From 47f68794dc154088e164e2ad7ebea2a48eceb1e9 Mon Sep 17 00:00:00 2001 From: Marc Nixon Date: Fri, 1 Sep 2023 09:58:29 -0700 Subject: [PATCH] initial commit --- pyproject.toml | 24 +++++++++++++ snowscraper/__init__.py | 0 snowscraper/cli.py | 37 ++++++++++++++++++++ snowscraper/controller.py | 25 ++++++++++++++ snowscraper/helpers.py | 19 +++++++++++ snowscraper/scraper.py | 23 +++++++++++++ snowscraper/scrapers/medium.py | 33 ++++++++++++++++++ snowscraper/scrapers/quickstarts.py | 53 +++++++++++++++++++++++++++++ 8 files changed, 214 insertions(+) create mode 100644 pyproject.toml create mode 100644 snowscraper/__init__.py create mode 100644 snowscraper/cli.py create mode 100644 snowscraper/controller.py create mode 100644 snowscraper/helpers.py create mode 100644 snowscraper/scraper.py create mode 100644 snowscraper/scrapers/medium.py create mode 100644 snowscraper/scrapers/quickstarts.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..195f36c --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["setuptools>=67.8"] +build-backend = "setuptools.build_meta" + +[project] +name = "snowscraper" +version = "0.0.1" +authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }] +description = "Snowflake scraper" +requires-python = ">=3.9" +license = { text = "MIT" } +dependencies = ["pydantic==1.10.10", "scrapy>=2.10.0", "feedparser>=6.0.10"] + +[project.scripts] +snowscraper = "snowscraper.cli:run" + +[tool.setuptools] +py-modules = ["snowscraper"] + +[tool.bandit] +exclude_dirs = ["/doc", "/build"] + +[tool.black] +line-length = 120 diff --git a/snowscraper/__init__.py b/snowscraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/snowscraper/cli.py b/snowscraper/cli.py new file mode 100644 index 0000000..1ec5e11 --- /dev/null +++ b/snowscraper/cli.py @@ -0,0 +1,37 @@ +import argparse +import sys +from datetime import datetime +from datetime import timezone + +from .controller import import_scrapers +from .controller import run_all + + +def main(args: argparse.Namespace) -> None: + import_scrapers() + run_all(args) + + +def run(): + parser = argparse.ArgumentParser(description="Snowflake scraper") + parser.add_argument( + "--after", + type=str, + required=False, + default=None, + help="Scrape only after a specific date in the format 'MM-DD-YYYY' in UTC", + ) + + args = parser.parse_args() + + if args.after: + try: + args.after = datetime.strptime(args.after, "%m-%d-%Y").replace(tzinfo=timezone.utc) + except ValueError: + print(f"Error: The 'after' argument should be in the format MM-DD-YYYY. You provided: {args.after}") + sys.exit(1) + main(args) + + +if __name__ == "__main__": + run() diff --git a/snowscraper/controller.py b/snowscraper/controller.py new file mode 100644 index 0000000..e6d77f2 --- /dev/null +++ b/snowscraper/controller.py @@ -0,0 +1,25 @@ +import argparse +import importlib +import pkgutil +from pathlib import Path + +SCRAPERS = {} + + +def register_scraper(cls): + SCRAPERS[cls.__name__] = cls + return cls + + +def run_all(args: argparse.Namespace): + results = {} + for scraper_cls in SCRAPERS.values(): + scraper = scraper_cls(after=args.after) + results |= scraper.scrape() + print(results) + + +def import_scrapers(): + directory = Path(__file__).resolve().parent / "scrapers" + for module_loader, name, ispkg in pkgutil.iter_modules([directory]): + importlib.import_module(f".scrapers.{name}", __package__) diff --git a/snowscraper/helpers.py b/snowscraper/helpers.py new file mode 100644 index 0000000..dc7849d --- /dev/null +++ b/snowscraper/helpers.py @@ -0,0 +1,19 @@ +from datetime import datetime + + +def string_to_datetime(date_string): + try: + # try ISO 8601 + if "Z" in date_string: + return datetime.fromisoformat(date_string.replace("Z", "+00:00")) + return datetime.fromisoformat(date_string) + except ValueError: + pass + + try: + # try RFC 1123 + return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z") + except ValueError: + pass + + raise ValueError(f"Unsupported date format: {date_string}") diff --git a/snowscraper/scraper.py b/snowscraper/scraper.py new file mode 100644 index 0000000..a478987 --- /dev/null +++ b/snowscraper/scraper.py @@ -0,0 +1,23 @@ +from abc import ABC +from abc import abstractmethod + + +class BaseScraper(ABC): + def run(self, validate=True): + self.scraped_json = self._scrape() + self.transformed_json = self._transform(self.scraped_json) + if validate: + self.validate() + return self.transformed_json + + def validate(self): + if not self.transformed_json: + return self.transformed_json + + @abstractmethod + def scrape(self): + pass + + @abstractmethod + def transform(self): + pass diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py new file mode 100644 index 0000000..10b2e59 --- /dev/null +++ b/snowscraper/scrapers/medium.py @@ -0,0 +1,33 @@ +from datetime import datetime +from datetime import timezone + +import feedparser + +from ..controller import register_scraper +from ..helpers import string_to_datetime +from ..scraper import BaseScraper + + +@register_scraper +class MediumScraper(BaseScraper): + url = "https://medium.com/feed/snowflake" + + def __init__(self, *args, **kwargs): + super(MediumScraper, self).__init__(*args, **kwargs) + self.data = {} + self.after = datetime(1970, 1, 1, tzinfo=timezone.utc) + + def scrape(self): + print("Scraping Medium") + for entry in feedparser.parse(MediumScraper.url)["entries"]: + updated = string_to_datetime(entry["updated"]) + if updated > self.after: + self.data[entry["link"]] = { + "title": entry["title"], + "published": string_to_datetime(entry["published"]), + "updated": updated, + } + return self.data + + def transform(self): + return self.data diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py new file mode 100644 index 0000000..0f8b329 --- /dev/null +++ b/snowscraper/scrapers/quickstarts.py @@ -0,0 +1,53 @@ +from datetime import datetime +from datetime import timezone + +import scrapy +from scrapy import signals +from scrapy.crawler import CrawlerProcess +from scrapy.signalmanager import dispatcher + +from ..controller import register_scraper +from ..scraper import BaseScraper +from snowscraper.helpers import string_to_datetime + +QuickStartsURL = "https://quickstarts.snowflake.com/" + + +@register_scraper +class QuickstartScraper(BaseScraper, scrapy.Spider): + name = "snowflakespider" + + def __init__(self, *args, **kwargs): + super(QuickstartScraper, self).__init__(*args, **kwargs) + self.data = {} + self.after = datetime(1970, 1, 1, tzinfo=timezone.utc) + + def start_requests(self): + yield scrapy.Request(url=QuickStartsURL, callback=self.parse) + + def signal_handler(self, signal, sender, item, response, spider): + self.data[item["key"]] = item + self.data[item["key"]].pop("key") + + def scrape(self): + print("Scraping Quickstarts") + dispatcher.connect(self.signal_handler, signal=signals.item_scraped) + process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) + process.crawl(QuickstartScraper, after=self.after) + process.start() + return self.data + + def parse(self, response): + for card in response.css("card-sorter#cards > a.codelab-card"): + updated = string_to_datetime(card.attrib["data-updated"]) + if updated > self.after: + key = QuickStartsURL.rstrip("/") + card.attrib["href"] + yield { + "key": key, + "title": card.attrib["data-title"], + "updated": updated, + "tags": card.attrib["data-tags"], + } + + def transform(self): + return self.data