commit c09e02c44122c702f746d8725bcd4693564f773d Author: hackish Date: Mon Aug 14 15:30:24 2023 -0700 initial diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..dd0767d --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 160 +exclude = docs/*, .git, __pycache__, build diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ee0401b --- /dev/null +++ b/.gitignore @@ -0,0 +1,161 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +**/.DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1073594 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.2.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files +- repo: https://github.com/asottile/reorder_python_imports + rev: v3.9.0 + hooks: + - id: reorder-python-imports + args: [--application-directories, '.:snowscraper', --py39-plus] +- repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black + language_version: python3.11 +- repo: https://github.com/PyCQA/bandit + rev: 1.7.5 + hooks: + - id: bandit + args: ["-c", "pyproject.toml"] + additional_dependencies: ["bandit[toml]"] +- repo: https://github.com/pycqa/flake8 + rev: 6.0.0 + hooks: + - id: flake8 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f7f3ff8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools>=67.8"] +build-backend = "setuptools.build_meta" + +[project] +name = "snowscraper" +version = "0.0.1" +authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }] +description = "Snowflake scraper" +requires-python = ">=3.9" +license = { text = "MIT" } +dependencies = [ + "pydantic==1.10.10", + "scrapy>=2.10.0", + "feedparser>=6.0.10", + "uvloop>=0.17.0", + "rocketry>=2.5.1", + "fastapi>=0.101.0", + "uvicorn>=0.22.0", +] + +[project.scripts] +snowscraper = "snowscraper.cli:run" + +[tool.setuptools] +py-modules = ["snowscraper"] + +[tool.bandit] +exclude_dirs = ["/doc", "/build"] + +[tool.black] +line-length = 120 diff --git a/snowscraper/__init__.py b/snowscraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/snowscraper/api.py b/snowscraper/api.py new file mode 100644 index 0000000..df2e285 --- /dev/null +++ b/snowscraper/api.py @@ -0,0 +1,3 @@ +from fastapi import APIRouter + +app = APIRouter() diff --git a/snowscraper/cli.py b/snowscraper/cli.py new file mode 100644 index 0000000..f07fc21 --- /dev/null +++ b/snowscraper/cli.py @@ -0,0 +1,73 @@ +import argparse +import asyncio +import sys +from datetime import datetime +from datetime import timezone +from types import FrameType + +import uvicorn +import uvloop + +from .controller import import_scrapers +from .controller import run_all + +app_fastapi = None +app_rocketry = None + + +class SnowScraper(uvicorn.Server): + def handle_exit(self, sig: int, frame: FrameType) -> None: + print("Shutting down SnowScraper") + return super().handle_exit(sig, frame) + + +async def start_server() -> None: + print("Starting SnowScraper") + server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop")) + fastapi = asyncio.create_task(server.serve()) + rocket = asyncio.create_task(app_rocketry.serve()) + app_rocketry.task + + await asyncio.wait([rocket, fastapi], return_when=asyncio.FIRST_COMPLETED) + + +def main(args: argparse.Namespace) -> None: + global app_fastapi, app_rocketry + + from .scheduler import app as app_rocketry + from .webserver import app as app_fastapi + + import_scrapers() + run_all(args) + + if sys.version_info >= (3, 11): + with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner: + runner.run(start_server()) + else: + uvloop.install() + asyncio.run(start_server()) + + +def run(): + parser = argparse.ArgumentParser(description="Snowflake scraper") + parser.add_argument( + "--after", + type=str, + required=False, + default=None, + help="Scrape only after a specific date in the format 'MM-DD-YYYY' in UTC", + ) + + args = parser.parse_args() + + if args.after: + try: + args.after = datetime.strptime(args.after, "%m-%d-%Y").replace(tzinfo=timezone.utc) + except ValueError: + print(f"Error: The 'after' argument should be in the format MM-DD-YYYY. You provided: {args.after}") + sys.exit(1) + main(args) + + +if __name__ == "__main__": + run() diff --git a/snowscraper/controller.py b/snowscraper/controller.py new file mode 100644 index 0000000..d750043 --- /dev/null +++ b/snowscraper/controller.py @@ -0,0 +1,23 @@ +import argparse +import importlib +import pkgutil +from pathlib import Path + +SCRAPERS = {} + + +def register_scraper(cls): + SCRAPERS[cls.__name__] = cls + return cls + + +def run_all(args: argparse.Namespace): + for scraper_cls in SCRAPERS.values(): + scraper = scraper_cls(after=args.after) + scraper.scrape() + + +def import_scrapers(): + directory = Path(__file__).resolve().parent / "scrapers" + for module_loader, name, ispkg in pkgutil.iter_modules([directory]): + importlib.import_module(f".scrapers.{name}", __package__) diff --git a/snowscraper/helpers.py b/snowscraper/helpers.py new file mode 100644 index 0000000..a1311e4 --- /dev/null +++ b/snowscraper/helpers.py @@ -0,0 +1,36 @@ +from datetime import datetime + + +def string_to_datetime(date_string): + """ + Convert a date string to a datetime object. + + This function supports the following formats: + - ISO 8601 with and without timezone offset: e.g., '2023-08-02T09:27:54-05:00' or '2023-08-07T18:32:05.537Z' + - RFC 1123: e.g., 'Mon, 07 Aug 2023 18:32:05 GMT' + + Args: + - date_string (str): The date string to convert + + Returns: + - datetime.datetime: The datetime representation of the provided date string. + + Raises: + - ValueError: If the provided date string doesn't match any of the supported formats. + """ + try: + # First, try ISO 8601 format + if "Z" in date_string: + return datetime.fromisoformat(date_string.replace("Z", "+00:00")) + return datetime.fromisoformat(date_string) + except ValueError: + pass + + try: + # Then, try RFC 1123 format + return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z") + except ValueError: + pass + + # If neither format matches, raise an exception + raise ValueError(f"Unsupported date format: {date_string}") diff --git a/snowscraper/scheduler.py b/snowscraper/scheduler.py new file mode 100644 index 0000000..fdf4c12 --- /dev/null +++ b/snowscraper/scheduler.py @@ -0,0 +1,6 @@ +from rocketry import Rocketry + +app = Rocketry(execution="async") + +if __name__ == "__main__": + app.run() diff --git a/snowscraper/scraper.py b/snowscraper/scraper.py new file mode 100644 index 0000000..a478987 --- /dev/null +++ b/snowscraper/scraper.py @@ -0,0 +1,23 @@ +from abc import ABC +from abc import abstractmethod + + +class BaseScraper(ABC): + def run(self, validate=True): + self.scraped_json = self._scrape() + self.transformed_json = self._transform(self.scraped_json) + if validate: + self.validate() + return self.transformed_json + + def validate(self): + if not self.transformed_json: + return self.transformed_json + + @abstractmethod + def scrape(self): + pass + + @abstractmethod + def transform(self): + pass diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py new file mode 100644 index 0000000..d951dad --- /dev/null +++ b/snowscraper/scrapers/medium.py @@ -0,0 +1,34 @@ +from datetime import datetime +from datetime import timezone + +import feedparser + +from ..controller import register_scraper +from ..helpers import string_to_datetime +from ..scraper import BaseScraper + + +@register_scraper +class MediumScraper(BaseScraper): + url = "https://medium.com/feed/snowflake" + + def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): + super(MediumScraper, self).__init__(*args, **kwargs) + self.data = {} + self.after = after + + def scrape(self): + print("Scraping Medium") + for entry in feedparser.parse(MediumScraper.url)["entries"]: + updated = string_to_datetime(entry["updated"]) + if updated > self.after: + self.data[entry["link"]] = { + "title": entry["title"], + "published": string_to_datetime(entry["published"]), + "updated": updated, + } + print(self.data) + return self.data + + def transform(self): + return self.data diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py new file mode 100644 index 0000000..79a61c5 --- /dev/null +++ b/snowscraper/scrapers/quickstarts.py @@ -0,0 +1,47 @@ +from datetime import datetime +from datetime import timezone + +import scrapy +from scrapy.crawler import CrawlerProcess + +from ..controller import register_scraper +from ..scraper import BaseScraper +from snowscraper.helpers import string_to_datetime + +QuickStartsURL = "https://quickstarts.snowflake.com/" + + +@register_scraper +class QuickstartScraper(BaseScraper, scrapy.Spider): + name = "snowflakespider" + + def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): + super(QuickstartScraper, self).__init__(*args, **kwargs) + self.data = {} + self.after = after + + def start_requests(self): + yield scrapy.Request(url=QuickStartsURL, callback=self.parse) + + def scrape(self): + print("Scraping Quickstarts") + process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) + process.crawl(QuickstartScraper, after=self.after) + process.start() + + def parse(self, response): + for card in response.css("card-sorter#cards > a.codelab-card"): + updated = string_to_datetime(card.attrib["data-updated"]) + if updated > self.after: + print(f"Updated: {updated} > {self.after}") + key = QuickStartsURL.rstrip("/") + card.attrib["href"] + self.data[key] = { + "title": card.attrib["data-title"], + "updated": updated, + "tags": card.attrib["data-tags"], + } + print(key, self.data[key]) + yield self.data[key] + + def transform(self): + return self.data diff --git a/snowscraper/scrapers/youtube.py b/snowscraper/scrapers/youtube.py new file mode 100644 index 0000000..f6c18dc --- /dev/null +++ b/snowscraper/scrapers/youtube.py @@ -0,0 +1,20 @@ +from datetime import datetime +from datetime import timezone + +from ..controller import register_scraper +from ..scraper import BaseScraper + + +@register_scraper +class YoutubeScraper(BaseScraper): + def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): + super(YoutubeScraper, self).__init__(*args, **kwargs) + self.data = {} + self.after = after + + def scrape(self): + print("Scraping YouTube... Unimplemented") + self.data = {} + + def transform(self): + return self.data diff --git a/snowscraper/webserver.py b/snowscraper/webserver.py new file mode 100644 index 0000000..31a4426 --- /dev/null +++ b/snowscraper/webserver.py @@ -0,0 +1,11 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from .api import app as api_v1 + +app = FastAPI() +app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"]) +app.include_router(api_v1, prefix="/api/v1") + +if __name__ == "__main__": + app.run()