initial

2023-08-14 15:30:24 -07:00 · 2023-08-14 15:30:24 -07:00 · c09e02c441
commit c09e02c441
15 changed files with 500 additions and 0 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 160
+exclude = docs/*, .git, __pycache__, build
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+**/.DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,28 @@
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v3.2.0
+  hooks:
+  -   id: trailing-whitespace
+  -   id: end-of-file-fixer
+  -   id: check-yaml
+  -   id: check-added-large-files
+- repo: https://github.com/asottile/reorder_python_imports
+  rev: v3.9.0
+  hooks:
+  - id: reorder-python-imports
+    args: [--application-directories, '.:snowscraper', --py39-plus]
+- repo: https://github.com/psf/black
+  rev: 23.3.0
+  hooks:
+  - id: black
+    language_version: python3.11
+- repo: https://github.com/PyCQA/bandit
+  rev: 1.7.5
+  hooks:
+  - id: bandit
+    args: ["-c", "pyproject.toml"]
+    additional_dependencies: ["bandit[toml]"]
+- repo: https://github.com/pycqa/flake8
+  rev: 6.0.0
+  hooks:
+  -   id: flake8
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools>=67.8"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "snowscraper"
+version = "0.0.1"
+authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }]
+description = "Snowflake scraper"
+requires-python = ">=3.9"
+license = { text = "MIT" }
+dependencies = [
+    "pydantic==1.10.10",
+    "scrapy>=2.10.0",
+    "feedparser>=6.0.10",
+    "uvloop>=0.17.0",
+    "rocketry>=2.5.1",
+    "fastapi>=0.101.0",
+    "uvicorn>=0.22.0",
+]
+
+[project.scripts]
+snowscraper = "snowscraper.cli:run"
+
+[tool.setuptools]
+py-modules = ["snowscraper"]
+
+[tool.bandit]
+exclude_dirs = ["/doc", "/build"]
+
+[tool.black]
+line-length = 120
--- a/snowscraper/init.py
+++ b/snowscraper/init.py
--- a/snowscraper/api.py
+++ b/snowscraper/api.py
@ -0,0 +1,3 @@
+from fastapi import APIRouter
+
+app = APIRouter()
--- a/snowscraper/cli.py
+++ b/snowscraper/cli.py
@ -0,0 +1,73 @@
+import argparse
+import asyncio
+import sys
+from datetime import datetime
+from datetime import timezone
+from types import FrameType
+
+import uvicorn
+import uvloop
+
+from .controller import import_scrapers
+from .controller import run_all
+
+app_fastapi = None
+app_rocketry = None
+
+
+class SnowScraper(uvicorn.Server):
+    def handle_exit(self, sig: int, frame: FrameType) -> None:
+        print("Shutting down SnowScraper")
+        return super().handle_exit(sig, frame)
+
+
+async def start_server() -> None:
+    print("Starting SnowScraper")
+    server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop"))
+    fastapi = asyncio.create_task(server.serve())
+    rocket = asyncio.create_task(app_rocketry.serve())
+    app_rocketry.task
+
+    await asyncio.wait([rocket, fastapi], return_when=asyncio.FIRST_COMPLETED)
+
+
+def main(args: argparse.Namespace) -> None:
+    global app_fastapi, app_rocketry
+
+    from .scheduler import app as app_rocketry
+    from .webserver import app as app_fastapi
+
+    import_scrapers()
+    run_all(args)
+
+    if sys.version_info >= (3, 11):
+        with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
+            runner.run(start_server())
+    else:
+        uvloop.install()
+        asyncio.run(start_server())
+
+
+def run():
+    parser = argparse.ArgumentParser(description="Snowflake scraper")
+    parser.add_argument(
+        "--after",
+        type=str,
+        required=False,
+        default=None,
+        help="Scrape only after a specific date in the format 'MM-DD-YYYY' in UTC",
+    )
+
+    args = parser.parse_args()
+
+    if args.after:
+        try:
+            args.after = datetime.strptime(args.after, "%m-%d-%Y").replace(tzinfo=timezone.utc)
+        except ValueError:
+            print(f"Error: The 'after' argument should be in the format MM-DD-YYYY. You provided: {args.after}")
+            sys.exit(1)
+    main(args)
+
+
+if __name__ == "__main__":
+    run()
--- a/snowscraper/controller.py
+++ b/snowscraper/controller.py
@ -0,0 +1,23 @@
+import argparse
+import importlib
+import pkgutil
+from pathlib import Path
+
+SCRAPERS = {}
+
+
+def register_scraper(cls):
+    SCRAPERS[cls.__name__] = cls
+    return cls
+
+
+def run_all(args: argparse.Namespace):
+    for scraper_cls in SCRAPERS.values():
+        scraper = scraper_cls(after=args.after)
+        scraper.scrape()
+
+
+def import_scrapers():
+    directory = Path(__file__).resolve().parent / "scrapers"
+    for module_loader, name, ispkg in pkgutil.iter_modules([directory]):
+        importlib.import_module(f".scrapers.{name}", __package__)
--- a/snowscraper/helpers.py
+++ b/snowscraper/helpers.py
@ -0,0 +1,36 @@
+from datetime import datetime
+
+
+def string_to_datetime(date_string):
+    """
+    Convert a date string to a datetime object.
+
+    This function supports the following formats:
+    - ISO 8601 with and without timezone offset: e.g., '2023-08-02T09:27:54-05:00' or '2023-08-07T18:32:05.537Z'
+    - RFC 1123: e.g., 'Mon, 07 Aug 2023 18:32:05 GMT'
+
+    Args:
+    - date_string (str): The date string to convert
+
+    Returns:
+    - datetime.datetime: The datetime representation of the provided date string.
+
+    Raises:
+    - ValueError: If the provided date string doesn't match any of the supported formats.
+    """
+    try:
+        # First, try ISO 8601 format
+        if "Z" in date_string:
+            return datetime.fromisoformat(date_string.replace("Z", "+00:00"))
+        return datetime.fromisoformat(date_string)
+    except ValueError:
+        pass
+
+    try:
+        # Then, try RFC 1123 format
+        return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z")
+    except ValueError:
+        pass
+
+    # If neither format matches, raise an exception
+    raise ValueError(f"Unsupported date format: {date_string}")
--- a/snowscraper/scheduler.py
+++ b/snowscraper/scheduler.py
@ -0,0 +1,6 @@
+from rocketry import Rocketry
+
+app = Rocketry(execution="async")
+
+if __name__ == "__main__":
+    app.run()
--- a/snowscraper/scraper.py
+++ b/snowscraper/scraper.py
@ -0,0 +1,23 @@
+from abc import ABC
+from abc import abstractmethod
+
+
+class BaseScraper(ABC):
+    def run(self, validate=True):
+        self.scraped_json = self._scrape()
+        self.transformed_json = self._transform(self.scraped_json)
+        if validate:
+            self.validate()
+        return self.transformed_json
+
+    def validate(self):
+        if not self.transformed_json:
+            return self.transformed_json
+
+    @abstractmethod
+    def scrape(self):
+        pass
+
+    @abstractmethod
+    def transform(self):
+        pass
--- a/snowscraper/scrapers/medium.py
+++ b/snowscraper/scrapers/medium.py
@ -0,0 +1,34 @@
+from datetime import datetime
+from datetime import timezone
+
+import feedparser
+
+from ..controller import register_scraper
+from ..helpers import string_to_datetime
+from ..scraper import BaseScraper
+
+
+@register_scraper
+class MediumScraper(BaseScraper):
+    url = "https://medium.com/feed/snowflake"
+
+    def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
+        super(MediumScraper, self).__init__(*args, **kwargs)
+        self.data = {}
+        self.after = after
+
+    def scrape(self):
+        print("Scraping Medium")
+        for entry in feedparser.parse(MediumScraper.url)["entries"]:
+            updated = string_to_datetime(entry["updated"])
+            if updated > self.after:
+                self.data[entry["link"]] = {
+                    "title": entry["title"],
+                    "published": string_to_datetime(entry["published"]),
+                    "updated": updated,
+                }
+        print(self.data)
+        return self.data
+
+    def transform(self):
+        return self.data
--- a/snowscraper/scrapers/quickstarts.py
+++ b/snowscraper/scrapers/quickstarts.py
@ -0,0 +1,47 @@
+from datetime import datetime
+from datetime import timezone
+
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+from ..controller import register_scraper
+from ..scraper import BaseScraper
+from snowscraper.helpers import string_to_datetime
+
+QuickStartsURL = "https://quickstarts.snowflake.com/"
+
+
+@register_scraper
+class QuickstartScraper(BaseScraper, scrapy.Spider):
+    name = "snowflakespider"
+
+    def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
+        super(QuickstartScraper, self).__init__(*args, **kwargs)
+        self.data = {}
+        self.after = after
+
+    def start_requests(self):
+        yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
+
+    def scrape(self):
+        print("Scraping Quickstarts")
+        process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
+        process.crawl(QuickstartScraper, after=self.after)
+        process.start()
+
+    def parse(self, response):
+        for card in response.css("card-sorter#cards > a.codelab-card"):
+            updated = string_to_datetime(card.attrib["data-updated"])
+            if updated > self.after:
+                print(f"Updated: {updated} > {self.after}")
+                key = QuickStartsURL.rstrip("/") + card.attrib["href"]
+                self.data[key] = {
+                    "title": card.attrib["data-title"],
+                    "updated": updated,
+                    "tags": card.attrib["data-tags"],
+                }
+                print(key, self.data[key])
+                yield self.data[key]
+
+    def transform(self):
+        return self.data
--- a/snowscraper/scrapers/youtube.py
+++ b/snowscraper/scrapers/youtube.py
@ -0,0 +1,20 @@
+from datetime import datetime
+from datetime import timezone
+
+from ..controller import register_scraper
+from ..scraper import BaseScraper
+
+
+@register_scraper
+class YoutubeScraper(BaseScraper):
+    def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
+        super(YoutubeScraper, self).__init__(*args, **kwargs)
+        self.data = {}
+        self.after = after
+
+    def scrape(self):
+        print("Scraping YouTube... Unimplemented")
+        self.data = {}
+
+    def transform(self):
+        return self.data
--- a/snowscraper/webserver.py
+++ b/snowscraper/webserver.py
@ -0,0 +1,11 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+from .api import app as api_v1
+
+app = FastAPI()
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"])
+app.include_router(api_v1, prefix="/api/v1")
+
+if __name__ == "__main__":
+    app.run()