initial

2023-08-14 15:30:24 -07:00 · 2023-08-14 15:30:24 -07:00 · c09e02c441
commit c09e02c441
15 changed files with 500 additions and 0 deletions
--- a/snowscraper/init.py
+++ b/snowscraper/init.py
--- a/snowscraper/api.py
+++ b/snowscraper/api.py
@ -0,0 +1,3 @@
+from fastapi import APIRouter
+
+app = APIRouter()
--- a/snowscraper/cli.py
+++ b/snowscraper/cli.py
@ -0,0 +1,73 @@
+import argparse
+import asyncio
+import sys
+from datetime import datetime
+from datetime import timezone
+from types import FrameType
+
+import uvicorn
+import uvloop
+
+from .controller import import_scrapers
+from .controller import run_all
+
+app_fastapi = None
+app_rocketry = None
+
+
+class SnowScraper(uvicorn.Server):
+    def handle_exit(self, sig: int, frame: FrameType) -> None:
+        print("Shutting down SnowScraper")
+        return super().handle_exit(sig, frame)
+
+
+async def start_server() -> None:
+    print("Starting SnowScraper")
+    server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop"))
+    fastapi = asyncio.create_task(server.serve())
+    rocket = asyncio.create_task(app_rocketry.serve())
+    app_rocketry.task
+
+    await asyncio.wait([rocket, fastapi], return_when=asyncio.FIRST_COMPLETED)
+
+
+def main(args: argparse.Namespace) -> None:
+    global app_fastapi, app_rocketry
+
+    from .scheduler import app as app_rocketry
+    from .webserver import app as app_fastapi
+
+    import_scrapers()
+    run_all(args)
+
+    if sys.version_info >= (3, 11):
+        with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
+            runner.run(start_server())
+    else:
+        uvloop.install()
+        asyncio.run(start_server())
+
+
+def run():
+    parser = argparse.ArgumentParser(description="Snowflake scraper")
+    parser.add_argument(
+        "--after",
+        type=str,
+        required=False,
+        default=None,
+        help="Scrape only after a specific date in the format 'MM-DD-YYYY' in UTC",
+    )
+
+    args = parser.parse_args()
+
+    if args.after:
+        try:
+            args.after = datetime.strptime(args.after, "%m-%d-%Y").replace(tzinfo=timezone.utc)
+        except ValueError:
+            print(f"Error: The 'after' argument should be in the format MM-DD-YYYY. You provided: {args.after}")
+            sys.exit(1)
+    main(args)
+
+
+if __name__ == "__main__":
+    run()
--- a/snowscraper/controller.py
+++ b/snowscraper/controller.py
@ -0,0 +1,23 @@
+import argparse
+import importlib
+import pkgutil
+from pathlib import Path
+
+SCRAPERS = {}
+
+
+def register_scraper(cls):
+    SCRAPERS[cls.__name__] = cls
+    return cls
+
+
+def run_all(args: argparse.Namespace):
+    for scraper_cls in SCRAPERS.values():
+        scraper = scraper_cls(after=args.after)
+        scraper.scrape()
+
+
+def import_scrapers():
+    directory = Path(__file__).resolve().parent / "scrapers"
+    for module_loader, name, ispkg in pkgutil.iter_modules([directory]):
+        importlib.import_module(f".scrapers.{name}", __package__)
--- a/snowscraper/helpers.py
+++ b/snowscraper/helpers.py
@ -0,0 +1,36 @@
+from datetime import datetime
+
+
+def string_to_datetime(date_string):
+    """
+    Convert a date string to a datetime object.
+
+    This function supports the following formats:
+    - ISO 8601 with and without timezone offset: e.g., '2023-08-02T09:27:54-05:00' or '2023-08-07T18:32:05.537Z'
+    - RFC 1123: e.g., 'Mon, 07 Aug 2023 18:32:05 GMT'
+
+    Args:
+    - date_string (str): The date string to convert
+
+    Returns:
+    - datetime.datetime: The datetime representation of the provided date string.
+
+    Raises:
+    - ValueError: If the provided date string doesn't match any of the supported formats.
+    """
+    try:
+        # First, try ISO 8601 format
+        if "Z" in date_string:
+            return datetime.fromisoformat(date_string.replace("Z", "+00:00"))
+        return datetime.fromisoformat(date_string)
+    except ValueError:
+        pass
+
+    try:
+        # Then, try RFC 1123 format
+        return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z")
+    except ValueError:
+        pass
+
+    # If neither format matches, raise an exception
+    raise ValueError(f"Unsupported date format: {date_string}")
--- a/snowscraper/scheduler.py
+++ b/snowscraper/scheduler.py
@ -0,0 +1,6 @@
+from rocketry import Rocketry
+
+app = Rocketry(execution="async")
+
+if __name__ == "__main__":
+    app.run()
--- a/snowscraper/scraper.py
+++ b/snowscraper/scraper.py
@ -0,0 +1,23 @@
+from abc import ABC
+from abc import abstractmethod
+
+
+class BaseScraper(ABC):
+    def run(self, validate=True):
+        self.scraped_json = self._scrape()
+        self.transformed_json = self._transform(self.scraped_json)
+        if validate:
+            self.validate()
+        return self.transformed_json
+
+    def validate(self):
+        if not self.transformed_json:
+            return self.transformed_json
+
+    @abstractmethod
+    def scrape(self):
+        pass
+
+    @abstractmethod
+    def transform(self):
+        pass
--- a/snowscraper/scrapers/medium.py
+++ b/snowscraper/scrapers/medium.py
@ -0,0 +1,34 @@
+from datetime import datetime
+from datetime import timezone
+
+import feedparser
+
+from ..controller import register_scraper
+from ..helpers import string_to_datetime
+from ..scraper import BaseScraper
+
+
+@register_scraper
+class MediumScraper(BaseScraper):
+    url = "https://medium.com/feed/snowflake"
+
+    def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
+        super(MediumScraper, self).__init__(*args, **kwargs)
+        self.data = {}
+        self.after = after
+
+    def scrape(self):
+        print("Scraping Medium")
+        for entry in feedparser.parse(MediumScraper.url)["entries"]:
+            updated = string_to_datetime(entry["updated"])
+            if updated > self.after:
+                self.data[entry["link"]] = {
+                    "title": entry["title"],
+                    "published": string_to_datetime(entry["published"]),
+                    "updated": updated,
+                }
+        print(self.data)
+        return self.data
+
+    def transform(self):
+        return self.data
--- a/snowscraper/scrapers/quickstarts.py
+++ b/snowscraper/scrapers/quickstarts.py
@ -0,0 +1,47 @@
+from datetime import datetime
+from datetime import timezone
+
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+from ..controller import register_scraper
+from ..scraper import BaseScraper
+from snowscraper.helpers import string_to_datetime
+
+QuickStartsURL = "https://quickstarts.snowflake.com/"
+
+
+@register_scraper
+class QuickstartScraper(BaseScraper, scrapy.Spider):
+    name = "snowflakespider"
+
+    def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
+        super(QuickstartScraper, self).__init__(*args, **kwargs)
+        self.data = {}
+        self.after = after
+
+    def start_requests(self):
+        yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
+
+    def scrape(self):
+        print("Scraping Quickstarts")
+        process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
+        process.crawl(QuickstartScraper, after=self.after)
+        process.start()
+
+    def parse(self, response):
+        for card in response.css("card-sorter#cards > a.codelab-card"):
+            updated = string_to_datetime(card.attrib["data-updated"])
+            if updated > self.after:
+                print(f"Updated: {updated} > {self.after}")
+                key = QuickStartsURL.rstrip("/") + card.attrib["href"]
+                self.data[key] = {
+                    "title": card.attrib["data-title"],
+                    "updated": updated,
+                    "tags": card.attrib["data-tags"],
+                }
+                print(key, self.data[key])
+                yield self.data[key]
+
+    def transform(self):
+        return self.data
--- a/snowscraper/scrapers/youtube.py
+++ b/snowscraper/scrapers/youtube.py
@ -0,0 +1,20 @@
+from datetime import datetime
+from datetime import timezone
+
+from ..controller import register_scraper
+from ..scraper import BaseScraper
+
+
+@register_scraper
+class YoutubeScraper(BaseScraper):
+    def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
+        super(YoutubeScraper, self).__init__(*args, **kwargs)
+        self.data = {}
+        self.after = after
+
+    def scrape(self):
+        print("Scraping YouTube... Unimplemented")
+        self.data = {}
+
+    def transform(self):
+        return self.data
--- a/snowscraper/webserver.py
+++ b/snowscraper/webserver.py
@ -0,0 +1,11 @@
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+
+from .api import app as api_v1
+
+app = FastAPI()
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"])
+app.include_router(api_v1, prefix="/api/v1")
+
+if __name__ == "__main__":
+    app.run()