diff --git a/pyproject.toml b/pyproject.toml index 195f36c..f7f3ff8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,15 @@ authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }] description = "Snowflake scraper" requires-python = ">=3.9" license = { text = "MIT" } -dependencies = ["pydantic==1.10.10", "scrapy>=2.10.0", "feedparser>=6.0.10"] +dependencies = [ + "pydantic==1.10.10", + "scrapy>=2.10.0", + "feedparser>=6.0.10", + "uvloop>=0.17.0", + "rocketry>=2.5.1", + "fastapi>=0.101.0", + "uvicorn>=0.22.0", +] [project.scripts] snowscraper = "snowscraper.cli:run" diff --git a/snowscraper/api.py b/snowscraper/api.py new file mode 100644 index 0000000..df2e285 --- /dev/null +++ b/snowscraper/api.py @@ -0,0 +1,3 @@ +from fastapi import APIRouter + +app = APIRouter() diff --git a/snowscraper/cli.py b/snowscraper/cli.py index 1ec5e11..f07fc21 100644 --- a/snowscraper/cli.py +++ b/snowscraper/cli.py @@ -1,16 +1,52 @@ import argparse +import asyncio import sys from datetime import datetime from datetime import timezone +from types import FrameType + +import uvicorn +import uvloop from .controller import import_scrapers from .controller import run_all +app_fastapi = None +app_rocketry = None + + +class SnowScraper(uvicorn.Server): + def handle_exit(self, sig: int, frame: FrameType) -> None: + print("Shutting down SnowScraper") + return super().handle_exit(sig, frame) + + +async def start_server() -> None: + print("Starting SnowScraper") + server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop")) + fastapi = asyncio.create_task(server.serve()) + rocket = asyncio.create_task(app_rocketry.serve()) + app_rocketry.task + + await asyncio.wait([rocket, fastapi], return_when=asyncio.FIRST_COMPLETED) + def main(args: argparse.Namespace) -> None: + global app_fastapi, app_rocketry + + from .scheduler import app as app_rocketry + from .webserver import app as app_fastapi + import_scrapers() run_all(args) + if sys.version_info >= (3, 11): + with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner: + runner.run(start_server()) + else: + uvloop.install() + asyncio.run(start_server()) + def run(): parser = argparse.ArgumentParser(description="Snowflake scraper") diff --git a/snowscraper/controller.py b/snowscraper/controller.py index e6d77f2..d750043 100644 --- a/snowscraper/controller.py +++ b/snowscraper/controller.py @@ -12,11 +12,9 @@ def register_scraper(cls): def run_all(args: argparse.Namespace): - results = {} for scraper_cls in SCRAPERS.values(): scraper = scraper_cls(after=args.after) - results |= scraper.scrape() - print(results) + scraper.scrape() def import_scrapers(): diff --git a/snowscraper/helpers.py b/snowscraper/helpers.py index dc7849d..a1311e4 100644 --- a/snowscraper/helpers.py +++ b/snowscraper/helpers.py @@ -2,8 +2,24 @@ from datetime import datetime def string_to_datetime(date_string): + """ + Convert a date string to a datetime object. + + This function supports the following formats: + - ISO 8601 with and without timezone offset: e.g., '2023-08-02T09:27:54-05:00' or '2023-08-07T18:32:05.537Z' + - RFC 1123: e.g., 'Mon, 07 Aug 2023 18:32:05 GMT' + + Args: + - date_string (str): The date string to convert + + Returns: + - datetime.datetime: The datetime representation of the provided date string. + + Raises: + - ValueError: If the provided date string doesn't match any of the supported formats. + """ try: - # try ISO 8601 + # First, try ISO 8601 format if "Z" in date_string: return datetime.fromisoformat(date_string.replace("Z", "+00:00")) return datetime.fromisoformat(date_string) @@ -11,9 +27,10 @@ def string_to_datetime(date_string): pass try: - # try RFC 1123 + # Then, try RFC 1123 format return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass + # If neither format matches, raise an exception raise ValueError(f"Unsupported date format: {date_string}") diff --git a/snowscraper/scheduler.py b/snowscraper/scheduler.py new file mode 100644 index 0000000..fdf4c12 --- /dev/null +++ b/snowscraper/scheduler.py @@ -0,0 +1,6 @@ +from rocketry import Rocketry + +app = Rocketry(execution="async") + +if __name__ == "__main__": + app.run() diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py index 3825801..3620789 100644 --- a/snowscraper/scrapers/medium.py +++ b/snowscraper/scrapers/medium.py @@ -27,6 +27,7 @@ class MediumScraper(BaseScraper): "published": string_to_datetime(entry["published"]), "updated": updated, } + # print(self.data) return self.data def transform(self): diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py index 4b3ff07..2ed37ad 100644 --- a/snowscraper/scrapers/quickstarts.py +++ b/snowscraper/scrapers/quickstarts.py @@ -2,9 +2,7 @@ from datetime import datetime from datetime import timezone import scrapy -from scrapy import signals from scrapy.crawler import CrawlerProcess -from scrapy.signalmanager import dispatcher from ..controller import register_scraper from ..scraper import BaseScraper @@ -25,29 +23,24 @@ class QuickstartScraper(BaseScraper, scrapy.Spider): def start_requests(self): yield scrapy.Request(url=QuickStartsURL, callback=self.parse) - def signal_handler(self, signal, sender, item, response, spider): - self.data[item["key"]] = item - self.data[item["key"]].pop("key") - def scrape(self): print("Scraping Quickstarts") - dispatcher.connect(self.signal_handler, signal=signals.item_scraped) process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) process.crawl(QuickstartScraper, after=self.after) process.start() - return self.data def parse(self, response): for card in response.css("card-sorter#cards > a.codelab-card"): updated = string_to_datetime(card.attrib["data-updated"]) if updated > self.after: key = QuickStartsURL.rstrip("/") + card.attrib["href"] - yield { - "key": key, + self.data[key] = { "title": card.attrib["data-title"], "updated": updated, "tags": card.attrib["data-tags"], } + # print(key, self.data[key]) + yield self.data[key] def transform(self): return self.data diff --git a/snowscraper/scrapers/youtube.py b/snowscraper/scrapers/youtube.py new file mode 100644 index 0000000..f6c18dc --- /dev/null +++ b/snowscraper/scrapers/youtube.py @@ -0,0 +1,20 @@ +from datetime import datetime +from datetime import timezone + +from ..controller import register_scraper +from ..scraper import BaseScraper + + +@register_scraper +class YoutubeScraper(BaseScraper): + def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): + super(YoutubeScraper, self).__init__(*args, **kwargs) + self.data = {} + self.after = after + + def scrape(self): + print("Scraping YouTube... Unimplemented") + self.data = {} + + def transform(self): + return self.data diff --git a/snowscraper/webserver.py b/snowscraper/webserver.py new file mode 100644 index 0000000..31a4426 --- /dev/null +++ b/snowscraper/webserver.py @@ -0,0 +1,11 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from .api import app as api_v1 + +app = FastAPI() +app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"]) +app.include_router(api_v1, prefix="/api/v1") + +if __name__ == "__main__": + app.run()