From 3c8569814cbf1572741dee317e251be55f0e49d2 Mon Sep 17 00:00:00 2001 From: hackish Date: Mon, 28 Aug 2023 18:18:29 -0700 Subject: [PATCH 1/3] Remove Rocketry --- pyproject.toml | 1 - snowscraper/cli.py | 8 ++------ snowscraper/scheduler.py | 6 ------ 3 files changed, 2 insertions(+), 13 deletions(-) delete mode 100644 snowscraper/scheduler.py diff --git a/pyproject.toml b/pyproject.toml index f7f3ff8..a56e149 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ dependencies = [ "scrapy>=2.10.0", "feedparser>=6.0.10", "uvloop>=0.17.0", - "rocketry>=2.5.1", "fastapi>=0.101.0", "uvicorn>=0.22.0", ] diff --git a/snowscraper/cli.py b/snowscraper/cli.py index f07fc21..287dfbf 100644 --- a/snowscraper/cli.py +++ b/snowscraper/cli.py @@ -12,7 +12,6 @@ from .controller import import_scrapers from .controller import run_all app_fastapi = None -app_rocketry = None class SnowScraper(uvicorn.Server): @@ -25,16 +24,13 @@ async def start_server() -> None: print("Starting SnowScraper") server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop")) fastapi = asyncio.create_task(server.serve()) - rocket = asyncio.create_task(app_rocketry.serve()) - app_rocketry.task - await asyncio.wait([rocket, fastapi], return_when=asyncio.FIRST_COMPLETED) + await asyncio.wait([fastapi], return_when=asyncio.FIRST_COMPLETED) def main(args: argparse.Namespace) -> None: - global app_fastapi, app_rocketry + global app_fastapi - from .scheduler import app as app_rocketry from .webserver import app as app_fastapi import_scrapers() diff --git a/snowscraper/scheduler.py b/snowscraper/scheduler.py deleted file mode 100644 index fdf4c12..0000000 --- a/snowscraper/scheduler.py +++ /dev/null @@ -1,6 +0,0 @@ -from rocketry import Rocketry - -app = Rocketry(execution="async") - -if __name__ == "__main__": - app.run() From 16243cf8d04466e9d819e4da181cf7dda504572b Mon Sep 17 00:00:00 2001 From: hackish Date: Mon, 28 Aug 2023 18:26:06 -0700 Subject: [PATCH 2/3] Remove FastAPI --- pyproject.toml | 9 +-------- snowscraper/api.py | 3 --- snowscraper/cli.py | 32 -------------------------------- snowscraper/helpers.py | 21 ++------------------- snowscraper/scrapers/youtube.py | 20 -------------------- snowscraper/webserver.py | 11 ----------- 6 files changed, 3 insertions(+), 93 deletions(-) delete mode 100644 snowscraper/api.py delete mode 100644 snowscraper/scrapers/youtube.py delete mode 100644 snowscraper/webserver.py diff --git a/pyproject.toml b/pyproject.toml index a56e149..195f36c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,14 +9,7 @@ authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }] description = "Snowflake scraper" requires-python = ">=3.9" license = { text = "MIT" } -dependencies = [ - "pydantic==1.10.10", - "scrapy>=2.10.0", - "feedparser>=6.0.10", - "uvloop>=0.17.0", - "fastapi>=0.101.0", - "uvicorn>=0.22.0", -] +dependencies = ["pydantic==1.10.10", "scrapy>=2.10.0", "feedparser>=6.0.10"] [project.scripts] snowscraper = "snowscraper.cli:run" diff --git a/snowscraper/api.py b/snowscraper/api.py deleted file mode 100644 index df2e285..0000000 --- a/snowscraper/api.py +++ /dev/null @@ -1,3 +0,0 @@ -from fastapi import APIRouter - -app = APIRouter() diff --git a/snowscraper/cli.py b/snowscraper/cli.py index 287dfbf..1ec5e11 100644 --- a/snowscraper/cli.py +++ b/snowscraper/cli.py @@ -1,48 +1,16 @@ import argparse -import asyncio import sys from datetime import datetime from datetime import timezone -from types import FrameType - -import uvicorn -import uvloop from .controller import import_scrapers from .controller import run_all -app_fastapi = None - - -class SnowScraper(uvicorn.Server): - def handle_exit(self, sig: int, frame: FrameType) -> None: - print("Shutting down SnowScraper") - return super().handle_exit(sig, frame) - - -async def start_server() -> None: - print("Starting SnowScraper") - server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop")) - fastapi = asyncio.create_task(server.serve()) - - await asyncio.wait([fastapi], return_when=asyncio.FIRST_COMPLETED) - def main(args: argparse.Namespace) -> None: - global app_fastapi - - from .webserver import app as app_fastapi - import_scrapers() run_all(args) - if sys.version_info >= (3, 11): - with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner: - runner.run(start_server()) - else: - uvloop.install() - asyncio.run(start_server()) - def run(): parser = argparse.ArgumentParser(description="Snowflake scraper") diff --git a/snowscraper/helpers.py b/snowscraper/helpers.py index a1311e4..dc7849d 100644 --- a/snowscraper/helpers.py +++ b/snowscraper/helpers.py @@ -2,24 +2,8 @@ from datetime import datetime def string_to_datetime(date_string): - """ - Convert a date string to a datetime object. - - This function supports the following formats: - - ISO 8601 with and without timezone offset: e.g., '2023-08-02T09:27:54-05:00' or '2023-08-07T18:32:05.537Z' - - RFC 1123: e.g., 'Mon, 07 Aug 2023 18:32:05 GMT' - - Args: - - date_string (str): The date string to convert - - Returns: - - datetime.datetime: The datetime representation of the provided date string. - - Raises: - - ValueError: If the provided date string doesn't match any of the supported formats. - """ try: - # First, try ISO 8601 format + # try ISO 8601 if "Z" in date_string: return datetime.fromisoformat(date_string.replace("Z", "+00:00")) return datetime.fromisoformat(date_string) @@ -27,10 +11,9 @@ def string_to_datetime(date_string): pass try: - # Then, try RFC 1123 format + # try RFC 1123 return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass - # If neither format matches, raise an exception raise ValueError(f"Unsupported date format: {date_string}") diff --git a/snowscraper/scrapers/youtube.py b/snowscraper/scrapers/youtube.py deleted file mode 100644 index f6c18dc..0000000 --- a/snowscraper/scrapers/youtube.py +++ /dev/null @@ -1,20 +0,0 @@ -from datetime import datetime -from datetime import timezone - -from ..controller import register_scraper -from ..scraper import BaseScraper - - -@register_scraper -class YoutubeScraper(BaseScraper): - def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): - super(YoutubeScraper, self).__init__(*args, **kwargs) - self.data = {} - self.after = after - - def scrape(self): - print("Scraping YouTube... Unimplemented") - self.data = {} - - def transform(self): - return self.data diff --git a/snowscraper/webserver.py b/snowscraper/webserver.py deleted file mode 100644 index 31a4426..0000000 --- a/snowscraper/webserver.py +++ /dev/null @@ -1,11 +0,0 @@ -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware - -from .api import app as api_v1 - -app = FastAPI() -app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"]) -app.include_router(api_v1, prefix="/api/v1") - -if __name__ == "__main__": - app.run() From 8ce31b70baa6f4944ca82bcd42331812440ea010 Mon Sep 17 00:00:00 2001 From: hackish Date: Mon, 28 Aug 2023 18:59:05 -0700 Subject: [PATCH 3/3] ready for github action? --- snowscraper/controller.py | 4 +++- snowscraper/scrapers/medium.py | 1 - snowscraper/scrapers/quickstarts.py | 13 ++++++++++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/snowscraper/controller.py b/snowscraper/controller.py index d750043..e6d77f2 100644 --- a/snowscraper/controller.py +++ b/snowscraper/controller.py @@ -12,9 +12,11 @@ def register_scraper(cls): def run_all(args: argparse.Namespace): + results = {} for scraper_cls in SCRAPERS.values(): scraper = scraper_cls(after=args.after) - scraper.scrape() + results |= scraper.scrape() + print(results) def import_scrapers(): diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py index 3620789..3825801 100644 --- a/snowscraper/scrapers/medium.py +++ b/snowscraper/scrapers/medium.py @@ -27,7 +27,6 @@ class MediumScraper(BaseScraper): "published": string_to_datetime(entry["published"]), "updated": updated, } - # print(self.data) return self.data def transform(self): diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py index 2ed37ad..4b3ff07 100644 --- a/snowscraper/scrapers/quickstarts.py +++ b/snowscraper/scrapers/quickstarts.py @@ -2,7 +2,9 @@ from datetime import datetime from datetime import timezone import scrapy +from scrapy import signals from scrapy.crawler import CrawlerProcess +from scrapy.signalmanager import dispatcher from ..controller import register_scraper from ..scraper import BaseScraper @@ -23,24 +25,29 @@ class QuickstartScraper(BaseScraper, scrapy.Spider): def start_requests(self): yield scrapy.Request(url=QuickStartsURL, callback=self.parse) + def signal_handler(self, signal, sender, item, response, spider): + self.data[item["key"]] = item + self.data[item["key"]].pop("key") + def scrape(self): print("Scraping Quickstarts") + dispatcher.connect(self.signal_handler, signal=signals.item_scraped) process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) process.crawl(QuickstartScraper, after=self.after) process.start() + return self.data def parse(self, response): for card in response.css("card-sorter#cards > a.codelab-card"): updated = string_to_datetime(card.attrib["data-updated"]) if updated > self.after: key = QuickStartsURL.rstrip("/") + card.attrib["href"] - self.data[key] = { + yield { + "key": key, "title": card.attrib["data-title"], "updated": updated, "tags": card.attrib["data-tags"], } - # print(key, self.data[key]) - yield self.data[key] def transform(self): return self.data