From 8ce31b70baa6f4944ca82bcd42331812440ea010 Mon Sep 17 00:00:00 2001 From: hackish Date: Mon, 28 Aug 2023 18:59:05 -0700 Subject: [PATCH] ready for github action? --- snowscraper/controller.py | 4 +++- snowscraper/scrapers/medium.py | 1 - snowscraper/scrapers/quickstarts.py | 13 ++++++++++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/snowscraper/controller.py b/snowscraper/controller.py index d750043..e6d77f2 100644 --- a/snowscraper/controller.py +++ b/snowscraper/controller.py @@ -12,9 +12,11 @@ def register_scraper(cls): def run_all(args: argparse.Namespace): + results = {} for scraper_cls in SCRAPERS.values(): scraper = scraper_cls(after=args.after) - scraper.scrape() + results |= scraper.scrape() + print(results) def import_scrapers(): diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py index 3620789..3825801 100644 --- a/snowscraper/scrapers/medium.py +++ b/snowscraper/scrapers/medium.py @@ -27,7 +27,6 @@ class MediumScraper(BaseScraper): "published": string_to_datetime(entry["published"]), "updated": updated, } - # print(self.data) return self.data def transform(self): diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py index 2ed37ad..4b3ff07 100644 --- a/snowscraper/scrapers/quickstarts.py +++ b/snowscraper/scrapers/quickstarts.py @@ -2,7 +2,9 @@ from datetime import datetime from datetime import timezone import scrapy +from scrapy import signals from scrapy.crawler import CrawlerProcess +from scrapy.signalmanager import dispatcher from ..controller import register_scraper from ..scraper import BaseScraper @@ -23,24 +25,29 @@ class QuickstartScraper(BaseScraper, scrapy.Spider): def start_requests(self): yield scrapy.Request(url=QuickStartsURL, callback=self.parse) + def signal_handler(self, signal, sender, item, response, spider): + self.data[item["key"]] = item + self.data[item["key"]].pop("key") + def scrape(self): print("Scraping Quickstarts") + dispatcher.connect(self.signal_handler, signal=signals.item_scraped) process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) process.crawl(QuickstartScraper, after=self.after) process.start() + return self.data def parse(self, response): for card in response.css("card-sorter#cards > a.codelab-card"): updated = string_to_datetime(card.attrib["data-updated"]) if updated > self.after: key = QuickStartsURL.rstrip("/") + card.attrib["href"] - self.data[key] = { + yield { + "key": key, "title": card.attrib["data-title"], "updated": updated, "tags": card.attrib["data-tags"], } - # print(key, self.data[key]) - yield self.data[key] def transform(self): return self.data