ready for github action?

This commit is contained in:
hackish 2023-08-28 18:59:05 -07:00
parent 16243cf8d0
commit 8ce31b70ba
3 changed files with 13 additions and 5 deletions

View file

@ -12,9 +12,11 @@ def register_scraper(cls):
def run_all(args: argparse.Namespace): def run_all(args: argparse.Namespace):
results = {}
for scraper_cls in SCRAPERS.values(): for scraper_cls in SCRAPERS.values():
scraper = scraper_cls(after=args.after) scraper = scraper_cls(after=args.after)
scraper.scrape() results |= scraper.scrape()
print(results)
def import_scrapers(): def import_scrapers():

View file

@ -27,7 +27,6 @@ class MediumScraper(BaseScraper):
"published": string_to_datetime(entry["published"]), "published": string_to_datetime(entry["published"]),
"updated": updated, "updated": updated,
} }
# print(self.data)
return self.data return self.data
def transform(self): def transform(self):

View file

@ -2,7 +2,9 @@ from datetime import datetime
from datetime import timezone from datetime import timezone
import scrapy import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess from scrapy.crawler import CrawlerProcess
from scrapy.signalmanager import dispatcher
from ..controller import register_scraper from ..controller import register_scraper
from ..scraper import BaseScraper from ..scraper import BaseScraper
@ -23,24 +25,29 @@ class QuickstartScraper(BaseScraper, scrapy.Spider):
def start_requests(self): def start_requests(self):
yield scrapy.Request(url=QuickStartsURL, callback=self.parse) yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
def signal_handler(self, signal, sender, item, response, spider):
self.data[item["key"]] = item
self.data[item["key"]].pop("key")
def scrape(self): def scrape(self):
print("Scraping Quickstarts") print("Scraping Quickstarts")
dispatcher.connect(self.signal_handler, signal=signals.item_scraped)
process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
process.crawl(QuickstartScraper, after=self.after) process.crawl(QuickstartScraper, after=self.after)
process.start() process.start()
return self.data
def parse(self, response): def parse(self, response):
for card in response.css("card-sorter#cards > a.codelab-card"): for card in response.css("card-sorter#cards > a.codelab-card"):
updated = string_to_datetime(card.attrib["data-updated"]) updated = string_to_datetime(card.attrib["data-updated"])
if updated > self.after: if updated > self.after:
key = QuickStartsURL.rstrip("/") + card.attrib["href"] key = QuickStartsURL.rstrip("/") + card.attrib["href"]
self.data[key] = { yield {
"key": key,
"title": card.attrib["data-title"], "title": card.attrib["data-title"],
"updated": updated, "updated": updated,
"tags": card.attrib["data-tags"], "tags": card.attrib["data-tags"],
} }
# print(key, self.data[key])
yield self.data[key]
def transform(self): def transform(self):
return self.data return self.data