ready for github action?

This commit is contained in:
hackish 2023-08-28 18:59:05 -07:00
parent 16243cf8d0
commit 8ce31b70ba
3 changed files with 13 additions and 5 deletions

View file

@ -12,9 +12,11 @@ def register_scraper(cls):
def run_all(args: argparse.Namespace):
results = {}
for scraper_cls in SCRAPERS.values():
scraper = scraper_cls(after=args.after)
scraper.scrape()
results |= scraper.scrape()
print(results)
def import_scrapers():

View file

@ -27,7 +27,6 @@ class MediumScraper(BaseScraper):
"published": string_to_datetime(entry["published"]),
"updated": updated,
}
# print(self.data)
return self.data
def transform(self):

View file

@ -2,7 +2,9 @@ from datetime import datetime
from datetime import timezone
import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess
from scrapy.signalmanager import dispatcher
from ..controller import register_scraper
from ..scraper import BaseScraper
@ -23,24 +25,29 @@ class QuickstartScraper(BaseScraper, scrapy.Spider):
def start_requests(self):
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
def signal_handler(self, signal, sender, item, response, spider):
self.data[item["key"]] = item
self.data[item["key"]].pop("key")
def scrape(self):
print("Scraping Quickstarts")
dispatcher.connect(self.signal_handler, signal=signals.item_scraped)
process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
process.crawl(QuickstartScraper, after=self.after)
process.start()
return self.data
def parse(self, response):
for card in response.css("card-sorter#cards > a.codelab-card"):
updated = string_to_datetime(card.attrib["data-updated"])
if updated > self.after:
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
self.data[key] = {
yield {
"key": key,
"title": card.attrib["data-title"],
"updated": updated,
"tags": card.attrib["data-tags"],
}
# print(key, self.data[key])
yield self.data[key]
def transform(self):
return self.data