From 652b4b6acfad2aebae45687d636ce962cde8679c Mon Sep 17 00:00:00 2001 From: hackish Date: Mon, 14 Aug 2023 15:55:09 -0700 Subject: [PATCH] Add tag filter for medium --- snowscraper/scrapers/medium.py | 8 +++++--- snowscraper/scrapers/quickstarts.py | 7 +++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py index d951dad..b10f954 100644 --- a/snowscraper/scrapers/medium.py +++ b/snowscraper/scrapers/medium.py @@ -12,14 +12,16 @@ from ..scraper import BaseScraper class MediumScraper(BaseScraper): url = "https://medium.com/feed/snowflake" - def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): + def __init__(self, after, *args, **kwargs): super(MediumScraper, self).__init__(*args, **kwargs) self.data = {} - self.after = after + self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc) def scrape(self): print("Scraping Medium") for entry in feedparser.parse(MediumScraper.url)["entries"]: + if not any(tag["term"] == "snowflake" for tag in entry["tags"]): + continue updated = string_to_datetime(entry["updated"]) if updated > self.after: self.data[entry["link"]] = { @@ -27,7 +29,7 @@ class MediumScraper(BaseScraper): "published": string_to_datetime(entry["published"]), "updated": updated, } - print(self.data) + # print(self.data) return self.data def transform(self): diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py index 79a61c5..2ed37ad 100644 --- a/snowscraper/scrapers/quickstarts.py +++ b/snowscraper/scrapers/quickstarts.py @@ -15,10 +15,10 @@ QuickStartsURL = "https://quickstarts.snowflake.com/" class QuickstartScraper(BaseScraper, scrapy.Spider): name = "snowflakespider" - def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): + def __init__(self, after, *args, **kwargs): super(QuickstartScraper, self).__init__(*args, **kwargs) self.data = {} - self.after = after + self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc) def start_requests(self): yield scrapy.Request(url=QuickStartsURL, callback=self.parse) @@ -33,14 +33,13 @@ class QuickstartScraper(BaseScraper, scrapy.Spider): for card in response.css("card-sorter#cards > a.codelab-card"): updated = string_to_datetime(card.attrib["data-updated"]) if updated > self.after: - print(f"Updated: {updated} > {self.after}") key = QuickStartsURL.rstrip("/") + card.attrib["href"] self.data[key] = { "title": card.attrib["data-title"], "updated": updated, "tags": card.attrib["data-tags"], } - print(key, self.data[key]) + # print(key, self.data[key]) yield self.data[key] def transform(self):