Add tag filter for medium

This commit is contained in:
hackish 2023-08-14 15:55:09 -07:00
parent c09e02c441
commit 652b4b6acf
2 changed files with 8 additions and 7 deletions

View file

@ -12,14 +12,16 @@ from ..scraper import BaseScraper
class MediumScraper(BaseScraper): class MediumScraper(BaseScraper):
url = "https://medium.com/feed/snowflake" url = "https://medium.com/feed/snowflake"
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): def __init__(self, after, *args, **kwargs):
super(MediumScraper, self).__init__(*args, **kwargs) super(MediumScraper, self).__init__(*args, **kwargs)
self.data = {} self.data = {}
self.after = after self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc)
def scrape(self): def scrape(self):
print("Scraping Medium") print("Scraping Medium")
for entry in feedparser.parse(MediumScraper.url)["entries"]: for entry in feedparser.parse(MediumScraper.url)["entries"]:
if not any(tag["term"] == "snowflake" for tag in entry["tags"]):
continue
updated = string_to_datetime(entry["updated"]) updated = string_to_datetime(entry["updated"])
if updated > self.after: if updated > self.after:
self.data[entry["link"]] = { self.data[entry["link"]] = {
@ -27,7 +29,7 @@ class MediumScraper(BaseScraper):
"published": string_to_datetime(entry["published"]), "published": string_to_datetime(entry["published"]),
"updated": updated, "updated": updated,
} }
print(self.data) # print(self.data)
return self.data return self.data
def transform(self): def transform(self):

View file

@ -15,10 +15,10 @@ QuickStartsURL = "https://quickstarts.snowflake.com/"
class QuickstartScraper(BaseScraper, scrapy.Spider): class QuickstartScraper(BaseScraper, scrapy.Spider):
name = "snowflakespider" name = "snowflakespider"
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs): def __init__(self, after, *args, **kwargs):
super(QuickstartScraper, self).__init__(*args, **kwargs) super(QuickstartScraper, self).__init__(*args, **kwargs)
self.data = {} self.data = {}
self.after = after self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc)
def start_requests(self): def start_requests(self):
yield scrapy.Request(url=QuickStartsURL, callback=self.parse) yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
@ -33,14 +33,13 @@ class QuickstartScraper(BaseScraper, scrapy.Spider):
for card in response.css("card-sorter#cards > a.codelab-card"): for card in response.css("card-sorter#cards > a.codelab-card"):
updated = string_to_datetime(card.attrib["data-updated"]) updated = string_to_datetime(card.attrib["data-updated"])
if updated > self.after: if updated > self.after:
print(f"Updated: {updated} > {self.after}")
key = QuickStartsURL.rstrip("/") + card.attrib["href"] key = QuickStartsURL.rstrip("/") + card.attrib["href"]
self.data[key] = { self.data[key] = {
"title": card.attrib["data-title"], "title": card.attrib["data-title"],
"updated": updated, "updated": updated,
"tags": card.attrib["data-tags"], "tags": card.attrib["data-tags"],
} }
print(key, self.data[key]) # print(key, self.data[key])
yield self.data[key] yield self.data[key]
def transform(self): def transform(self):