Add tag filter for medium
This commit is contained in:
parent
c09e02c441
commit
652b4b6acf
2 changed files with 8 additions and 7 deletions
|
@ -12,14 +12,16 @@ from ..scraper import BaseScraper
|
|||
class MediumScraper(BaseScraper):
|
||||
url = "https://medium.com/feed/snowflake"
|
||||
|
||||
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
|
||||
def __init__(self, after, *args, **kwargs):
|
||||
super(MediumScraper, self).__init__(*args, **kwargs)
|
||||
self.data = {}
|
||||
self.after = after
|
||||
self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||
|
||||
def scrape(self):
|
||||
print("Scraping Medium")
|
||||
for entry in feedparser.parse(MediumScraper.url)["entries"]:
|
||||
if not any(tag["term"] == "snowflake" for tag in entry["tags"]):
|
||||
continue
|
||||
updated = string_to_datetime(entry["updated"])
|
||||
if updated > self.after:
|
||||
self.data[entry["link"]] = {
|
||||
|
@ -27,7 +29,7 @@ class MediumScraper(BaseScraper):
|
|||
"published": string_to_datetime(entry["published"]),
|
||||
"updated": updated,
|
||||
}
|
||||
print(self.data)
|
||||
# print(self.data)
|
||||
return self.data
|
||||
|
||||
def transform(self):
|
||||
|
|
|
@ -15,10 +15,10 @@ QuickStartsURL = "https://quickstarts.snowflake.com/"
|
|||
class QuickstartScraper(BaseScraper, scrapy.Spider):
|
||||
name = "snowflakespider"
|
||||
|
||||
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
|
||||
def __init__(self, after, *args, **kwargs):
|
||||
super(QuickstartScraper, self).__init__(*args, **kwargs)
|
||||
self.data = {}
|
||||
self.after = after
|
||||
self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
|
||||
|
@ -33,14 +33,13 @@ class QuickstartScraper(BaseScraper, scrapy.Spider):
|
|||
for card in response.css("card-sorter#cards > a.codelab-card"):
|
||||
updated = string_to_datetime(card.attrib["data-updated"])
|
||||
if updated > self.after:
|
||||
print(f"Updated: {updated} > {self.after}")
|
||||
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
|
||||
self.data[key] = {
|
||||
"title": card.attrib["data-title"],
|
||||
"updated": updated,
|
||||
"tags": card.attrib["data-tags"],
|
||||
}
|
||||
print(key, self.data[key])
|
||||
# print(key, self.data[key])
|
||||
yield self.data[key]
|
||||
|
||||
def transform(self):
|
||||
|
|
Loading…
Reference in a new issue