Add tag filter for medium
This commit is contained in:
parent
c09e02c441
commit
652b4b6acf
2 changed files with 8 additions and 7 deletions
|
@ -12,14 +12,16 @@ from ..scraper import BaseScraper
|
||||||
class MediumScraper(BaseScraper):
|
class MediumScraper(BaseScraper):
|
||||||
url = "https://medium.com/feed/snowflake"
|
url = "https://medium.com/feed/snowflake"
|
||||||
|
|
||||||
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
|
def __init__(self, after, *args, **kwargs):
|
||||||
super(MediumScraper, self).__init__(*args, **kwargs)
|
super(MediumScraper, self).__init__(*args, **kwargs)
|
||||||
self.data = {}
|
self.data = {}
|
||||||
self.after = after
|
self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||||
|
|
||||||
def scrape(self):
|
def scrape(self):
|
||||||
print("Scraping Medium")
|
print("Scraping Medium")
|
||||||
for entry in feedparser.parse(MediumScraper.url)["entries"]:
|
for entry in feedparser.parse(MediumScraper.url)["entries"]:
|
||||||
|
if not any(tag["term"] == "snowflake" for tag in entry["tags"]):
|
||||||
|
continue
|
||||||
updated = string_to_datetime(entry["updated"])
|
updated = string_to_datetime(entry["updated"])
|
||||||
if updated > self.after:
|
if updated > self.after:
|
||||||
self.data[entry["link"]] = {
|
self.data[entry["link"]] = {
|
||||||
|
@ -27,7 +29,7 @@ class MediumScraper(BaseScraper):
|
||||||
"published": string_to_datetime(entry["published"]),
|
"published": string_to_datetime(entry["published"]),
|
||||||
"updated": updated,
|
"updated": updated,
|
||||||
}
|
}
|
||||||
print(self.data)
|
# print(self.data)
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
def transform(self):
|
def transform(self):
|
||||||
|
|
|
@ -15,10 +15,10 @@ QuickStartsURL = "https://quickstarts.snowflake.com/"
|
||||||
class QuickstartScraper(BaseScraper, scrapy.Spider):
|
class QuickstartScraper(BaseScraper, scrapy.Spider):
|
||||||
name = "snowflakespider"
|
name = "snowflakespider"
|
||||||
|
|
||||||
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
|
def __init__(self, after, *args, **kwargs):
|
||||||
super(QuickstartScraper, self).__init__(*args, **kwargs)
|
super(QuickstartScraper, self).__init__(*args, **kwargs)
|
||||||
self.data = {}
|
self.data = {}
|
||||||
self.after = after
|
self.after = after or datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
|
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
|
||||||
|
@ -33,14 +33,13 @@ class QuickstartScraper(BaseScraper, scrapy.Spider):
|
||||||
for card in response.css("card-sorter#cards > a.codelab-card"):
|
for card in response.css("card-sorter#cards > a.codelab-card"):
|
||||||
updated = string_to_datetime(card.attrib["data-updated"])
|
updated = string_to_datetime(card.attrib["data-updated"])
|
||||||
if updated > self.after:
|
if updated > self.after:
|
||||||
print(f"Updated: {updated} > {self.after}")
|
|
||||||
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
|
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
|
||||||
self.data[key] = {
|
self.data[key] = {
|
||||||
"title": card.attrib["data-title"],
|
"title": card.attrib["data-title"],
|
||||||
"updated": updated,
|
"updated": updated,
|
||||||
"tags": card.attrib["data-tags"],
|
"tags": card.attrib["data-tags"],
|
||||||
}
|
}
|
||||||
print(key, self.data[key])
|
# print(key, self.data[key])
|
||||||
yield self.data[key]
|
yield self.data[key]
|
||||||
|
|
||||||
def transform(self):
|
def transform(self):
|
||||||
|
|
Loading…
Reference in a new issue