This commit is contained in:
hackish 2023-08-14 15:30:24 -07:00
commit c09e02c441
15 changed files with 500 additions and 0 deletions

View file

@ -0,0 +1,34 @@
from datetime import datetime
from datetime import timezone
import feedparser
from ..controller import register_scraper
from ..helpers import string_to_datetime
from ..scraper import BaseScraper
@register_scraper
class MediumScraper(BaseScraper):
url = "https://medium.com/feed/snowflake"
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
super(MediumScraper, self).__init__(*args, **kwargs)
self.data = {}
self.after = after
def scrape(self):
print("Scraping Medium")
for entry in feedparser.parse(MediumScraper.url)["entries"]:
updated = string_to_datetime(entry["updated"])
if updated > self.after:
self.data[entry["link"]] = {
"title": entry["title"],
"published": string_to_datetime(entry["published"]),
"updated": updated,
}
print(self.data)
return self.data
def transform(self):
return self.data

View file

@ -0,0 +1,47 @@
from datetime import datetime
from datetime import timezone
import scrapy
from scrapy.crawler import CrawlerProcess
from ..controller import register_scraper
from ..scraper import BaseScraper
from snowscraper.helpers import string_to_datetime
QuickStartsURL = "https://quickstarts.snowflake.com/"
@register_scraper
class QuickstartScraper(BaseScraper, scrapy.Spider):
name = "snowflakespider"
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
super(QuickstartScraper, self).__init__(*args, **kwargs)
self.data = {}
self.after = after
def start_requests(self):
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
def scrape(self):
print("Scraping Quickstarts")
process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
process.crawl(QuickstartScraper, after=self.after)
process.start()
def parse(self, response):
for card in response.css("card-sorter#cards > a.codelab-card"):
updated = string_to_datetime(card.attrib["data-updated"])
if updated > self.after:
print(f"Updated: {updated} > {self.after}")
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
self.data[key] = {
"title": card.attrib["data-title"],
"updated": updated,
"tags": card.attrib["data-tags"],
}
print(key, self.data[key])
yield self.data[key]
def transform(self):
return self.data

View file

@ -0,0 +1,20 @@
from datetime import datetime
from datetime import timezone
from ..controller import register_scraper
from ..scraper import BaseScraper
@register_scraper
class YoutubeScraper(BaseScraper):
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
super(YoutubeScraper, self).__init__(*args, **kwargs)
self.data = {}
self.after = after
def scrape(self):
print("Scraping YouTube... Unimplemented")
self.data = {}
def transform(self):
return self.data