initial commit
This commit is contained in:
commit
47f68794dc
8 changed files with 214 additions and 0 deletions
24
pyproject.toml
Normal file
24
pyproject.toml
Normal file
|
@ -0,0 +1,24 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=67.8"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "snowscraper"
|
||||
version = "0.0.1"
|
||||
authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }]
|
||||
description = "Snowflake scraper"
|
||||
requires-python = ">=3.9"
|
||||
license = { text = "MIT" }
|
||||
dependencies = ["pydantic==1.10.10", "scrapy>=2.10.0", "feedparser>=6.0.10"]
|
||||
|
||||
[project.scripts]
|
||||
snowscraper = "snowscraper.cli:run"
|
||||
|
||||
[tool.setuptools]
|
||||
py-modules = ["snowscraper"]
|
||||
|
||||
[tool.bandit]
|
||||
exclude_dirs = ["/doc", "/build"]
|
||||
|
||||
[tool.black]
|
||||
line-length = 120
|
0
snowscraper/__init__.py
Normal file
0
snowscraper/__init__.py
Normal file
37
snowscraper/cli.py
Normal file
37
snowscraper/cli.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from .controller import import_scrapers
|
||||
from .controller import run_all
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
import_scrapers()
|
||||
run_all(args)
|
||||
|
||||
|
||||
def run():
|
||||
parser = argparse.ArgumentParser(description="Snowflake scraper")
|
||||
parser.add_argument(
|
||||
"--after",
|
||||
type=str,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Scrape only after a specific date in the format 'MM-DD-YYYY' in UTC",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.after:
|
||||
try:
|
||||
args.after = datetime.strptime(args.after, "%m-%d-%Y").replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
print(f"Error: The 'after' argument should be in the format MM-DD-YYYY. You provided: {args.after}")
|
||||
sys.exit(1)
|
||||
main(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
25
snowscraper/controller.py
Normal file
25
snowscraper/controller.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
import argparse
|
||||
import importlib
|
||||
import pkgutil
|
||||
from pathlib import Path
|
||||
|
||||
SCRAPERS = {}
|
||||
|
||||
|
||||
def register_scraper(cls):
|
||||
SCRAPERS[cls.__name__] = cls
|
||||
return cls
|
||||
|
||||
|
||||
def run_all(args: argparse.Namespace):
|
||||
results = {}
|
||||
for scraper_cls in SCRAPERS.values():
|
||||
scraper = scraper_cls(after=args.after)
|
||||
results |= scraper.scrape()
|
||||
print(results)
|
||||
|
||||
|
||||
def import_scrapers():
|
||||
directory = Path(__file__).resolve().parent / "scrapers"
|
||||
for module_loader, name, ispkg in pkgutil.iter_modules([directory]):
|
||||
importlib.import_module(f".scrapers.{name}", __package__)
|
19
snowscraper/helpers.py
Normal file
19
snowscraper/helpers.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
from datetime import datetime
|
||||
|
||||
|
||||
def string_to_datetime(date_string):
|
||||
try:
|
||||
# try ISO 8601
|
||||
if "Z" in date_string:
|
||||
return datetime.fromisoformat(date_string.replace("Z", "+00:00"))
|
||||
return datetime.fromisoformat(date_string)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# try RFC 1123
|
||||
return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
raise ValueError(f"Unsupported date format: {date_string}")
|
23
snowscraper/scraper.py
Normal file
23
snowscraper/scraper.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
def run(self, validate=True):
|
||||
self.scraped_json = self._scrape()
|
||||
self.transformed_json = self._transform(self.scraped_json)
|
||||
if validate:
|
||||
self.validate()
|
||||
return self.transformed_json
|
||||
|
||||
def validate(self):
|
||||
if not self.transformed_json:
|
||||
return self.transformed_json
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform(self):
|
||||
pass
|
33
snowscraper/scrapers/medium.py
Normal file
33
snowscraper/scrapers/medium.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
import feedparser
|
||||
|
||||
from ..controller import register_scraper
|
||||
from ..helpers import string_to_datetime
|
||||
from ..scraper import BaseScraper
|
||||
|
||||
|
||||
@register_scraper
|
||||
class MediumScraper(BaseScraper):
|
||||
url = "https://medium.com/feed/snowflake"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(MediumScraper, self).__init__(*args, **kwargs)
|
||||
self.data = {}
|
||||
self.after = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||
|
||||
def scrape(self):
|
||||
print("Scraping Medium")
|
||||
for entry in feedparser.parse(MediumScraper.url)["entries"]:
|
||||
updated = string_to_datetime(entry["updated"])
|
||||
if updated > self.after:
|
||||
self.data[entry["link"]] = {
|
||||
"title": entry["title"],
|
||||
"published": string_to_datetime(entry["published"]),
|
||||
"updated": updated,
|
||||
}
|
||||
return self.data
|
||||
|
||||
def transform(self):
|
||||
return self.data
|
53
snowscraper/scrapers/quickstarts.py
Normal file
53
snowscraper/scrapers/quickstarts.py
Normal file
|
@ -0,0 +1,53 @@
|
|||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
import scrapy
|
||||
from scrapy import signals
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.signalmanager import dispatcher
|
||||
|
||||
from ..controller import register_scraper
|
||||
from ..scraper import BaseScraper
|
||||
from snowscraper.helpers import string_to_datetime
|
||||
|
||||
QuickStartsURL = "https://quickstarts.snowflake.com/"
|
||||
|
||||
|
||||
@register_scraper
|
||||
class QuickstartScraper(BaseScraper, scrapy.Spider):
|
||||
name = "snowflakespider"
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(QuickstartScraper, self).__init__(*args, **kwargs)
|
||||
self.data = {}
|
||||
self.after = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||
|
||||
def start_requests(self):
|
||||
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
|
||||
|
||||
def signal_handler(self, signal, sender, item, response, spider):
|
||||
self.data[item["key"]] = item
|
||||
self.data[item["key"]].pop("key")
|
||||
|
||||
def scrape(self):
|
||||
print("Scraping Quickstarts")
|
||||
dispatcher.connect(self.signal_handler, signal=signals.item_scraped)
|
||||
process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
|
||||
process.crawl(QuickstartScraper, after=self.after)
|
||||
process.start()
|
||||
return self.data
|
||||
|
||||
def parse(self, response):
|
||||
for card in response.css("card-sorter#cards > a.codelab-card"):
|
||||
updated = string_to_datetime(card.attrib["data-updated"])
|
||||
if updated > self.after:
|
||||
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
|
||||
yield {
|
||||
"key": key,
|
||||
"title": card.attrib["data-title"],
|
||||
"updated": updated,
|
||||
"tags": card.attrib["data-tags"],
|
||||
}
|
||||
|
||||
def transform(self):
|
||||
return self.data
|
Loading…
Reference in a new issue