initial commit
This commit is contained in:
commit
47f68794dc
8 changed files with 214 additions and 0 deletions
24
pyproject.toml
Normal file
24
pyproject.toml
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=67.8"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "snowscraper"
|
||||||
|
version = "0.0.1"
|
||||||
|
authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }]
|
||||||
|
description = "Snowflake scraper"
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
license = { text = "MIT" }
|
||||||
|
dependencies = ["pydantic==1.10.10", "scrapy>=2.10.0", "feedparser>=6.0.10"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
snowscraper = "snowscraper.cli:run"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
py-modules = ["snowscraper"]
|
||||||
|
|
||||||
|
[tool.bandit]
|
||||||
|
exclude_dirs = ["/doc", "/build"]
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 120
|
0
snowscraper/__init__.py
Normal file
0
snowscraper/__init__.py
Normal file
37
snowscraper/cli.py
Normal file
37
snowscraper/cli.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
|
|
||||||
|
from .controller import import_scrapers
|
||||||
|
from .controller import run_all
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace) -> None:
|
||||||
|
import_scrapers()
|
||||||
|
run_all(args)
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
parser = argparse.ArgumentParser(description="Snowflake scraper")
|
||||||
|
parser.add_argument(
|
||||||
|
"--after",
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default=None,
|
||||||
|
help="Scrape only after a specific date in the format 'MM-DD-YYYY' in UTC",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.after:
|
||||||
|
try:
|
||||||
|
args.after = datetime.strptime(args.after, "%m-%d-%Y").replace(tzinfo=timezone.utc)
|
||||||
|
except ValueError:
|
||||||
|
print(f"Error: The 'after' argument should be in the format MM-DD-YYYY. You provided: {args.after}")
|
||||||
|
sys.exit(1)
|
||||||
|
main(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run()
|
25
snowscraper/controller.py
Normal file
25
snowscraper/controller.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
import argparse
|
||||||
|
import importlib
|
||||||
|
import pkgutil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SCRAPERS = {}
|
||||||
|
|
||||||
|
|
||||||
|
def register_scraper(cls):
|
||||||
|
SCRAPERS[cls.__name__] = cls
|
||||||
|
return cls
|
||||||
|
|
||||||
|
|
||||||
|
def run_all(args: argparse.Namespace):
|
||||||
|
results = {}
|
||||||
|
for scraper_cls in SCRAPERS.values():
|
||||||
|
scraper = scraper_cls(after=args.after)
|
||||||
|
results |= scraper.scrape()
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
|
||||||
|
def import_scrapers():
|
||||||
|
directory = Path(__file__).resolve().parent / "scrapers"
|
||||||
|
for module_loader, name, ispkg in pkgutil.iter_modules([directory]):
|
||||||
|
importlib.import_module(f".scrapers.{name}", __package__)
|
19
snowscraper/helpers.py
Normal file
19
snowscraper/helpers.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def string_to_datetime(date_string):
|
||||||
|
try:
|
||||||
|
# try ISO 8601
|
||||||
|
if "Z" in date_string:
|
||||||
|
return datetime.fromisoformat(date_string.replace("Z", "+00:00"))
|
||||||
|
return datetime.fromisoformat(date_string)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
# try RFC 1123
|
||||||
|
return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z")
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise ValueError(f"Unsupported date format: {date_string}")
|
23
snowscraper/scraper.py
Normal file
23
snowscraper/scraper.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
from abc import ABC
|
||||||
|
from abc import abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class BaseScraper(ABC):
|
||||||
|
def run(self, validate=True):
|
||||||
|
self.scraped_json = self._scrape()
|
||||||
|
self.transformed_json = self._transform(self.scraped_json)
|
||||||
|
if validate:
|
||||||
|
self.validate()
|
||||||
|
return self.transformed_json
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
if not self.transformed_json:
|
||||||
|
return self.transformed_json
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def scrape(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def transform(self):
|
||||||
|
pass
|
33
snowscraper/scrapers/medium.py
Normal file
33
snowscraper/scrapers/medium.py
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
|
|
||||||
|
import feedparser
|
||||||
|
|
||||||
|
from ..controller import register_scraper
|
||||||
|
from ..helpers import string_to_datetime
|
||||||
|
from ..scraper import BaseScraper
|
||||||
|
|
||||||
|
|
||||||
|
@register_scraper
|
||||||
|
class MediumScraper(BaseScraper):
|
||||||
|
url = "https://medium.com/feed/snowflake"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(MediumScraper, self).__init__(*args, **kwargs)
|
||||||
|
self.data = {}
|
||||||
|
self.after = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
def scrape(self):
|
||||||
|
print("Scraping Medium")
|
||||||
|
for entry in feedparser.parse(MediumScraper.url)["entries"]:
|
||||||
|
updated = string_to_datetime(entry["updated"])
|
||||||
|
if updated > self.after:
|
||||||
|
self.data[entry["link"]] = {
|
||||||
|
"title": entry["title"],
|
||||||
|
"published": string_to_datetime(entry["published"]),
|
||||||
|
"updated": updated,
|
||||||
|
}
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
def transform(self):
|
||||||
|
return self.data
|
53
snowscraper/scrapers/quickstarts.py
Normal file
53
snowscraper/scrapers/quickstarts.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy import signals
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.signalmanager import dispatcher
|
||||||
|
|
||||||
|
from ..controller import register_scraper
|
||||||
|
from ..scraper import BaseScraper
|
||||||
|
from snowscraper.helpers import string_to_datetime
|
||||||
|
|
||||||
|
QuickStartsURL = "https://quickstarts.snowflake.com/"
|
||||||
|
|
||||||
|
|
||||||
|
@register_scraper
|
||||||
|
class QuickstartScraper(BaseScraper, scrapy.Spider):
|
||||||
|
name = "snowflakespider"
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(QuickstartScraper, self).__init__(*args, **kwargs)
|
||||||
|
self.data = {}
|
||||||
|
self.after = datetime(1970, 1, 1, tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
|
||||||
|
|
||||||
|
def signal_handler(self, signal, sender, item, response, spider):
|
||||||
|
self.data[item["key"]] = item
|
||||||
|
self.data[item["key"]].pop("key")
|
||||||
|
|
||||||
|
def scrape(self):
|
||||||
|
print("Scraping Quickstarts")
|
||||||
|
dispatcher.connect(self.signal_handler, signal=signals.item_scraped)
|
||||||
|
process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
|
||||||
|
process.crawl(QuickstartScraper, after=self.after)
|
||||||
|
process.start()
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
for card in response.css("card-sorter#cards > a.codelab-card"):
|
||||||
|
updated = string_to_datetime(card.attrib["data-updated"])
|
||||||
|
if updated > self.after:
|
||||||
|
key = QuickStartsURL.rstrip("/") + card.attrib["href"]
|
||||||
|
yield {
|
||||||
|
"key": key,
|
||||||
|
"title": card.attrib["data-title"],
|
||||||
|
"updated": updated,
|
||||||
|
"tags": card.attrib["data-tags"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def transform(self):
|
||||||
|
return self.data
|
Loading…
Reference in a new issue