Compare commits

...

3 commits

Author SHA1 Message Date
hackish
8ce31b70ba ready for github action? 2023-08-28 18:59:05 -07:00
hackish
16243cf8d0 Remove FastAPI 2023-08-28 18:26:06 -07:00
hackish
3c8569814c Remove Rocketry 2023-08-28 18:18:29 -07:00
10 changed files with 16 additions and 109 deletions

View file

@ -9,15 +9,7 @@ authors = [{ name = "pdf", email = "git@nixon.mozmail.com" }]
description = "Snowflake scraper" description = "Snowflake scraper"
requires-python = ">=3.9" requires-python = ">=3.9"
license = { text = "MIT" } license = { text = "MIT" }
dependencies = [ dependencies = ["pydantic==1.10.10", "scrapy>=2.10.0", "feedparser>=6.0.10"]
"pydantic==1.10.10",
"scrapy>=2.10.0",
"feedparser>=6.0.10",
"uvloop>=0.17.0",
"rocketry>=2.5.1",
"fastapi>=0.101.0",
"uvicorn>=0.22.0",
]
[project.scripts] [project.scripts]
snowscraper = "snowscraper.cli:run" snowscraper = "snowscraper.cli:run"

View file

@ -1,3 +0,0 @@
from fastapi import APIRouter
app = APIRouter()

View file

@ -1,52 +1,16 @@
import argparse import argparse
import asyncio
import sys import sys
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
from types import FrameType
import uvicorn
import uvloop
from .controller import import_scrapers from .controller import import_scrapers
from .controller import run_all from .controller import run_all
app_fastapi = None
app_rocketry = None
class SnowScraper(uvicorn.Server):
def handle_exit(self, sig: int, frame: FrameType) -> None:
print("Shutting down SnowScraper")
return super().handle_exit(sig, frame)
async def start_server() -> None:
print("Starting SnowScraper")
server = SnowScraper(config=uvicorn.Config(app_fastapi, workers=1, loop="uvloop"))
fastapi = asyncio.create_task(server.serve())
rocket = asyncio.create_task(app_rocketry.serve())
app_rocketry.task
await asyncio.wait([rocket, fastapi], return_when=asyncio.FIRST_COMPLETED)
def main(args: argparse.Namespace) -> None: def main(args: argparse.Namespace) -> None:
global app_fastapi, app_rocketry
from .scheduler import app as app_rocketry
from .webserver import app as app_fastapi
import_scrapers() import_scrapers()
run_all(args) run_all(args)
if sys.version_info >= (3, 11):
with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
runner.run(start_server())
else:
uvloop.install()
asyncio.run(start_server())
def run(): def run():
parser = argparse.ArgumentParser(description="Snowflake scraper") parser = argparse.ArgumentParser(description="Snowflake scraper")

View file

@ -12,9 +12,11 @@ def register_scraper(cls):
def run_all(args: argparse.Namespace): def run_all(args: argparse.Namespace):
results = {}
for scraper_cls in SCRAPERS.values(): for scraper_cls in SCRAPERS.values():
scraper = scraper_cls(after=args.after) scraper = scraper_cls(after=args.after)
scraper.scrape() results |= scraper.scrape()
print(results)
def import_scrapers(): def import_scrapers():

View file

@ -2,24 +2,8 @@ from datetime import datetime
def string_to_datetime(date_string): def string_to_datetime(date_string):
"""
Convert a date string to a datetime object.
This function supports the following formats:
- ISO 8601 with and without timezone offset: e.g., '2023-08-02T09:27:54-05:00' or '2023-08-07T18:32:05.537Z'
- RFC 1123: e.g., 'Mon, 07 Aug 2023 18:32:05 GMT'
Args:
- date_string (str): The date string to convert
Returns:
- datetime.datetime: The datetime representation of the provided date string.
Raises:
- ValueError: If the provided date string doesn't match any of the supported formats.
"""
try: try:
# First, try ISO 8601 format # try ISO 8601
if "Z" in date_string: if "Z" in date_string:
return datetime.fromisoformat(date_string.replace("Z", "+00:00")) return datetime.fromisoformat(date_string.replace("Z", "+00:00"))
return datetime.fromisoformat(date_string) return datetime.fromisoformat(date_string)
@ -27,10 +11,9 @@ def string_to_datetime(date_string):
pass pass
try: try:
# Then, try RFC 1123 format # try RFC 1123
return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z") return datetime.strptime(date_string, "%a, %d %b %Y %H:%M:%S %Z")
except ValueError: except ValueError:
pass pass
# If neither format matches, raise an exception
raise ValueError(f"Unsupported date format: {date_string}") raise ValueError(f"Unsupported date format: {date_string}")

View file

@ -1,6 +0,0 @@
from rocketry import Rocketry
app = Rocketry(execution="async")
if __name__ == "__main__":
app.run()

View file

@ -27,7 +27,6 @@ class MediumScraper(BaseScraper):
"published": string_to_datetime(entry["published"]), "published": string_to_datetime(entry["published"]),
"updated": updated, "updated": updated,
} }
# print(self.data)
return self.data return self.data
def transform(self): def transform(self):

View file

@ -2,7 +2,9 @@ from datetime import datetime
from datetime import timezone from datetime import timezone
import scrapy import scrapy
from scrapy import signals
from scrapy.crawler import CrawlerProcess from scrapy.crawler import CrawlerProcess
from scrapy.signalmanager import dispatcher
from ..controller import register_scraper from ..controller import register_scraper
from ..scraper import BaseScraper from ..scraper import BaseScraper
@ -23,24 +25,29 @@ class QuickstartScraper(BaseScraper, scrapy.Spider):
def start_requests(self): def start_requests(self):
yield scrapy.Request(url=QuickStartsURL, callback=self.parse) yield scrapy.Request(url=QuickStartsURL, callback=self.parse)
def signal_handler(self, signal, sender, item, response, spider):
self.data[item["key"]] = item
self.data[item["key"]].pop("key")
def scrape(self): def scrape(self):
print("Scraping Quickstarts") print("Scraping Quickstarts")
dispatcher.connect(self.signal_handler, signal=signals.item_scraped)
process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) process = CrawlerProcess({"LOG_LEVEL": "ERROR"})
process.crawl(QuickstartScraper, after=self.after) process.crawl(QuickstartScraper, after=self.after)
process.start() process.start()
return self.data
def parse(self, response): def parse(self, response):
for card in response.css("card-sorter#cards > a.codelab-card"): for card in response.css("card-sorter#cards > a.codelab-card"):
updated = string_to_datetime(card.attrib["data-updated"]) updated = string_to_datetime(card.attrib["data-updated"])
if updated > self.after: if updated > self.after:
key = QuickStartsURL.rstrip("/") + card.attrib["href"] key = QuickStartsURL.rstrip("/") + card.attrib["href"]
self.data[key] = { yield {
"key": key,
"title": card.attrib["data-title"], "title": card.attrib["data-title"],
"updated": updated, "updated": updated,
"tags": card.attrib["data-tags"], "tags": card.attrib["data-tags"],
} }
# print(key, self.data[key])
yield self.data[key]
def transform(self): def transform(self):
return self.data return self.data

View file

@ -1,20 +0,0 @@
from datetime import datetime
from datetime import timezone
from ..controller import register_scraper
from ..scraper import BaseScraper
@register_scraper
class YoutubeScraper(BaseScraper):
def __init__(self, after=datetime(1970, 1, 1, tzinfo=timezone.utc), *args, **kwargs):
super(YoutubeScraper, self).__init__(*args, **kwargs)
self.data = {}
self.after = after
def scrape(self):
print("Scraping YouTube... Unimplemented")
self.data = {}
def transform(self):
return self.data

View file

@ -1,11 +0,0 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from .api import app as api_v1
app = FastAPI()
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["GET"], allow_headers=["*"])
app.include_router(api_v1, prefix="/api/v1")
if __name__ == "__main__":
app.run()