From d91f6512239a120273971c6215a44bedd98e4071 Mon Sep 17 00:00:00 2001 From: hackish Date: Mon, 18 Sep 2023 20:55:21 -0700 Subject: [PATCH] Use GraphQL calls for Medium --- snowscraper/helpers.py | 10 + snowscraper/scrapers/medium.py | 1910 ++++++++++++++++++++++++++- snowscraper/scrapers/quickstarts.py | 53 - 3 files changed, 1910 insertions(+), 63 deletions(-) delete mode 100644 snowscraper/scrapers/quickstarts.py diff --git a/snowscraper/helpers.py b/snowscraper/helpers.py index dc7849d..5afbcdb 100644 --- a/snowscraper/helpers.py +++ b/snowscraper/helpers.py @@ -1,5 +1,15 @@ from datetime import datetime +def unix_to_datetime_utc(timestamp_millis): + # Convert to seconds from milliseconds + timestamp_seconds = timestamp_millis / 1000.0 + + # Create a datetime object in UTC + dt_object = datetime.utcfromtimestamp(timestamp_seconds) + + # Format the datetime object as an ISO 8601 string + return dt_object.isoformat() + 'Z' # 'Z' indicates UTC time + def string_to_datetime(date_string): try: diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py index 10b2e59..fe91c88 100644 --- a/snowscraper/scrapers/medium.py +++ b/snowscraper/scrapers/medium.py @@ -1,32 +1,1922 @@ from datetime import datetime from datetime import timezone +import requests import feedparser from ..controller import register_scraper -from ..helpers import string_to_datetime +from ..helpers import unix_to_datetime_utc from ..scraper import BaseScraper +LONG_QUERY = {"query": """ +query PublicationHomepageQuery($collectionId: ID!, $homepagePostsLimit: PaginationLimit = 25, $homepagePostsFrom: String, $includeDistributedResponses: Boolean = false) { + collection(id: $collectionId) { + __typename + id + ...PublicationHomepage_collection + } +} + +fragment PublicationHomepage_collection on Collection { + id + ...PublisherHeader_publisher + ...PublisherHomepagePosts_publisher + ...usePublicationAnalytics_collection + ...CollectionMetadata_collection + __typename +} + +fragment PublisherHeader_publisher on Publisher { + id + ...PublisherHeaderBackground_publisher + ...PublisherHeaderNameplate_publisher + ...PublisherHeaderActions_publisher + ...PublisherHeaderNav_publisher + __typename +} + +fragment PublisherHeaderBackground_publisher on Publisher { + __typename + id + customStyleSheet { + ...PublisherHeaderBackground_customStyleSheet + __typename + id + } + ... on Collection { + colorPalette { + tintBackgroundSpectrum { + backgroundColor + __typename + } + __typename + } + isAuroraVisible + legacyHeaderBackgroundImage { + id + originalWidth + focusPercentX + focusPercentY + __typename + } + ...collectionTintBackgroundTheme_collection + __typename + id + } + ...publisherUrl_publisher +} + +fragment PublisherHeaderBackground_customStyleSheet on CustomStyleSheet { + id + global { + colorPalette { + background { + rgb + __typename + } + __typename + } + __typename + } + header { + headerScale + backgroundImageDisplayMode + backgroundImageVerticalAlignment + backgroundColorDisplayMode + backgroundColor { + alpha + rgb + ...getHexFromColorValue_colorValue + ...getOpaqueHexFromColorValue_colorValue + __typename + } + secondaryBackgroundColor { + ...getHexFromColorValue_colorValue + __typename + } + postBackgroundColor { + ...getHexFromColorValue_colorValue + __typename + } + backgroundImage { + id + originalWidth + __typename + } + __typename + } + __typename +} + +fragment getHexFromColorValue_colorValue on ColorValue { + rgb + alpha + __typename +} + +fragment getOpaqueHexFromColorValue_colorValue on ColorValue { + rgb + __typename +} + +fragment collectionTintBackgroundTheme_collection on Collection { + colorPalette { + ...collectionTintBackgroundTheme_colorPalette + __typename + } + customStyleSheet { + id + ...collectionTintBackgroundTheme_customStyleSheet + __typename + } + __typename + id +} + +fragment collectionTintBackgroundTheme_colorPalette on ColorPalette { + ...customTintBackgroundTheme_colorPalette + __typename +} + +fragment customTintBackgroundTheme_colorPalette on ColorPalette { + tintBackgroundSpectrum { + ...ThemeUtil_colorSpectrum + __typename + } + __typename +} + +fragment ThemeUtil_colorSpectrum on ColorSpectrum { + backgroundColor + ...ThemeUtilInterpolateHelpers_colorSpectrum + __typename +} + +fragment ThemeUtilInterpolateHelpers_colorSpectrum on ColorSpectrum { + colorPoints { + ...ThemeUtil_colorPoint + __typename + } + __typename +} + +fragment ThemeUtil_colorPoint on ColorPoint { + color + point + __typename +} + +fragment collectionTintBackgroundTheme_customStyleSheet on CustomStyleSheet { + id + ...customTintBackgroundTheme_customStyleSheet + __typename +} + +fragment customTintBackgroundTheme_customStyleSheet on CustomStyleSheet { + id + global { + colorPalette { + primary { + colorPalette { + ...customTintBackgroundTheme_colorPalette + __typename + } + __typename + } + __typename + } + __typename + } + __typename +} + +fragment publisherUrl_publisher on Publisher { + id + __typename + ... on Collection { + ...collectionUrl_collection + __typename + id + } + ... on User { + ...userUrl_user + __typename + id + } +} + +fragment collectionUrl_collection on Collection { + id + domain + slug + __typename +} + +fragment userUrl_user on User { + __typename + id + customDomainState { + live { + domain + __typename + } + __typename + } + hasSubdomain + username +} + +fragment PublisherHeaderNameplate_publisher on Publisher { + ...PublisherAvatar_publisher + ...PublisherHeaderLogo_publisher + ...PublisherFollowersCount_publisher + __typename +} + +fragment PublisherAvatar_publisher on Publisher { + __typename + ... on Collection { + id + ...CollectionAvatar_collection + __typename + } + ... on User { + id + ...UserAvatar_user + __typename + } +} + +fragment CollectionAvatar_collection on Collection { + name + avatar { + id + __typename + } + ...collectionUrl_collection + __typename + id +} + +fragment UserAvatar_user on User { + __typename + id + imageId + mediumMemberAt + name + username + ...userUrl_user +} + +fragment PublisherHeaderLogo_publisher on Publisher { + __typename + id + customStyleSheet { + id + header { + logoImage { + ...PublisherHeaderLogo_image + __typename + } + appNameColor { + ...getHexFromColorValue_colorValue + __typename + } + appNameTreatment + __typename + } + __typename + } + name + ... on Collection { + isAuroraVisible + logo { + id + originalHeight + originalWidth + __typename + } + __typename + id + } + ... on User { + ...useIsVerifiedBookAuthor_user + __typename + id + } + ...CustomHeaderTooltip_publisher + ...publisherUrl_publisher +} + +fragment PublisherHeaderLogo_image on ImageMetadata { + id + originalHeight + originalWidth + __typename +} + +fragment useIsVerifiedBookAuthor_user on User { + verifications { + isBookAuthor + __typename + } + __typename + id +} + +fragment CustomHeaderTooltip_publisher on Publisher { + __typename + id + customStyleSheet { + id + header { + appNameTreatment + nameTreatment + __typename + } + __typename + } + ... on Collection { + isAuroraVisible + slug + __typename + id + } +} + +fragment PublisherFollowersCount_publisher on Publisher { + id + __typename + id + ... on Collection { + slug + subscriberCount + ...collectionUrl_collection + __typename + id + } + ... on User { + socialStats { + followerCount + __typename + } + username + ...userUrl_user + __typename + id + } +} + +fragment PublisherHeaderActions_publisher on Publisher { + __typename + ...MetaHeaderPubMenu_publisher + ... on Collection { + ...CollectionFollowButton_collection + __typename + id + } + ... on User { + ...FollowAndSubscribeButtons_user + __typename + id + } +} + +fragment MetaHeaderPubMenu_publisher on Publisher { + __typename + ... on Collection { + ...MetaHeaderPubMenu_publisher_collection + __typename + id + } + ... on User { + ...MetaHeaderPubMenu_publisher_user + __typename + id + } +} + +fragment MetaHeaderPubMenu_publisher_collection on Collection { + id + slug + name + domain + newsletterV3 { + slug + __typename + id + } + ...MutePopoverOptions_collection + __typename +} + +fragment MutePopoverOptions_collection on Collection { + id + __typename +} + +fragment MetaHeaderPubMenu_publisher_user on User { + id + username + ...MutePopoverOptions_creator + __typename +} + +fragment MutePopoverOptions_creator on User { + id + __typename +} + +fragment CollectionFollowButton_collection on Collection { + __typename + id + name + slug + ...collectionUrl_collection + ...SusiClickable_collection +} + +fragment SusiClickable_collection on Collection { + ...SusiContainer_collection + __typename + id +} + +fragment SusiContainer_collection on Collection { + name + ...SignInOptions_collection + ...SignUpOptions_collection + __typename + id +} + +fragment SignInOptions_collection on Collection { + id + name + __typename +} + +fragment SignUpOptions_collection on Collection { + id + name + __typename +} + +fragment FollowAndSubscribeButtons_user on User { + ...UserFollowButton_user + ...UserSubscribeButton_user + __typename + id +} + +fragment UserFollowButton_user on User { + ...UserFollowButtonSignedIn_user + ...UserFollowButtonSignedOut_user + __typename + id +} + +fragment UserFollowButtonSignedIn_user on User { + id + name + __typename +} + +fragment UserFollowButtonSignedOut_user on User { + id + ...SusiClickable_user + __typename +} + +fragment SusiClickable_user on User { + ...SusiContainer_user + __typename + id +} + +fragment SusiContainer_user on User { + ...SignInOptions_user + ...SignUpOptions_user + __typename + id +} + +fragment SignInOptions_user on User { + id + name + __typename +} + +fragment SignUpOptions_user on User { + id + name + __typename +} + +fragment UserSubscribeButton_user on User { + id + isPartnerProgramEnrolled + name + viewerEdge { + id + isFollowing + isUser + __typename + } + viewerIsUser + newsletterV3 { + id + ...useNewsletterV3Subscription_newsletterV3 + __typename + } + ...useNewsletterV3Subscription_user + ...MembershipUpsellModal_user + __typename +} + +fragment useNewsletterV3Subscription_newsletterV3 on NewsletterV3 { + id + type + slug + name + collection { + slug + __typename + id + } + user { + id + name + username + newsletterV3 { + id + __typename + } + __typename + } + __typename +} + +fragment useNewsletterV3Subscription_user on User { + id + username + newsletterV3 { + ...useNewsletterV3Subscription_newsletterV3 + __typename + id + } + __typename +} + +fragment MembershipUpsellModal_user on User { + id + name + imageId + postSubscribeMembershipUpsellShownAt + newsletterV3 { + id + __typename + } + __typename +} + +fragment PublisherHeaderNav_publisher on Publisher { + __typename + id + customStyleSheet { + navigation { + navItems { + name + ...PublisherHeaderNavLink_headerNavigationItem + __typename + } + __typename + } + __typename + id + } + ...PublisherHeaderNavLink_publisher + ... on Collection { + domain + isAuroraVisible + slug + navItems { + tagSlug + title + url + __typename + } + __typename + id + } + ... on User { + customDomainState { + live { + domain + __typename + } + __typename + } + hasSubdomain + username + homePostsPublished: homepagePostsConnection(paging: {limit: 1}) { + posts { + id + __typename + } + __typename + } + ...useIsVerifiedBookAuthor_user + __typename + id + } +} + +fragment PublisherHeaderNavLink_headerNavigationItem on HeaderNavigationItem { + href + name + tags { + id + normalizedTagSlug + __typename + } + type + __typename +} + +fragment PublisherHeaderNavLink_publisher on Publisher { + __typename + id + ... on Collection { + slug + __typename + id + } +} + +fragment PublisherHomepagePosts_publisher on Publisher { + __typename + id + homepagePostsConnection( + paging: {limit: $homepagePostsLimit, from: $homepagePostsFrom} + includeDistributedResponses: $includeDistributedResponses + ) { + posts { + inResponseToPostResult { + __typename + } + ...WithResponsesSidebar_post + ...PostPreview_post + __typename + } + pagingInfo { + next { + from + limit + __typename + } + __typename + } + __typename + } + ...CardByline_publisher + ...NewsletterV3Promo_publisher + ...PublisherHomepagePosts_user +} + +fragment WithResponsesSidebar_post on Post { + id + ...ThreadedResponsesSidebar_post + __typename +} + +fragment ThreadedResponsesSidebar_post on Post { + id + ...ThreadedResponsesSidebarContent_post + __typename +} + +fragment ThreadedResponsesSidebarContent_post on Post { + id + postResponses { + count + __typename + } + collection { + id + viewerEdge { + id + isEditor + __typename + } + __typename + } + creator { + id + __typename + } + ...ThreadedReplies_post + __typename +} + +fragment ThreadedReplies_post on Post { + __typename + id + ...ThreadedReply_post +} + +fragment ThreadedReply_post on Post { + __typename + id + ...ReadOrEditSimpleResponse_post + ...StoryResponse_post +} + +fragment ReadOrEditSimpleResponse_post on Post { + __typename + id + ...SimpleResponse_post +} + +fragment SimpleResponse_post on Post { + id + ...ResponseHeader_post + __typename +} + +fragment ResponseHeader_post on Post { + __typename + id + createdAt + firstPublishedAt + latestPublishedAt + creator { + id + name + ...UserAvatar_user + ...useIsVerifiedBookAuthor_user + ...UserMentionTooltip_user + __typename + } + ...ResponsePopoverMenu_post +} + +fragment UserMentionTooltip_user on User { + id + name + username + bio + imageId + mediumMemberAt + ...UserAvatar_user + ...UserFollowButton_user + ...useIsVerifiedBookAuthor_user + __typename +} + +fragment ResponsePopoverMenu_post on Post { + id + ...ReportUserMenuItem_post + ...HideResponseMenuItem_post + ...BlockUserMenuItem_post + ...UndoClapsMenuItem_post + __typename +} + +fragment ReportUserMenuItem_post on Post { + __typename + id + creator { + id + __typename + } + ...SusiClickable_post +} + +fragment SusiClickable_post on Post { + id + mediumUrl + ...SusiContainer_post + __typename +} + +fragment SusiContainer_post on Post { + id + __typename +} + +fragment HideResponseMenuItem_post on Post { + __typename + id + collection { + id + viewerEdge { + id + isEditor + __typename + } + __typename + } + creator { + id + __typename + } +} + +fragment BlockUserMenuItem_post on Post { + __typename + id + creator { + id + __typename + } +} + +fragment UndoClapsMenuItem_post on Post { + id + clapCount + __typename +} + +fragment StoryResponse_post on Post { + id + ...ResponseHeader_post + __typename +} + +fragment PostPreview_post on Post { + id + creator { + ...PostPreview_user + __typename + id + } + collection { + ...CardByline_collection + ...ExpandablePostByline_collection + __typename + id + } + ...InteractivePostBody_postPreview + firstPublishedAt + isLocked + isSeries + latestPublishedAt + inResponseToCatalogResult { + __typename + } + pinnedAt + pinnedByCreatorAt + previewImage { + id + focusPercentX + focusPercentY + __typename + } + readingTime + sequence { + slug + __typename + } + title + uniqueSlug + ...CardByline_post + ...PostFooterActionsBar_post + ...InResponseToEntityPreview_post + ...PostScrollTracker_post + ...HighDensityPreview_post + __typename +} + +fragment PostPreview_user on User { + __typename + name + username + ...CardByline_user + ...ExpandablePostByline_user + id +} + +fragment CardByline_user on User { + __typename + id + name + username + mediumMemberAt + socialStats { + followerCount + __typename + } + ...useIsVerifiedBookAuthor_user + ...userUrl_user + ...UserMentionTooltip_user +} + +fragment ExpandablePostByline_user on User { + __typename + id + name + imageId + ...userUrl_user + ...useIsVerifiedBookAuthor_user +} + +fragment CardByline_collection on Collection { + name + ...collectionUrl_collection + __typename + id +} + +fragment ExpandablePostByline_collection on Collection { + __typename + id + name + domain + slug +} + +fragment InteractivePostBody_postPreview on Post { + extendedPreviewContent( + truncationConfig: {previewParagraphsWordCountThreshold: 400, minimumWordLengthForTruncation: 150, truncateAtEndOfSentence: true, showFullImageCaptions: true, shortformPreviewParagraphsWordCountThreshold: 30, shortformMinimumWordLengthForTruncation: 30} + ) { + bodyModel { + ...PostBody_bodyModel + __typename + } + isFullContent + __typename + } + __typename + id +} + +fragment PostBody_bodyModel on RichText { + sections { + name + startIndex + textLayout + imageLayout + backgroundImage { + id + originalHeight + originalWidth + __typename + } + videoLayout + backgroundVideo { + videoId + originalHeight + originalWidth + previewImageId + __typename + } + __typename + } + paragraphs { + id + ...PostBodySection_paragraph + __typename + } + ...normalizedBodyModel_richText + __typename +} + +fragment PostBodySection_paragraph on Paragraph { + name + ...PostBodyParagraph_paragraph + __typename + id +} + +fragment PostBodyParagraph_paragraph on Paragraph { + name + type + ...ImageParagraph_paragraph + ...TextParagraph_paragraph + ...IframeParagraph_paragraph + ...MixtapeParagraph_paragraph + ...CodeBlockParagraph_paragraph + __typename + id +} + +fragment ImageParagraph_paragraph on Paragraph { + href + layout + metadata { + id + originalHeight + originalWidth + focusPercentX + focusPercentY + alt + __typename + } + ...Markups_paragraph + ...ParagraphRefsMapContext_paragraph + ...PostAnnotationsMarker_paragraph + __typename + id +} + +fragment Markups_paragraph on Paragraph { + name + text + hasDropCap + dropCapImage { + ...MarkupNode_data_dropCapImage + __typename + id + } + markups { + ...Markups_markup + __typename + } + __typename + id +} + +fragment MarkupNode_data_dropCapImage on ImageMetadata { + ...DropCap_image + __typename + id +} + +fragment DropCap_image on ImageMetadata { + id + originalHeight + originalWidth + __typename +} + +fragment Markups_markup on Markup { + type + start + end + href + anchorType + userId + linkMetadata { + httpStatus + __typename + } + __typename +} + +fragment ParagraphRefsMapContext_paragraph on Paragraph { + id + name + text + __typename +} + +fragment PostAnnotationsMarker_paragraph on Paragraph { + ...PostViewNoteCard_paragraph + __typename + id +} + +fragment PostViewNoteCard_paragraph on Paragraph { + name + __typename + id +} + +fragment TextParagraph_paragraph on Paragraph { + type + hasDropCap + codeBlockMetadata { + mode + lang + __typename + } + ...Markups_paragraph + ...ParagraphRefsMapContext_paragraph + __typename + id +} + +fragment IframeParagraph_paragraph on Paragraph { + type + iframe { + mediaResource { + id + iframeSrc + iframeHeight + iframeWidth + title + __typename + } + __typename + } + layout + ...Markups_paragraph + __typename + id +} + +fragment MixtapeParagraph_paragraph on Paragraph { + type + mixtapeMetadata { + href + mediaResource { + mediumCatalog { + id + __typename + } + __typename + } + __typename + } + ...GenericMixtapeParagraph_paragraph + __typename + id +} + +fragment GenericMixtapeParagraph_paragraph on Paragraph { + text + mixtapeMetadata { + href + thumbnailImageId + __typename + } + markups { + start + end + type + href + __typename + } + __typename + id +} + +fragment CodeBlockParagraph_paragraph on Paragraph { + codeBlockMetadata { + lang + mode + __typename + } + __typename + id +} + +fragment normalizedBodyModel_richText on RichText { + paragraphs { + ...normalizedBodyModel_richText_paragraphs + __typename + } + sections { + startIndex + ...getSectionEndIndex_section + __typename + } + ...getParagraphStyles_richText + ...getParagraphSpaces_richText + __typename +} + +fragment normalizedBodyModel_richText_paragraphs on Paragraph { + markups { + ...normalizedBodyModel_richText_paragraphs_markups + __typename + } + codeBlockMetadata { + lang + mode + __typename + } + ...getParagraphHighlights_paragraph + ...getParagraphPrivateNotes_paragraph + __typename + id +} + +fragment normalizedBodyModel_richText_paragraphs_markups on Markup { + type + __typename +} + +fragment getParagraphHighlights_paragraph on Paragraph { + name + __typename + id +} + +fragment getParagraphPrivateNotes_paragraph on Paragraph { + name + __typename + id +} + +fragment getSectionEndIndex_section on Section { + startIndex + __typename +} + +fragment getParagraphStyles_richText on RichText { + paragraphs { + text + type + __typename + } + sections { + ...getSectionEndIndex_section + __typename + } + __typename +} + +fragment getParagraphSpaces_richText on RichText { + paragraphs { + layout + metadata { + originalHeight + originalWidth + id + __typename + } + type + ...paragraphExtendsImageGrid_paragraph + __typename + } + ...getSeriesParagraphTopSpacings_richText + ...getPostParagraphTopSpacings_richText + __typename +} + +fragment paragraphExtendsImageGrid_paragraph on Paragraph { + layout + type + __typename + id +} + +fragment getSeriesParagraphTopSpacings_richText on RichText { + paragraphs { + id + __typename + } + sections { + ...getSectionEndIndex_section + __typename + } + __typename +} + +fragment getPostParagraphTopSpacings_richText on RichText { + paragraphs { + type + layout + text + codeBlockMetadata { + lang + mode + __typename + } + __typename + } + sections { + ...getSectionEndIndex_section + __typename + } + __typename +} + +fragment CardByline_post on Post { + ...DraftStatus_post + ...Star_post + ...shouldShowPublishedInStatus_post + __typename + id +} + +fragment DraftStatus_post on Post { + id + pendingCollection { + id + creator { + id + __typename + } + ...BoldCollectionName_collection + __typename + } + statusForCollection + creator { + id + __typename + } + isPublished + __typename +} + +fragment BoldCollectionName_collection on Collection { + id + name + __typename +} + +fragment Star_post on Post { + id + creator { + id + __typename + } + __typename +} + +fragment shouldShowPublishedInStatus_post on Post { + statusForCollection + isPublished + __typename + id +} + +fragment PostFooterActionsBar_post on Post { + id + visibility + allowResponses + postResponses { + count + __typename + } + isLimitedState + creator { + id + __typename + } + collection { + id + __typename + } + ...MultiVote_post + ...PostSharePopover_post + ...OverflowMenuButtonWithNegativeSignal_post + ...PostPageBookmarkButton_post + __typename +} + +fragment MultiVote_post on Post { + id + creator { + id + ...SusiClickable_user + __typename + } + isPublished + ...SusiClickable_post + collection { + id + slug + __typename + } + isLimitedState + ...MultiVoteCount_post + __typename +} + +fragment MultiVoteCount_post on Post { + id + __typename +} + +fragment PostSharePopover_post on Post { + id + mediumUrl + title + isPublished + isLocked + ...usePostUrl_post + ...FriendLink_post + __typename +} + +fragment usePostUrl_post on Post { + id + creator { + ...userUrl_user + __typename + id + } + collection { + id + domain + slug + __typename + } + isSeries + mediumUrl + sequence { + slug + __typename + } + uniqueSlug + __typename +} + +fragment FriendLink_post on Post { + id + ...SusiClickable_post + ...useCopyFriendLink_post + __typename +} + +fragment useCopyFriendLink_post on Post { + ...usePostUrl_post + __typename + id +} + +fragment OverflowMenuButtonWithNegativeSignal_post on Post { + id + visibility + ...OverflowMenuWithNegativeSignal_post + __typename +} + +fragment OverflowMenuWithNegativeSignal_post on Post { + id + creator { + id + __typename + } + collection { + id + __typename + } + ...OverflowMenuItemUndoClaps_post + ...AddToCatalogBase_post + __typename +} + +fragment OverflowMenuItemUndoClaps_post on Post { + id + clapCount + ...ClapMutation_post + __typename +} + +fragment ClapMutation_post on Post { + __typename + id + clapCount + ...MultiVoteCount_post +} + +fragment AddToCatalogBase_post on Post { + id + isPublished + __typename +} + +fragment PostPageBookmarkButton_post on Post { + ...AddToCatalogBookmarkButton_post + __typename + id +} + +fragment AddToCatalogBookmarkButton_post on Post { + ...AddToCatalogBase_post + __typename + id +} + +fragment InResponseToEntityPreview_post on Post { + id + inResponseToEntityType + __typename +} + +fragment PostScrollTracker_post on Post { + id + collection { + id + __typename + } + sequence { + sequenceId + __typename + } + __typename +} + +fragment HighDensityPreview_post on Post { + id + title + previewImage { + id + focusPercentX + focusPercentY + __typename + } + extendedPreviewContent( + truncationConfig: {previewParagraphsWordCountThreshold: 400, minimumWordLengthForTruncation: 150, truncateAtEndOfSentence: true, showFullImageCaptions: true, shortformPreviewParagraphsWordCountThreshold: 30, shortformMinimumWordLengthForTruncation: 30} + ) { + subtitle + __typename + } + ...HighDensityFooter_post + __typename +} + +fragment HighDensityFooter_post on Post { + id + readingTime + tags { + ...TopicPill_tag + __typename + } + ...BookmarkButton_post + ...ExpandablePostCardOverflowButton_post + ...OverflowMenuButtonWithNegativeSignal_post + __typename +} + +fragment TopicPill_tag on Tag { + __typename + id + displayTitle + normalizedTagSlug +} + +fragment BookmarkButton_post on Post { + visibility + ...SusiClickable_post + ...AddToCatalogBookmarkButton_post + __typename + id +} + +fragment ExpandablePostCardOverflowButton_post on Post { + creator { + id + __typename + } + ...ExpandablePostCardReaderButton_post + __typename + id +} + +fragment ExpandablePostCardReaderButton_post on Post { + id + collection { + id + __typename + } + creator { + id + __typename + } + clapCount + ...ClapMutation_post + __typename +} + +fragment CardByline_publisher on Publisher { + __typename + ... on User { + id + ...CardByline_user + __typename + } + ... on Collection { + id + ...CardByline_collection + __typename + } +} + +fragment NewsletterV3Promo_publisher on Publisher { + __typename + ... on User { + ...NewsletterV3Promo_user + __typename + id + } + ... on Collection { + ...NewsletterV3Promo_collection + __typename + id + } +} + +fragment NewsletterV3Promo_user on User { + id + username + name + viewerEdge { + isUser + __typename + id + } + newsletterV3 { + id + ...NewsletterV3Promo_newsletterV3 + __typename + } + __typename +} + +fragment NewsletterV3Promo_newsletterV3 on NewsletterV3 { + slug + name + description + promoHeadline + promoBody + ...NewsletterSubscribeComponent_newsletterV3 + __typename + id +} + +fragment NewsletterSubscribeComponent_newsletterV3 on NewsletterV3 { + ...NewsletterV3SubscribeButton_newsletterV3 + ...NewsletterV3SubscribeByEmail_newsletterV3 + __typename + id +} + +fragment NewsletterV3SubscribeButton_newsletterV3 on NewsletterV3 { + id + name + slug + type + user { + id + name + username + __typename + } + collection { + slug + ...SusiClickable_collection + ...collectionDefaultBackgroundTheme_collection + __typename + id + } + ...SusiClickable_newsletterV3 + ...useNewsletterV3Subscription_newsletterV3 + __typename +} + +fragment collectionDefaultBackgroundTheme_collection on Collection { + colorPalette { + ...collectionDefaultBackgroundTheme_colorPalette + __typename + } + customStyleSheet { + id + ...collectionDefaultBackgroundTheme_customStyleSheet + __typename + } + __typename + id +} + +fragment collectionDefaultBackgroundTheme_colorPalette on ColorPalette { + ...customDefaultBackgroundTheme_colorPalette + __typename +} + +fragment customDefaultBackgroundTheme_colorPalette on ColorPalette { + highlightSpectrum { + ...ThemeUtil_colorSpectrum + __typename + } + defaultBackgroundSpectrum { + ...ThemeUtil_colorSpectrum + __typename + } + tintBackgroundSpectrum { + ...ThemeUtil_colorSpectrum + __typename + } + __typename +} + +fragment collectionDefaultBackgroundTheme_customStyleSheet on CustomStyleSheet { + id + ...customDefaultBackgroundTheme_customStyleSheet + __typename +} + +fragment customDefaultBackgroundTheme_customStyleSheet on CustomStyleSheet { + id + global { + colorPalette { + primary { + colorPalette { + ...customDefaultBackgroundTheme_colorPalette + __typename + } + __typename + } + background { + colorPalette { + ...customDefaultBackgroundTheme_colorPalette + __typename + } + __typename + } + __typename + } + __typename + } + __typename +} + +fragment SusiClickable_newsletterV3 on NewsletterV3 { + ...SusiContainer_newsletterV3 + __typename + id +} + +fragment SusiContainer_newsletterV3 on NewsletterV3 { + ...SignInOptions_newsletterV3 + ...SignUpOptions_newsletterV3 + __typename + id +} + +fragment SignInOptions_newsletterV3 on NewsletterV3 { + id + name + __typename +} + +fragment SignUpOptions_newsletterV3 on NewsletterV3 { + id + name + __typename +} + +fragment NewsletterV3SubscribeByEmail_newsletterV3 on NewsletterV3 { + id + slug + type + user { + id + name + username + __typename + } + collection { + ...collectionDefaultBackgroundTheme_collection + ...collectionUrl_collection + __typename + id + } + __typename +} + +fragment NewsletterV3Promo_collection on Collection { + id + slug + domain + name + newsletterV3 { + id + ...NewsletterV3Promo_newsletterV3 + __typename + } + __typename +} + +fragment PublisherHomepagePosts_user on User { + id + ...useShowAuthorNewsletterV3Promo_user + __typename +} + +fragment useShowAuthorNewsletterV3Promo_user on User { + id + username + newsletterV3 { + id + showPromo + slug + __typename + } + __typename +} + +fragment usePublicationAnalytics_collection on Collection { + id + googleAnalyticsId + __typename +} + +fragment CollectionMetadata_collection on Collection { + avatar { + id + focusPercentX + focusPercentY + originalHeight + originalWidth + __typename + } + creator { + id + twitterScreenName + ...userUrl_user + __typename + } + description + domain + facebookPageId + name + tags + twitterUsername + createdAt + ptsQualifiedAt + customDomainState { + live { + status + isSubdomain + __typename + } + __typename + } + ...collectionUrl_collection + ...CollectionJsonLd_collection + __typename + id +} + +fragment CollectionJsonLd_collection on Collection { + id + logo { + ...PrepareLogoForJsonLd_imageMetadata + __typename + id + } + avatar { + id + focusPercentX + focusPercentY + originalHeight + originalWidth + __typename + } + domain + name + ...collectionUrl_collection + __typename +} + +fragment PrepareLogoForJsonLd_imageMetadata on ImageMetadata { + id + originalWidth + originalHeight + __typename +} +""" } + @register_scraper class MediumScraper(BaseScraper): - url = "https://medium.com/feed/snowflake" + url = "https://medium.com/_/graphql" def __init__(self, *args, **kwargs): super(MediumScraper, self).__init__(*args, **kwargs) self.data = {} self.after = datetime(1970, 1, 1, tzinfo=timezone.utc) + def make_request(self, query_vars): + response = requests.post(self.url, json=query_vars) + post_data = response.json()["data"]["collection"]["homepagePostsConnection"] + paging_info = post_data['pagingInfo'] + return post_data["posts"], paging_info + def scrape(self): print("Scraping Medium") - for entry in feedparser.parse(MediumScraper.url)["entries"]: - updated = string_to_datetime(entry["updated"]) - if updated > self.after: - self.data[entry["link"]] = { - "title": entry["title"], - "published": string_to_datetime(entry["published"]), - "updated": updated, - } + query_vars = LONG_QUERY | { + "variables": { + "homepagePostsLimit": 25, + "includeDistributedResponses": False, + "collectionId": "34b6daafc07", + "homepagePostsFrom": "0" + } + } + + while True: + posts, paging_info = self.make_request(query_vars) + + for post in posts: + if post["visibility"] == "PUBLIC": + self.data[post["mediumUrl"]] = { + "title": post["title"], + "published": unix_to_datetime_utc(post["firstPublishedAt"]), + "updated": unix_to_datetime_utc(post["latestPublishedAt"]) + } + + if paging_info is None: + break + + query_vars['variables']['homepagePostsFrom'] = paging_info['next']['from'] + query_vars['variables']['homepagePostsLimit'] = paging_info['next']['limit'] + return self.data def transform(self): diff --git a/snowscraper/scrapers/quickstarts.py b/snowscraper/scrapers/quickstarts.py deleted file mode 100644 index 0f8b329..0000000 --- a/snowscraper/scrapers/quickstarts.py +++ /dev/null @@ -1,53 +0,0 @@ -from datetime import datetime -from datetime import timezone - -import scrapy -from scrapy import signals -from scrapy.crawler import CrawlerProcess -from scrapy.signalmanager import dispatcher - -from ..controller import register_scraper -from ..scraper import BaseScraper -from snowscraper.helpers import string_to_datetime - -QuickStartsURL = "https://quickstarts.snowflake.com/" - - -@register_scraper -class QuickstartScraper(BaseScraper, scrapy.Spider): - name = "snowflakespider" - - def __init__(self, *args, **kwargs): - super(QuickstartScraper, self).__init__(*args, **kwargs) - self.data = {} - self.after = datetime(1970, 1, 1, tzinfo=timezone.utc) - - def start_requests(self): - yield scrapy.Request(url=QuickStartsURL, callback=self.parse) - - def signal_handler(self, signal, sender, item, response, spider): - self.data[item["key"]] = item - self.data[item["key"]].pop("key") - - def scrape(self): - print("Scraping Quickstarts") - dispatcher.connect(self.signal_handler, signal=signals.item_scraped) - process = CrawlerProcess({"LOG_LEVEL": "ERROR"}) - process.crawl(QuickstartScraper, after=self.after) - process.start() - return self.data - - def parse(self, response): - for card in response.css("card-sorter#cards > a.codelab-card"): - updated = string_to_datetime(card.attrib["data-updated"]) - if updated > self.after: - key = QuickStartsURL.rstrip("/") + card.attrib["href"] - yield { - "key": key, - "title": card.attrib["data-title"], - "updated": updated, - "tags": card.attrib["data-tags"], - } - - def transform(self): - return self.data