From d044f849ea9a100d5603b78f4168cb996ebad17d Mon Sep 17 00:00:00 2001 From: hackish Date: Tue, 19 Sep 2023 12:12:19 -0700 Subject: [PATCH] Reduce GraphQL query, add tags --- snowscraper/scrapers/medium.py | 1815 +------------------------------- 1 file changed, 7 insertions(+), 1808 deletions(-) diff --git a/snowscraper/scrapers/medium.py b/snowscraper/scrapers/medium.py index fe91c88..0bed523 100644 --- a/snowscraper/scrapers/medium.py +++ b/snowscraper/scrapers/medium.py @@ -2,1877 +2,74 @@ from datetime import datetime from datetime import timezone import requests -import feedparser from ..controller import register_scraper from ..helpers import unix_to_datetime_utc from ..scraper import BaseScraper -LONG_QUERY = {"query": """ +LONG_QUERY = """ query PublicationHomepageQuery($collectionId: ID!, $homepagePostsLimit: PaginationLimit = 25, $homepagePostsFrom: String, $includeDistributedResponses: Boolean = false) { collection(id: $collectionId) { - __typename - id ...PublicationHomepage_collection } } fragment PublicationHomepage_collection on Collection { - id - ...PublisherHeader_publisher ...PublisherHomepagePosts_publisher - ...usePublicationAnalytics_collection - ...CollectionMetadata_collection - __typename -} - -fragment PublisherHeader_publisher on Publisher { - id - ...PublisherHeaderBackground_publisher - ...PublisherHeaderNameplate_publisher - ...PublisherHeaderActions_publisher - ...PublisherHeaderNav_publisher - __typename -} - -fragment PublisherHeaderBackground_publisher on Publisher { - __typename - id - customStyleSheet { - ...PublisherHeaderBackground_customStyleSheet - __typename - id - } - ... on Collection { - colorPalette { - tintBackgroundSpectrum { - backgroundColor - __typename - } - __typename - } - isAuroraVisible - legacyHeaderBackgroundImage { - id - originalWidth - focusPercentX - focusPercentY - __typename - } - ...collectionTintBackgroundTheme_collection - __typename - id - } - ...publisherUrl_publisher -} - -fragment PublisherHeaderBackground_customStyleSheet on CustomStyleSheet { - id - global { - colorPalette { - background { - rgb - __typename - } - __typename - } - __typename - } - header { - headerScale - backgroundImageDisplayMode - backgroundImageVerticalAlignment - backgroundColorDisplayMode - backgroundColor { - alpha - rgb - ...getHexFromColorValue_colorValue - ...getOpaqueHexFromColorValue_colorValue - __typename - } - secondaryBackgroundColor { - ...getHexFromColorValue_colorValue - __typename - } - postBackgroundColor { - ...getHexFromColorValue_colorValue - __typename - } - backgroundImage { - id - originalWidth - __typename - } - __typename - } - __typename -} - -fragment getHexFromColorValue_colorValue on ColorValue { - rgb - alpha - __typename -} - -fragment getOpaqueHexFromColorValue_colorValue on ColorValue { - rgb - __typename -} - -fragment collectionTintBackgroundTheme_collection on Collection { - colorPalette { - ...collectionTintBackgroundTheme_colorPalette - __typename - } - customStyleSheet { - id - ...collectionTintBackgroundTheme_customStyleSheet - __typename - } - __typename - id -} - -fragment collectionTintBackgroundTheme_colorPalette on ColorPalette { - ...customTintBackgroundTheme_colorPalette - __typename -} - -fragment customTintBackgroundTheme_colorPalette on ColorPalette { - tintBackgroundSpectrum { - ...ThemeUtil_colorSpectrum - __typename - } - __typename -} - -fragment ThemeUtil_colorSpectrum on ColorSpectrum { - backgroundColor - ...ThemeUtilInterpolateHelpers_colorSpectrum - __typename -} - -fragment ThemeUtilInterpolateHelpers_colorSpectrum on ColorSpectrum { - colorPoints { - ...ThemeUtil_colorPoint - __typename - } - __typename -} - -fragment ThemeUtil_colorPoint on ColorPoint { - color - point - __typename -} - -fragment collectionTintBackgroundTheme_customStyleSheet on CustomStyleSheet { - id - ...customTintBackgroundTheme_customStyleSheet - __typename -} - -fragment customTintBackgroundTheme_customStyleSheet on CustomStyleSheet { - id - global { - colorPalette { - primary { - colorPalette { - ...customTintBackgroundTheme_colorPalette - __typename - } - __typename - } - __typename - } - __typename - } - __typename -} - -fragment publisherUrl_publisher on Publisher { - id - __typename - ... on Collection { - ...collectionUrl_collection - __typename - id - } - ... on User { - ...userUrl_user - __typename - id - } -} - -fragment collectionUrl_collection on Collection { - id - domain - slug - __typename -} - -fragment userUrl_user on User { - __typename - id - customDomainState { - live { - domain - __typename - } - __typename - } - hasSubdomain - username -} - -fragment PublisherHeaderNameplate_publisher on Publisher { - ...PublisherAvatar_publisher - ...PublisherHeaderLogo_publisher - ...PublisherFollowersCount_publisher - __typename -} - -fragment PublisherAvatar_publisher on Publisher { - __typename - ... on Collection { - id - ...CollectionAvatar_collection - __typename - } - ... on User { - id - ...UserAvatar_user - __typename - } -} - -fragment CollectionAvatar_collection on Collection { - name - avatar { - id - __typename - } - ...collectionUrl_collection - __typename - id -} - -fragment UserAvatar_user on User { - __typename - id - imageId - mediumMemberAt - name - username - ...userUrl_user -} - -fragment PublisherHeaderLogo_publisher on Publisher { - __typename - id - customStyleSheet { - id - header { - logoImage { - ...PublisherHeaderLogo_image - __typename - } - appNameColor { - ...getHexFromColorValue_colorValue - __typename - } - appNameTreatment - __typename - } - __typename - } - name - ... on Collection { - isAuroraVisible - logo { - id - originalHeight - originalWidth - __typename - } - __typename - id - } - ... on User { - ...useIsVerifiedBookAuthor_user - __typename - id - } - ...CustomHeaderTooltip_publisher - ...publisherUrl_publisher -} - -fragment PublisherHeaderLogo_image on ImageMetadata { - id - originalHeight - originalWidth - __typename -} - -fragment useIsVerifiedBookAuthor_user on User { - verifications { - isBookAuthor - __typename - } - __typename - id -} - -fragment CustomHeaderTooltip_publisher on Publisher { - __typename - id - customStyleSheet { - id - header { - appNameTreatment - nameTreatment - __typename - } - __typename - } - ... on Collection { - isAuroraVisible - slug - __typename - id - } -} - -fragment PublisherFollowersCount_publisher on Publisher { - id - __typename - id - ... on Collection { - slug - subscriberCount - ...collectionUrl_collection - __typename - id - } - ... on User { - socialStats { - followerCount - __typename - } - username - ...userUrl_user - __typename - id - } -} - -fragment PublisherHeaderActions_publisher on Publisher { - __typename - ...MetaHeaderPubMenu_publisher - ... on Collection { - ...CollectionFollowButton_collection - __typename - id - } - ... on User { - ...FollowAndSubscribeButtons_user - __typename - id - } -} - -fragment MetaHeaderPubMenu_publisher on Publisher { - __typename - ... on Collection { - ...MetaHeaderPubMenu_publisher_collection - __typename - id - } - ... on User { - ...MetaHeaderPubMenu_publisher_user - __typename - id - } -} - -fragment MetaHeaderPubMenu_publisher_collection on Collection { - id - slug - name - domain - newsletterV3 { - slug - __typename - id - } - ...MutePopoverOptions_collection - __typename -} - -fragment MutePopoverOptions_collection on Collection { - id - __typename -} - -fragment MetaHeaderPubMenu_publisher_user on User { - id - username - ...MutePopoverOptions_creator - __typename -} - -fragment MutePopoverOptions_creator on User { - id - __typename -} - -fragment CollectionFollowButton_collection on Collection { - __typename - id - name - slug - ...collectionUrl_collection - ...SusiClickable_collection -} - -fragment SusiClickable_collection on Collection { - ...SusiContainer_collection - __typename - id -} - -fragment SusiContainer_collection on Collection { - name - ...SignInOptions_collection - ...SignUpOptions_collection - __typename - id -} - -fragment SignInOptions_collection on Collection { - id - name - __typename -} - -fragment SignUpOptions_collection on Collection { - id - name - __typename -} - -fragment FollowAndSubscribeButtons_user on User { - ...UserFollowButton_user - ...UserSubscribeButton_user - __typename - id -} - -fragment UserFollowButton_user on User { - ...UserFollowButtonSignedIn_user - ...UserFollowButtonSignedOut_user - __typename - id -} - -fragment UserFollowButtonSignedIn_user on User { - id - name - __typename -} - -fragment UserFollowButtonSignedOut_user on User { - id - ...SusiClickable_user - __typename -} - -fragment SusiClickable_user on User { - ...SusiContainer_user - __typename - id -} - -fragment SusiContainer_user on User { - ...SignInOptions_user - ...SignUpOptions_user - __typename - id -} - -fragment SignInOptions_user on User { - id - name - __typename -} - -fragment SignUpOptions_user on User { - id - name - __typename -} - -fragment UserSubscribeButton_user on User { - id - isPartnerProgramEnrolled - name - viewerEdge { - id - isFollowing - isUser - __typename - } - viewerIsUser - newsletterV3 { - id - ...useNewsletterV3Subscription_newsletterV3 - __typename - } - ...useNewsletterV3Subscription_user - ...MembershipUpsellModal_user - __typename -} - -fragment useNewsletterV3Subscription_newsletterV3 on NewsletterV3 { - id - type - slug - name - collection { - slug - __typename - id - } - user { - id - name - username - newsletterV3 { - id - __typename - } - __typename - } - __typename -} - -fragment useNewsletterV3Subscription_user on User { - id - username - newsletterV3 { - ...useNewsletterV3Subscription_newsletterV3 - __typename - id - } - __typename -} - -fragment MembershipUpsellModal_user on User { - id - name - imageId - postSubscribeMembershipUpsellShownAt - newsletterV3 { - id - __typename - } - __typename -} - -fragment PublisherHeaderNav_publisher on Publisher { - __typename - id - customStyleSheet { - navigation { - navItems { - name - ...PublisherHeaderNavLink_headerNavigationItem - __typename - } - __typename - } - __typename - id - } - ...PublisherHeaderNavLink_publisher - ... on Collection { - domain - isAuroraVisible - slug - navItems { - tagSlug - title - url - __typename - } - __typename - id - } - ... on User { - customDomainState { - live { - domain - __typename - } - __typename - } - hasSubdomain - username - homePostsPublished: homepagePostsConnection(paging: {limit: 1}) { - posts { - id - __typename - } - __typename - } - ...useIsVerifiedBookAuthor_user - __typename - id - } -} - -fragment PublisherHeaderNavLink_headerNavigationItem on HeaderNavigationItem { - href - name - tags { - id - normalizedTagSlug - __typename - } - type - __typename -} - -fragment PublisherHeaderNavLink_publisher on Publisher { - __typename - id - ... on Collection { - slug - __typename - id - } } fragment PublisherHomepagePosts_publisher on Publisher { - __typename - id homepagePostsConnection( paging: {limit: $homepagePostsLimit, from: $homepagePostsFrom} includeDistributedResponses: $includeDistributedResponses ) { posts { - inResponseToPostResult { - __typename - } - ...WithResponsesSidebar_post ...PostPreview_post - __typename } pagingInfo { next { from limit - __typename } - __typename } - __typename } - ...CardByline_publisher - ...NewsletterV3Promo_publisher - ...PublisherHomepagePosts_user -} - -fragment WithResponsesSidebar_post on Post { - id - ...ThreadedResponsesSidebar_post - __typename -} - -fragment ThreadedResponsesSidebar_post on Post { - id - ...ThreadedResponsesSidebarContent_post - __typename -} - -fragment ThreadedResponsesSidebarContent_post on Post { - id - postResponses { - count - __typename - } - collection { - id - viewerEdge { - id - isEditor - __typename - } - __typename - } - creator { - id - __typename - } - ...ThreadedReplies_post - __typename -} - -fragment ThreadedReplies_post on Post { - __typename - id - ...ThreadedReply_post -} - -fragment ThreadedReply_post on Post { - __typename - id - ...ReadOrEditSimpleResponse_post - ...StoryResponse_post -} - -fragment ReadOrEditSimpleResponse_post on Post { - __typename - id - ...SimpleResponse_post -} - -fragment SimpleResponse_post on Post { - id - ...ResponseHeader_post - __typename -} - -fragment ResponseHeader_post on Post { - __typename - id - createdAt - firstPublishedAt - latestPublishedAt - creator { - id - name - ...UserAvatar_user - ...useIsVerifiedBookAuthor_user - ...UserMentionTooltip_user - __typename - } - ...ResponsePopoverMenu_post -} - -fragment UserMentionTooltip_user on User { - id - name - username - bio - imageId - mediumMemberAt - ...UserAvatar_user - ...UserFollowButton_user - ...useIsVerifiedBookAuthor_user - __typename -} - -fragment ResponsePopoverMenu_post on Post { - id - ...ReportUserMenuItem_post - ...HideResponseMenuItem_post - ...BlockUserMenuItem_post - ...UndoClapsMenuItem_post - __typename -} - -fragment ReportUserMenuItem_post on Post { - __typename - id - creator { - id - __typename - } - ...SusiClickable_post -} - -fragment SusiClickable_post on Post { - id - mediumUrl - ...SusiContainer_post - __typename -} - -fragment SusiContainer_post on Post { - id - __typename -} - -fragment HideResponseMenuItem_post on Post { - __typename - id - collection { - id - viewerEdge { - id - isEditor - __typename - } - __typename - } - creator { - id - __typename - } -} - -fragment BlockUserMenuItem_post on Post { - __typename - id - creator { - id - __typename - } -} - -fragment UndoClapsMenuItem_post on Post { - id - clapCount - __typename -} - -fragment StoryResponse_post on Post { - id - ...ResponseHeader_post - __typename } fragment PostPreview_post on Post { - id - creator { - ...PostPreview_user - __typename - id - } - collection { - ...CardByline_collection - ...ExpandablePostByline_collection - __typename - id - } - ...InteractivePostBody_postPreview firstPublishedAt - isLocked - isSeries latestPublishedAt - inResponseToCatalogResult { - __typename - } - pinnedAt - pinnedByCreatorAt - previewImage { - id - focusPercentX - focusPercentY - __typename - } - readingTime - sequence { - slug - __typename - } title uniqueSlug - ...CardByline_post ...PostFooterActionsBar_post - ...InResponseToEntityPreview_post - ...PostScrollTracker_post ...HighDensityPreview_post - __typename -} - -fragment PostPreview_user on User { - __typename - name - username - ...CardByline_user - ...ExpandablePostByline_user - id -} - -fragment CardByline_user on User { - __typename - id - name - username - mediumMemberAt - socialStats { - followerCount - __typename - } - ...useIsVerifiedBookAuthor_user - ...userUrl_user - ...UserMentionTooltip_user -} - -fragment ExpandablePostByline_user on User { - __typename - id - name - imageId - ...userUrl_user - ...useIsVerifiedBookAuthor_user -} - -fragment CardByline_collection on Collection { - name - ...collectionUrl_collection - __typename - id -} - -fragment ExpandablePostByline_collection on Collection { - __typename - id - name - domain - slug -} - -fragment InteractivePostBody_postPreview on Post { - extendedPreviewContent( - truncationConfig: {previewParagraphsWordCountThreshold: 400, minimumWordLengthForTruncation: 150, truncateAtEndOfSentence: true, showFullImageCaptions: true, shortformPreviewParagraphsWordCountThreshold: 30, shortformMinimumWordLengthForTruncation: 30} - ) { - bodyModel { - ...PostBody_bodyModel - __typename - } - isFullContent - __typename - } - __typename - id -} - -fragment PostBody_bodyModel on RichText { - sections { - name - startIndex - textLayout - imageLayout - backgroundImage { - id - originalHeight - originalWidth - __typename - } - videoLayout - backgroundVideo { - videoId - originalHeight - originalWidth - previewImageId - __typename - } - __typename - } - paragraphs { - id - ...PostBodySection_paragraph - __typename - } - ...normalizedBodyModel_richText - __typename -} - -fragment PostBodySection_paragraph on Paragraph { - name - ...PostBodyParagraph_paragraph - __typename - id -} - -fragment PostBodyParagraph_paragraph on Paragraph { - name - type - ...ImageParagraph_paragraph - ...TextParagraph_paragraph - ...IframeParagraph_paragraph - ...MixtapeParagraph_paragraph - ...CodeBlockParagraph_paragraph - __typename - id -} - -fragment ImageParagraph_paragraph on Paragraph { - href - layout - metadata { - id - originalHeight - originalWidth - focusPercentX - focusPercentY - alt - __typename - } - ...Markups_paragraph - ...ParagraphRefsMapContext_paragraph - ...PostAnnotationsMarker_paragraph - __typename - id -} - -fragment Markups_paragraph on Paragraph { - name - text - hasDropCap - dropCapImage { - ...MarkupNode_data_dropCapImage - __typename - id - } - markups { - ...Markups_markup - __typename - } - __typename - id -} - -fragment MarkupNode_data_dropCapImage on ImageMetadata { - ...DropCap_image - __typename - id -} - -fragment DropCap_image on ImageMetadata { - id - originalHeight - originalWidth - __typename -} - -fragment Markups_markup on Markup { - type - start - end - href - anchorType - userId - linkMetadata { - httpStatus - __typename - } - __typename -} - -fragment ParagraphRefsMapContext_paragraph on Paragraph { - id - name - text - __typename -} - -fragment PostAnnotationsMarker_paragraph on Paragraph { - ...PostViewNoteCard_paragraph - __typename - id -} - -fragment PostViewNoteCard_paragraph on Paragraph { - name - __typename - id -} - -fragment TextParagraph_paragraph on Paragraph { - type - hasDropCap - codeBlockMetadata { - mode - lang - __typename - } - ...Markups_paragraph - ...ParagraphRefsMapContext_paragraph - __typename - id -} - -fragment IframeParagraph_paragraph on Paragraph { - type - iframe { - mediaResource { - id - iframeSrc - iframeHeight - iframeWidth - title - __typename - } - __typename - } - layout - ...Markups_paragraph - __typename - id -} - -fragment MixtapeParagraph_paragraph on Paragraph { - type - mixtapeMetadata { - href - mediaResource { - mediumCatalog { - id - __typename - } - __typename - } - __typename - } - ...GenericMixtapeParagraph_paragraph - __typename - id -} - -fragment GenericMixtapeParagraph_paragraph on Paragraph { - text - mixtapeMetadata { - href - thumbnailImageId - __typename - } - markups { - start - end - type - href - __typename - } - __typename - id -} - -fragment CodeBlockParagraph_paragraph on Paragraph { - codeBlockMetadata { - lang - mode - __typename - } - __typename - id -} - -fragment normalizedBodyModel_richText on RichText { - paragraphs { - ...normalizedBodyModel_richText_paragraphs - __typename - } - sections { - startIndex - ...getSectionEndIndex_section - __typename - } - ...getParagraphStyles_richText - ...getParagraphSpaces_richText - __typename -} - -fragment normalizedBodyModel_richText_paragraphs on Paragraph { - markups { - ...normalizedBodyModel_richText_paragraphs_markups - __typename - } - codeBlockMetadata { - lang - mode - __typename - } - ...getParagraphHighlights_paragraph - ...getParagraphPrivateNotes_paragraph - __typename - id -} - -fragment normalizedBodyModel_richText_paragraphs_markups on Markup { - type - __typename -} - -fragment getParagraphHighlights_paragraph on Paragraph { - name - __typename - id -} - -fragment getParagraphPrivateNotes_paragraph on Paragraph { - name - __typename - id -} - -fragment getSectionEndIndex_section on Section { - startIndex - __typename -} - -fragment getParagraphStyles_richText on RichText { - paragraphs { - text - type - __typename - } - sections { - ...getSectionEndIndex_section - __typename - } - __typename -} - -fragment getParagraphSpaces_richText on RichText { - paragraphs { - layout - metadata { - originalHeight - originalWidth - id - __typename - } - type - ...paragraphExtendsImageGrid_paragraph - __typename - } - ...getSeriesParagraphTopSpacings_richText - ...getPostParagraphTopSpacings_richText - __typename -} - -fragment paragraphExtendsImageGrid_paragraph on Paragraph { - layout - type - __typename - id -} - -fragment getSeriesParagraphTopSpacings_richText on RichText { - paragraphs { - id - __typename - } - sections { - ...getSectionEndIndex_section - __typename - } - __typename -} - -fragment getPostParagraphTopSpacings_richText on RichText { - paragraphs { - type - layout - text - codeBlockMetadata { - lang - mode - __typename - } - __typename - } - sections { - ...getSectionEndIndex_section - __typename - } - __typename -} - -fragment CardByline_post on Post { - ...DraftStatus_post - ...Star_post - ...shouldShowPublishedInStatus_post - __typename - id -} - -fragment DraftStatus_post on Post { - id - pendingCollection { - id - creator { - id - __typename - } - ...BoldCollectionName_collection - __typename - } - statusForCollection - creator { - id - __typename - } - isPublished - __typename -} - -fragment BoldCollectionName_collection on Collection { - id - name - __typename -} - -fragment Star_post on Post { - id - creator { - id - __typename - } - __typename -} - -fragment shouldShowPublishedInStatus_post on Post { - statusForCollection - isPublished - __typename - id } fragment PostFooterActionsBar_post on Post { - id visibility - allowResponses - postResponses { - count - __typename - } - isLimitedState - creator { - id - __typename - } - collection { - id - __typename - } - ...MultiVote_post ...PostSharePopover_post - ...OverflowMenuButtonWithNegativeSignal_post - ...PostPageBookmarkButton_post - __typename } - -fragment MultiVote_post on Post { - id - creator { - id - ...SusiClickable_user - __typename - } - isPublished - ...SusiClickable_post - collection { - id - slug - __typename - } - isLimitedState - ...MultiVoteCount_post - __typename -} - -fragment MultiVoteCount_post on Post { - id - __typename -} - + fragment PostSharePopover_post on Post { - id mediumUrl - title isPublished - isLocked - ...usePostUrl_post - ...FriendLink_post - __typename -} - -fragment usePostUrl_post on Post { - id - creator { - ...userUrl_user - __typename - id - } - collection { - id - domain - slug - __typename - } - isSeries - mediumUrl - sequence { - slug - __typename - } - uniqueSlug - __typename -} - -fragment FriendLink_post on Post { - id - ...SusiClickable_post - ...useCopyFriendLink_post - __typename -} - -fragment useCopyFriendLink_post on Post { - ...usePostUrl_post - __typename - id -} - -fragment OverflowMenuButtonWithNegativeSignal_post on Post { - id - visibility - ...OverflowMenuWithNegativeSignal_post - __typename -} - -fragment OverflowMenuWithNegativeSignal_post on Post { - id - creator { - id - __typename - } - collection { - id - __typename - } - ...OverflowMenuItemUndoClaps_post - ...AddToCatalogBase_post - __typename -} - -fragment OverflowMenuItemUndoClaps_post on Post { - id - clapCount - ...ClapMutation_post - __typename -} - -fragment ClapMutation_post on Post { - __typename - id - clapCount - ...MultiVoteCount_post -} - -fragment AddToCatalogBase_post on Post { - id - isPublished - __typename -} - -fragment PostPageBookmarkButton_post on Post { - ...AddToCatalogBookmarkButton_post - __typename - id -} - -fragment AddToCatalogBookmarkButton_post on Post { - ...AddToCatalogBase_post - __typename - id -} - -fragment InResponseToEntityPreview_post on Post { - id - inResponseToEntityType - __typename -} - -fragment PostScrollTracker_post on Post { - id - collection { - id - __typename - } - sequence { - sequenceId - __typename - } - __typename } fragment HighDensityPreview_post on Post { - id - title - previewImage { - id - focusPercentX - focusPercentY - __typename - } - extendedPreviewContent( - truncationConfig: {previewParagraphsWordCountThreshold: 400, minimumWordLengthForTruncation: 150, truncateAtEndOfSentence: true, showFullImageCaptions: true, shortformPreviewParagraphsWordCountThreshold: 30, shortformMinimumWordLengthForTruncation: 30} - ) { - subtitle - __typename - } ...HighDensityFooter_post - __typename } fragment HighDensityFooter_post on Post { - id - readingTime tags { ...TopicPill_tag - __typename } - ...BookmarkButton_post - ...ExpandablePostCardOverflowButton_post - ...OverflowMenuButtonWithNegativeSignal_post - __typename } fragment TopicPill_tag on Tag { - __typename - id displayTitle normalizedTagSlug } - -fragment BookmarkButton_post on Post { - visibility - ...SusiClickable_post - ...AddToCatalogBookmarkButton_post - __typename - id -} - -fragment ExpandablePostCardOverflowButton_post on Post { - creator { - id - __typename - } - ...ExpandablePostCardReaderButton_post - __typename - id -} - -fragment ExpandablePostCardReaderButton_post on Post { - id - collection { - id - __typename - } - creator { - id - __typename - } - clapCount - ...ClapMutation_post - __typename -} - -fragment CardByline_publisher on Publisher { - __typename - ... on User { - id - ...CardByline_user - __typename - } - ... on Collection { - id - ...CardByline_collection - __typename - } -} - -fragment NewsletterV3Promo_publisher on Publisher { - __typename - ... on User { - ...NewsletterV3Promo_user - __typename - id - } - ... on Collection { - ...NewsletterV3Promo_collection - __typename - id - } -} - -fragment NewsletterV3Promo_user on User { - id - username - name - viewerEdge { - isUser - __typename - id - } - newsletterV3 { - id - ...NewsletterV3Promo_newsletterV3 - __typename - } - __typename -} - -fragment NewsletterV3Promo_newsletterV3 on NewsletterV3 { - slug - name - description - promoHeadline - promoBody - ...NewsletterSubscribeComponent_newsletterV3 - __typename - id -} - -fragment NewsletterSubscribeComponent_newsletterV3 on NewsletterV3 { - ...NewsletterV3SubscribeButton_newsletterV3 - ...NewsletterV3SubscribeByEmail_newsletterV3 - __typename - id -} - -fragment NewsletterV3SubscribeButton_newsletterV3 on NewsletterV3 { - id - name - slug - type - user { - id - name - username - __typename - } - collection { - slug - ...SusiClickable_collection - ...collectionDefaultBackgroundTheme_collection - __typename - id - } - ...SusiClickable_newsletterV3 - ...useNewsletterV3Subscription_newsletterV3 - __typename -} - -fragment collectionDefaultBackgroundTheme_collection on Collection { - colorPalette { - ...collectionDefaultBackgroundTheme_colorPalette - __typename - } - customStyleSheet { - id - ...collectionDefaultBackgroundTheme_customStyleSheet - __typename - } - __typename - id -} - -fragment collectionDefaultBackgroundTheme_colorPalette on ColorPalette { - ...customDefaultBackgroundTheme_colorPalette - __typename -} - -fragment customDefaultBackgroundTheme_colorPalette on ColorPalette { - highlightSpectrum { - ...ThemeUtil_colorSpectrum - __typename - } - defaultBackgroundSpectrum { - ...ThemeUtil_colorSpectrum - __typename - } - tintBackgroundSpectrum { - ...ThemeUtil_colorSpectrum - __typename - } - __typename -} - -fragment collectionDefaultBackgroundTheme_customStyleSheet on CustomStyleSheet { - id - ...customDefaultBackgroundTheme_customStyleSheet - __typename -} - -fragment customDefaultBackgroundTheme_customStyleSheet on CustomStyleSheet { - id - global { - colorPalette { - primary { - colorPalette { - ...customDefaultBackgroundTheme_colorPalette - __typename - } - __typename - } - background { - colorPalette { - ...customDefaultBackgroundTheme_colorPalette - __typename - } - __typename - } - __typename - } - __typename - } - __typename -} - -fragment SusiClickable_newsletterV3 on NewsletterV3 { - ...SusiContainer_newsletterV3 - __typename - id -} - -fragment SusiContainer_newsletterV3 on NewsletterV3 { - ...SignInOptions_newsletterV3 - ...SignUpOptions_newsletterV3 - __typename - id -} - -fragment SignInOptions_newsletterV3 on NewsletterV3 { - id - name - __typename -} - -fragment SignUpOptions_newsletterV3 on NewsletterV3 { - id - name - __typename -} - -fragment NewsletterV3SubscribeByEmail_newsletterV3 on NewsletterV3 { - id - slug - type - user { - id - name - username - __typename - } - collection { - ...collectionDefaultBackgroundTheme_collection - ...collectionUrl_collection - __typename - id - } - __typename -} - -fragment NewsletterV3Promo_collection on Collection { - id - slug - domain - name - newsletterV3 { - id - ...NewsletterV3Promo_newsletterV3 - __typename - } - __typename -} - -fragment PublisherHomepagePosts_user on User { - id - ...useShowAuthorNewsletterV3Promo_user - __typename -} - -fragment useShowAuthorNewsletterV3Promo_user on User { - id - username - newsletterV3 { - id - showPromo - slug - __typename - } - __typename -} - -fragment usePublicationAnalytics_collection on Collection { - id - googleAnalyticsId - __typename -} - -fragment CollectionMetadata_collection on Collection { - avatar { - id - focusPercentX - focusPercentY - originalHeight - originalWidth - __typename - } - creator { - id - twitterScreenName - ...userUrl_user - __typename - } - description - domain - facebookPageId - name - tags - twitterUsername - createdAt - ptsQualifiedAt - customDomainState { - live { - status - isSubdomain - __typename - } - __typename - } - ...collectionUrl_collection - ...CollectionJsonLd_collection - __typename - id -} - -fragment CollectionJsonLd_collection on Collection { - id - logo { - ...PrepareLogoForJsonLd_imageMetadata - __typename - id - } - avatar { - id - focusPercentX - focusPercentY - originalHeight - originalWidth - __typename - } - domain - name - ...collectionUrl_collection - __typename -} - -fragment PrepareLogoForJsonLd_imageMetadata on ImageMetadata { - id - originalWidth - originalHeight - __typename -} -""" } +""" @register_scraper class MediumScraper(BaseScraper): @@ -1891,7 +88,8 @@ class MediumScraper(BaseScraper): def scrape(self): print("Scraping Medium") - query_vars = LONG_QUERY | { + query_vars = { + "query": LONG_QUERY, "variables": { "homepagePostsLimit": 25, "includeDistributedResponses": False, @@ -1908,7 +106,8 @@ class MediumScraper(BaseScraper): self.data[post["mediumUrl"]] = { "title": post["title"], "published": unix_to_datetime_utc(post["firstPublishedAt"]), - "updated": unix_to_datetime_utc(post["latestPublishedAt"]) + "updated": unix_to_datetime_utc(post["latestPublishedAt"]), + "tags": ",".join(tag["normalizedTagSlug"] for tag in post["tags"]) } if paging_info is None: