首页 > 解决方案 > 在 Scrapy 中模拟 POST 请求时使用哪些请求标头?

问题描述

我想从这个网站上删除一些信息:https ://londonreal.tv/e/

有一个无限滚动,它使用 POST 请求来加载新数据。我想使用 Scrapy 人为地发出此请求并访问除第一页之外的下一页上的内容。但在结果中,我得到了 400 响应状态码,这意味着错误请求。有人可以给我一个提示,我的代码中可能有什么问题吗?也许我应该发送不同的标题?目前,我在我的请求中包含了我可以在开发人员控制台中看到的所有标头。

这是我的代码:

# follow links to articles pages
    for href in response.css('article a::attr(href)').extract()[::2]:
        yield response.follow(href, self.parse_article, headers=self.headers, meta={'dont_redirect': True})

    style_text = response.css('aside.ul-custom-css-inline style::text').extract_first()
    dot_index = style_text.index(".")
    random_id = style_text[1:dot_index]

    header = { ':authority': 'londonreal.tv', ':path': '/wp-admin/admin-ajax.php', ':scheme': 'https', 'origin': 'https://londonreal.tv', ':method': 'POST',
        'referer': 'https://londonreal.tv/e/', 'x-requested-with': 'XMLHttpRequest', 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 
        'content-length': '6010', 'accept': 'text/html, */*; q=0.01', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'cookie': '__cfduid=d8904c21e1fef19c4a14e9f922648c9631530003050; ajs_user_id=null; ajs_group_id=null; ajs_anonymous_id=%2227b2bd7b-a5fb-4255-ab8a-dc503212c0b4%22; _ga=GA1.2.1470534344.1530003059; gs_v_GSN-230219-G=; ct-ultimate-gdpr-cookie=eyJjb25zZW50X2RlY2xpbmVkIjpmYWxzZSwiY29uc2VudF9leHBpcmVfdGltZSI6MTU2MTU2MzE3OSwiY29uc2VudF9sZXZlbCI6MiwiY29uc2VudF90aW1lIjoxNTMwMDI3MTc5fQ%3D%3D; wisepops=%7B%22cross_subdomain%22%3Atrue%2C%22last_req_date%22%3Anull%2C%22popins%22%3A%7B%22116719%22%3A%7B%22display_count%22%3A1%2C%22display_date%22%3A%222018-06-26T15%3A34%3A16.903Z%22%7D%2C%22116876%22%3A%7B%22display_count%22%3A2%2C%22display_date%22%3A%222018-06-26T15%3A33%3A24.723Z%22%7D%2C%22124130%22%3A%7B%22display_count%22%3A1%2C%22display_date%22%3A%222018-07-04T11%3A27%3A31.617Z%22%7D%7D%2C%22ucrn%22%3A28%2C%22uid%22%3A%2210327%22%2C%22version%22%3A3%7D; gs_v_GSN-230219-G=; gs_u_GSN-230219-G=563193887e842c22fd7937b8695b14b1:8047:10685:1532507562214; _gid=GA1.2.1810722376.1532630788; PHPSESSID=df0b1537fb2f3723aa9a8e92657462a3; __unam=767b664-1643cbace12-79ef2868-7; wisepops_visits=%5B%222018-07-27T17%3A45%3A59.594Z%22%2C%222018-07-27T17%3A45%3A40.830Z%22%2C%222018-07-27T17%3A41%3A50.092Z%22%2C%222018-07-27T16%3A15%3A53.673Z%22%2C%222018-07-27T15%3A51%3A15.844Z%22%2C%222018-07-26T19%3A43%3A37.413Z%22%2C%222018-07-26T19%3A09%3A53.095Z%22%2C%222018-07-26T18%3A46%3A26.617Z%22%2C%222018-07-25T08%3A55%3A11.462Z%22%2C%222018-07-25T08%3A50%3A00.071Z%22%2C%222018-07-25T08%3A32%3A41.795Z%22%2C%222018-07-25T08%3A31%3A26.439Z%22%2C%222018-07-25T08%3A29%3A59.935Z%22%2C%222018-07-25T08%3A25%3A22.776Z%22%2C%222018-07-25T08%3A04%3A19.861Z%22%2C%222018-07-25T08%3A02%3A10.582Z%22%2C%222018-07-25T08%3A01%3A55.066Z%22%2C%222018-07-25T07%3A59%3A13.806Z%22%2C%222018-07-04T11%3A27%3A30.620Z%22%5D; wisepops_session=%7B%22arrivalOnSite%22%3A%222018-07-27T17%3A45%3A59.594Z%22%2C%22mtime%22%3A%222018-07-27T17%3A45%3A59.681Z%22%2C%22pageviews%22%3A1%2C%22popins%22%3A%7B%7D%2C%22src%22%3Anull%2C%22utm%22%3A%7B%7D%7D; gs_u_GSN-230219-G=563193887e842c22fd7937b8695b14b1:8595:11064:1532713560352; _drip_client_4297898=vid%253Df8e5b7505b4b01369c9b0e97d2a2a872%2526pageViews%253D29%2526sessionPageCount%253D3%2526lastVisitedAt%253D1532713560749%2526weeklySessionCount%253D4%2526lastSessionAt%253D1532713311365' }

    formdata = { 'query_params[post_types]': 'post', 'query_params[i_attachment]': '', 'query_params[taxonomies]': 'category', 'query_params[multi_post_types]': '',
                 'query_params[multi_taxonomies]': '', 'query_params[query_types]': '0', 'query_params[i_taxonomies]': '509', 'query_params[e_taxonomies]': '',
                 'query_params[i_ids]': '', 'query_params[cq_operator]': '0', 'query_params[e_ids]': '', 'query_params[query_author]': '', 'query_params[query_offset]': '',
                 'query_params[query_include_children]': '1', 'query_params[today_post]': '0', 'query_params[datetime_meta]': '','query_params[woo_query]': '0', 
                 'query_params[post_count]': '1000', 'query_params[posts_per_page]': '16', 'filter': '','order':'DESC','orderby': 'date','sub_opt_query[meta_key_query]': '',
                 'sub_opt_query[paged]': '2','sub_opt_query[first_query]': 'off','sub_opt_query[total_pages]': '29','sub_opt_query[items_last_page]': '8',
                 'sub_opt_query[query_operator]': '', 'sub_opt_query[query_relation]': '', 'options[layout_style]': '0', 'options[button_gallery_name]': 'Gallery',
                 'options[grid_style]': '0', 'options[list_style]': '0','options[carousel_t_style]': '0','options[carousel_f_style]': '0','options[creative_style]': '0',
                 'options[timeline_style]': '0','options[block_content_style]': '0','options[sync_slider_style]': '0','options[grid_masonry]': '0',
                 'options[show_arrows]': '1','options[arrows_outside]': '0','options[show_dots]': '1','options[infinite]': '1','options[autoplay]': '1',
                 'options[autoplayspeed]': '5000','options[scrollperpage]': '1','options[speed]': '500','options[centermode]': '0','options[sync_slider_height_d]': '', 
                 'options[sync_slider_height_t]': '', 'options[s_cb_categories]': '', 'options[s_categories_target]': '0',
                 'options[sync_slider_height_m]': '','options[sync_slider_width_d]': '','options[sync_slider_width_t]': '','options[sync_slider_width_m]': '',
                 'options[show_elements]': '0','options[av_content]': '0','options[cc_mobile]': '0','options[cc_portrait_tablet]': '0','options[cc_landscape_tablet]': '3',
                 'options[cc_small_desktop]': '4','options[cc_medium_desktop]': '4','options[cc_large_desktop]': '5','options[cc_extra_large_desktop]': '6',
                 'options[image_size]': '400x225_ul_grid_16_9_1x', 'options[image_size_s]': 'thumbnail', 'options[s_image]': '1', 'options[s_image_link]': '1', 
                 'options[s_image_link_target]': '0','options[s_icon_lightbox_video]': '0','options[video_url_meta]': '0','options[video_url_meta_key]': '',
                 'options[s_icon_lightbox_image]': '0', 'options[s_icon_link]': '0','options[s_icon_link_target]': '0', 'options[s_image_hover_effect]': '2',
                 'options[s_overlay_hover_effect]': 'ultimate-layouts-hover-css-sweep-to-right', 'options[s_overlay_settings]': '0',
                 'options[s_overlay_color]': '','options[s_image_post_format]': '1','options[s_image_post_format_pos]': 'ul-top-right',
                 'options[s_image_avatar]': '0','options[s_title]': '1','options[s_title_limit]': '0','options[s_title_link]': '1','options[s_title_link_target]': '0',
                 'options[s_excerpt]': '0','options[s_excerpt_rbtn]': '0','options[s_excerpt_f]': 'get_the_excerpt','options[s_excerpt_sc]': '1',
                 'options[s_excerpt_sh]': '1','options[s_excerpt_length]': '0','options[s_categories]': '0','options[s_s_categories]': '0',
                 'options[s_s_categories_parent]': '0','options[ex_items_taxonomies]': '','options[s_c_categories]': '0','options[s_ct_categories]': '',                 
                'options[s_metas_o]': '0','options[s_metas_o_author]': '1','options[s_metas_t_readmore]': '1', 'options[woo_show_cart]': '0',
                'options[s_metas_o_author_avatar]': '0','options[s_metas_o_time]': '1','options[time_format]': 'F j, Y','options[s_metas_o_comment]': '1',
                'options[s_metas_o_like]': '0','options[s_metas_o_share]': '0','options[custom_meta_o]': '','options[s_metas_t]': '0',
                'options[s_metas_t_author]': '0','options[s_metas_t_author_avatar]': '0','options[s_metas_t_time]': '0','options[time_format_t]': 'F j, Y',
                'options[s_metas_t_comment]': '0','options[s_metas_t_like]': '1','options[s_metas_t_share]': '1','options[custom_meta_t]': '',
                'options[s_metas_t_readmore_link_target]': '0','options[share_text]': '','options[read_more_text]': '','options[before_author_text]': '',
                'options[pagination]': '3','options[loadmore_text]': '','options[prev_text]': '','options[next_text]': '','options[animate]': 'default',
                'options[lazyload]': '0','options[lazyload_p]': '','options[geodirectory_rating]': '0','options[quick_view]': '0','options[quick_view_mode]': '0',
                'options[extra_class]': '','options[rnd_id]': 'ul76912','options[s_title_small]': '1','options[s_title_limit_small]': '0','options[s_title_link_small]': '1',
                'options[s_title_link_target_small]': '0','options[s_categories_small]': '1','options[s_s_categories_small]': '0','options[s_s_categories_parent_small]':'0',
                'options[ex_items_taxonomies_small]': '','options[s_c_categories_small]': '0','options[s_ct_categories_small]': '','options[s_cb_categories_small]': '',
                'options[s_categories_target_small]': '0','options[s_metas_o_small]': '1','options[s_metas_o_author_small]': '1','options[s_metas_o_author_avatar_small]':'0',
                'options[s_metas_o_time_small]': '1','options[time_format_small]': 'F j, Y','options[s_metas_o_comment_small]': '1','options[s_metas_o_like_small]': '0',
                'options[s_metas_o_share_small]': '0','options[custom_meta_o_small]': '','options[woo_show_price]': '0','options[woo_show_rating]': '0',
                'options[qv_s_title]': '1','options[qv_s_categories]': '1','options[qv_s_s_categories]': '0','options[qv_s_s_categories_parent]': '0',
                'options[qv_ex_items_taxonomies]': '','options[qv_s_c_categories]': '0','options[qv_s_ct_categories]': '','options[qv_s_cb_categories]': '',
                'options[qv_s_categories_target]': '0','options[qv_s_metas_o]': '1','options[qv_s_metas_o_author]': '1','options[qv_s_metas_o_author_avatar]': '0',
                'options[qv_s_metas_o_time]': '1','options[qv_time_format]': 'F j, Y', 'options[qv_s_metas_o_comment]': '1','options[qv_s_metas_o_like]': '1',
                'options[qv_custom_meta_o]': '','options[qv_show_content]': '1','options[qv_content_stripsc]': '0','options[qv_show_share]': '1',
                'options[qv_woo_show_rating]': '1','options[qv_s_featured_image]': '1','options[goo_ads_client]': '','options[goo_ads_id]': '',
                'options[goo_ads_offset]': '1','options[css_class]': '','random_id': random_id,'action': 'ultimatelayoutsajaxaction' }


    # follow pagination links
    #paging_data = json.loads(response.text.split("while(1);</x>")[1])['payload']['paging']

    if formdata != 0:
        yield scrapy.Request('https://londonreal.tv/wp-admin/admin-ajax.php', method='POST', body=json.dumps(formdata), headers=header, callback=self.parse)

标签: ajaxhttppostweb-scrapingscrapy

解决方案


推荐阅读