python - 在第 6/7 页后处理 Cookie 弹出窗口
问题描述
在这个网站上的一些会员的帮助下,我已经建立了一个房地产数据的网络抓取。
它工作得很好,但是在爬到第 6/7 页或更长时间之后,弹出一个典型的 cookie 警告的 cookie,并且似乎破坏了我在 CSV 文件中的输出。
有没有办法处理弹出窗口?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
网站中的 cookie 脚本:
(function(){function g(a){return{get:function(b){var c=JSON.parse(a.getItem(b));return!c||Date.parse(c.expires)<= (new Date).getTime()?(a.removeItem(b),null):c.value},set:function(b,c,d){c={value:c,expires:d.toUTCString() };a.setItem(b,JSON.stringify(c))},remove:function(b){a.removeItem(b)}}}function d(a,b,c,d){this.parseCommand=function (e,g){function h(){var a=JSON.stringify({messageId:k,value:l||!1});window.parent.postMessage(a," ")}var m=q[ a],n=e.action,p=e.key,k=e.messageId,f=e.siteId,f=d?p:p+":"+ f,l=e.value,r=e. expiresMinutes||1440 (e.expiresDays||365),s=function(){var a=new Date;a.setTime(a.getTime()+6E4*r);return a}();if(!function (){var a={_hjSet:c,_hjGet:b,_hjRemove:c}[n]||[];return 0<=a.indexOf("")||0<=a.indexOf(g)}())throw Error("Command "+n+" not allowed on key: "+p);switch(n){case "_hjSet":m.set( f,l,s);break;case "_hjGet":l=m.get(f);h();break;case "_hjRemove":m.remove(f)}}}function h(a){试试{var b=JSON.parse(a.data);b.key&&k[b.key]&&k[b.key].parseCommand(b,a.origin)}catch(c){return null}} var q;try {q={cookie:{get:function(a){return(a=RegExp("(?:^|; )"+a+"=([^;] )").exec(document.cookie))? a[1]:void 0},set:function(a,b,c){document.cookie=a+"="+b+"; 路径=/; expires="+c.toUTCString()},remove:function(a){document.cookie=a+"=; expires=1979 年 3 月 13 日星期二 00:00:00 UTC;path=/;"}},localStorage:g(localStorage),sessionStorage:g(sessionStorage)}}catch(t){return}var k={_hjOptOut:new d("cookie",["http://local.hotjar.com "," https://insights-staging.hotjar.com "," http://insights-staging.hotjar.com "],!0),grant_consent:new d(" cookie",[" "],[" "],!1),screenshot_retake:new d("localStorage",[" "],[" "],!1),screenshot_active_retake:new d("sessionStorage",[ " "],["*"],!1)};window.addEventListener?window.addEventListener("message",h,!1):window.attachEvent("onmessage",h)})();
解决方案
要克服弹出问题,只需在加载页面后检查是否有可用的弹出窗口。如果是,请单击该按钮。希望对您有所帮助。
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
推荐阅读
- google-apps-script - 单元格为空时设置透明背景
- performance - Elasticsearch:include_in_parent - 对性能有什么影响?默认值为 false 的任何原因
- ios - Swift:两个扩展中的相同功能,不同的模块
- javascript - 读取来自应用服务器的 Javascript cookie
- spring-boot - 如何在 Spring Boot Controller 中获取多个模型对象
- java - DataOutputStream 只写一个字符串 Java
- java - 对数组进行排序并找到重复的数字
- java - 这是简单的抽象演示代码,我无法创建suresh类的对象,有人可以帮我解决这里出了什么问题
- javascript - 按 id 数组多维求和
- angular - 使用 Angular 5 在组件 + 服务中测试函数 submit()