python - xPath 似乎没有转到下一页
问题描述
这是我的脚本:
from selenium import webdriver
import time
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium.webdriver.common.keys import Keys
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
url = 'https://www.booking.com/index.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ&sid=303509179a2849df63e4d1e5bc1ab1e3&srpvid=e6ae6d1417bd00a1&click_from_logo=1'
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get('https://www.booking.com/index.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ&sid=303509179a2849df63e4d1e5bc1ab1e3&srpvid=e6ae6d1417bd00a1&click_from_logo=1')
driver.maximize_window()
time.sleep(2)
headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
cookie = driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]')
try:
cookie.click()
except:
pass
time.sleep(2)
job_title = driver.find_element_by_xpath('//*[@id="ss"]')
job_title.click()
job_title.send_keys('Paris') #ici on renseigne la ville, attention à la syntaxe
time.sleep(3)
search = driver.find_element_by_xpath('//*[@id="frm"]/div[1]/div[4]/div[2]/button')
search.click()
time.sleep(6)
linksfinal = []
n = 1 #le nombre de pages que vous voulez parcourir
for x in range(n): #iterate over n pages
time.sleep(3)
my_elems = driver.find_elements_by_xpath('//a[@class="js-sr-hotel-link hotel_name_link url"]')
links = [my_elem.get_attribute("href") for my_elem in my_elems]
links = [link.replace('\n','') for link in links]
linksfinal = linksfinal + links
time.sleep(3)
next = driver.find_element_by_xpath('//*[@class="bk-icon -iconset-navarrow_right bui-pagination__icon"]')
next.click()
#print(linksfinal)
nameshotel = []
for url in linksfinal:
results = requests.get(url, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
name = soup.find("h2",attrs={"id":"hp_hotel_name"}).text.strip("\n").split("\n")[1]
nameshotel.append(name)
#linksfinal = ['https://www.booking.com/hotel/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625662317;srpvid=b3d45a76bf450049;type=total;ucfs=1&#tab-main']
for i in range(len(linksfinal)) :
linksfinal[i] = linksfinal[i].split("fr")[1]
linksfinal[i] = linksfinal[i][1:]
linksfinal[i] = linksfinal[i][:-1]
urlfinal = []
for i in range(len(linksfinal)) :
urlfinal.append('https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=' + str(linksfinal[i]) + '&type=total&offset=0&rows=25')
j = 3
for url, name in zip(urlfinal, nameshotel) :
commspos = []
commsneg = []
header = []
notes = []
dates = []
datestostay = []
driver.get(url)
results = requests.get(url, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
reviews = soup.find_all('li', class_ = "review_list_new_item_block")
for k in range(j): #iterate over n pages
for review in reviews:
try:
commpos = review.find("div", class_ = "c-review__row").text[11:].strip()
except:
commpos = 'NA'
commspos.append(commpos)
try:
commneg = review.find("div", class_ = "c-review__row lalala").text[17:].strip()
except:
commneg = 'NA'
commsneg.append(commneg)
note = review.find('div', class_ = 'bui-review-score__badge').text.strip()
notes.append(note)
date = review.find('span', class_ = 'c-review-block__date').text.strip()
dates.append(date)
try:
datestay = review.find('ul', class_ = 'bui-list bui-list--text bui-list--icon bui_font_caption c-review-block__row c-review-block__stay-date').text[16:].strip()
datestostay.append(datestay)
except:
datestostay.append('NaN')
time.sleep(3)
nextpages = driver.find_element_by_xpath('//a[@class="pagenext"]')
urlnext = nextpages.get_attribute("href")
results2 = requests.get(urlnext)
driver.get(urlnext)
time.sleep(3)
soup = BeautifulSoup(results2.text, "html.parser")
reviews = soup.find_all('li', class_ = "review_list_new_item_block")
data = pd.DataFrame({
'commspos' : commspos,
'commsneg' : commsneg,
'notes' : notes,
'dates' : dates,
'datestostay' : datestostay,
})
data.to_csv(f"{name}.csv", sep=';', index=False, encoding = 'utf_8_sig')
time.sleep(3)
我不知道为什么,但是我告诉脚本转到下一页的部分似乎不起作用:
nextpages = driver.find_element_by_xpath('//a[@class="pagenext"]')
urlnext = nextpages.get_attribute("href")
results2 = requests.get(urlnext)
driver.get(urlnext)
time.sleep(3)
soup = BeautifulSoup(results2.text, "html.parser")
reviews = soup.find_all('li', class_ = "review_list_new_item_block")
我的脚本只刮了 25 条评论,对于每个链接,每页有 25 条评论,所以它似乎只刮了一页,而它应该在那里刮 3 页:
然而,当我查看源代码时,似乎href
进入了下一页。因此,我将这个 href 实现为一个循环,如上所示:
任何想法为什么它不能按预期工作?
解决方案
如果单击或查找 pagenext 对您不起作用,则只需更改 url。offset=
当您进入下一页时,我发现这种情况正在发生变化。所以。如果您的刮刀刮第一页然后更改offset=0
为offset=10
然后它将为您提供下一页的数据。或者在任何你想抓取数据的地方运行循环并将偏移量增加 10。
这是示例:
https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=hotelistria&type=total&offset=0&rows=20
这是第一页,如果我们更改offset=10
它,它会为我们提供下一页的数据,例如:
https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=hotelistria&type=total&offset=10&rows=20
你可以这样循环:
for i in range(0,<up to you want>,10):
url=f"https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=hotelistria&type=total&offset={i}&rows=20"
如果有任何进一步的疑问,请在评论中提问。
推荐阅读
- python - 数据集和 Gram 矩阵的平均值
- google-apps-script - 从 V8 Runtime 转换为 Rhino Runtime (Legacy) 我需要进行哪些脚本调整?
- javascript - 如何在javascript中按间隔设置对象值
- python - ValueError:形状 (None, 1) 和 (None, 64) 不兼容 Keras
- html - 如何使用 flexbox 垂直居中文本?
- laravel - Laravel:按年份和班级过滤的 Highchart 列
- lua - 像javascript这样的lua映射数据结构有?
- mysql - .append 上的 Laravel ajax“无效或意外令牌”
- ios - ld:当 lib 存在于 .../BuildProductsPath/Debug-iphoneos/Amplitude 时,找不到 -lAmplitude 的库
- vba - 用于打印的 VBA 语句仅在调试模式下执行