首页 > 解决方案 > xPath 似乎没有转到下一页

问题描述

这是我的脚本:

from selenium import webdriver
import time

from selenium.webdriver.support.select import Select

from selenium.webdriver.support.ui import WebDriverWait     
from selenium.webdriver.common.by import By     
from selenium.webdriver.support import expected_conditions as EC

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from selenium.webdriver.common.keys import Keys

PATH = "driver\chromedriver.exe"

options = webdriver.ChromeOptions() 
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
url = 'https://www.booking.com/index.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ&sid=303509179a2849df63e4d1e5bc1ab1e3&srpvid=e6ae6d1417bd00a1&click_from_logo=1'

driver = webdriver.Chrome(options=options, executable_path=PATH)

driver.get('https://www.booking.com/index.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ&sid=303509179a2849df63e4d1e5bc1ab1e3&srpvid=e6ae6d1417bd00a1&click_from_logo=1')
driver.maximize_window()
time.sleep(2)

headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}


cookie = driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]')
try:
    cookie.click()
except:
    pass

time.sleep(2)



job_title = driver.find_element_by_xpath('//*[@id="ss"]')
job_title.click()
job_title.send_keys('Paris') #ici on renseigne la ville, attention à la syntaxe
time.sleep(3)

search = driver.find_element_by_xpath('//*[@id="frm"]/div[1]/div[4]/div[2]/button')
search.click()
time.sleep(6)

linksfinal = []

n = 1 #le nombre de pages que vous voulez parcourir

for x in range(n): #iterate over n pages

    time.sleep(3)

    my_elems = driver.find_elements_by_xpath('//a[@class="js-sr-hotel-link hotel_name_link url"]')

    links = [my_elem.get_attribute("href") for my_elem in my_elems]

    links = [link.replace('\n','') for link in links]

    linksfinal = linksfinal + links

    time.sleep(3) 

    next = driver.find_element_by_xpath('//*[@class="bk-icon -iconset-navarrow_right bui-pagination__icon"]')
      
    next.click()

#print(linksfinal)
nameshotel = []
for url in linksfinal:
    results = requests.get(url, headers = headers)
    soup = BeautifulSoup(results.text, "html.parser")
    name = soup.find("h2",attrs={"id":"hp_hotel_name"}).text.strip("\n").split("\n")[1]
    nameshotel.append(name)


#linksfinal = ['https://www.booking.com/hotel/fr/d-argentine.fr.html?label=gen173nr-1DEgdyZXZpZXdzKIICOOgHSDNYBGhNiAEBmAENuAEXyAEM2AED6AEBiAIBqAIDuAKr2vuGBsACAdICJDE1YjBlZDY1LTI2NzEtNGM3Mi04OWQ1LWE5MjQ3OWFmNzE2NtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1625662317;srpvid=b3d45a76bf450049;type=total;ucfs=1&#tab-main']

for i in range(len(linksfinal)) :
    linksfinal[i] = linksfinal[i].split("fr")[1]
    linksfinal[i] = linksfinal[i][1:]
    linksfinal[i] = linksfinal[i][:-1]

urlfinal = []
for i in range(len(linksfinal)) :
    urlfinal.append('https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=' + str(linksfinal[i]) + '&type=total&offset=0&rows=25')

j = 3

for url, name in zip(urlfinal, nameshotel) :

    commspos = []
    commsneg = []
    header = []
    notes = []
    dates = []
    datestostay = []

    driver.get(url)

    results = requests.get(url, headers = headers)

    soup = BeautifulSoup(results.text, "html.parser")

    reviews = soup.find_all('li', class_ = "review_list_new_item_block")

    for k in range(j): #iterate over n pages
   
        for review in reviews:
            try:
                commpos  = review.find("div", class_  = "c-review__row").text[11:].strip()
            except:
                commpos = 'NA'

            commspos.append(commpos)

            try:
                commneg  = review.find("div", class_  = "c-review__row lalala").text[17:].strip()
            except:
                commneg = 'NA'

            commsneg.append(commneg)
  
            note = review.find('div', class_ = 'bui-review-score__badge').text.strip()
            notes.append(note)

            date = review.find('span', class_ = 'c-review-block__date').text.strip()
            dates.append(date)

            try:
                datestay = review.find('ul', class_ = 'bui-list bui-list--text bui-list--icon bui_font_caption c-review-block__row c-review-block__stay-date').text[16:].strip()
                datestostay.append(datestay)
            except:
                datestostay.append('NaN')

            time.sleep(3)

        nextpages = driver.find_element_by_xpath('//a[@class="pagenext"]')

        urlnext = nextpages.get_attribute("href")

        results2 = requests.get(urlnext)

        driver.get(urlnext)

        time.sleep(3)

        soup = BeautifulSoup(results2.text, "html.parser")

        reviews = soup.find_all('li', class_ = "review_list_new_item_block")


    data = pd.DataFrame({
        'commspos' : commspos,
        'commsneg' : commsneg,
        'notes' : notes,
        'dates' : dates,
        'datestostay' : datestostay,
        })

    data.to_csv(f"{name}.csv", sep=';', index=False, encoding = 'utf_8_sig')

    time.sleep(3)

我不知道为什么,但是我告诉脚本转到下一页的部分似乎不起作用:

nextpages = driver.find_element_by_xpath('//a[@class="pagenext"]')

urlnext = nextpages.get_attribute("href")

results2 = requests.get(urlnext)

driver.get(urlnext)

time.sleep(3)

soup = BeautifulSoup(results2.text, "html.parser")

reviews = soup.find_all('li', class_ = "review_list_new_item_block")

我的脚本只刮了 25 条评论,对于每个链接,每页有 25 条评论,所以它似乎只刮了一页,而它应该在那里刮 3 页:

输出

然而,当我查看源代码时,似乎href进入了下一页。因此,我将这个 href 实现为一个循环,如上所示:

下一页

任何想法为什么它不能按预期工作?

标签: pythonpython-3.xseleniumweb-scrapingbeautifulsoup

解决方案


如果单击或查找 pagenext 对您不起作用,则只需更改 url。offset=当您进入下一页时,我发现这种情况正在发生变化。所以。如果您的刮刀刮第一页然后更改offset=0offset=10然后它将为您提供下一页的数据。或者在任何你想抓取数据的地方运行循环并将偏移量增加 10。

这是示例:

https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=hotelistria&type=total&offset=0&rows=20

这是第一页,如果我们更改offset=10它,它会为我们提供下一页的数据,例如:

https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=hotelistria&type=total&offset=10&rows=20

你可以这样循环:

for i in range(0,<up to you want>,10):
    url=f"https://www.booking.com/reviewlist.fr.html?cc1=fr&dist=1&pagename=hotelistria&type=total&offset={i}&rows=20"

如果有任何进一步的疑问,请在评论中提问。


推荐阅读