首页 > 解决方案 > BeautifulSoup 没有返回网站的完整 HTML

问题描述

我正在尝试为他们列出的所有待售汽车抓取一个名为 Autochek 的网站。制作完汤后,我创建了两个汤的子列表以进行迭代以获取我需要然后放入 DataFrame 的信息,但由于某种原因,汤只返回前 8 辆车。我认为这与必须滚动页面以加载数据有关,但我不确定。你能帮忙吗?

from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import selenium
from selenium import webdriver # conda install selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time


url = 'https://autochek.africa/ng/cars-for-sale'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
time.sleep(2)
xpath_search = r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/div'
element = driver.find_element_by_xpath(xpath_search)
element.click()
element_2 = driver.find_element_by_xpath(r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/ul/li[4]')
element_2.click()
time.sleep(10)
new_url = driver.current_url
print(new_url)

headers = ({'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})

url = new_url

response = get(url, headers= headers)

soup = BeautifulSoup(response.text,'html.parser')

tag = 'div'
attributes =  {'class':'car-grid-container'}

content_list = soup.find_all(tag, attributes)

basic_info = []
for item in content_list:
    basic_info.append(item.find_all('a', {'class':'hover:tw-shadow-md'}))

text_info = []
for item in content_list:
    text_info.append(item.find_all('div', {'class':'other-details tw-flex tw-flex-row tw-justify-between tw-items-stretch'}))

标签: pythonseleniumweb-scrapingbeautifulsoup

解决方案


这里不需要硒。直接从源获取数据:

import requests
import pandas as pd

url ='https://autochek.africa/_next/data/78f0e3518ea1a34c27af75e4181bc80d2c112ebf/ng/cars-for-sale.json'

pageSize = 9999
page = 0

rows = []
for page in range(1,100):
    if page > pageSize:
        break
    print(page)
    payload = {
        'country': 'ng',
        'page_number': '%s' %page}
    
    jsonData = requests.get(url, params=payload).json()
    pageSize = jsonData['pageProps']['pagination']['pageSize']
    
    rows += jsonData['pageProps']['cars']

df = pd.DataFrame(rows)

输出:

print(df)
            id                 title  ... hasCleanTitle  soldDate
0    -e4h58sQe          Toyota Camry  ...          True       NaN
1    SWbyrtv5P          Toyota Venza  ...         False       NaN
2    R9_sIKnJy     Toyota Highlander  ...           NaN       NaN
3    4DAiUe1oC          Toyota Camry  ...          True       NaN
4    ECIkVk1aJ          Toyota Camry  ...          True       NaN
..         ...                   ...  ...           ...       ...
571  l7fFI5Q3w          Toyota Camry  ...          True       NaN
572  GquAWYVWu          Toyota Camry  ...          True       NaN
573  SVkuXCLiV         Toyota Matrix  ...           NaN       NaN
574  2O0OKpL_d        Toyota Corolla  ...          True       NaN
575  lp2PtU1oO  Mercedes-Benz ML 350  ...           NaN       NaN

[576 rows x 24 columns]

推荐阅读