python - BeautifulSoup 没有返回网站的完整 HTML
问题描述
我正在尝试为他们列出的所有待售汽车抓取一个名为 Autochek 的网站。制作完汤后,我创建了两个汤的子列表以进行迭代以获取我需要然后放入 DataFrame 的信息,但由于某种原因,汤只返回前 8 辆车。我认为这与必须滚动页面以加载数据有关,但我不确定。你能帮忙吗?
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import selenium
from selenium import webdriver # conda install selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = 'https://autochek.africa/ng/cars-for-sale'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
time.sleep(2)
xpath_search = r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/div'
element = driver.find_element_by_xpath(xpath_search)
element.click()
element_2 = driver.find_element_by_xpath(r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/ul/li[4]')
element_2.click()
time.sleep(10)
new_url = driver.current_url
print(new_url)
headers = ({'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit\
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
url = new_url
response = get(url, headers= headers)
soup = BeautifulSoup(response.text,'html.parser')
tag = 'div'
attributes = {'class':'car-grid-container'}
content_list = soup.find_all(tag, attributes)
basic_info = []
for item in content_list:
basic_info.append(item.find_all('a', {'class':'hover:tw-shadow-md'}))
text_info = []
for item in content_list:
text_info.append(item.find_all('div', {'class':'other-details tw-flex tw-flex-row tw-justify-between tw-items-stretch'}))
解决方案
这里不需要硒。直接从源获取数据:
import requests
import pandas as pd
url ='https://autochek.africa/_next/data/78f0e3518ea1a34c27af75e4181bc80d2c112ebf/ng/cars-for-sale.json'
pageSize = 9999
page = 0
rows = []
for page in range(1,100):
if page > pageSize:
break
print(page)
payload = {
'country': 'ng',
'page_number': '%s' %page}
jsonData = requests.get(url, params=payload).json()
pageSize = jsonData['pageProps']['pagination']['pageSize']
rows += jsonData['pageProps']['cars']
df = pd.DataFrame(rows)
输出:
print(df)
id title ... hasCleanTitle soldDate
0 -e4h58sQe Toyota Camry ... True NaN
1 SWbyrtv5P Toyota Venza ... False NaN
2 R9_sIKnJy Toyota Highlander ... NaN NaN
3 4DAiUe1oC Toyota Camry ... True NaN
4 ECIkVk1aJ Toyota Camry ... True NaN
.. ... ... ... ... ...
571 l7fFI5Q3w Toyota Camry ... True NaN
572 GquAWYVWu Toyota Camry ... True NaN
573 SVkuXCLiV Toyota Matrix ... NaN NaN
574 2O0OKpL_d Toyota Corolla ... True NaN
575 lp2PtU1oO Mercedes-Benz ML 350 ... NaN NaN
[576 rows x 24 columns]
推荐阅读
- outlook - Outlook 桌面中的自适应卡操作错误,但在 OOTW 中没有
- javascript - 隐藏 `prototype` 属性以防止自定义函数覆盖本机函数
- javascript - 如何在 Node.Js 中组织与数据库的通信?
- c# - C#将多个单元格从Excel复制到同一列中的Word
- r - 按 customerID 和 YTD Sales 计算该客户的 3 个月平均值
- algorithm - 将边集合转换为三角形集合的算法
- php - 无法从 20 Mb 文件中获取文件大小()
- dynamics-crm - Dynamics 8.1 和 365:“Publisher”是否可移植用于托管解决方案?
- android - 如何为 64 位 Android 构建 cordova 项目?
- algorithm - 查找与单词查询匹配的顶级文档