python - 无法使用python从网站中提取记录
问题描述
当我运行代码时,它会从网站打印数据,但问题是,我在代码中使用了自动滚动功能,但它不能滚动到网站的最后一页,这样,我只得到 122 CSV 文件中的记录,但在网站中,直到网站的最后一页有很多记录,所以请帮帮我。
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
url ='https://dawaai.pk/medicine-category/alimentary-tract-metabolism-2'
driver = webdriver.Chrome('E:/chromedriver')
driver.get(url)
SCROLL_PAUSE_TIME = 1
#time.sleep(4)
# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")
this doesn't work due to floating web elements on youtube
"""
last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height and conte:
print("break")
break
last_height = new_height
#time.sleep(5)
pageSource = driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
aaa = soup.find('div',class_='columns systemic-products-div')
conte= aaa.find_all('div',class_='column col-3 mb-20')
#print(conte)
suit = []
for items in conte:
product_title = items.find("h2").text.strip()
product_Brand_Name = items.find("p").text.strip()
# title=''
print(len(product_title))
#driver.close()
fabric = {
'productname':product_title,
'Product_Brand_Name':product_Brand_Name
}
suit.append(fabric)
print ("Importing to Data into CSV File...!!")
df = pd.DataFrame(suit)
print("Saved Sucessfully....")
df.to_csv('dawaii.csv', index=False)
解决方案
另一种方法,你可以试试。
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'authority': 'dawaai.pk',
'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'digital-signature': '49621438',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://dawaai.pk/medicine-category/alimentary-tract-metabolism-2',
'accept-language': 'en-US,en;q=0.9',
'cookie': 'locationCityId=48504; locationCityName=Karachi; _gid=GA1.2.615032251.1635224873; _fbp=fb.1.1635224873607.1935785452; _hjid=91298dd5-b6dc-44d0-8efa-c8e077b32803; _hjFirstSeen=1; _hjAbsoluteSessionInProgress=0; _CEFT=Q%3D%3D%3D; _ce.s=v11.rlc~1635224874425; moe_uuid=ac52076a-c5c0-4680-a523-91844e23a37e; __zlcmid=16kkQu7WdDEd4MT; ci_session=4e9bh9uog3g4su26tgvd90v4uoso2p5s; mp_1b439ed4835dcc8528bac6ce6d25c879_mixpanel=%7B%22distinct_id%22%3A%20%2217cbaff65754cb-0deaaefbe322db-57b193e-100200-17cbaff657647a%22%2C%22%24device_id%22%3A%20%2217cbaff65754cb-0deaaefbe322db-57b193e-100200-17cbaff657647a%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%7D; _ga=GA1.2.1379999337.1635224871; _gat=1; _ga_M50BR48D77=GS1.1.1635224870.1.1.1635225184.0',
}
suit = []
i = 1
while i < 100:
response = requests.get(f'https://dawaai.pk/systemic_class/pagination/{i}', headers=headers)
print(response.url)
soup = BeautifulSoup(response.content, 'html.parser')
# print(soup.prettify())
data = soup.select('div.card-body')
for title_brand in data:
title = title_brand.find('h2').text
brand = title_brand.find('p').text
suit.append([title,brand])
i += 1
df = pd.DataFrame(suit)
print("Saved Sucessfully....")
df.to_csv('dawaii.csv', index=False)
推荐阅读
- sqlite - 无论时区如何,如何在 sqlite 中搜索一小时?
- php - JQUERY ajax 更新 laravel 中的字段不起作用
- xml - 为什么 clojure.xml/emit 在标签内的字符串内容周围打印新行?
- sql - 如果子数据共享相同的父数据怎么办?
- java - 如何在 Retina 显示器上的 Java Swing 中进行双缓冲而不丢失更高分辨率?
- workbox - 如何设置workbox-webpack-plugin生成的workbox文件的位置
- laravel - Laravel - 电子邮件验证上的 403 无效签名
- html - 如何从页面确定图像的格式
- algorithm - 为什么段树上的范围查询最多返回 ceil(log_2 N) 个节点?
- mongodb - 数据库中是否存在尚未执行 sh.enableSharding() 的主分片?