首页 > 解决方案 > 无法使用python从网站中提取记录

问题描述

当我运行代码时,它会从网站打印数据,但问题是,我在代码中使用了自动滚动功能,但它不能滚动到网站的最后一页,这样,我只得到 122 CSV 文件中的记录,但在网站中,直到网站的最后一页有很多记录,所以请帮帮我。

import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
url ='https://dawaai.pk/medicine-category/alimentary-tract-metabolism-2'
driver = webdriver.Chrome('E:/chromedriver')

driver.get(url)
SCROLL_PAUSE_TIME = 1
#time.sleep(4)

# Get scroll height
"""last_height = driver.execute_script("return document.body.scrollHeight")

this doesn't work due to floating web elements on youtube
"""

last_height = driver.execute_script("return document.documentElement.scrollHeight")
conte = None

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0,document.documentElement.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    if new_height == last_height and conte:
       print("break")
       break
       
    last_height = new_height
    #time.sleep(5)
    pageSource = driver.page_source
    soup = BeautifulSoup(pageSource, 'html.parser')
    aaa = soup.find('div',class_='columns systemic-products-div')  
    conte= aaa.find_all('div',class_='column col-3 mb-20')
    #print(conte)        
suit = []

for items in conte:
  product_title = items.find("h2").text.strip()
  product_Brand_Name = items.find("p").text.strip()

   #     title=''
  print(len(product_title))
  #driver.close()
  fabric = {
      'productname':product_title,
      'Product_Brand_Name':product_Brand_Name
  }
        
  suit.append(fabric)
print ("Importing to Data into CSV File...!!")
df = pd.DataFrame(suit)
print("Saved Sucessfully....")
df.to_csv('dawaii.csv', index=False)

标签: pythonpandasbeautifulsouprequestwebdriver

解决方案


另一种方法,你可以试试。

import requests
from bs4 import BeautifulSoup 
import pandas as pd 

headers = {
    'authority': 'dawaai.pk',
    'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
    'accept': '*/*',
    'x-requested-with': 'XMLHttpRequest',
    'digital-signature': '49621438',
    'sec-ch-ua-mobile': '?0',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-mode': 'cors',
    'sec-fetch-dest': 'empty',
    'referer': 'https://dawaai.pk/medicine-category/alimentary-tract-metabolism-2',
    'accept-language': 'en-US,en;q=0.9',
    'cookie': 'locationCityId=48504; locationCityName=Karachi; _gid=GA1.2.615032251.1635224873; _fbp=fb.1.1635224873607.1935785452; _hjid=91298dd5-b6dc-44d0-8efa-c8e077b32803; _hjFirstSeen=1; _hjAbsoluteSessionInProgress=0; _CEFT=Q%3D%3D%3D; _ce.s=v11.rlc~1635224874425; moe_uuid=ac52076a-c5c0-4680-a523-91844e23a37e; __zlcmid=16kkQu7WdDEd4MT; ci_session=4e9bh9uog3g4su26tgvd90v4uoso2p5s; mp_1b439ed4835dcc8528bac6ce6d25c879_mixpanel=%7B%22distinct_id%22%3A%20%2217cbaff65754cb-0deaaefbe322db-57b193e-100200-17cbaff657647a%22%2C%22%24device_id%22%3A%20%2217cbaff65754cb-0deaaefbe322db-57b193e-100200-17cbaff657647a%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%7D; _ga=GA1.2.1379999337.1635224871; _gat=1; _ga_M50BR48D77=GS1.1.1635224870.1.1.1635225184.0',
}

suit = []
i = 1
while i < 100:
    response = requests.get(f'https://dawaai.pk/systemic_class/pagination/{i}', headers=headers)
    print(response.url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # print(soup.prettify())
    data = soup.select('div.card-body')
    for title_brand in data:
        title = title_brand.find('h2').text

        brand = title_brand.find('p').text
        suit.append([title,brand])
    i += 1


df = pd.DataFrame(suit)
print("Saved Sucessfully....")
df.to_csv('dawaii.csv', index=False)

推荐阅读