首页 > 解决方案 > 调整python(美汤)代码抓取多页

问题描述

非常感谢您的支持,我正在使用 Python BeautifulSoup:

我需要简单地在多个页面上运行此代码(也就是在第 1 到 1290 页上抓取相同的数据)。我是新手,我可以想象它并没有那么复杂,因为 URL 非常简单,带有页码

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000'

#Opening the connection and grabbing the page
uClient = uReq(my_url)

#offload page content into a variable
page_html = uClient.read()
uClient.close()

#html parsing
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div",{"class":"card__content"})
contain_cards = cards[0]

#file creation
filename = "propertyfinder.csv"
f = open(filename, "w")

headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)

##DATA

for contain_cards in cards:

#TITLE
    title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"})
    title = title_container[0].text

#ADDRESS
    address_container = contain_cards.findAll("span",{"class":"card__location-text"})
    address = address_container[0].text

#PRICE
    price_container = contain_cards.findAll("span",{"class":"card__price-value"})
    price = (price_container[0].text.strip()).replace("EGP","")

#BEDROOMS
    bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
    bedrooms = bedrooms_container[0].text.strip()

#BATHROOMS
    bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
    bathrooms = bathrooms_container[0].text.strip()

#AREA
    area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"})
    area = area_container[0].text

#CLOSING
    print (title)
    print (address)
    print (area)
    print (bedrooms)
    print (bathrooms)
    print (price)
    f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" )
        
f.close()    

标签: pythonweb-scrapingbeautifulsoup

解决方案


想出来如下供大家参考:

from bs4 import BeautifulSoup
import requests

def scrape_properties(page):
    my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000'

    #Opening the connection and grabbing the page
    headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'referer': 'https://google.com',
    }
    response = requests.get(my_url, headers=headers)

    #html parsing
    page_soup = BeautifulSoup(response.text, "html.parser")
    cards = page_soup.find_all("div",{"class":"card__content"})
    contain_cards = cards[0]

    #file creation
    filename = "propertyfinder.csv"
    if page == 1:
        f = open(filename, "w")

        headers = "title,address,area,bedrooms,bathrooms,price,ptype\n"
        f.write(headers)
    else:
        f = open(filename, "a")
    ##DATA

    for contain_cards in cards:
        try:

            #TITLE
            title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"})
            title = title_container[0].text.strip()

            #ADDRESS
            address_container = contain_cards.find_all("span",{"class":"card__location-text"})
            address = address_container[0].text.strip()

            #PRICE
            price_container = contain_cards.find_all("span",{"class":"card__price-value"})
            price = (price_container[0].text.strip()).replace("EGP","").strip()

            #BEDROOMS
            bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
            bedrooms = bedrooms_container[0].text.strip().strip()

            #BATHROOMS
            bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
            bathrooms = bathrooms_container[0].text.strip()

            #AREA
            area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--area"})
            area = area_container[0].text.strip()

            #PTYPE
            ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--property-type"})
            ptype = ptype_container[0].text.strip()        

            #CLOSING
            print (title)
            print (address)
            print (area)
            print (bedrooms)
            print (bathrooms)
            print (price)
            print (ptype)
            f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" )
        except:
            pass
    
    f.close()

for page in range(1, 100):
    scrape_properties(page)


推荐阅读