python - 调整python(美汤)代码抓取多页
问题描述
非常感谢您的支持,我正在使用 Python BeautifulSoup:
我需要简单地在多个页面上运行此代码(也就是在第 1 到 1290 页上抓取相同的数据)。我是新手,我可以想象它并没有那么复杂,因为 URL 非常简单,带有页码
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000'
#Opening the connection and grabbing the page
uClient = uReq(my_url)
#offload page content into a variable
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
##DATA
for contain_cards in cards:
#TITLE
title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"})
title = title_container[0].text
#ADDRESS
address_container = contain_cards.findAll("span",{"class":"card__location-text"})
address = address_container[0].text
#PRICE
price_container = contain_cards.findAll("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","")
#BEDROOMS
bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
#BATHROOMS
bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" )
f.close()
解决方案
想出来如下供大家参考:
from bs4 import BeautifulSoup
import requests
def scrape_properties(page):
my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000'
#Opening the connection and grabbing the page
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'referer': 'https://google.com',
}
response = requests.get(my_url, headers=headers)
#html parsing
page_soup = BeautifulSoup(response.text, "html.parser")
cards = page_soup.find_all("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
if page == 1:
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price,ptype\n"
f.write(headers)
else:
f = open(filename, "a")
##DATA
for contain_cards in cards:
try:
#TITLE
title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"})
title = title_container[0].text.strip()
#ADDRESS
address_container = contain_cards.find_all("span",{"class":"card__location-text"})
address = address_container[0].text.strip()
#PRICE
price_container = contain_cards.find_all("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","").strip()
#BEDROOMS
bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip().strip()
#BATHROOMS
bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text.strip()
#PTYPE
ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--property-type"})
ptype = ptype_container[0].text.strip()
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
print (ptype)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" )
except:
pass
f.close()
for page in range(1, 100):
scrape_properties(page)
推荐阅读
- mysql - 字段列表MySql存储过程中的未知列
- html - Vba Web 导航和身份验证
- javascript - 如何在 chrome 中获取我的 JS 错误以显示正确的行号?
- python - Discord money bot 将用户 ID 保存在 json 文件中。当 Bot 重新启动时,它会为每个人创建一个新的(但相同的)ID
- autohotkey - 如何改变循环?
- javascript - 添加、删除、更新记录到数据库后的最佳实践
- go - 为什么当阻塞读取连接关闭时,这个 goroutine 随机无法退出?
- hibernate - Hibernate:本机查询无法处理用单引号括起来的值
- kubernetes - 无法从 ansible 运行 kubectl
- mysql - 如何在一行中添加三个不同表中的三个数字mysql phpmyadmin