python - 使用循环抓取多个网站
问题描述
在下面的代码中,我为网络抓取创建了一个函数。我想做的下一步是:
- 创建一个遍历 area_links 中提供的链接的循环
- 将其保存到名称来自区域元组的数据框中。
当然,我可以单独做所有事情,但这不是本练习的重点。
# import libraries
from bs4 import BeautifulSoup
import urllib.request
import csv
import re
import requests
import pandas as pd
from IPython import display
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
areas_links = ('https://www.morizon.pl/ceny/warszawa/',
'https://www.morizon.pl/ceny/warszawa/bemowo/')
areas = ('warszawa','bemowo')
def web_scrape(url):
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
# find a string starting from generatechartData
data = soup.find(string=re.compile('generatechartData()'))
# finding patter for Price
pattern = '\r?\ndescription: "([^"]+)"(?:\r?[^()]+)\)'
rPrices = re.findall(pattern, data)
# finding patter for date
pattern = '\r?(?:\r?\n(?!newDate\.setFullYear\().*)*\r?\nnewDate\.setFullYear\(([^()]+)\);'
rdates = re.findall(pattern, data)
# removal of '\n' in the data
rdates = [x.replace('\n', '') for x in rdates]
# split dates on Years and Months
rYear = [i.split(',')[0] for i in rdates]
rMonth = [i.split(',')[1] for i in rdates]
# data cleaning on the December. For some reason its '0' instead of '12'
rMonth = [re.sub(r'\b0\b', '12', i) for i in rMonth]
#building dataframe for the data
df = pd.DataFrame(
{'Prices': rPrices,
'Year': rYear,
'Month': rMonth
})
for i in areas_links:
web_scrap(i)
解决方案
我不知道您的期望是什么,因为您的大多数代码都可以工作,并且它使用来自的 url 运行,areas_links
并且它提供了df
您只需要保存的内容(使用不同的名称)
在def web_scrap(url)
你必须使用变量urlpage
而不是url
你必须return df
在最后使用。
然后循环工作。它提供了df
您可以保存在单独文件中的内容。
areas_links = ('https://www.morizon.pl/ceny/warszawa/',
'https://www.morizon.pl/ceny/warszawa/bemowo/')
areas = ('warszawa','bemowo')
for url, name in zip(areas_links, areas):
df = web_scrap(url)
df.to_csv(name + '.csv')
print(df)
完整代码
# import libraries
from bs4 import BeautifulSoup
import urllib.request
#import csv
import re
#import requests
import pandas as pd
#from IPython import display
# --- functions ---
def web_scrap(urlpage):
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
# find a string starting from generatechartData
data = soup.find(string=re.compile('generatechartData()'))
# finding patter for Price
pattern = '\r?\ndescription: "([^"]+)"(?:\r?[^()]+)\)'
rPrices = re.findall(pattern, data)
# finding patter for date
pattern = '\r?(?:\r?\n(?!newDate\.setFullYear\().*)*\r?\nnewDate\.setFullYear\(([^()]+)\);'
rdates = re.findall(pattern, data)
# removal of '\n' in the data
rdates = [x.replace('\n', '') for x in rdates]
# split dates on Years and Months
rYear = [i.split(',')[0] for i in rdates]
rMonth = [i.split(',')[1] for i in rdates]
# data cleaning on the December. For some reason its '0' instead of '12'
rMonth = [re.sub(r'\b0\b', '12', i) for i in rMonth]
#building dataframe for the data
df = pd.DataFrame({
'Prices': rPrices,
'Year': rYear,
'Month': rMonth
})
return df
# --- main ---
areas_links = ('https://www.morizon.pl/ceny/warszawa/',
'https://www.morizon.pl/ceny/warszawa/bemowo/')
areas = ('warszawa','bemowo')
for url, name in zip(areas_links, areas):
df = web_scrap(url)
df.to_csv(name + '.csv')
print(df)
推荐阅读
- php - TCPDF 不显示文本装饰(粗体、草书、下划线)
- c++ - vscode中的C ++程序仅显示二维数组的最后两个元素
- firebase - 如何在 Cloud Functions 中访问云存储
- python - 文件夹访问权限被拒绝 python [权限:Errno 13]
- list - 以特定模式 PowerShell 导出文件夹列表
- python - 尝试使用 2D 数据设置 pandas 列
- sql - System.Data.SqlClient.SqlException:''.' 附近的语法不正确。'
- android - 为什么我不能从 androidx.fragment.app.Fragment 访问 getParentFragmentManager()
- reactjs - 使用正则表达式和 useState 钩子响应验证输入
- r - ggplot 自定义主题:框架的一些图形问题