首页 > 解决方案 > 使用循环抓取多个网站

问题描述

在下面的代码中,我为网络抓取创建了一个函数。我想做的下一步是:

  1. 创建一个遍历 area_links 中提供的链接的循环
  2. 将其保存到名称来自区域元组的数据框中。

当然,我可以单独做所有事情,但这不是本练习的重点。

# import libraries
from bs4 import BeautifulSoup
import urllib.request
import csv
import re
import requests
import pandas as pd 
from IPython import display  

# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')

areas_links =  ('https://www.morizon.pl/ceny/warszawa/',
      'https://www.morizon.pl/ceny/warszawa/bemowo/')

areas = ('warszawa','bemowo')

def web_scrape(url):
    # query the website and return the html to the variable 'page'
    page = urllib.request.urlopen(urlpage)
    # parse the html using beautiful soup and store in variable 'soup'
    soup = BeautifulSoup(page, 'html.parser')
    # find a string starting from generatechartData
    data = soup.find(string=re.compile('generatechartData()'))
    # finding patter for Price
    pattern = '\r?\ndescription: "([^"]+)"(?:\r?[^()]+)\)'
    rPrices = re.findall(pattern, data) 
    # finding patter for date
    pattern = '\r?(?:\r?\n(?!newDate\.setFullYear\().*)*\r?\nnewDate\.setFullYear\(([^()]+)\);'
    rdates = re.findall(pattern, data)
    # removal of '\n' in the data
    rdates = [x.replace('\n', '') for x in rdates]
    # split dates on Years and Months 
    rYear = [i.split(',')[0] for i in rdates] 
    rMonth = [i.split(',')[1] for i in rdates] 
    # data cleaning on the December. For some reason its '0' instead of '12'
    rMonth = [re.sub(r'\b0\b', '12', i) for i in rMonth]
    #building dataframe for the data
    df = pd.DataFrame(
    {'Prices': rPrices,
     'Year': rYear,
     'Month': rMonth
     })

for i in areas_links:
web_scrap(i)

标签: pythonweb-scraping

解决方案


我不知道您的期望是什么,因为您的大多数代码都可以工作,并且它使用来自的 url 运行,areas_links并且它提供了df您只需要保存的内容(使用不同的名称)

def web_scrap(url)你必须使用变量urlpage而不是url你必须return df在最后使用。

然后循环工作。它提供了df您可以保存在单独文件中的内容。

areas_links =  ('https://www.morizon.pl/ceny/warszawa/',
      'https://www.morizon.pl/ceny/warszawa/bemowo/')

areas = ('warszawa','bemowo')
    
for url, name in zip(areas_links, areas):
    df = web_scrap(url)
    df.to_csv(name + '.csv')
    print(df)

完整代码

# import libraries
from bs4 import BeautifulSoup
import urllib.request
#import csv
import re
#import requests
import pandas as pd 
#from IPython import display  

# --- functions ---

def web_scrap(urlpage):
    # query the website and return the html to the variable 'page'
    page = urllib.request.urlopen(urlpage)

    # parse the html using beautiful soup and store in variable 'soup'
    soup = BeautifulSoup(page, 'html.parser')

    # find a string starting from generatechartData
    data = soup.find(string=re.compile('generatechartData()'))

    # finding patter for Price
    pattern = '\r?\ndescription: "([^"]+)"(?:\r?[^()]+)\)'
    rPrices = re.findall(pattern, data) 

    # finding patter for date
    pattern = '\r?(?:\r?\n(?!newDate\.setFullYear\().*)*\r?\nnewDate\.setFullYear\(([^()]+)\);'
    rdates = re.findall(pattern, data)

    # removal of '\n' in the data
    rdates = [x.replace('\n', '') for x in rdates]

    # split dates on Years and Months 
    rYear = [i.split(',')[0] for i in rdates] 
    rMonth = [i.split(',')[1] for i in rdates] 

    # data cleaning on the December. For some reason its '0' instead of '12'
    rMonth = [re.sub(r'\b0\b', '12', i) for i in rMonth]

    #building dataframe for the data
    df = pd.DataFrame({
             'Prices': rPrices,
             'Year': rYear,
             'Month': rMonth
         })

    return df

# --- main ---

areas_links =  ('https://www.morizon.pl/ceny/warszawa/',
      'https://www.morizon.pl/ceny/warszawa/bemowo/')

areas = ('warszawa','bemowo')
    
for url, name in zip(areas_links, areas):
    df = web_scrap(url)
    df.to_csv(name + '.csv')
    print(df)

推荐阅读