首页 > 解决方案 > 创建一个 for 循环以使用 beautifulsoup 从多个 URL 抓取多个页面

问题描述

我正在尝试有效地从多个 URL 中抓取多个页面。我已经能够成功地从一个 URL 抓取多个页面,但无法为多个 URL 实现此功能。任何和帮助将不胜感激。谢谢你。

当前循环代码:

BASE = 'https://www.unegui.mn'
URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
         'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
with requests.Session() as session:
    while True:
        (r := session.get(f'{URL}{page+1}')).raise_for_status()
        m = re.search('.*page=(\d+)$', r.url)
        if m and int(m.group(1)) == page:
            break
        page += 1
        print(f'Scrapping page {page}')

所需的 URL 循环:

每个 url 唯一改变的是 1-r、2-r、3-r 部分。URLS 总数为 5。

URL = [f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=',
       f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
       ]

完整代码:

import requests
from bs4 import BeautifulSoup as BS
from datetime import datetime
import pandas as pd
import re
import csv

today = datetime.today().strftime('%y%m%d ')

def main():
    page = 0
    name = []
    date = []
    address = []
    district = []
    city = []
    price = []
    area_sqm = []
    rooms = []
    floor = []
    commission_year = []
    building_floors = []
    garage = []
    balcony = []
    windows = []
    window_type = []
    floor_type = []
    door_type = []
    leasing = []
    description = []
    link = []

        BASE = 'https://www.unegui.mn'
    URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page='
    COLUMNS=['Name','Date','Address','District','City','Price','Area_sqm','Rooms','Floor','Commission_year',
             'Building_floors','Garage', 'Balcony','Windows','Window_type','Floor_type','door_type','Leasing','Description','Link']
    with requests.Session() as session:
        while True:
            (r := session.get(f'{URL}{page+1}')).raise_for_status()
            m = re.search('.*page=(\d+)$', r.url)
            if m and int(m.group(1)) == page:
                break
            page += 1
            print(f'Scrapping page {page}')
            soup = BS(r.text, 'lxml')
            for tag in soup.findAll('div', class_='list-announcement-block'):
                _name = tag.find('a', attrs={'itemprop': 'name'})
                name.append(_name.get('content', 'N/A'))
                if (_link := _name.get('href', None)):
                    link.append(f'{BASE}{_link}')
                    (_r := session.get(link[-1])).raise_for_status()
                    _spanlist = BS(_r.text, 'lxml').find_all('span', class_='value-chars')
                    floor_type.append(_spanlist[0].get_text().strip())
                    balcony.append(_spanlist[1].get_text().strip())
                    garage.append(_spanlist[2].get_text().strip())
                    window_type.append(_spanlist[3].get_text().strip())
                    door_type.append(_spanlist[4].get_text().strip())   
                    windows.append(_spanlist[5].get_text().strip())
                    
                    _alist = BS(_r.text, 'lxml').find_all('a', class_='value-chars')
                    commission_year.append(_alist[0].get_text().strip())
                    building_floors.append(_alist[1].get_text().strip())
                    area_sqm.append(_alist[2].get_text().strip())
                    floor.append(_alist[3].get_text().strip())
                    leasing.append(_alist[4].get_text().strip())
                    district.append(_alist[5].get_text().strip())
                    address.append(_alist[6].get_text().strip())
                    
                rooms.append(tag.find('div', attrs={'announcement-block__breadcrumbs'}).get_text().split('»')[1].strip())
                description.append(tag.find('div', class_='announcement-block__description').get_text().strip())
                date.append(tag.find('div', class_='announcement-block__date').get_text().split(',')[0].strip())
                city.append((tag.find('meta', attrs={'itemprop': 'areaServed'})).get('content'))
                if (_price := tag.find('meta', attrs={'itemprop': 'price'})) is None:
                    _price = tag.find('div', class_='announcement-block__price _premium')
                price.append(_price.get_text().strip() if _price else 'N/A')
        df = pd.DataFrame(zip(name, date, address, district, city, 
                                  price, area_sqm, rooms, floor, commission_year,
                                  building_floors, garage, balcony, windows, window_type,
                                  floor_type, door_type, leasing, description, link), columns=COLUMNS)
        return(df)

if __name__ == '__main__':
    df = main()
    df.to_csv(f'{today}HPD.csv', encoding='cp1251', errors='ignore', index=False)

标签: pythonloopsfor-loopweb-scrapingbeautifulsoup

解决方案


您可以for loops与 Python 的range()功能结合使用。

range()函数根据函数的参数提供整数序列。

range(start, stop[, step])

start参数是范围中的第一个值。如果range()仅使用一个参数调用,则 Python 假定start = 0.

stop参数是范围的上限。重要的是要意识到这个上限值不包括在范围内

例子:

for i in range(1, 6):
    BASE = 'https://www.unegui.mn'
    URL = f'{BASE}/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{i}-r/?page='
    print(URL)

输出:

https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/3-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/4-r/?page=
https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/?page=

推荐阅读