首页 > 解决方案 > BeautifulSoup 网页抓取:UnboundLocalError:分配前引用的局部变量“汤”

问题描述

我尝试使用漂亮的汤和请求通过 Youtube 视频进行网络抓取,一切都很顺利,直到我遇到这个错误,但它为导师解决了。

import requests
from bs4 import BeautifulSoup

def get_data(url):
    if not response.ok:
        print('Server Responded: {}'.format(response.status_code))
    else:
        soup = BeautifulSoup(response.text, 'lxml')
    return(soup)

def get_detail_data(soup):
    try:
        title = soup.find('h1', id='itemTitle').text.strip()
    except:
        title = ''
        
    try:
        p = soup.find('span', id='prcIsum').text.strip()
        currency, price = p.split(' ')
    except:
        currency = ''
        price = ''
    
    try:
        sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
    except:
        sold = ''
    
    data = {
        'title' : title,
        'currency' : currency,
        'price' : price,
        'total units sold' : sold
    }

    return data

def get_index_data(soup):
    try:
        links = soup.find_all('a', class_='s-item__link')
    except:
        links = []

    
    urls = [item.get('href') for item in links]
    return urls

def main():
    url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
    
    products = get_index_data(get_data(url))

    for link in products:
        data =  get_detail_data(get_data(link))


if __name__ == '__main__':
    main()

标签: pythonpython-3.xweb-scrapingbeautifulsouppython-requests

解决方案


In get_data you are missing making the actual request and storing in response. Then you need to assign soup = None if the response.ok is not True. Finally, elsewhere you need to test if soup is None before attempting to calls methods on it.

import requests
from bs4 import BeautifulSoup

def get_data(url):
    
    response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'}) #this was missing
    
    if not response.ok:
        print('Server Responded: {}'.format(response.status_code))
        soup = None
    else:
        soup = BeautifulSoup(response.text, 'lxml')
    return soup

def get_detail_data(soup):
    
    try:
        title = soup.find('h1', id='itemTitle').text.strip()
    except:
        title = ''
        
    try:
        p = soup.find('span', id='prcIsum').text.strip()
        currency, price = p.split(' ')
    except:
        currency = ''
        price = ''
    
    try:
        sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
    except:
        sold = ''
    
    data = {
        'title' : title,
        'currency' : currency,
        'price' : price,
        'total units sold' : sold
    }

    return data

def get_index_data(soup):
    try:
        links = soup.find_all('a', class_='s-item__link')
    except:
        links = []

    
    urls = [item.get('href') for item in links]
    return urls

def main():
    
    url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
    soup = get_data(url)
    
    if not soup is None:
        
        products = get_index_data(soup)
        #print(products)

        for link in products:
            
            soup = get_data(link)
            
            if not soup is None:
                
                data =  get_detail_data(soup)
                print(data)

if __name__ == '__main__':
    main()

推荐阅读