python - BeautifulSoup 网页抓取:UnboundLocalError:分配前引用的局部变量“汤”
问题描述
我尝试使用漂亮的汤和请求通过 Youtube 视频进行网络抓取,一切都很顺利,直到我遇到这个错误,但它为导师解决了。
import requests
from bs4 import BeautifulSoup
def get_data(url):
if not response.ok:
print('Server Responded: {}'.format(response.status_code))
else:
soup = BeautifulSoup(response.text, 'lxml')
return(soup)
def get_detail_data(soup):
try:
title = soup.find('h1', id='itemTitle').text.strip()
except:
title = ''
try:
p = soup.find('span', id='prcIsum').text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''
try:
sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
except:
sold = ''
data = {
'title' : title,
'currency' : currency,
'price' : price,
'total units sold' : sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def main():
url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
products = get_index_data(get_data(url))
for link in products:
data = get_detail_data(get_data(link))
if __name__ == '__main__':
main()
解决方案
In get_data you are missing making the actual request and storing in response. Then you need to assign soup = None if the response.ok is not True. Finally, elsewhere you need to test if soup is None before attempting to calls methods on it.
import requests
from bs4 import BeautifulSoup
def get_data(url):
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'}) #this was missing
if not response.ok:
print('Server Responded: {}'.format(response.status_code))
soup = None
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(soup):
try:
title = soup.find('h1', id='itemTitle').text.strip()
except:
title = ''
try:
p = soup.find('span', id='prcIsum').text.strip()
currency, price = p.split(' ')
except:
currency = ''
price = ''
try:
sold = soup.find('span', class_='vi-qtyS-hot-red').a.text.strip().split(' ')[0]
except:
sold = ''
data = {
'title' : title,
'currency' : currency,
'price' : price,
'total units sold' : sold
}
return data
def get_index_data(soup):
try:
links = soup.find_all('a', class_='s-item__link')
except:
links = []
urls = [item.get('href') for item in links]
return urls
def main():
url = 'https://www.ebay.com/sch/i.html?_nkw=mens+shoes&_sacat=0'
soup = get_data(url)
if not soup is None:
products = get_index_data(soup)
#print(products)
for link in products:
soup = get_data(link)
if not soup is None:
data = get_detail_data(soup)
print(data)
if __name__ == '__main__':
main()
推荐阅读
- c# - 查找包含给定 SyntaxNode 的 MethodDeclarationSyntax
- python - 如何以有效的方式在python中的sql上与外部连接器进行内部连接
- arrays - 无法在索引集合 laravel 中创建新的数组 foreach 记录
- reactjs - StopPropogation 基于反应中的条件
- c# - 当我们将非可空属性复制到可空属性时获取空值
- javascript - 使用 Cognito 用户池组调用没有联合身份的 Lambda 函数
- sql - 3 个表不支持加入表达式
- performance - 并行处理加速 S(n)
- wordpress - 自定义帖子类型的自定义 URL
- android - 当您已经在活动3时如何顺序执行活动1、2、3