python - Web 抓取到 CSV - ValueError 无法将输入数组从形状 (2) 广播到形状 (1)
问题描述
最近几天我一直在玩python和beautifulsoup。我已经开始尝试抓取当地的枪支广告平台。
我一直在工作并得到帮助,这很棒。有了这些帮助阶段,我推到了下一个“正确的,我该怎么做……”
这就是我现在的位置:
我正在将我的数据导出(失败)到 csv。
我收到错误 - ValueError - 无法将输入数组从形状 (2) 广播到形状 (1)。
一些阅读建议我将二维数组拉入一维数组?- 查看我的代码,我看不出我可能在哪里这样做?
有人会介意进行扫描并查看我哪里出错了吗?
谢谢!
完整脚本:
from bs4 import BeautifulSoup
import requests
import urllib.request
import csv
import pandas
from pandas import DataFrame
import re
#csv creation
with open('Guntrader_Dealer.csv', mode='w') as csv_file:
fieldnames = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
#all links list
all_links=[]
#grab all links which contain the href specifed
url="https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1,3):
res=requests.get(url).text
soup=BeautifulSoup(res,'html.parser')
for link in soup.select('a[href*="dealers/minsterley/minsterley-ranges/guns/"]'):
all_links.append("https://www.guntrader.uk" + link['href'])
for a_link in all_links:
#Defining the span text in GunDetails lookups
def make_span(make):
return make.name=='span' and 'Make:' in make.parent.contents[0]
def model_span(model):
return model.name=='span' and 'Model:' in model.parent.contents[0]
def licence_span(licence):
return licence.name=='span' and 'Licence:' in licence.parent.contents[0]
def orient_span(orient):
return orient.name=='span' and 'Orient.:' in orient.parent.contents[0]
def barrel_span(barrel):
return barrel.name=='span' and 'Barrel:' in barrel.parent.contents[0]
def stock_span(stock):
return stock.name=='span' and 'Stock:' in stock.parent.contents[0]
def choke_span(choke):
return choke.name=='span' and 'Chokes:' in choke.parent.contents[0]
def origin_span(origin):
return origin.name=='span' and 'Origin:' in origin.parent.contents[0]
def trigger_span(trigger):
return trigger.name=='span' and 'Trigger:' in trigger.parent.contents[0]
def ejection_span(ejection):
return ejection.name=='span' and 'Ejection:' in ejection.parent.contents[0]
def serial_span(serial):
return serial.name=='span' and 'Serial #:' in serial.parent.contents[0]
def stockno_span(stockno):
return stockno.name=='span' and 'Stock #:' in stockno.parent.contents[0]
def condition_span(condition):
return condition.name=='span' and 'Condition:' in condition.parent.contents[0]
def scope_span(scope):
return scope.name=='span' and 'Scope:' in scope.parent.contents[0]
res = urllib.request.urlopen(a_link)
soup = BeautifulSoup(res, 'html.parser')
#soup searches using the define criteria
makes = soup.find(make_span)
gun_makes = makes.content if makes else 'none'
models = soup.find(model_span)
gun_models = models.contents if models else 'none'
licences = soup.find(licence_span)
gun_licences = licences.contents if licences else 'none'
orients = soup.find(orient_span)
gun_orients = orients.contents if orients else 'none'
barrels = soup.find(barrel_span)
gun_barrels = barrels.contents if barrels else 'none'
stocks = soup.find(stock_span)
gun_stocks = stocks.contents if stocks else 'none'
chokes = soup.find(choke_span)
gun_chokes = chokes.contents if chokes else 'none'
origins = soup.find(origin_span)
gun_origins = origins.contents if origins else 'none'
triggers = soup.find(trigger_span)
gun_triggers = triggers.contents if triggers else 'none'
ejections = soup.find(ejection_span)
gun_ejections = ejections.contents if ejections else 'none'
scopes = soup.find(scope_span)
gun_scopes = scopes.contents if scopes else 'none'
serials = soup.find(serial_span)
gun_serials = serials.contents if serials else 'none'
stocknos = soup.find(stockno_span)
gun_stocknos = stocknos.contents if stocknos else 'none'
conditions = soup.find(condition_span)
gun_conditions = conditions.contents if conditions else 'none'
#title price and description
title = soup.select_one('h1[itemprop="name"]')
gun_title = title.text if title else 'none'
price = soup.select_one('p.price')
gun_price = price.text if price else 'none'
description = soup.select_one('p[itemprop="description"]')
gun_description = description.text if description else 'none'
data = { 'Title': gun_title, 'Make': gun_makes, 'Model': gun_models, 'Licence': gun_licences, 'Orientation': gun_orients, 'Barrel Length': gun_barrels, 'Stock Length': gun_stocks, 'Chokes': gun_chokes, 'Origin': gun_origins, 'Trigger': gun_triggers, 'Ejection': gun_ejections, 'Scope': gun_scopes, 'Serial No': gun_serials, 'Stock No': gun_stocknos, 'Condition': gun_conditions, 'Description': gun_description, 'Price': gun_price}
df = DataFrame(data, columns = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price'], index=[0])
df.to_csv(r'Guntrader_Dealer.csv')
解决方案
我已经为你写好了剧本。我没有将不同的 df 覆盖到同一个文件,而是创建了 main df,它将所有 df 附加到 for 循环中。
这是最终代码:
from bs4 import BeautifulSoup
import requests
import csv
import pandas
from pandas import DataFrame
import re
import os
import locale
os.environ["PYTHONIOENCODING"] = "utf-8"
#csv creation
with open('Guntrader_Dealer.csv', mode='w') as csv_file:
fieldnames = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
all_links=[]
#grab all links which contain the href specifed
url="https://www.guntrader.uk/dealers/minsterley/minsterley-ranges/guns?page={}"
for page in range(1,3):
res=requests.get(url).text
soup=BeautifulSoup(res,'html.parser')
for link in soup.select('a[href*="dealers/minsterley/minsterley-ranges/guns/"]'):
all_links.append("https://www.guntrader.uk" + link['href'])
df_main = DataFrame(columns = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price'])
for a_link in all_links:
def make_span(make):
return make.name=='span' and 'Make:' in make.parent.contents[0]
def model_span(model):
return model.name=='span' and 'Model:' in model.parent.contents[0]
def licence_span(licence):
return licence.name=='span' and 'Licence:' in licence.parent.contents[0]
def orient_span(orient):
return orient.name=='span' and 'Orient.:' in orient.parent.contents[0]
def barrel_span(barrel):
return barrel.name=='span' and 'Barrel:' in barrel.parent.contents[0]
def stock_span(stock):
return stock.name=='span' and 'Stock:' in stock.parent.contents[0]
def choke_span(choke):
return choke.name=='span' and 'Chokes:' in choke.parent.contents[0]
def origin_span(origin):
return origin.name=='span' and 'Origin:' in origin.parent.contents[0]
def trigger_span(trigger):
return trigger.name=='span' and 'Trigger:' in trigger.parent.contents[0]
def ejection_span(ejection):
return ejection.name=='span' and 'Ejection:' in ejection.parent.contents[0]
def serial_span(serial):
return serial.name=='span' and 'Serial #:' in serial.parent.contents[0]
def stockno_span(stockno):
return stockno.name=='span' and 'Stock #:' in stockno.parent.contents[0]
def condition_span(condition):
return condition.name=='span' and 'Condition:' in condition.parent.contents[0]
def scope_span(scope):
return scope.name=='span' and 'Scope:' in scope.parent.contents[0]
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
makes = soup.find(make_span)
gun_makes = makes.content if makes else 'none'
models = soup.find(model_span)
gun_models = models.contents if models else 'none'
licences = soup.find(licence_span)
gun_licences = licences.contents if licences else 'none'
orients = soup.find(orient_span)
gun_orients = orients.contents if orients else 'none'
barrels = soup.find(barrel_span)
gun_barrels = barrels.contents if barrels else 'none'
stocks = soup.find(stock_span)
gun_stocks = stocks.contents if stocks else 'none'
chokes = soup.find(choke_span)
gun_chokes = chokes.contents if chokes else 'none'
origins = soup.find(origin_span)
gun_origins = origins.contents if origins else 'none'
triggers = soup.find(trigger_span)
gun_triggers = triggers.contents if triggers else 'none'
ejections = soup.find(ejection_span)
gun_ejections = ejections.contents if ejections else 'none'
scopes = soup.find(scope_span)
gun_scopes = scopes.contents if scopes else 'none'
serials = soup.find(serial_span)
gun_serials = serials.contents if serials else 'none'
stocknos = soup.find(stockno_span)
gun_stocknos = stocknos.contents if stocknos else 'none'
conditions = soup.find(condition_span)
gun_conditions = conditions.contents if conditions else 'none'
title = soup.select_one('h1[itemprop="name"]')
gun_title = title.text if title else 'none'
price = soup.select_one('p.price')
gun_price = price.text if price else 'none'
description = soup.select_one('p[itemprop="description"]')
gun_description = description.text if description else 'none'
data = { 'Title': gun_title, 'Make': gun_makes, 'Model': gun_models, 'Licence': gun_licences, 'Orientation': gun_orients, 'Barrel Length': gun_barrels, 'Stock Length': gun_stocks, 'Chokes': gun_chokes, 'Origin': gun_origins, 'Trigger': gun_triggers, 'Ejection': gun_ejections, 'Scope': gun_scopes, 'Serial No': gun_serials, 'Stock No': gun_stocknos, 'Condition': gun_conditions, 'Description': gun_description, 'Price': gun_price}
df = DataFrame(data, columns = ['Title', 'Make', 'Model', 'Licence', 'Orientation', 'Barrel Length', 'Stock Length', 'Chokes', 'Origin', 'Trigger', 'Ejection', 'Scope', 'Serial No', 'Stock No', 'Condition', 'Description', 'Price'], index=[0])
df_main = df_main.append(df, ignore_index = True)
df_main.to_csv('Guntrader_Dealer.csv', encoding='UTF-8')
推荐阅读
- python - 如何匹配正则表达式并获取先例词
- javascript - 同时设置:关注轮播和物品的变化
- django - FileNotFoundError [Errno 2] 没有这样的文件或目录:调整图像大小后
- c# - “全局”解析服务不再导致属性注入
- python - 基于linux在某个conda环境中设置环境变量
- javascript - 从 API 获取时如何存储数据
- uwp - Windows 运行时组件中未解析的外部 __imp_strdup
- c# - iOS 设备上的 Xamarin.Forms AbsoluteLayout 问题(内容不可见)
- python - 空闲编辑器中的正则表达式替换
- continuous-deployment - 如何根据 Docker 注册表中的更改触发 Google Cloud Build