python - Python - BeautifulSoup - 抓取的内容仅写入第一个文本文件,而不是后续文件
问题描述
我目前正在使用下面的代码从运动日程网站上抓取数据并将信息输出到文本文件中。目前使用我拥有的代码,数据正确打印到控制台,并且来自第一个 URL ( https://sport-tv-guide.live/live/darts ) 的数据按预期输出到文本文件。
问题是来自第二个 URL(https://sport-tv-guide.live/live/boxing/)的内容没有输出到预期的文本文件(文本文件已创建但其中没有内容) .
我正在使用的代码如下:
import requests
import time
from bs4 import BeautifulSoup
def makesoup(url):
cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London', 'user_time_zone_id': '1'}
r = requests.post(url, cookies=cookies)
return BeautifulSoup(r.text,"lxml")
def linkscrape(links, savefile):
baseurl = "https://sport-tv-guide.live"
urllist = []
for link in links:
finalurl = (baseurl+ link['href'])
urllist.append(finalurl)
# print(finalurl)
for singleurl in urllist:
soup2=makesoup(url=singleurl)
g_data=soup2.find_all('div', {'id': 'channelInfo'})
c_data=soup2.find_all('div', {'class': 'liveOtherStations clearfix'})
with open(savefile ,"w") as text_file:
for match in g_data:
try:
hometeam = match.find_previous('div', class_='cell40 text-center teamName1').text.strip()
awayteam = match.find_previous('div', class_='cell40 text-center teamName2').text.strip()
print("Competitors; ", hometeam +" "+ "vs" +" "+ awayteam)
except:
hometeam = "Home Team element not found"
awayteam = "Away Team element not found"
try:
startime = match.find('div', class_='time full').text.strip()
print("Time; ", startime)
except:
startime = "Time element not found"
try:
event= match.find('div', class_='title full').text.strip()
print("Event:", event)
except:
event = "Event element not found"
try:
dateandtime = match.find('div', class_='date full').text.strip()
print("Date:", dateandtime)
except:
dateandtime = "Date not found"
try:
sport = match.find('div', class_='text full').text.strip()
print("Sport:", sport)
except:
sport = "Sport element not found"
try:
singlechannel = match.find('div', class_='station full').text.strip()
print("Main Channel:", singlechannel)
print("-----")
except:
singlechannel = "Single Channel element not found"
for channel in c_data:
try:
channels = match.find('div', class_='stationLive active col-wrap')
print("Extra Channels:", channel.text)
except:
channels = "No channels found"
print(channels)
print("-------")
text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ hometeam + awayteam + event+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + ' '+channel.text+" \n"+'-' *20 + " \n")
def matches():
dict = {"https://sport-tv-guide.live/live/darts/":"/home/brendan/Desktop/testing,txt",
"https://sport-tv-guide.live/live/boxing/":"/home/brendan/Desktop/boxing.txt"}
for key, value in dict.items():
soup=makesoup(url = key)
linkscrape(links= soup.find_all('a', {'class': 'article flag', 'href' : True}) , savefile = value)
matches()
我认为打开文本文件的 while 循环位置可能存在问题,导致它被创建,但在第一个文本文件成功创建后实际的 .writelines 函数没有正确运行。我尝试过从 while 循环开始的所有代码,但这对输出没有影响。
不幸的是,我不确定如何从这里开始。
感谢任何可以提供帮助或解决此问题的人。
解决方案
发现了问题。在您的代码中,对于boxing
url -https://sport-tv-guide.live/live/boxing/
没有额外的频道。因此,控件不会进入循环内部并且没有输出写入文件。
您可以在列表中收集所有额外的频道,然后写入文件
import requests
import time
from bs4 import BeautifulSoup
def makesoup(url):
cookies = {'mycountries' : '101,28,3,102,42,10,18,4,2,22', 'user_time_zone': 'Europe/London', 'user_time_zone_id': '1'}
r = requests.post(url, cookies=cookies)
return BeautifulSoup(r.text,"lxml")
def linkscrape(links, savefile):
baseurl = "https://sport-tv-guide.live"
urllist = []
print(savefile)
for link in links:
finalurl = (baseurl+ link['href'])
urllist.append(finalurl)
# print(finalurl)
for singleurl in urllist:
soup2=makesoup(url=singleurl)
g_data=soup2.find_all('div', {'id': 'channelInfo'})
c_data=soup2.find_all('div', {'class': 'liveOtherStations clearfix'})
with open(savefile ,"w") as text_file:
for match in g_data:
try:
hometeam = match.find_previous('div', class_='cell40 text-center teamName1').text.strip()
awayteam = match.find_previous('div', class_='cell40 text-center teamName2').text.strip()
print("Competitors; ", hometeam +" "+ "vs" +" "+ awayteam)
except:
hometeam = "Home Team element not found"
awayteam = "Away Team element not found"
try:
startime = match.find('div', class_='time full').text.strip()
print("Time; ", startime)
except:
startime = "Time element not found"
try:
event= match.find('div', class_='title full').text.strip()
print("Event:", event)
except:
event = "Event element not found"
try:
dateandtime = match.find('div', class_='date full').text.strip()
print("Date:", dateandtime)
except:
dateandtime = "Date not found"
try:
sport = match.find('div', class_='text full').text.strip()
print("Sport:", sport)
except:
sport = "Sport element not found"
try:
singlechannel = match.find('div', class_='station full').text.strip()
print("Main Channel:", singlechannel)
print("-----")
except:
singlechannel = "Single Channel element not found"
extra_channels = []
for channel in c_data:
try:
channels = match.find('div', class_='stationLive active col-wrap')
print("Extra Channels:", channel.text)
extra_channels.append(channel.text)
except:
channels = "No channels found"
print(channels)
extra_channels.append(channel.text)
print("-------")
if extra_channels:
for channel in extra_channels:
text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ hometeam + awayteam + event+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + ' '+channel+" \n"+'-' *20 + " \n")
else:
text_file.writelines("__**Sport:**__" +':' + ' '+ sport +" \n"+"__**Competitors:**__" +':' + ' '+ hometeam + awayteam + event+" \n"+"__**Match Date:**__" +':' + ' ' +dateandtime +" \n"+"__**Match Time:**__"+':' + ' ' +startime +" \n"+ "__**Main Channel**__"+':' + ' '+singlechannel+" \n" + "__**Channels**__"+':' + " \n"+'-' *20 + " \n")
def matches():
dict = {"https://sport-tv-guide.live/live/darts/":"testing.txt",
"https://sport-tv-guide.live/live/boxing/":"boxing.txt"}
for key, value in dict.items():
soup=makesoup(url = key)
linkscrape(links= soup.find_all('a', {'class': 'article flag', 'href' : True}) , savefile = value)
matches()
推荐阅读
- android - 尝试使用 picasso 显示来自 FirebaseFirestore 的图像时出错
- r - 使用 ODBC 的 R 到 Oracle 连接
- c# - 使用 C# 变量在列表中构建 SQL
- javascript - 检查 jQuery 中的空值或未定义值
- react-native - react-native-google-places-autocomplete 中的当前位置问题
- python - Itertools Groupby 给出了意外的结果
- android - Android startActivityForResult 破坏主要活动并关闭应用程序
- html - 使用 PHP、HTML 和 MySQL 的自我处理页面
- java - 读取 $.ajax POST 方法的返回值只给出返回值 Object 对象
- git - 在 git diff 中显示未更改的文件