python - 使用抓取的数据创建数据框
问题描述
我正在尝试按如下方式抓取数据:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
daterange = pd.date_range('02-25-2015', '09-16-2020', freq='D')
def main(req, date):
r = req.get(f"https://it.sputniknews.com/politica/{date.strftime('%Y%m%d')}")
print(r, r.content)
soup = BeautifulSoup(r.content, 'html.parser')
tag=None
print (soup.select("b-plainlist"))
#for tag in soup.select(".b-plainlist "):
#print(tag.select_one(".b-plainlist__date").text)
#print(tag.select_one(".b-plainlist__title").text)
#print(tag.find_next(class_="b-plainlist__announce").text.strip())
return tag.select_one(".b-plainlist__date").text, tag.select_one(".b-plainlist__title").text, tag.find_next(class_="b-plainlist__announce").text.strip()
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, date) for date in daterange]
allin = []
for f in fs:
allin.append(f.result())
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Content"])
print(df)
尝试通过时间创建具有日期、标题和内容的数据框。
这段代码应该没问题,但我无法创建一个“干净”的数据框,所以我认为标签有问题。你能看看吗?谢谢
解决方案
我不能完全按原样运行您的代码,但是要获得“干净”的数据框,您需要注意每个新闻项目都包含在一个.b-plainlist__item
元素中,因此您必须全选、遍历它们并获取b-plainlist__date
等。
以下对我有用:
import datetime
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from dateutil import rrule
import pandas as pd
import requests
def get_items(session, date):
url_template = 'https://it.sputniknews.com/politica/{}'
r = session.get(url_template.format(date.strftime("%Y%m%d")))
soup = BeautifulSoup(r.content, 'html.parser')
items = soup.select(".b-plainlist__item")
for item in items:
date = datetime.datetime.strptime(
item.select_one(".b-plainlist__date").text,
'%H:%M %d.%m.%Y'
)
title = item.select_one(".b-plainlist__title").text
content = item.select_one(".b-plainlist__announce").text
yield {'Date': date, 'Title': title, 'Content': content}
daterange = rrule.rrule(
freq=rrule.DAILY,
dtstart=datetime.datetime(2015, 2, 25),
until=datetime.datetime(2020, 9, 16),
)
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as session:
fs = [
executor.submit(get_items, session, date)
for date in daterange
]
result = []
for f in fs:
result.extend(f.result())
df = pd.DataFrame.from_records(result, index='Date')
输出
df.head()
Title Content
Date
2015-02-25 18:21:00 Che cosa accadrà dopo la tregua di Minsk? Nonostante il cessate il fuoco entrato nel vig...
2015-02-26 14:12:00 Renzi va a Mosca con un pacchetto misterioso Il viaggio di Matteo Renzi a Mosca – previsto ...
2015-02-26 13:47:00 Turchia all’UE: non aspetteremo all’infinito Lo scopo della Turchia è appunto quello di ade...
2015-02-26 11:24:00 Tutte le portaerei mediatiche sparano ad alzo ... Riassumiamo le ultime vicende della crisi est-...
2015-02-26 11:01:00 Nel dopo Minsk gli ostacoli prevalgono sull'ot... Nella ridda di intepretazioni dei risultati de...
推荐阅读
- python - Tile rendering with opengl
- python - Create virtual environment with all packages shown in `conda list`
- c# - code throws Object synchronization method was called from an unsynchronized block of code
- javascript - 如何修复 JS 中的 insertBefore() 错误
- imagemagick - 将当前日期添加到签名图像并将其添加到现有 pdf
- google-apps-script - Automatically generate a unique sequential ID in Google Sheets
- javascript - Is it possible to filter on multiple map values in Firestore
- python - 如何有效地计算 pandas 中所有后续行的平均值?
- c++ - 无法安装柯南包特征
- java - How to deal with underflow in java?