python - 我的雅虎金融报价数据的 Python BeautifulSoup 刮板坏了。上周有什么变化吗?
问题描述
def get_fy(ticker, outFile):
global retry
# print ( "get_fy processing", ticker, ", retry =", retry )
try:
# create a web driver instance
p1 = "http://finance.yahoo.com/quote/"
p2 = "?p="
p3 = "&.tsrc=fin-srch-v1"
print(p1+ticker+p2+ticker)
r = urlopen(p1+ticker+p2+ticker)
bsObj = BeautifulSoup(r,"html5lib")
# bsObj = BeautifulSoup(r.read())
titleList = bsObj.findAll("title")
# title = title_re.search(r.data)
if not titleList or titleList[0].get_text().find('Stock Price')<0:
print ("found", len(bsObj.get_text()), "bytes" )
print ("found title at", bsObj.get_text().find("<title>"))
print ("found", len(bsObj.findAll("td")), "td objects")
print (bsObj.get_text()[:70])
if titleList: print("found titleList[0]:", titleList[0])
else: print("titleList was not found")
#dump = open('dump.html', 'w')
#dump.write(bsObj.get_text())
#dump.close()
if retry > 2: return False
retry = retry+1
time.sleep(10)
print ("retry count", retry, "for", ticker )
return get_fy(ticker,outFile)
elements = bsObj.findAll("td")
处理 F(http://finance.yahoo.com/quote/F?p=F)给出:
found 140676 bytes
found title at -1
found 0 td objects
▼ì½ízÓÈÒ(ú⌂]…ð♀`¿XŠ¾-ÛcX! ♥k%„�♦سxyóÈ’∟k!K▲INb2ÞϹ�s{çJNUuKÖ—C↕☻3³ö
titleList was not found
retry count 1 for F
解决方案
该代码似乎首先尝试获取给定股票代码的 HTML 标题标签,然后获取其中一个表格。你可以这样做:
from bs4 import BeautifulSoup
import requests
def get_fy(ticker, outFile):
url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics?p={ticker}'
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, "html5lib")
title = soup.title.text
print(title)
for tr in soup.table.find_all('tr'):
row = [td.text for td in tr.find_all('td')]
print(f'{row[0]:30} {row[1]}')
get_fy("LMT", "test.txt")
这将显示:
https://finance.yahoo.com/quote/LMT/key-statistics?p=LMT
Lockheed Martin Corporation (LMT) Valuation Measures & Financial Statistics
Market Cap (intraday) 5 106.48B
Enterprise Value 3 115.73B
Trailing P/E 15.46
Forward P/E 1 13.74
PEG Ratio (5 yr expected) 1 2.79
Price/Sales (ttm) 1.61
Price/Book (mrq) 16.87
Enterprise Value/Revenue 3 1.75
Enterprise Value/EBITDA 7 11.33
应该只有一个<title>
标签,所以.find_all()
不需要调用(它总是会给你一个单一的项目列表)。该站点还需要在返回合适的 HTML 之前添加用户代理。
推荐阅读
- amazon-web-services - 在扩展 aws 实例(docker 容器)时运行脚本
- java - 如何部署 JavaFX 14 JDK11 HSQLDB Java 桌面应用程序
- python - 回调错误更新 plot-div.children (Plotly Dash)
- django - 不尊重每个用户的 Django 特定语言
- rails-activestorage - 如何在 Ubuntu 18.04 LTS 上安装 poppler,以便 ActiveStorage 可以预览 PDF?
- amazon-web-services - 网页未加载,在网络选项卡中显示 304 错误。aws ec2 中的托管应用程序
- typescript - 如何使用 Nest cli 创建我的 ouw 原理图
- javascript - 在 ReactJS 中的数组对象后添加空格
- ios - 如何快速使用陀螺仪
- three.js - 如何使用 gdal 对 Float32 光栅文件进行显着压缩