python - 使用 BeautifulSoup 抓取不同的 URL 和特定行
问题描述
我试图刮掉这个网站的符号:https ://in.finance.yahoo.com/quote/%5EDJI?p=%5EDJI
然后我尝试从特定详细信息站点获取每个符号的交易量,例如:https ://in.finance.yahoo.com/quote/HON?p=HON
问题是,它获得了“quote/HON?p=HON”,但无法将其添加到网址“https://in.finance.yahoo.com/quote/”
第二个问题是,我不知道如何从详细站点获取“音量”,例如从这个站点:https ://in.finance.yahoo.com/quote/HON?p=HON
我尝试了很多想法,但不知道如何解决这个问题..
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
urls = ['https://de.finance.yahoo.com/quote/%5EIXIC/components?p=%5EIXIC',
'https://de.finance.yahoo.com/quote/%5EDJI/components?p=%5EDJI',
'https://de.finance.yahoo.com/quote/%5EGSPC/components?p=%5EGSPC']
current_date = datetime.now()
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
stock_information_yahoo_finance = pd.DataFrame(columns=['Name', 'Volume', 'Volume2'])
stock_symbols = []
for url in urls:
r: requests.Response = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'lxml')
for item in soup.select('tr.BdT'):
stock_symbols.append(item.select_one('a').text)
for symbol in sorted(stock_symbols):
r = requests.get("https://in.finance.yahoo.com/quote/" + symbol)
detailed_soup = BeautifulSoup(r.content, "lxml")
summary_info = detailed_soup.find('div', {'id': 'quote-summary'})
name = detailed_soup.find_all("h1", {"class": "D(ib) Fz(18px)"})
try:
for td_tag in summary_info.find('table'):
span_tag = td_tag.findAll('span', {'class': 'Trsdu(0.3s)'})
volume = span_tag[4].text
avg_volume = span_tag[5].text
volume = volume.replace(",", "")
avg_volume = avg_volume.replace(",", "")
volume = int(volume)
avg_volume = int(avg_volume)
volume2 = avg_volume * 2
stock_information_yahoo_finance = stock_information_yahoo_finance.append({ 'Name' : name,
'Volume': volume, 'Volume2': volume2}, ignore_index=True)
except:
volume = '0'
avg_volume = '0'
volume2 = '0'
stock_information_yahoo_finance = stock_information_yahoo_finance.append({
'Volume': volume, 'Volume2': volume2}, ignore_index=True)
stocks2 = stock_information_yahoo_finance.loc[
stock_information_yahoo_finance['Volume'] > stock_information_yahoo_finance['Volume2'], :]
print(stocks2.to_string())
解决方案
我不是 100% 确定你想要做什么。下面的代码会抓取此 URL:https ://in.finance.yahoo.com/quote/%5EIXIC/components?p=%5EIXIC并从表中提取公司符号。这些符号被添加到一个列表中,用于查询各个页面上的其他数据点。这些数据点可以添加到字典、列表或数据框中。
import requests
from bs4 import BeautifulSoup
url = 'https://in.finance.yahoo.com/quote/%5EIXIC/components?p=%5EIXIC'
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
stock_symbols = []
r: requests.Response = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'lxml')
for item in soup.select('tr.BdT'):
stock_symbols.append(item.select_one('a').text)
for symbol in sorted(stock_symbols):
r = requests.get("https://in.finance.yahoo.com/quote/" + symbol)
detailed_soup = BeautifulSoup(r.content, "lxml")
summary_info = detailed_soup.find('div', {'id': 'quote-summary'})
span_tag = summary_info.findAll('span', {'class': 'Trsdu(0.3s)'})
previous_close = span_tag[0].text
open_price = span_tag[1].text
bid_price = span_tag[2].text
ask_price = span_tag[3].text
volume = span_tag[4].text
avg_volume = span_tag[5].text
这是一个将股票信息保存到 Pandas 数据框的示例。
import requests
import pandas as pd
from time import sleep
from random import randint
from datetime import datetime
from bs4 import BeautifulSoup
url = 'https://in.finance.yahoo.com/quote/%5EIXIC/components?p=%5EIXIC'
current_date = datetime.now()
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
stock_information_yahoo_finance = pd.DataFrame(columns=['Date', 'Symbol', 'Previous Close Price', 'Open Price',
'Bid Price', 'Ask Price', 'Volume', 'Average Volume'])
stock_symbols = []
r: requests.Response = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'lxml')
for item in soup.select('tr.BdT'):
stock_symbols.append(item.select_one('a').text)
for symbol in sorted(stock_symbols):
r = requests.get("https://in.finance.yahoo.com/quote/" + symbol)
detailed_soup = BeautifulSoup(r.content, "lxml")
summary_info = detailed_soup.find('div', {'id': 'quote-summary'})
span_tag = summary_info.findAll('span', {'class': 'Trsdu(0.3s)'})
previous_close = span_tag[0].text
open_price = span_tag[1].text
bid_price = span_tag[2].text
ask_price = span_tag[3].text
volume = span_tag[4].text
avg_volume = span_tag[5].text
stock_information_yahoo_finance = stock_information_yahoo_finance.append({
'Date': current_date.strftime("%m-%d-%Y"), 'Symbol': symbol, 'Previous Close Price': previous_close,
'Open Price': open_price, 'Bid Price': bid_price, 'Ask Price': ask_price, 'Volume': volume,
'Average Volume': avg_volume}, ignore_index=True)
# add random delay to prevent Yahoo Finance from blocking requests
sleep(randint(1, 5))
print(stock_information_yahoo_finance.to_string())
#output
Date Symbol Previous Close Price Open Price Bid Price Ask Price Volume Average Volume
0 02-17-2021 ACCD 53.76 53.69 50.02 x 1000 52.99 x 800 612,444 704,749
1 02-17-2021 AGLE 7.27 7.27 7.35 x 900 7.30 x 1100 204,802 224,822
2 02-17-2021 ATEC 16.67 16.56 16.43 x 1100 16.45 x 3100 437,393 847,203
3 02-17-2021 BBI 1.6000 1.5600 1.5000 x 3200 1.5200 x 1000 2,730,143 4,035,214
4 02-17-2021 BBQ 6.36 6.44 6.00 x 1300 6.39 x 1100 5,701 16,236
5 02-17-2021 CIIC 27.34 27.10 26.85 x 2200 27.50 x 800 1,086,149 4,664,344
6 02-17-2021 CVCO 215.00 213.29 215.87 x 800 252.20 x 2200 21,097 42,231
7 02-17-2021 EXAS 154.28 148.95 142.10 x 1400 145.45 x 1800 5,246,528 1,724,611
8 02-17-2021 FARM 6.59 6.59 6.72 x 800 6.76 x 1000 177,308 197,201
9 02-17-2021 FARO 79.73 78.87 78.25 x 800 77.93 x 800 101,062 87,798
10 02-17-2021 FEYE 21.07 20.98 20.54 x 900 20.70 x 2200 6,727,862 9,450,016
11 02-17-2021 FNKO 13.08 13.14 12.57 x 900 13.48 x 800 444,766 773,173
12 02-17-2021 FNLC 24.89 24.80 24.81 x 800 26.00 x 1000 8,143 15,695
13 02-17-2021 FRSX 10.00 9.75 9.60 x 800 9.66 x 900 4,063,999 14,712,591
14 02-17-2021 FRTA 21.68 21.41 21.76 x 1100 21.78 x 800 233,758 432,391
15 02-17-2021 IFRX 6.22 6.12 6.31 x 800 6.32 x 1000 320,425 329,272
16 02-17-2021 NEBC 10.75 10.88 10.46 x 800 10.87 x 1300 766,067 380,736
17 02-17-2021 NMRD 8.04 7.71 7.35 x 900 7.50 x 1400 289,880 212,036
18 02-17-2021 NMRK 8.65 8.51 8.73 x 2200 8.74 x 1200 706,172 933,298
19 02-17-2021 NVCN 2.2600 2.2500 2.1000 x 3200 2.1100 x 4000 17,213,552 5,553,301
20 02-17-2021 NVCR 185.31 182.65 173.80 x 900 181.30 x 800 524,242 957,014
21 02-17-2021 OLMA 48.55 50.33 47.50 x 1000 54.50 x 1000 152,455 157,635
22 02-17-2021 OPTT 5.0200 4.9300 4.6600 x 1100 4.6900 x 3200 6,398,491 11,193,375
23 02-17-2021 SCOA 10.45 10.49 10.38 x 2200 10.40 x 800 263,090 455,133
24 02-17-2021 SCOR 3.8800 3.8600 3.8100 x 1000 3.8600 x 3200 572,379 724,314
25 02-17-2021 SDC 12.43 12.50 12.23 x 3000 12.24 x 3000 6,077,952 6,864,586
26 02-17-2021 SDH 5.0800 4.9900 4.8100 x 1000 5.0500 x 1000 225,981 960,750
27 02-17-2021 SLAB 153.75 150.87 150.00 x 1100 150.02 x 1000 338,618 270,232
28 02-17-2021 TSLA 796.22 779.09 793.65 x 800 793.71 x 1300 26,078,898 43,111,968
29 02-17-2021 TWST 156.25 155.00 153.00 x 1000 154.61 x 1000 981,909 814,265
推荐阅读
- sql - 统计 SQL 语句中返回的行数
- html - Srcset 似乎找不到正确的图像
- rust - 使用参考枢轴循环时出现可变借用问题
- rust - 如何使用潮汐和glommio将错误传播回调用者?
- python - 使用方法链接分配给 Pandas 数据框
- azure-active-directory - 在 Azure AD 应用程序对象上设置 AppId uri 时获取“HostNameNotOnVerifiedDomain”
- javascript - 如何使用javascript在href中获取标签值
- python - Python3 Pandas 键错误:0
- spss - 比较或组合列中的值
- r - 在字符之间添加逗号