python - 无法从带有漂亮汤的“a”标签中提取 href 属性
问题描述
我正在尝试借助以下代码从 youtube 播放列表中抓取链接:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pyperclip
import time
url = input('Please enter youtube playlist url: ')
driver = webdriver.Firefox()
driver.get(url)
elem = driver.find_element_by_tag_name('html')
elem.send_keys(Keys.END)
time.sleep(3)
elem.send_keys(Keys.END)
innerHTML = driver.execute_script("return document.body.innerHTML")
soup = bs(innerHTML, 'html.parser')
res = soup.select('div#content.style-scope.ytd-playlist-video-renderer a.yt-simple-endpoint.style-scope.ytd-playlist-video-renderer')
whole_list = ''
for i in res:
print(i.get('href'))
print(i['href'])
print(i.attrs['href'])
# whole_list = whole_list + " '" + i.get('href') + "', \n"
print(whole_list)
pyperclip.copy(whole_list)
driver.close()
而 youtube 的播放列表视频组件在 chrome 开发者工具中显示如下:
<a class="yt-simple-endpoint style-scope ytd-playlist-video-renderer" href="/watch?v=QXeEoD0pB3E&list=PLsyeobzWxl7poL9JTVyndKe62ieoN-MZ3&index=2&t=0s">
<ytd-thumbnail id="thumbnail" height="68" width="120" class="style-scope ytd-playlist-video-renderer">
<a id="thumbnail" class="yt-simple-endpoint inline-block style-scope ytd-thumbnail" aria-hidden="true" tabindex="-1" rel="null" href="/watch?v=QXeEoD0pB3E&list=PLsyeobzWxl7poL9JTVyndKe62ieoN-MZ3&index=2&t=0s">
<yt-img-shadow class="style-scope ytd-thumbnail no-transition" style="background-color: transparent;" loaded=""><img id="img" class="style-scope yt-img-shadow" alt="" width="120" src="https://i.ytimg.com/vi/QXeEoD0pB3E/hqdefault.jpg?sqp=-oaymwEZCPYBEIoBSFXyq4qpAwsIARUAAIhCGAFwAQ==&rs=AOn4CLCsnnE_5VNrXFHejH29sP0T7NSSmw"></yt-img-shadow>
<div id="overlays" class="style-scope ytd-thumbnail"><ytd-thumbnail-overlay-resume-playback-renderer class="style-scope ytd-thumbnail"><div id="progress" class="style-scope ytd-thumbnail-overlay-resume-playback-renderer" style="width: 100%;"></div></ytd-thumbnail-overlay-resume-playback-renderer><ytd-thumbnail-overlay-time-status-renderer class="style-scope ytd-thumbnail" overlay-style="DEFAULT"><span class="style-scope ytd-thumbnail-overlay-time-status-renderer" aria-label="66 seconds">
1:06
</span></ytd-thumbnail-overlay-time-status-renderer><ytd-thumbnail-overlay-now-playing-renderer class="style-scope ytd-thumbnail">
<span class="style-scope ytd-thumbnail-overlay-now-playing-renderer">Now playing</span>
</ytd-thumbnail-overlay-now-playing-renderer></div>
<div id="mouseover-overlay" class="style-scope ytd-thumbnail"></div>
<div id="hover-overlays" class="style-scope ytd-thumbnail"></div>
</a>
</ytd-thumbnail>
<div id="meta" class="style-scope ytd-playlist-video-renderer">
<h3 class="style-scope ytd-playlist-video-renderer">
<ytd-badge-supported-renderer class="style-scope ytd-playlist-video-renderer">
<dom-repeat id="repeat" as="badge" class="style-scope ytd-badge-supported-renderer"><template is="dom-repeat"></template></dom-repeat>
</ytd-badge-supported-renderer>
<span id="video-title" class="style-scope ytd-playlist-video-renderer" aria-label="#0 Python Tutorial | Python Programming Tutorial for Beginners | Course Introduction by Telusko 1 year ago 66 seconds 1,108,432 views" title="#0 Python Tutorial | Python Programming Tutorial for Beginners | Course Introduction">
#0 Python Tutorial | Python Programming Tutorial for Beginners | Course Introduction
</span>
</h3>
<ytd-video-meta-block class="playlist style-scope ytd-playlist-video-renderer">
<div id="metadata" class="style-scope ytd-video-meta-block">
<div id="byline-container" class="style-scope ytd-video-meta-block">
<ytd-channel-name id="channel-name" class="style-scope ytd-video-meta-block">
<div id="container" class="style-scope ytd-channel-name">
<div id="text-container" class="style-scope ytd-channel-name">
<yt-formatted-string id="text" class="style-scope ytd-channel-name complex-string" ellipsis-truncate="" title="Telusko" has-link-only_=""><a class="yt-simple-endpoint style-scope yt-formatted-string" spellcheck="false" href="/user/javaboynavin">Telusko</a></yt-formatted-string>
</div>
</div>
<ytd-badge-supported-renderer class="style-scope ytd-channel-name" disable-upgrade="" hidden="">
</ytd-badge-supported-renderer>
</ytd-channel-name>
<div id="separator" class="style-scope ytd-video-meta-block">•</div>
</div>
<div id="metadata-line" class="style-scope ytd-video-meta-block">
<dom-repeat strip-whitespace="" class="style-scope ytd-video-meta-block"><template is="dom-repeat"></template></dom-repeat>
</div>
</div>
<div id="additional-metadata-line" class="style-scope ytd-video-meta-block">
<dom-repeat class="style-scope ytd-video-meta-block"><template is="dom-repeat"></template></dom-repeat>
</div>
</ytd-video-meta-block>
</div>
<ytd-badge-supported-renderer id="badges" class="style-scope ytd-playlist-video-renderer" disable-upgrade="" hidden="">
</ytd-badge-supported-renderer>
<yt-formatted-string id="contributor" class="style-scope ytd-playlist-video-renderer" hidden=""></yt-formatted-string>
</a>
如您所见,我正在尝试使用我在网上找到的所有三个建议,即使用 i.get('href') 给我 null; 而其余两个选项给了我错误。从昨天开始我就被困在这个问题上,找不到我做错了什么。
解决方案
有时<a>
可能没有,href
所以我会用if
跳过它。
for i in res:
href = i.get('href')
if href:
whole_list = whole_list + " '" + href + "', \n"
这段代码给了我一些播放列表的所有href。你会看到它也是None
第一个i
,但我跳过了这个值。
from bs4 import BeautifulSoup as BS
from selenium import webdriver
import pyperclip
import time
#url = input('Please enter youtube playlist url: ')
url = 'https://www.youtube.com/playlist?list=PLmNPvQr9Tf-a4MrEG5thq3qzlkrF5NFbC'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
html = driver.page_source
soup = BS(html, 'html.parser')
res = soup.select('a.yt-simple-endpoint.style-scope.ytd-playlist-video-renderer')
all_hrefs = []
for i in res:
href = i.get('href')
print(href)
if href:
all_hrefs.append(href)
text = ',\n'.join([" '{}'".format(x) for x in all_hrefs])
print(text)
pyperclip.copy(text)
driver.close()
推荐阅读
- python - 漂亮汤的谷歌新闻标题标签
- python - 使用作为引用第三列复制列值
- python - 将重叠图像拼接在一起后平滑重叠图像的最佳方法?
- haskell - 如何在haskell中更改构造函数的优先级
- python - Heroku 没有检测到语言
- database - 从包含一些丢失文件的数据目录中恢复 PostgreSQL 数据
- raspberry-pi - 使用单声道在 linux (Raspberry Pi) 上使用网络运行 dot net 程序集
- javascript - jQuery 在新的触发之前等待一个函数完成
- c++ - 交换链表的最后一个节点将进入无穷大——C++
- javascript - 在构造函数中同步加载图像