json - 一次抓取多个 JS 页面
问题描述
我正在尝试抓取一个网站,该网站具有多个由 Javascript 呈现的页面。我正在使用 BeautifulSoup 和 Selenium。我有一个有效的脚本,但仅适用于网站的第一页。是否可以抓取多个 javascript 呈现的页面,还是我需要单独进行?这是我的脚本:
import time
from bs4 import BeautifulSoup as soup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/rawlins/Downloads/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
#chrome_options.add_argument('--window-size=1920x1080')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path,
chrome_options = chrome_options)
# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
browser.get(url)
# to ensure that the page has loaded completely.
time.sleep(3)
data = []
n = 2
for i in range(1, n+1):
if (i == 1):
# handle first page
response = requests.get(url)
response = requests.get(url + "&page=" + str(i))
#response = requests.get(url + "&page=" + str(i),headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
# Parse HTML, close browser
page_soup = soup(browser.page_source, 'lxml')
containers = page_soup.findAll("tr")
browser.quit()
for container in containers:
item = {}
item['type'] = "Course Material"
if container.find('td', {'class' : 'title'}):
item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
else:
item['title'] = ""
if container.find('td', {'class' : 'authors'}):
item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
else:
item['author'] = ""
if container.find('td', {'class' : 'title'}):
item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
else:
item['link'] = ""
if container.find('td', {'class' : 'title'}):
item['description'] = container.find('td', {'class' : 'title'}).span.text
else:
item['description'] = ""
item['subject'] = "Arts"
item['source'] = "OpenStax CNX"
item['base_url'] = "https://cnx.org/browse"
item['license'] = "Attribution"
data.append(item) # add the item to the list
with open("js-webscrape.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
提前致谢。
解决方案
这里有几个问题:
- 你在混进
requests.get()
来browser.get()
。这里根本不需要requests
模块,因为您是通过无头浏览器获取页面的。 - 不需要为第一页设置特殊情况。https://cnx.org/search?q=subject:Arts&page=1工作正常。
time.sleep()
应该在browser.get()
和解析之间,以允许页面在将其提供给 BeautifulSoup 之前完全加载。- 一旦所有页面都被抓取,您应该在循环
data
外写入 JSON 文件。for
- 在循环之外退出浏览器
for
,而不是在单次迭代之后。 - 为避免编码错误,请在写入 JSON 文件时指定编码:with
open("js-webscrape.json", "w", encoding="utf-8")
这是一个抓取所有 7 个页面的有效实现:
import time
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/Gebruiker/Downloads/chromedriver_win32/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path, options = chrome_options)
# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
data = []
n = 7
for i in range(1, n+1):
response = browser.get(url + "&page=" + str(i))
time.sleep(5)
# Parse HTML
page_soup = soup(browser.page_source,'lxml')
containers = page_soup.findAll("tr")
for container in containers:
item = dict()
item['type'] = "Course Material"
if container.find('td', {'class' : 'title'}):
item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
else:
item['title'] = ""
if container.find('td', {'class' : 'authors'}):
item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
else:
item['author'] = ""
if container.find('td', {'class' : 'title'}):
item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
else:
item['link'] = ""
if container.find('td', {'class' : 'title'}):
item['description'] = container.find('td', {'class' : 'title'}).span.text
else:
item['description'] = ""
item['subject'] = "Arts"
item['source'] = "OpenStax CNX"
item['base_url'] = "https://cnx.org/browse"
item['license'] = "Attribution"
data.append(item) # add the item to the list
# write data to file and quit browser when done
print(data)
with open("js-webscrape.json", "w", encoding="utf-8") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
browser.quit()
推荐阅读
- css - 使用 Flexbox 获取所有可用高度的组件(在 Quasar/Vue 中)
- javascript - 这些时间戳有什么区别?
- visual-studio - 仅使用 Dotfuscator 社区的 Dotfuscator netstandard 项目
- java - 如何在 AWS Amplify Android SDK 中添加 GoogleSignIn?
- angular - 如何在角度组件中嵌入角度元素
- excel - 从 excel 宏更新数据源 (SAP) 时,有没有办法抑制询问查询参数的提示?
- powershell - 如何转义未知字符串并将其传递给本机命令
- ios - 是否可以从初始 tableview 控制器转换到 UISplitViewController?
- sql - 在sql中使用OUTPUT返回插入的行JAVASCRIPT/NODEJS/PG
- java - Spring boot - 使用配置文件时未读取默认属性文件