python - 我的脚本有问题,无法抓取许多页面的数据
问题描述
我编写了一个代码来从多个页面中抓取数据(意味着包括分页)。除了下一页活动外,我的代码一切正常。我的代码只是抓取第一页的数据当我为下一页设置一个循环时,当它完成第一页数据时,它再次开始抓取第一页的数据并在我设置循环时继续抓取同一页的数据。实际上,我是使用 python 进行网络抓取的新手,所以请指导我。你能帮我修复我的代码吗?请看一下我的代码并帮助我,谢谢
这是我的代码:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="cdm_style",id=False).text)
except:
title = 'Empty Title'
try:
collection = (soup.find('td',id="metadata_collec").find('a').text)
except:
collection = "Empty Collection"
try:
author = (soup.find('td',id="metadata_creato").text)
except:
author = "Empty Author"
try:
abstract = (soup.find('td',id="metadata_descri").text)
except:
abstract = "Empty Abstract"
try:
keywords = (soup.find('td',id="metadata_keywor").text)
except:
keywords = "Empty Keywords"
try:
publishers = (soup.find('td',id="metadata_publis").text)
except:
publishers = "Empty Publishers"
try:
date_original = (soup.find('td',id="metadata_contri").text)
except:
date_original = "Empty Date original"
try:
date_digital = (soup.find('td',id="metadata_date").text)
except:
date_digital = "Empty Date digital"
try:
formatt = (soup.find('td',id="metadata_source").text)
except:
formatt = "Empty Format"
try:
release_statement = (soup.find('td',id="metadata_rights").text)
except:
release_statement = "Empty Realease Statement"
try:
library = (soup.find('td',id="metadata_librar").text)
except:
library = "Empty Library"
try:
date_created = (soup.find('td',id="metadata_dmcreated").text)
except:
date_created = "Empty date Created"
data = {
'Title' : title.strip(),
'Collection' : collection.strip(),
'Author' : author.strip(),
'Abstract' : abstract.strip(),
'Keywords' : keywords.strip(),
'Publishers' : publishers.strip(),
'Date_original': date_original.strip(),
'Date_digital' : date_digital.strip(),
'Format' : formatt.strip(),
'Release-st' : release_statement.strip(),
'Library' : library.strip(),
'Date_created' : date_created.strip()
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
return titles_link_output
def write_csv(data,url):
with open('1111_to_5555.csv','a') as csvfile:
writer = csv.writer(csvfile)
row = [data['Title'], data['Collection'], data['Author'],
data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
data['Date_created'], url]
writer.writerow(row)
def main():
for x in range(2,4):
mainurl = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/")
print(x)
url = f"{mainurl}{x}"
products = get_index_data(get_page(url))
for product in products:
data1 = get_detail_page(get_page(product))
write_csv(data1,product)
if __name__ == '__main__':
main()
解决方案
urls = ['http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/' + str(i) for i in range(1,124)]
现在只需遍历 url:
for url in urls:
do_something()
推荐阅读
- javascript - 使用包含 JS 变量的字符串调整对象的大小
- caching - Service Worker 获取缓存资产失败
- python - 使用 imgaug 创建和保存增强图像时遇到问题
- python - Pandas 和 Tkinter,打开和保存文件
- css - 覆盖:父母悬停在孩子身上
- python-3.x - 特定模式的字典到制表符分隔的文本文件
- php - 我需要在 PHP docblock 中使用 splat 运算符,但无法弄清楚如何转义字符
- c# - 在添加新数据之前检查现有列表中是否已存在值
- typescript - 有条件地将输入转换为 Typescript 函数
- python - scikit-learn:FeatureUnion 包含手工制作的功能