python - 如何编写代码来读取输出文件以找出它在抓取网站中的距离,然后从它停止的地方开始
问题描述
我正在编写一个程序来从本网站存档中的每篇文章中抓取文章标题、日期和正文文本并导出到 csv 文件。该网站似乎在某些时候阻止了我,我收到此错误:HTTPError: Service Unavailable。
我相信这是因为我试图在短时间内访问他们的网站太多次。我希望我的代码能够读取错误发生的位置并从中断处继续。
在浏览 10 篇文章后,我尝试添加延迟以延迟 2 秒。每十篇文章后,我也尝试过随机延迟。我可以添加更长的延迟,但我希望代码能够在感觉万无一失的地方找到。
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
from time import sleep
from random import randint
csvfile = "C:/Users/k/Dropbox/granularitygrowth/Politico/pol.csv"
with open(csvfile, mode='w', newline='', encoding='utf-8') as pol:
csvwriter = csv.writer(pol, delimiter='~', quoting=csv.QUOTE_MINIMAL)
csvwriter.writerow(["Date", "Title", "Article"])
#for each page on Politico archive
for p in range(0,412):
url = urlopen("https://www.politico.com/newsletters/playbook/archive/%d" % p)
content = url.read()
#Parse article links from page
soup = BeautifulSoup(content,"lxml")
articleLinks = soup.findAll('article', attrs={'class':'story-frag format-l'})
#Each article link on page
for article in articleLinks:
link = article.find('a', attrs={'target':'_top'}).get('href')
#Open and read each article link
articleURL = urlopen(link)
articleContent = articleURL.read()
#Parse body text from article page
soupArticle = BeautifulSoup(articleContent, "lxml")
#Limits to div class = story-text tag (where article text is)
articleText = soupArticle.findAll('div', attrs={'class':'story-text'})
for div in articleText:
#Find date
footer = div.find('footer', attrs={'class':'meta'})
date = footer.find('time').get('datetime')
print(date)
#Find title
headerSection = div.find('header')
title = headerSection.find('h1').text
print(title)
#Find body text
textContent = ""
bodyText = div.findAll('p')
for p in bodyText:
p_string = str(p.text)
textContent += p_string + ' '
print(textContent)
#Adds data to csv file
csvwriter.writerow([date, title, textContent])
time.sleep(randint(3,8))
我希望我的代码仍然有这个错误,但然后从它停止的地方开始并继续打印并将数据导出到 csv 文件。
解决方案
您可以计算您保存在 CSV 中的文章数量,将其除以 10(page = 1 + records // 10
(+1 表示第一页))以获得您所在的最后一页。
我已经像这样重构了您的代码:
import csv
import time
from random import randint
from urllib.request import urlopen
from bs4 import BeautifulSoup
HEADERS = ["Date", "Title", "Article"]
def count_rows(csv_path: str) -> int:
with open(csv_path) as f:
reader = csv.DictReader(f)
return len(list(reader))
def write_articles(csv_path: str, articles: list):
# note the append mode, write mode would delete everything and start fresh
with open(csv_path, 'a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f,
quoting=csv.QUOTE_MINIMAL,
fieldnames=HEADERS)
writer.writerows(articles)
def init_csv(csv_path: str):
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=HEADERS, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
def get_page_soup(url: str) -> BeautifulSoup:
response = urlopen(url)
html = response.read()
soup = BeautifulSoup(html, "lxml")
return soup
def scrape_article(url: str) -> dict:
article_soup = get_page_soup(url)
# Limits to div class = story-text tag (where article text is)
story_el = article_soup.select_one('.story-text')
# find date
date = story_el.select_one('.timestamp time')['datetime']
# find title
title = story_el.find('h1').text
# find body text
article_text = ''
for p in story_el.find_all('p'):
article_text += p.text + ' '
return {
'Title': title,
'Date': date,
'Article': article_text
}
def main():
csvfile = "test.csv"
try:
record_count = count_rows(csvfile)
except FileNotFoundError:
init_csv(csvfile)
print('Initialized CSV file')
record_count = 0
article_per_page = 10
page = 1 + record_count // article_per_page
print('Continuing from page', page)
articles = []
for p in range(page, 413):
url = "https://www.politico.com/newsletters/playbook/archive/%d" % p
soup = get_page_soup(url)
article_links = soup.select('article.story-frag.format-l')
# Each article link on page
for article in article_links:
link = article.select_one('a[target=_top]')['href']
scraped_article = scrape_article(link)
print(scraped_article)
articles.append(scraped_article)
write_articles(csvfile, articles)
print('Finished page', p)
time.sleep(randint(3, 8))
if __name__ == '__main__':
main()
这会给你这样的输出:
Finished page 48
{'Title': 'Playbook: Scalise takes several Republicans to ...
{'Title': 'Playbook: Four unfolding events that show the ...
{'Title': 'Playbook: Texas kicks off primary season, as D ...
{'Title': 'Playbook: The next gen: McCarthy and Crowley’s ...
{'Title': 'INSIDE THE GRIDIRON DINNER: What Trump said an ...
{'Title': 'DEMS spending millions already to boost vulner ...
{'Title': 'Playbook: Inside the Republican super PAC mone ...
{'Title': 'Playbook: Who would want to be White House com ...
{'Title': "Playbook: Jared Kushner's bad day", 'Date': '2 ...
{'Title': 'Playbook: Gun control quickly stalls in the Se ...
Finished page 49
推荐阅读
- google-cloud-platform - 刷新存储桶中的数据
- typescript - 两个功能“组合”(非组合)的无点样式
- reactjs - 在 React 中使用 setInterval 没有按预期工作
- python - for循环中python内置方法过滤器的神秘行为
- java - 使用 java LocalDateTime 解析 Scala 日期
- angular - 在 Angular 10 上使用 `ng add @ngrx/store` 会出错并添加旧版本的 ngrx
- javascript - 在 Vue 中使用 $refs 触发“click”之前有条件地更新 INPUT 元素的属性
- python - ImportError with scipy.misc cannot import toimage
- c++ - 我在范围内未声明的错误类有问题
- hyperledger-fabric - 如何设置 REST 服务器以与 Azure Kubernetes 服务 (AKS) 上的 Hyperledger Fabric 联盟通信