问题描述
设置:
site = 'https://www.benefits.gov'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X
10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102
Safari/537.36'}
base = urlparse(site).netloc
print('test')
to_visit = [site]
outlinks = []
visited = {}
external_visited = {}
参观地点:
while to_visit:
l = to_visit.pop()
print(l)
url = urljoin(site, l)
print('test1')
try:
r = requests.get(url, headers=headers)
visited[l] = r.status_code
print("test5")
except:
visited[l] = None
print('test2')
if r.status_code == 200:
print("test6")
soup = BeautifulSoup(r.content, 'html5lib')
print("test7")
links = [l['href'] for l in soup.find_all('a', href=True)]
for link in links:
parsed_link = urlparse(link)
loc = parsed_link.netloc
path = parsed_link.path
joined_url = urljoin(site, link)
print('test3')
if loc == '':
print("test9")
if joined_url not in to_visit and joined_url not in visited.keys():
to_visit.append(joined_url)
elif loc == base:
if link not in to_visit and link not in visited.keys():
to_visit.append(link)
else:
if link not in outlinks and link not in visited.keys():
outlinks.append(link)
print('test4')
检查外部链接状态:
while outlinks:
l = outlinks.pop()
print("test8")
try:
r = requests.get(l)
external_visited[l] = r.status_code
except:
external_visited[l] = None
标签: pythonbeautifulsoupweb-crawler
解决方案
推荐阅读