首页 > 技术文章 > python爬虫-网页url采集爬虫

alicehome 2021-06-21 19:03 原文

 1 # coding:utf-8
 2 # 网页url采集爬虫,给定网址,以及存储文件,将该网页内全部网址采集下,可指定文件存储方式
 3 import requests, time
 4 from lxml import etree
 5 
 6 """
 7     url:给定的url
 8     save_file_name:为url存储文件
 9 """
10 
11 
12 def Redirect(url):
13     try:
14         res = requests.get(url, timeout=10)
15         url = res.url
16     except Exception as e:
17         print("4", e)
18         time.sleep(1)
19     return url
20 
21 
22 def requests_for_url(url, save_file_name, file_model):
23     global selector
24     headers = {
25         'pragma': "no-cache",
26         'accept-encoding': "gzip, deflate, br",
27         'accept-language': "zh-CN,zh;q=0.8",
28         'upgrade-insecure-requests': "1",
29         'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
30         'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
31         'cache-control': "no-cache",
32         'connection': "keep-alive",
33     }
34     try:
35         response = requests.request("GET", url, headers=headers)
36         selector = etree.HTML(response.text, parser=etree.HTMLParser(encoding='utf-8'))
37     except Exception as e:
38         print("页面加载失败", e)
39 
40     return_set = set()
41     with open(save_file_name, file_model) as f:
42         try:
43             context = selector.xpath('//a/@href')
44             for i in context:
45                 try:
46                     if i[0] == "j":
47                         continue
48                     if i[0] == "/":
49                         # print i
50                         i = url + i.replace("/", "")
51                     f.write(i)
52                     f.write("\n")
53                     return_set.add(i)
54                     # print(len(return_set))
55                     print(len(return_set), context[0], i)
56                 except Exception as e:
57                     print("1", e)
58         except Exception as e:
59             print("2", e)
60     return return_set
61 
62 
63 if __name__ == '__main__':
64     # 网页url采集爬虫,给定网址,以及存储文件,将该网页内全部网址采集下,可指定文件存储方式
65     url = "https://www.ak47s.cn/"
66     save_file_name = "url.txt"
67     return_set = requests_for_url(url, save_file_name, "a")  # “a”:追加
68     print(len(return_set))

 

推荐阅读