首页 > 技术文章 > 爬虫知识随记

chang2021 2020-11-28 15:38 原文

备忘

 1 import urllib.parse
 2 import urllib.request
 3 #  将数据使用urlencode编码处理后,再使用encoding设置为utf-8编码
 4 data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding="utf-8")
 5 response = urllib.request.urlopen("http://httpbin.org/post",data=data)    # 打开指定需要爬取的网页
 6 html = response.read()    # 读取网页
 7 print(html)    # 打印读取内容
 8 
 9 import requests
10 data = {"word":"hello"}    # 表单参数
11 response = requests.post("http://httpbin.org/post",data=data)
12 print("状态码:",response.status_code)
13 # print("请求url:",response.url)
14 # print("头部信息:",response.headers)
15 # print("cookies:",response.cookies)
16 # print("文本形式的网页源码:",response.text)
17 print("字节流形式的网页源码:",response.content)
18 
19 import requests
20 url = "https://book.douban.com//tag/营销"    # 网络请求地址
21 headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit"
22                         "/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"}
23 response = requests.get(url, headers=headers)    # 发送网络请求
24 print(response.content.decode("utf-8"))    # 以字节流形式打印网页源码
25 
26 import requests
27 # 导入requests.exceptions中的三种异常类
28 from requests.exceptions import ReadTimeout,HTTPError,RequestException
29 # 循环发送请求50次
30 for a in range(0,50):
31     try:    # 捕获异常
32         url = "https://www.baidu.com/"    # 网页请求地址
33         response = requests.get(url, timeout=0.06)    # 设置超时为0.06s
34         print(response.status_code)    # 打印状态码
35     except ReadTimeout:
36         print("timeout")
37     except HTTPError:
38         print("httperror")
39     except RequestException:
40         print("reqeerror")
41 
42 import requests
43 # 设置代理IP
44 proxy = {"http://":"60.188.90.33:3000",
45          "https://":"183.128.240.228:6666"}
46 response = requests.get("https://www.baidu.com/", proxies = proxy)
47 print(response.content.decode("utf-8"))
48 
49 import requests
50 from bs4 import BeautifulSoup
51 response = requests.get("http://news.baidu.com/")    # 发送网络请求
52 bs = BeautifulSoup(response.text,"lxml")    # 通过lxml解析器来解析文本形式的网页源码
53 print(bs.find("title").text)    # 找到title并以文本形式打印
54 
55 # 任务:将爬取的内容写到本地
56 import requests
57 url = "https://www.bilibili.com/"
58 def use_requests(url):
59     response = requests.get(url)
60     # print(response.text)
61     file_path = "E:/PyCharmProjects/SpyderStudy/哔哩哔哩首页.html"
62     with open(file_path,"w",encoding="utf-8") as f:
63         f.write(response.text)
64 if __name__ == "__main__":
65     use_requests(url)
66 
67 
68 file_path = "E:/PyCharmProjects/SpyderStudy/哔哩哔哩_selenium_chrome.html"
69 with open(file_path,"w",encoding="utf-8") as f:
70     f.write(data)
71 
72 driver.close()
73 
74 import urllib.request
75 import requests
76 from bs4 import BeautifulSoup
77 headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 "
78                          "(KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"}
79 key = "小说"
80 key_ASCII = urllib.request.quote(key)
81 url = "https://book.douban.com/tag/" + str(key_ASCII) + "?start=0&type=T"
82 response = requests.get(url, headers=headers)
83 bs = BeautifulSoup(response.text, "lxml")
84 a = bs.select("#subject_list > ul > li:nth-child(1) > div.info > h2 > a")
85 a_1 = a[0]
86 print(a_1.get_text().replace("\n", "").replace(" ", ""))
87 
88 import urllib.request
89 import requests
90 
91 headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 "
92                              "(KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"}
93 
94 key = "华语"
95 key_ASCII = urllib.request.quote(key)
96 url = "https://movie.douban.com/j/search_subjects?type=movie&tag=" + str(
97     key_ASCII) + "&sort=recommend&page_limit=20&page_start=0"
98 response = requests.get(url, headers=headers)
99 print(response)

 

推荐阅读