备忘
1 import urllib.parse 2 import urllib.request 3 # 将数据使用urlencode编码处理后,再使用encoding设置为utf-8编码 4 data = bytes(urllib.parse.urlencode({"word":"hello"}), encoding="utf-8") 5 response = urllib.request.urlopen("http://httpbin.org/post",data=data) # 打开指定需要爬取的网页 6 html = response.read() # 读取网页 7 print(html) # 打印读取内容 8 9 import requests 10 data = {"word":"hello"} # 表单参数 11 response = requests.post("http://httpbin.org/post",data=data) 12 print("状态码:",response.status_code) 13 # print("请求url:",response.url) 14 # print("头部信息:",response.headers) 15 # print("cookies:",response.cookies) 16 # print("文本形式的网页源码:",response.text) 17 print("字节流形式的网页源码:",response.content) 18 19 import requests 20 url = "https://book.douban.com//tag/营销" # 网络请求地址 21 headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit" 22 "/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"} 23 response = requests.get(url, headers=headers) # 发送网络请求 24 print(response.content.decode("utf-8")) # 以字节流形式打印网页源码 25 26 import requests 27 # 导入requests.exceptions中的三种异常类 28 from requests.exceptions import ReadTimeout,HTTPError,RequestException 29 # 循环发送请求50次 30 for a in range(0,50): 31 try: # 捕获异常 32 url = "https://www.baidu.com/" # 网页请求地址 33 response = requests.get(url, timeout=0.06) # 设置超时为0.06s 34 print(response.status_code) # 打印状态码 35 except ReadTimeout: 36 print("timeout") 37 except HTTPError: 38 print("httperror") 39 except RequestException: 40 print("reqeerror") 41 42 import requests 43 # 设置代理IP 44 proxy = {"http://":"60.188.90.33:3000", 45 "https://":"183.128.240.228:6666"} 46 response = requests.get("https://www.baidu.com/", proxies = proxy) 47 print(response.content.decode("utf-8")) 48 49 import requests 50 from bs4 import BeautifulSoup 51 response = requests.get("http://news.baidu.com/") # 发送网络请求 52 bs = BeautifulSoup(response.text,"lxml") # 通过lxml解析器来解析文本形式的网页源码 53 print(bs.find("title").text) # 找到title并以文本形式打印 54 55 # 任务:将爬取的内容写到本地 56 import requests 57 url = "https://www.bilibili.com/" 58 def use_requests(url): 59 response = requests.get(url) 60 # print(response.text) 61 file_path = "E:/PyCharmProjects/SpyderStudy/哔哩哔哩首页.html" 62 with open(file_path,"w",encoding="utf-8") as f: 63 f.write(response.text) 64 if __name__ == "__main__": 65 use_requests(url) 66 67 68 file_path = "E:/PyCharmProjects/SpyderStudy/哔哩哔哩_selenium_chrome.html" 69 with open(file_path,"w",encoding="utf-8") as f: 70 f.write(data) 71 72 driver.close() 73 74 import urllib.request 75 import requests 76 from bs4 import BeautifulSoup 77 headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 " 78 "(KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"} 79 key = "小说" 80 key_ASCII = urllib.request.quote(key) 81 url = "https://book.douban.com/tag/" + str(key_ASCII) + "?start=0&type=T" 82 response = requests.get(url, headers=headers) 83 bs = BeautifulSoup(response.text, "lxml") 84 a = bs.select("#subject_list > ul > li:nth-child(1) > div.info > h2 > a") 85 a_1 = a[0] 86 print(a_1.get_text().replace("\n", "").replace(" ", "")) 87 88 import urllib.request 89 import requests 90 91 headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 " 92 "(KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36"} 93 94 key = "华语" 95 key_ASCII = urllib.request.quote(key) 96 url = "https://movie.douban.com/j/search_subjects?type=movie&tag=" + str( 97 key_ASCII) + "&sort=recommend&page_limit=20&page_start=0" 98 response = requests.get(url, headers=headers) 99 print(response)