第一个爬虫
from urllib import request url = r'http://www.baidu.com' # 发送请求,获取 response = request.urlopen(url).read() # 1、打印获取信息 print(response) # 2、打印获取信息的长度 print(len(response))
中文处理
# 数据清洗,用【正则表达式】进行数据清洗 from urllib import request import re # 正则表达式模块 url = r'http://www.baidu.com' # 发送请求,获取 response = request.urlopen(url).read().decode() # 解码---(编码endecode()) # 1、获取title标签的内容 pat = r'(.*?)' data = re.findall(pat,response) print(data)
自定义请求
# 数据清洗,用【正则表达式】进行数据清洗 from urllib import request import re # 正则表达式模块 url = r'http://www.baidu.com' # 创建自定义请求对象 req = request.Request(url) # 不仅可以放url,也可以放cookie等 # 发送请求,获取请求信息,request自动创建请求对象 response = request.urlopen(req).read().decode() # 解码---(编码endecode()) pat = r'(.*?)' data = re.findall(pat,response) print(type(data)) # <class 'list'> # 2、要获取list里的信息 print(data[0])
伪装浏览器原理
# 数据清洗,用【正则表达式】进行数据清洗 from urllib import request import re # 正则表达式模块 url = r'http://www.baidu.com' # 构造请求头 header = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like " "Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44" } iphoneHeader = { "User-Agent":"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWeb" "Kit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" } # 创建自定义请求对象 目的:对抗反爬虫机制 # 反爬虫机制:通过判断用户是否是浏览器访问 # 可以通过伪装浏览器进行爬取 req = request.Request(url,headers=iphoneHeader) # 不仅可以放url,也可以放cookie、header等 response = request.urlopen(req).read().decode() # 解码---(编码endecode()) pat = r'(.*?)' data = re.findall(pat,response) print(type(data)) # <class 'list'> # 2、要获取list里的信息 print(data[0])
常用浏览器user-agent
=======================PC浏览器========================
Opera
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
Opera/8.0 (Windows NT 5.1; U; en)
Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
Firefox
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Safari
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
chrome
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
360
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
淘宝浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
猎豹浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
QQ浏览器
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
sogou浏览器
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
maxthon浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
UC浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
==================== mobile浏览器====================
IPhone
Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5
IPod
Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5
IPAD
Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5
Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5
Android
Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
QQ浏览器 Android版本
MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
Android Opera Mobile
Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10
Android Pad Moto Xoom
Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13
BlackBerry
Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+
WebOS HP Touchpad
Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0
Nokia N97
Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124
Windows Phone Mango
Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)
UC浏览器
UCWEB7.0.2.37/28/999
NOKIA5700/ UCWEB7.0.2.37/28/999
UCOpenwave
Openwave/ UCWEB7.0.2.37/28/999
UC Opera
Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999
添加多个UserAgent
在我们每次请求的时候随机地生成请求头user-agent,这样我们多次爬取的时候,对方就没有办法去判断是否是爬虫
# 数据清洗,用【正则表达式】进行数据清洗 from urllib import request import re # 正则表达式模块 import random url = r'http://www.baidu.com' agent1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like " \ "Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44" agent2 = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWeb" \ "Kit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" agent3 = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gec" \ "ko) Chrome/39.0.2171.71 Safari/537.36" agent4 = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubun" \ "tu/10.10 (maverick) Firefox/3.6.10" agent5 = "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83" \ "D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" list1 = [agent1,agent2,agent3,agent4,agent5] agent = random.choice(list1) print(agent) # 构造请求头 header = { "User-Agent":agent } iphoneHeader = { "User-Agent":"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWeb" "Kit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5" } # 创建自定义请求对象 目的:对抗反爬虫机制 # 反爬虫机制:通过判断用户是否是浏览器访问 # 可以通过伪装浏览器进行爬取 req = request.Request(url,headers=iphoneHeader) # 不仅可以放url,也可以放cookie、header等 response = request.urlopen(req).read().decode() # 解码---(编码endecode()) pat = r'(.*?)' data = re.findall(pat,response) print(type(data)) # <class 'list'> # 2、要获取list里的信息 print(data[0])
创建自定义Openner
我们之前一直都在使用的urlopen,它是一个特殊的openner(也就是模块帮我们构建好的)
但是基本的urlopen()方法不支持代理、cookie等其他的HTTP/HTTPS高级功能。所以要支持这些功能。
通过request,build_opener()方法创建自定义opener对象
使用自定义的opener对象,调用open()方法发送请求
如果程序里所有的请求都使用自定义的opener,可以使用request.install_opener() 将自定义的opener对象定义为全局opener,表示如果之后凡是调用urlopen,都将使用这个opener(根据自己的需求来选择)
# 创建自定义opener from urllib import request # 构建HTTP处理器对象 (专门处理HTTP请求的对象) http_hander = request.HTTPHandler() # 也可以HTTPS请求 # 创建自定义opener opener = request.build_opener(http_hander) # 创建自定义请求对象 req = request.Request("http://www.baidu.com") # 提交发送请求,获取响应 response = opener.open(req).read().decode() print(response)
设置opener为全局
opener是用来发送请求的对象
# 创建自定义opener from urllib import request # 构建HTTP处理器对象 (专门处理HTTP请求的对象) http_hander = request.HTTPHandler() # 也可以HTTPS请求 # 创建自定义opener opener = request.build_opener(http_hander) # 创建自定义请求对象 req = request.Request("http://www.baidu.com") # 提交发送请求,获取响应 # response = opener.open(req).read().decode() # 把自定义opener设置为全局,这样用urlopen发送的请求也会使用自定义的opener request.install_opener(opener) response = request.urlopen(req).read().decode() print(response)
使用代理ip
反爬虫2:判断请求来源的ip地址
措施:使用代理IP
(百度:代理ip)找存活时间长一点的
https://www.kuaidaili.com/free/inha/1/
from urllib import request import random # 反爬虫2:判断请求来源ip的地址 # 措施:使用代理IP # 125.46.0.62 53281 # 222.223.182.66 8000 # 222.90.110.194 8080 # 49.4.67.31 3128 # 182.18.13.149 53281 proxylist = [ {"http":"125.46.0.62:53281"}, {"http":"222.223.182.66:8000"}, {"http":"222.90.110.194:8080"}, {"http":"49.4.67.31:3128"}, {"http":"182.18.13.149:53281"}, ] proxy = random.choice(proxylist) print(proxy)
构建代理处理器对象
from urllib import request import random url = r'http://www.baidu.com' # 反爬虫2:判断请求来源ip的地址 # 措施:使用代理IP # 125.46.0.62 53281 # 222.223.182.66 8000 # 222.90.110.194 8080 # 49.4.67.31 3128 # 182.18.13.149 53281 proxylist = [ {"http":"171.35.169.76:9999"}, {"http":"222.223.182.66:8000"}, {"http":"222.90.110.194:8080"}, {"http":"49.4.67.31:3128"}, {"http":"182.18.13.149:53281"} ] proxy = random.choice(proxylist) # 构建代理处理器对象 proxyHandler = request.ProxyHandler(proxy) # 创建自定义opener opener = request.build_opener(proxyHandler) # 创建请求对象 req = request.Request(url) res = opener.open(req).read().decode() print(res)
处理Get请求
处理get请求
from urllib import request import urllib # http://www.baidu.com/s?wd=%E5%8C%97%E4%BA%AC # url编码 wd = {"wd":"北京"} url = 'http://www.baidu.com/s?' # 构造url编码 wdd = urllib.parse.urlencode(wd) # wd=%E5%8C%97%E4%BA%AC url = url + wdd req = request.Request(url) response = request.urlopen(req).read().decode() print(response)
贴吧爬虫实战
from urllib import request import urllib import time # 构造请求头信息 header = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36" } # url规律 # http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 # 第一页 # http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 # 第二页 # http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 # 第三页 # http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 # 第五页 def loadpage(fullurl,filename): print("正在下载:",filename) req = request.Request(fullurl,headers = header) resp = request.urlopen(req).read() # 不能用decode() return resp def writepage(html,filename): print("正在保存:",filename) with open(filename,"wb") as f: # wb表示二进制 f.write(html) print("--------------------------------------------") # 构造url def tiebaSpider(url,begin,end): for page in range(begin, end+1): pn = (page-1)*50 fullurl = url + "&pn=" + str(pn) # 每次请求的完整url filename = "C:\\Users\\Administrator\\Desktop\\爬虫\\第" + str(page) + "页.html" # 每次请求后保存的文件名 html = loadpage(fullurl,filename) # 调用爬虫爬取网页 writepage(html,filename) # 把获取到的网页信息写入本地 if __name__ == '__main__': kw = input('请输入贴吧名:') begin = int(input("请输入起始页:")) end = int(input("请输入结束页:")) url = "http://tieba.baidu.com/f?" key = urllib.parse.urlencode({"kw":kw}) url = url + key tiebaSpider(url,begin,end) time.sleep(10)
有道翻译爬虫
#2.15 实战:贴吧爬虫 from urllib import request import urllib import re #构造请求头信息 header={ "User-Agent":"Mozilla/5.0 (Linux; U; An\ droid 8.1.0; zh-cn; BLA-AL00 Build/HUAW\ EIBLA-AL00) AppleWebKit/537.36 (KHTML, l\ ike Gecko) Version/4.0 Chrome/57.0.2987.13\ 2 MQQBrowser/8.9 Mobile Safari/537.36" } url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" key="自学" #post请求需要提交的参数 formdata={ "i":key, "from":"AUTO", "to":"AUTO", "smartresult":"dict", "client":"fanyideskweb", "salt":"15503049709404", "sign":"3da914b136a37f75501f7f31b11e75fb", "ts":"1550304970940", "bv":"ab57a166e6a56368c9f95952de6192b5", "doctype":"json", "version":"2.1", "keyfrom":"fanyi.web", "action":"FY_BY_REALTIME", "typoResult":"false" } data=urllib.parse.urlencode(formdata).encode(encoding='utf-8') req=request.Request(url,data=data,headers=header) resp=request.urlopen(req).read().decode() #正则表达式 提取"tgt":"和"}]]中间的任意内容 pat=r'"tgt":"(.*?)"}]]' result=re.findall(pat,resp) print(result[0])
异常处理
from urllib import request list=[ "http://www.baidu.com", "http://www.baidu.com", "http://jiswiswissnduehduehd.com", "http://www.baidu.com", "http://www.baidu.com" ] i = 0 for url in list: i = i+1 try: request.urlopen(url) print("第{}次请求完成了".format(i)) except Exception as e: print(e)