首页 > 技术文章 > 爬虫(二)urllib库数据挖掘

zibinchen 2020-08-04 22:20 原文

第一个爬虫

from urllib import request

url = r'http://www.baidu.com'

# 发送请求,获取
response = request.urlopen(url).read()

# 1、打印获取信息
print(response)

# 2、打印获取信息的长度
print(len(response))

 

 

中文处理

# 数据清洗,用【正则表达式】进行数据清洗
from urllib import request
import re # 正则表达式模块

url = r'http://www.baidu.com'

# 发送请求,获取
response = request.urlopen(url).read().decode() # 解码---(编码endecode())

# 1、获取title标签的内容
pat = r'(.*?)'

data = re.findall(pat,response)

print(data)

 

自定义请求

# 数据清洗,用【正则表达式】进行数据清洗
from urllib import request
import re # 正则表达式模块

url = r'http://www.baidu.com'

# 创建自定义请求对象
req = request.Request(url) # 不仅可以放url,也可以放cookie等

# 发送请求,获取请求信息,request自动创建请求对象
response = request.urlopen(req).read().decode() # 解码---(编码endecode())

pat = r'(.*?)'

data = re.findall(pat,response)

print(type(data))
# <class 'list'>

# 2、要获取list里的信息
print(data[0])

 

伪装浏览器原理

# 数据清洗,用【正则表达式】进行数据清洗
from urllib import request
import re # 正则表达式模块

url = r'http://www.baidu.com'

# 构造请求头
header = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like "
                 "Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44"
}

iphoneHeader = {
    "User-Agent":"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWeb"
                 "Kit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"
}

# 创建自定义请求对象 目的:对抗反爬虫机制

# 反爬虫机制:通过判断用户是否是浏览器访问
# 可以通过伪装浏览器进行爬取
req = request.Request(url,headers=iphoneHeader) # 不仅可以放url,也可以放cookie、header等

response = request.urlopen(req).read().decode() # 解码---(编码endecode())

pat = r'(.*?)'

data = re.findall(pat,response)

print(type(data))
# <class 'list'>

# 2、要获取list里的信息
print(data[0])

 

常用浏览器user-agent

=======================PC浏览器========================
Opera
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
Opera/8.0 (Windows NT 5.1; U; en)
Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
Firefox
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Safari
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
chrome

Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
360
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko

淘宝浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
猎豹浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
QQ浏览器
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
sogou浏览器
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
maxthon浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
UC浏览器
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36

==================== mobile浏览器====================
IPhone
Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5
IPod
Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5
IPAD
Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5
Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5
Android
Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
QQ浏览器 Android版本
MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
Android Opera Mobile
Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10
Android Pad Moto Xoom
Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13
BlackBerry
Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+
WebOS HP Touchpad
Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0
Nokia N97
Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124
Windows Phone Mango
Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)
UC浏览器
UCWEB7.0.2.37/28/999
NOKIA5700/ UCWEB7.0.2.37/28/999
UCOpenwave
Openwave/ UCWEB7.0.2.37/28/999
UC Opera
Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999

 

添加多个UserAgent

 在我们每次请求的时候随机地生成请求头user-agent,这样我们多次爬取的时候,对方就没有办法去判断是否是爬虫

# 数据清洗,用【正则表达式】进行数据清洗
from urllib import request
import re # 正则表达式模块
import random

url = r'http://www.baidu.com'

agent1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like " \
         "Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44"

agent2 = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWeb" \
         "Kit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"

agent3 = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gec" \
         "ko) Chrome/39.0.2171.71 Safari/537.36"

agent4 = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubun" \
         "tu/10.10 (maverick) Firefox/3.6.10"

agent5 = "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83" \
         "D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"

list1 = [agent1,agent2,agent3,agent4,agent5]

agent = random.choice(list1)
print(agent)

# 构造请求头
header = {
    "User-Agent":agent
}

iphoneHeader = {
    "User-Agent":"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWeb"
                 "Kit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5"
}

# 创建自定义请求对象 目的:对抗反爬虫机制

# 反爬虫机制:通过判断用户是否是浏览器访问
# 可以通过伪装浏览器进行爬取
req = request.Request(url,headers=iphoneHeader) # 不仅可以放url,也可以放cookie、header等

response = request.urlopen(req).read().decode() # 解码---(编码endecode())

pat = r'(.*?)'

data = re.findall(pat,response)

print(type(data))
# <class 'list'>

# 2、要获取list里的信息
print(data[0])

 

创建自定义Openner

 我们之前一直都在使用的urlopen,它是一个特殊的openner(也就是模块帮我们构建好的)

 

但是基本的urlopen()方法不支持代理、cookie等其他的HTTP/HTTPS高级功能。所以要支持这些功能。

 

通过request,build_opener()方法创建自定义opener对象

 

使用自定义的opener对象,调用open()方法发送请求

 

如果程序里所有的请求都使用自定义的opener,可以使用request.install_opener() 将自定义的opener对象定义为全局opener,表示如果之后凡是调用urlopen,都将使用这个opener(根据自己的需求来选择)

# 创建自定义opener

from urllib import request

# 构建HTTP处理器对象 (专门处理HTTP请求的对象)
http_hander = request.HTTPHandler() # 也可以HTTPS请求

# 创建自定义opener
opener = request.build_opener(http_hander)

# 创建自定义请求对象
req = request.Request("http://www.baidu.com")

# 提交发送请求,获取响应
response = opener.open(req).read().decode()

print(response)

 

设置opener为全局

 opener是用来发送请求的对象

# 创建自定义opener
from urllib import request

# 构建HTTP处理器对象 (专门处理HTTP请求的对象)
http_hander = request.HTTPHandler() # 也可以HTTPS请求

# 创建自定义opener
opener = request.build_opener(http_hander)

# 创建自定义请求对象
req = request.Request("http://www.baidu.com")

# 提交发送请求,获取响应
# response = opener.open(req).read().decode()

# 把自定义opener设置为全局,这样用urlopen发送的请求也会使用自定义的opener
request.install_opener(opener)

response = request.urlopen(req).read().decode()

print(response)

 

使用代理ip

反爬虫2:判断请求来源的ip地址

措施:使用代理IP
(百度:代理ip)找存活时间长一点的
https://www.kuaidaili.com/free/inha/1/

 

from urllib import request
import random

# 反爬虫2:判断请求来源ip的地址

# 措施:使用代理IP
# 125.46.0.62  53281
# 222.223.182.66   8000
# 222.90.110.194   8080
# 49.4.67.31    3128
# 182.18.13.149     53281

proxylist = [
    {"http":"125.46.0.62:53281"},
    {"http":"222.223.182.66:8000"},
    {"http":"222.90.110.194:8080"},
    {"http":"49.4.67.31:3128"},
    {"http":"182.18.13.149:53281"},
]

proxy = random.choice(proxylist)

print(proxy)

 

 构建代理处理器对象

from urllib import request
import random

url = r'http://www.baidu.com'

# 反爬虫2:判断请求来源ip的地址

# 措施:使用代理IP
# 125.46.0.62  53281
# 222.223.182.66   8000
# 222.90.110.194   8080
# 49.4.67.31    3128
# 182.18.13.149     53281

proxylist = [
    {"http":"171.35.169.76:9999"},
    {"http":"222.223.182.66:8000"},
    {"http":"222.90.110.194:8080"},
    {"http":"49.4.67.31:3128"},
    {"http":"182.18.13.149:53281"}
]

proxy = random.choice(proxylist)

# 构建代理处理器对象
proxyHandler = request.ProxyHandler(proxy)

# 创建自定义opener
opener = request.build_opener(proxyHandler)

# 创建请求对象
req = request.Request(url)

res = opener.open(req).read().decode()

print(res)

 

处理Get请求

 处理get请求

from urllib import request
import urllib

# http://www.baidu.com/s?wd=%E5%8C%97%E4%BA%AC # url编码

wd = {"wd":"北京"}

url = 'http://www.baidu.com/s?'

# 构造url编码
wdd = urllib.parse.urlencode(wd)
# wd=%E5%8C%97%E4%BA%AC

url = url + wdd

req = request.Request(url)

response = request.urlopen(req).read().decode()

print(response)

 

 

 

贴吧爬虫实战

from urllib import request
import urllib
import time

# 构造请求头信息
header = {
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
}

# url规律

# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 # 第一页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 # 第二页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 # 第三页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 # 第五页

def loadpage(fullurl,filename):
    print("正在下载:",filename)
    req = request.Request(fullurl,headers = header)
    resp = request.urlopen(req).read() # 不能用decode()
    return resp

def writepage(html,filename):
    print("正在保存:",filename)
    with open(filename,"wb") as f: # wb表示二进制
        f.write(html)
    print("--------------------------------------------")

# 构造url
def tiebaSpider(url,begin,end):
    for page in range(begin, end+1):
        pn = (page-1)*50
        fullurl = url + "&pn=" + str(pn) # 每次请求的完整url
        filename = "C:\\Users\\Administrator\\Desktop\\爬虫\\第" + str(page) + "页.html" # 每次请求后保存的文件名

        html = loadpage(fullurl,filename) # 调用爬虫爬取网页
        writepage(html,filename) # 把获取到的网页信息写入本地

if __name__ == '__main__':
    kw = input('请输入贴吧名:')
    begin = int(input("请输入起始页:"))
    end = int(input("请输入结束页:"))

    url = "http://tieba.baidu.com/f?"

    key = urllib.parse.urlencode({"kw":kw})
    url = url + key

    tiebaSpider(url,begin,end)

    time.sleep(10)

 

 

 

有道翻译爬虫

 打开:http://fanyi.youdao.com/

#2.15 实战:贴吧爬虫

from urllib import request
import urllib
import re

#构造请求头信息
header={
"User-Agent":"Mozilla/5.0 (Linux; U; An\
droid 8.1.0; zh-cn; BLA-AL00 Build/HUAW\
EIBLA-AL00) AppleWebKit/537.36 (KHTML, l\
ike Gecko) Version/4.0 Chrome/57.0.2987.13\
2 MQQBrowser/8.9 Mobile Safari/537.36"
}

url="http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"

key="自学"

#post请求需要提交的参数
formdata={
   "i":key,
   "from":"AUTO",
   "to":"AUTO",
   "smartresult":"dict",
   "client":"fanyideskweb",
   "salt":"15503049709404",
   "sign":"3da914b136a37f75501f7f31b11e75fb",
   "ts":"1550304970940",
   "bv":"ab57a166e6a56368c9f95952de6192b5",
   "doctype":"json",
   "version":"2.1",
   "keyfrom":"fanyi.web",
   "action":"FY_BY_REALTIME",
   "typoResult":"false"
}

data=urllib.parse.urlencode(formdata).encode(encoding='utf-8')

req=request.Request(url,data=data,headers=header)

resp=request.urlopen(req).read().decode()

#正则表达式 提取"tgt":"和"}]]中间的任意内容
pat=r'"tgt":"(.*?)"}]]'

result=re.findall(pat,resp)

print(result[0])

 

异常处理

from urllib import request

list=[
    "http://www.baidu.com",
    "http://www.baidu.com",
    "http://jiswiswissnduehduehd.com",
    "http://www.baidu.com",
    "http://www.baidu.com"
]

i = 0
for url in list:
    i = i+1
    try:
        request.urlopen(url)
        print("第{}次请求完成了".format(i))
    except Exception as e:
        print(e)

 

推荐阅读