首页 > 技术文章 > 1 urllib库(了解)

sruzzg 原文

urllibpython的基本库之一,内置四大模块,即requesterrorparserobotparser,常用的requesterror,一个用于发送HTTP请求,一个用于处理请求的错误。parse用于对URL的处理,拆分,合并等

1、urllib库之urlopen函数

 1 """urllib库之urlopen函数"""
 2 
 3 
 4 #from urllib import request
 5 import urllib.request
 6 
 7 # 使用urlopen获得网页代码
 8 resp = urllib.request.urlopen('http://www.baidu.com/')
 9 
10 # print(resp.read())   #读取的是编码数据
11 print(resp.read().decode('utf-8'))
12 # <class 'http.client.HTTPResponse'>
13 print(type(resp))
14 
15 # print(resp.readline())    #读取1行
16 #print(resp.readlines())    #读取多行

2、urllib库之urlretrieve下载数据

 1 """urllib库之urlretrieve下载数据"""
 2 
 3 
 4 import urllib.request
 5 
 6 # 使用urlretrieve下载数据
 7 urllib.request.urlretrieve('http://www.baidu.com', 'baidu.html')
 8 
 9 # 使用urlretrieve下载图片
10 urllib.request.urlretrieve('http://img01.tooopen.com/Downs/images/2011/10/30/sy_20111030205827520061.jpg', 'image.jpg')

3、urllib库之解码与编码

 1 """urllib库之解码与编码"""
 2 
 3 
 4 import urllib.parse
 5 import urllib.request
 6 
 7 #urlencode函数的用法
 8 params = {'name':'张三', 'age':18, '年级':'一年级'}
 9 # 编码
10 result = urllib.parse.urlencode(params)
11 #name=%E5%BC%A0%E4%B8%89&age=18&%E5%B9%B4%E7%BA%A7=%E4%B8%80%E5%B9%B4%E7%BA%A7
12 print(result)
13 
14 # url = 'https://www.baidu.com/s?wd=刘德华'
15 url = 'https://www.baidu.com/s?'
16 param = {'wd':'刘德华'}
17 qs = urllib.parse.urlencode(param)
18 url += qs
19 print(url)
20 resp = urllib.request.urlopen(url)
21 print(resp.read().decode('utf-8'))
22 
23 
24 params = {'name':'张三', 'age':18, '年级':'一年级'}
25 # 编码
26 result = urllib.parse.urlencode(params)
27 # name=%E5%BC%A0%E4%B8%89&age=18&%E5%B9%B4%E7%BA%A7=%E4%B8%80%E5%B9%B4%E7%BA%A7
28 print(result)
29 # 解码
30 result2 = urllib.parse.parse_qs(result)
31 #{'name': ['张三'], 'age': ['18'], '年级': ['一年级']}
32 print(result2)
33 
34 
35 params2 = "张三李四"
36 # 编码
37 rs = urllib.parse.quote(params2)
38 print('rs=%s', rs)
39 # 解码
40 rs2 = urllib.parse.unquote(rs)
41 print('rs2=%s', rs2)

 4、urllib库之urlparse

 1 """urllib库之urlparse"""
 2 
 3 
 4 import urllib.parse
 5 
 6 url = 'http://www.google.com/search;hello?hl=en&q=urlparse&btnG=Google+Search#1'
 7 result = urllib.parse.urlparse(url)
 8 print('result = {}'.format(result))
 9 print(result.scheme)    # http
10 print(result.netloc)    # www.google.com
11 print(result.path)      # /search
12 print(result.params)    # hello (用的极少)
13 print(result.query)     # hl=en&q=urlparse&btnG=Google+Search
14 print(result.fragment)  # 1
15 result2 = urllib.parse.urlunparse(result)
16 print('result2 = {}'.format(result2))
17 
18 
19 url2 = 'http://www.google.com/search;hello?hl=en&q=urlparse&btnG=Google+Search#1'
20 result3 = urllib.parse.urlsplit(url2)
21 print('result3 = {}'.format(result3))
22 print(result.scheme)    # http
23 print(result.netloc)    # www.google.com
24 print(result.path)      # /search
25 print(result.query)     # hl=en&q=urlparse&btnG=Google+Search
26 print(result.fragment)  # 1
27 result4 = urllib.parse.urlunsplit(result3)
28 print('result4 = {}'.format(result4))
29 
30 
31 url3 = urllib.parse.urljoin('http://www.google.com/search?', 'hl=en&q=urlparse&btnG=Google+Search#1')
32 print('url3 = {}'.format(url3))
33 url4 = urllib.parse.urljoin('http://www.google.com/search?/', 'hl=en&q=urlparse&btnG=Google+Search#1')
34 print('url4 = {}'.format(url4))
35 url5 = urllib.parse.urljoin('http://www.google.com/search', '?hl=en&q=urlparse&btnG=Google+Search#1')
36 print('url5 = {}'.format(url5))

推荐阅读