1 """豆瓣电影爬虫"""
2
3
4 import requests
5 from lxml import etree
6
7 # 1、将目标网站上的页面爬取出来
8 headers = {
9 'User-Agent': 'Mozilla/5.0',
10 }
11
12 url = 'https://movie.douban.com/cinema/nowplaying/shangrao/'
13 #url = 'https://movie.douban.com/'
14
15 response = requests.get(url, headers=headers)
16 text = response.text
17 #print(response.text)
18 with open('douban.html', 'w', encoding='utf-8') as fp:
19 fp.write(response.content.decode('utf-8'))
20 # response.text返回的是一个经过解码的字符串,是str(unicode)类型
21 # response.content返回的是一个原生的字符串是bytes类型,没有经过解码,respose.content.decode('utf-8')解码
22
23 # 2、将数据根据一定的规则进行提取
24 movies = []
25 html = etree.HTML(text)
26 ul = html.xpath("//ul[@class='lists']")[0]
27 lis = ul.xpath("./li")
28 for li in lis:
29 title = li.xpath("@data-title")[0]
30 region = li.xpath("@data-region")[0]
31 director = li.xpath("@data-director")[0]
32 actors = li.xpath("@data-actors")[0]
33 duration = li.xpath("@data-duration")[0]
34 img = li.xpath(".//img/@src")[0]
35 release_date = li.xpath(".//li[@class='release-date']/text()")[0].strip() # strip()去除两边空格
36 #print(release_date)
37
38 movie = {
39 'title': title,
40 'region': region,
41 'director': director,
42 'actors': actors,
43 'duration': duration,
44 'img': img,
45 'date': release_date
46 }
47 movies.append(movie)
48
49 for movie in movies:
50 print(movie)