首页 > 技术文章 > 驴妈妈旅游爬虫

MC-Curry 2019-03-14 12:50 原文

概览页抓取链接

  1 import requests
  2 import re
  3 import pymysql
  4 import hashlib
  5 import datetime
  6 
  7 
  8 class Demo(object):
  9     def __init__(self):
 10         self.host = '127.0.0.1'
 11         self.db = 'app_mark'
 12         self.user = 'root'
 13         self.passwd = '123456'
 14         self.charset = 'utf8mb4'
 15         self.headers = {
 16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
 17         }
 18         self.url = 'http://www.lvmama.com/'
 19         self.channel_link = [
 20             'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list',  # 海岛
 21             'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 东南亚
 22             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 中国港澳台
 23             'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list',  # 迪拜
 24             'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list',  # 俄罗斯
 25             'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list',  # 越南
 26             'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22',  # 法瑞意德
 27             'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list',  # 巴厘岛
 28             'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list',  # 日本
 29             'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list',  # 欧美
 30             'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 新加坡
 31             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 香港
 32             'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list',  # 澳洲
 33             'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list',  # 泰国
 34             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list',  # 三亚
 35             'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p2
 36             'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p3
 37             'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p4
 38             'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list',  # 厦门
 39             'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list',  # 广东
 40             'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list',  # 云南
 41             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list',  # 上海
 42             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list',  # 西安
 43             'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list',  # 成都
 44             'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list',  # 吉林
 45             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list',  # 西北
 46             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list',  # 北京
 47             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list',  # 山东
 48             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list',  # 山西
 49             'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list',  # 河北
 50             'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list',  # 辽宁
 51             ]
 52         self.channel_name = [
 53             '海岛',
 54             '东南亚',
 55             '中国港澳台',
 56             '迪拜',
 57             '俄罗斯',
 58             '越南',
 59             '法瑞意德',
 60             '巴厘岛',
 61             '日本',
 62             '欧洲',
 63             '新加坡',
 64             '香港',
 65             '澳洲',
 66             '泰国',
 67             '三亚',
 68             '三亚p2',
 69             '三亚p3',
 70             '三亚p4',
 71             '厦门',
 72             '广东',
 73             '云南',
 74             '上海',
 75             '西安',
 76             '成都',
 77             '吉林',
 78             '西北',
 79             '北京',
 80             '山东',
 81             '山西',
 82             '河北',
 83             '辽宁',
 84         ]
 85 
 86     def get_html(self, url):
 87         response = requests.get(url, headers=self.headers)
 88         response.encoding = response.apparent_encoding
 89         html = response.text
 90         return html
 91 
 92     def get_data(self):
 93         # 首页抓取
 94         # html = self.get_html(self.url)
 95         # datas = re.findall('<li data-mmurl=.*?<div class="footLink">', html, re.S)[0]
 96         # lis = re.findall('(<li data-mmurl=.*?</li>)', datas, re.S)
 97         # for li in lis:
 98         #     # detail_url = re.findall('<li data-mmurl="(.*?)"', li, re.S)  # 详情页app链接
 99         #     detail_url = re.findall('href="(.*?)"', li, re.S)[0]  # 详情页网页链接
100         #     self.save_data(detail_url)
101         # print(datas)
102 
103         # 频道抓取
104         urls = []
105         # 正则匹配链接
106         for index, channel in enumerate(self.channel_link):
107             html = self.get_html(channel)
108             divs = re.findall('<div class="product-left".*<div class="paging orangestyle"', html, re.S)[0]
109             divs = re.findall('<div class="product-section">.*?</div>', divs, re.S)
110             for div in divs:
111                 print(self.channel_name[index])
112                 url = re.findall('<a href="(.*?)"', div, re.S)[0]
113                 self.save_data(url)
114 
115     def save_data(self, url):
116         print(url)
117         hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest()
118         sitename = '驴妈妈旅游'
119         lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
120         tag = '0'
121         list_sql = [url, hkey, tag, sitename, lasttime]
122         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
123         cur = con.cursor()
124         sql = 'insert into gly(link, hkey, tag, sitename, lasttime) values (%s, %s, %s, %s, %s)'
125         try:
126             cur.execute(sql, list_sql)
127             print('insert success')
128         except Exception as e:
129             con.rollback()
130             print('error~', e)
131         else:
132             con.commit()
133         cur.close()
134         con.close()
135 
136 
137 if __name__ == '__main__':
138     demo = Demo()
139     demo.get_data()

 

细览页解析字段

  1 import pymysql
  2 import re
  3 import requests
  4 from multiprocessing.dummy import Pool as ThreadPool
  5 import datetime
  6 
  7 
  8 class XLY(object):
  9     def __init__(self):
 10         self.host = '127.0.0.1'
 11         self.db = 'app_mark'
 12         self.user = 'root'
 13         self.passwd = '123456'
 14         self.charset = 'utf8mb4'
 15         self.headers = {
 16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
 17         }
 18         self.start = datetime.datetime.now()
 19 
 20     def get_data(self):
 21         # 从gly表中拿链接
 22         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
 23         cur = con.cursor()
 24         sql = 'select link from gly where tag = "1" and sitename="驴妈妈旅游"'
 25         after_sql = 'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"'
 26         try:
 27             cur.execute(sql)
 28             results = cur.fetchall()
 29             cur.execute(after_sql)
 30         except Exception as e:
 31             con.rollback()
 32             results = None
 33             print('error~', e)
 34         else:
 35             con.commit()
 36         cur.close()
 37         con.close()
 38         return results
 39 
 40     def parse_data(self, url):
 41         # 正则匹配各个字段
 42         print(url)
 43         url = url[0]
 44         # 匹配id
 45         id = url.split('/')[-1]
 46         id = re.sub('\?.*', '', id)
 47         # print(id)
 48         response = requests.get(url, headers=self.headers)
 49         html = response.text
 50         if 'scenic' not in url and 'hotel' not in url:
 51             # 去掉酒店和景点
 52             # 匹配标题
 53             title = re.findall('<h.*?tit">(.*?)</h.*?>', html, re.S)
 54             if title:
 55                 title = title[0]
 56                 title = re.sub('\n|\r|&nbsp;|自营|<[\s\S]*?>', '', title)
 57                 title = title.strip()
 58             else:
 59                 title = re.findall('<p class="nchtitle">(.*?)</p>', html, re.S)
 60                 if title:
 61                     title = title[0]
 62                     title = re.sub('\n|\r|&nbsp;|自营|<[\s\S]*?>', '', title)
 63                     title = title.strip()
 64             # 匹配价格
 65             price = re.findall('<dfn.*?>(\d+)</dfn>', html, re.S)
 66             if price:
 67                 price = price[0]
 68             else:
 69                 price = re.findall('<span class="product_price">.*?(\d+).*?</span>', html, re.S)
 70                 if price:
 71                     price = price[0]
 72                 else:
 73                     price = re.findall('¥<em>(\d+)</em>', html, re.S)
 74                     if price:
 75                         price = price[0]
 76                     else:
 77                         price = re.findall('<span class="product-price-value">.*?(\d+).*?</span>', html, re.S)
 78                         if price:
 79                             price = price[0]
 80                         else:
 81                             price = None
 82             # 匹配好评率
 83             praise = re.findall('<p class="product_top_dp">[\s\S]*?<span>([\s\S]*?)</span>[\s\S]*?</p>', html, re.S)
 84             if praise:
 85                 praise = praise[0]
 86                 praise = re.sub('<.*?>', '', praise)
 87                 praise = praise.strip()
 88             else:
 89                 praise = re.findall('<a href="#pro_comment".*?<span>([\s\S]*?)</span>', html, re.S)
 90                 if praise:
 91                     praise = praise[0]
 92                 else:
 93                     praise = re.findall('<span class="c_f60">([\s\S]*?)</span>', html, re.S)
 94                     if praise:
 95                         praise = praise[0]
 96                         praise = praise.strip()
 97                     else:
 98                         praise = re.findall('<p class="product_top_dp">[\s\S]*?<span>([\s\S]*?)<small>%</small>[\s\S]*?</span>', html, re.S)
 99                         if praise:
100                             praise = praise[0]
101                             praise = praise.strip()
102                         else:
103                             praise = re.findall('<span class="val">([\s\S]*?)</span>', html, re.S)
104                             if praise:
105                                 praise = praise[0]
106             if praise:
107                 if '%' in praise:
108                     praise = re.sub('%', '', praise)
109                 praise = float(praise)
110                 if praise > 100:
111                     praise = None
112                     print('好评率抓取错误')
113                 else:
114                     pass
115             else:
116                 praise = None
117             # 匹配出发地
118             starting_city = re.findall('<dl class="info-city">[\s\S]*?出发城市[\s\S]*?<ii>([\s\S]*?)</ii></dd>', html, re.S)
119             target_city = re.findall('<dt>目的地[\s\S]*?<dd>([\s\S]*?)</dd>', html, re.S)
120             if starting_city:
121                 starting_city = starting_city[0]
122                 starting_city = re.sub('<.*?>', '', starting_city)
123                 # 匹配目的地
124                 target_city = target_city[0]
125                 target_city = re.sub('<.*?>', '', target_city)
126                 # 匹配天数
127                 days_spent = re.findall('<dt>出游天数[\s\S]*?<dd>([\s\S]*?)</dd>', html, re.S)[0]
128                 days_spent = re.sub('<.*?>', '', days_spent)
129                 # print(days_spent)
130             else:
131                 starting_city = target_city = days_spent = None
132             # 匹配类型
133             type_ = re.findall('<i class="t-category">([\s\S]*?)</i>', html, re.S)
134             if type_:
135                 type_ = type_[0]
136             else:
137                 type_ = re.findall('<span class="product_top_type product_type_zyx">([\s\S]*?)</span>', html, re.S)
138                 if type_:
139                     type_ = type_[0]
140                 else:
141                     type_ = re.findall('<span class="dpn_group">([\s\S]*?)</span>', html, re.S)
142                     if type_:
143                         type_ = type_[0]
144                     else:
145                         type_ = None
146             # print(type_)
147             list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url]
148             self.save_data(list_data)
149 
150     def save_data(self, list_data):
151         # 写入数据库
152         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
153         cur = con.cursor()
154         sql = 'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'
155         # cur.execute(sql, list_data)
156         # con.commit()
157         try:
158             cur.execute(sql, list_data)
159             print('insert success')
160         except Exception as e:
161             con.rollback()
162             print('error~', e)
163         else:
164             con.commit()
165         cur.close()
166         con.close()
167 
168 
169 if __name__ == '__main__':
170     xly = XLY()
171     urls = xly.get_data()
172     if urls:
173         # 开启多线程
174         pool = ThreadPool(20)
175         pool.map(xly.parse_data, urls)
176         pool.close()
177         pool.join()
178     end = datetime.datetime.now()
179     print('耗时:', (end-xly.start))
180     # for url in urls:
181     #     url = url[0]
182     #     xly.parse_data(url)
183         # break

 

推荐阅读