首页 > 技术文章 > Python 爬虫实例(6)—— 爬取蚂蚁免费代理

xuchunlin 2017-04-27 15:16 原文

 数据库表sql语句:

 

CREATE TABLE `free_ip` (
  `free_ip_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
  `ip` varchar(255) DEFAULT NULL COMMENT 'ip地址',
  `port` varchar(255) DEFAULT NULL COMMENT '端口',
  `yini_class` varchar(255) DEFAULT NULL COMMENT '匿名等级',
  `http_type` varchar(255) DEFAULT NULL COMMENT '代理类型',
  `response_time` varchar(255) DEFAULT NULL COMMENT '响应时间',
  `address` varchar(255) DEFAULT NULL COMMENT '地理位置',
  `validate_time` varchar(255) DEFAULT NULL COMMENT '最近验证时间',
  `hashcode` varchar(255) DEFAULT NULL COMMENT '去重',
  PRIMARY KEY (`free_ip_id`),
  UNIQUE KEY `hashcode` (`hashcode`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=4220 DEFAULT CHARSET=utf8;

 

 

 

源代码:

 

# coding:utf-8
import random, re
import sqlite3
import json, time
import uuid
from bs4 import BeautifulSoup
import threading
import requests
import MySQLdb
from lxml import etree

import urllib3
urllib3.disable_warnings()
import urllib2

import sys
reload(sys)
sys.setdefaultencoding('utf-8')


session = requests.session()

import logging
import logging.handlers
import platform
sysStr = platform.system()
if sysStr =="Windows":
    LOG_FILE_check = 'H:\\log\\log.txt'
else:
    LOG_FILE_check = '/log/wlb/crawler/cic.log'


handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=128 * 1024 * 1024,backupCount=10)  # 实例化handler  200M 最多十个文件
fmt = '\n' + '%(asctime)s - %(filename)s:%(lineno)s  - %(message)s'
formatter = logging.Formatter(fmt)  # 实例化formatter
handler.setFormatter(formatter)  # 为handler添加formatter
logger = logging.getLogger('check')  # 获取名为tst的logger
logger.addHandler(handler)  # 为logger添加handler
logger.setLevel(logging.DEBUG)


def md5(str):
    import hashlib
    m = hashlib.md5()
    m.update(str)
    return m.hexdigest()

def freeIp():

    for i in range(1,1000):
        print "正在爬取的位置是:",i

        url = "http://www.ip181.com/daili/" + str(i)+ ".html"
        headers = {

            "Host":"www.ip181.com",
            "Connection":"keep-alive",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36",
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Referer":url,
            "Accept-Encoding":"gzip, deflate",
            "Accept-Language":"zh-CN,zh;q=0.8",


            }
        try:

            result = session.get(url=url,headers=headers).text
            result = result.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(result)[0])
        except:
            result = session.get(url=url, headers=headers).text
            result = result.encode('ISO-8859-1').decode(requests.utils.get_encodings_from_content(result)[0])

        soup = BeautifulSoup(result, 'html.parser')

        result_soup = soup.find_all("div", attrs={"class": "col-md-12"})[1]

        result_soup = str(result_soup).replace('\r\n\t','').replace('\r\n','').replace('\n\t','').replace('\n','').replace(' class="warning"','')

        result_soups = re.findall('最近验证时间</td></tr>(.*?)</tbody></table><div class="page">共',result_soup)[0]
        print result_soups
        result_list = re.findall('<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>',result_soups)

        for item in result_list:
            ip = item[0]
            port = item[1]
            yini_class = item[2]
            http_type = item[3]
            response_time = item[4]
            address = item[5]
            validate_time = item[6]

            proxy = str(ip) + ":" + port

            hashcode = md5(proxy)

            try: # 此处是数据库连接,请换成自己的数据库
                conn = MySQLdb.connect(host="110.110.110.717", user="lg", passwd="456", db="369",charset="utf8")
                cursor = conn.cursor()
                sql = """INSERT INTO free_ip (ip,port,yini_class,http_type,response_time,address,validate_time,hashcode) 
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"""

                params = (ip,port,yini_class,http_type,response_time,address,validate_time,hashcode)
                cursor.execute(sql, params)
                conn.commit()
                cursor.close()
                print "          插入成功      "


            except Exception as e:
                print "********插入失败********"
                print e


freeIp()

 

 

爬取效果:

 

推荐阅读