首页 > 技术文章 > 网络动态代理反反爬

yhc-910 2020-08-20 10:24 原文

前些天,写了个爬虫的博客,但是没有实现使用动态代理反反爬,今天补充下。如果想大量爬取数据,建议还是付费购买代理。

pom文件见:https://www.cnblogs.com/yhc-910/p/13440456.html

package com.paic.ocss.fps.client.jsoup;

import java.io.File;
import java.io.FileOutputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

import org.apache.commons.compress.utils.Lists;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author yhc
 * @date 2020/8/4
 */
public class AnjukeData {

    private final static Logger log = LoggerFactory.getLogger(AnjukeData.class);

   // 一些已知的免费代理,但是调试后基本都无法使用,下面有实时获取动态代理IP的地址,每天可免费获取十个,需注册账号后生成链接:http://h.etdaili.com/
private final static String[] proxy = { "112.65.53.167:24631", "113.195.171.58:9999", "112.95.22.78:8888", "175.44.109.219:9999", "113.195.18.104:9999", "163.125.30.227:8118", "118.212.105.115:9999", "175.44.109.239:9999", "112.111.77.41:9999", "125.108.84.68:9000", "36.250.156.213:9999", "36.249.53.29:9999", "121.232.148.222:9000", "36.249.109.19:9999", "163.125.31.3:8118", "115.218.209.27:9000", "120.83.106.218:9999", "120.83.109.228:9999" }; private static int proxyIndex = 0;
// 一些agent
static String[] ua = { "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/60.0" }; // 读取的数据 private static List<List<String>> data = Lists.newArrayList(); private static String proxyIp = "0.0.0.0"; private static int proxyPort = 80; private static boolean isFirstCall = true; private static void refreshProxy() { isFirstCall = false; String proxy = OkHttpUtil.doGet( "http://47.106.160.121/Index-generate_api_url.html?packid=1&fa=0&qty=1&port=1&format=txt&ss=1&css=&ipport=1&pro=%E4%B8%8A%E6%B5%B7%E7%9B%B4%E8%BE%96%E5%B8%82&city=%E4%B8%8A%E6%B5%B7%E5%B8%82&usertype=13"); log.info("# 获取代理IP:{}", proxy); String[] proxys = proxy.split(":"); proxyIp = proxys[0]; proxyPort = Integer.parseInt(proxys[1]); } public static void getData(String urls) throws Exception { // 读取数据 List<List<String>> pageData = Lists.newArrayList(); if (isFirstCall) { refreshProxy(); } String agent = ua[new Random().nextInt(ua.length - 1)]; Document doc = Jsoup.connect(urls).timeout(600000).proxy(proxyIp, proxyPort).userAgent(agent) .ignoreContentType(true).ignoreHttpErrors(true) // .header("referer", "https://hanchuanshi.anjuke.com/sale/rd1/?kw=&from=sugg") .get(); String html = doc.outerHtml(); if (html.contains("访问验证-安居客")) { log.error("# 请求被拦截,重新设置代理请求"); refreshProxy(); getData(urls); return; } else { log.info("# 请求成功,获取数据"); } Elements els = doc.body().getElementsByClass("list-item"); for (Element el : els) { List<String> rowData = Lists.newArrayList(); Elements titleEls = el.getElementsByClass("house-title"); log.info("# 标题:{}", titleEls.get(0).getElementsByTag("a").text()); rowData.add(titleEls.get(0).getElementsByTag("a").text()); Elements itemEls = el.getElementsByClass("details-item"); Elements itemSpanEls = itemEls.get(0).getElementsByTag("span"); log.info("# 户型:{}", itemSpanEls.get(0).text()); rowData.add(itemSpanEls.get(0).text()); log.info("# 面积:{}", itemSpanEls.get(1).text()); rowData.add(itemSpanEls.get(1).text()); log.info("# 楼层:{}", itemSpanEls.get(2).text()); rowData.add(itemSpanEls.get(2).text()); log.info("# 年限:{}", itemSpanEls.get(3).text()); rowData.add(itemSpanEls.get(3).text()); if (itemEls.size() == 1) { continue; } String[] address = itemEls.get(1).getElementsByTag("span").text().split(" "); log.info("# 楼盘:{}", address[0]); log.info("# 地址:{}", address[1]); rowData.add(address[0]); rowData.add(address[1]); Elements priceEls = el.getElementsByClass("pro-price"); Elements priceSpanEls = priceEls.get(0).getElementsByTag("span"); log.info("# 总价:{}", priceSpanEls.get(0).getElementsByTag("strong").text()); rowData.add(priceSpanEls.get(0).getElementsByTag("strong").text()); log.info("# 单价:{}", priceSpanEls.get(1).text()); rowData.add(priceSpanEls.get(1).text()); pageData.add(rowData); } data.addAll(pageData); } public static void writeExcel(List<String> titleList, List<List<String>> dataList) throws Exception { // open file. File excel = new File("C:\\Users\\Administrator\\Desktop\\data.xls"); excel.deleteOnExit(); excel.createNewFile(); FileOutputStream fos = new FileOutputStream(excel); Workbook book = new HSSFWorkbook(); // create Sheet named "Sheet_1". 0 means this is 1st page. Sheet sheet = book.createSheet("安居客房源信息"); // 写入标题 Row titleRow = sheet.createRow(0); for (int x = 0; x < titleList.size(); x++) { Cell cell0 = titleRow.createCell(x); cell0.setCellValue(titleList.get(x)); } // 写入数据 for (int i = 0; i < dataList.size(); i++) { int row = i + 1; Row dataRow = sheet.createRow(row); List<String> rowData = dataList.get(i); for (int j = 0; j < titleList.size(); j++) { Cell dataCell = dataRow.createCell(j); dataCell.setCellValue(rowData.get(j)); } } book.write(fos); book.close(); log.info("# write data success"); } public static void main(String[] args) { try { String url = "file:///D:/Users/YUHUCHENG693/Desktop/test.html"; // url = "https://hanchuanshi.anjuke.com/sale/p1-rd1/#filtersort"; for (int i = 1; i <= 50; i++) { url = "https://hanchuanshi.anjuke.com/sale/p" + i + "-rd1/#filtersort"; getData(url); log.info("# ===>>> 获取{}页数据成功", i); Thread.sleep(10000); } List<String> titleList = Arrays.asList("标题", "户型", "面积", "楼层", "年限", "楼盘", "地址", "总价", "单价"); writeExcel(titleList, data); } catch (Exception e) { e.printStackTrace(); } } }

 

推荐阅读