分析

蚂蚁代理的列表页大致是这样的：

端口字段使用了图片显示，并且在图片上还有各种干扰线，保存一个图片到本地用画图打开观察一下：

仔细观察蓝色的线其实是在黑色的数字下面的，其它的干扰线也是，所以这幅图是先绘制的干扰线又绘制的端口数字，于是就悲剧了，干扰线形同虚设，所以还是有办法识别的。

然后就是ip字段，看了下ip字段很老实没啥猫腻。

注意到这个列表有一个按端口号筛选的功能，很兴奋的试了一下以为可以绕过去，然后：

端口号是不用图片显示了，但是ip地址的最后一部分用图片显示，还是老老实实识别端口号吧。

另外就是对于端口号图片的url也是先存储在元素属性上然后又设置的，它默认返回的src是空的：

还有就是对于图片的访问需要有一个proxy_token的cookie，否则的话访问不了这张图片，这个算是做的比较好的了，其它的站点一般都是对图片访问没有限制。

这个proxy_token是在页面返回的时候设置的，同时设置了图片的src，可以在页面底部找到这段js：

$(function() {
    document.cookie = "proxy_token=mcmoveng;path=/";
    $("img.js-proxy-img").each(function(index, item) {
        $(this).attr("src", $(this).attr("data-uri")).removeAttr("data-uri");;
    });
});

在页面返回的时候提取出对应的proxy_token即可。

代码实现

识别端口号的话使用这个库：https://github.com/CC11001100/commons-simple-character-ocr

首先需要收集一些图片来生成标注图片，这里选了它的随机选择5位数端口的列表，这样得到的数字更多可以少下几张。

另外需要注意的是对图片去噪音使用的是SingleColorClean，这种过滤器会将图片上除了指定颜色（未指定的话默认是黑色）之外的颜色统统过滤掉，正好适合这里除了字体的黑色其它干扰线统统过滤掉，当然是有一定几率干扰线是黑色的过滤不掉的，几率大概是1/0XFFFFFF吧…haha

下载一些图片生成标注图片：

package org.cc11001100.t1;

import cc11001100.ocr.OcrUtil;
import cc11001100.ocr.clean.SingleColorFilterClean;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author CC11001100
 */
public class AntProxyGrab {

	private static OcrUtil ocrUtil;

	static {
		ocrUtil = new OcrUtil().setImageClean(new SingleColorFilterClean());
	}

	private static void grabImage(String saveBasePath) {
		String url = "http://www.mayidaili.com/free/fiveport/";
		for (int i = 0; i < 10; i++) {
			String responseContent = getResponseContent(url + i);
			String proxyToken = parseProxyToken(responseContent);
			Document doc = Jsoup.parse(responseContent);
			doc.select(".js-proxy-img").forEach(elt -> {
				String imgLink = elt.attr("data-uri");
				byte[] imgBytes = download(imgLink, proxyToken);
				try {
					BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes));
					String savePath = saveBasePath + "/" + System.currentTimeMillis() + ".png";
					ImageIO.write(img, "png", new File(savePath));
					System.out.println("save img " + imgLink);
				} catch (IOException e) {
					e.printStackTrace();
				}
			});
		}
	}

	private static String parseProxyToken(String responseContent) {
		Matcher matcher = Pattern.compile("proxy_token=(.+);path=/").matcher(responseContent);
		if (matcher.find()) {
			return matcher.group(1);
		}
		return "";
	}

	private static String getResponseContent(String url) {
		byte[] responseBytes = download(url);
		return new String(responseBytes, StandardCharsets.UTF_8);
	}

	private static byte[] download(String url) {
		return download(url, "");
	}

	private static byte[] download(String url, String proxyToken) {
		for (int i = 0; i < 3; i++) {
			try {
				return Jsoup.connect(url).cookie("proxy_token", proxyToken).execute().bodyAsBytes();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return new byte[0];
	}

	public static void main(String[] args) {
		String rawImageSaveDir = "E:/test/proxy/ant/raw/";
		String distinctCharImgSaveDir = "E:/test/proxy/ant/char/";
		grabImage(rawImageSaveDir);
		ocrUtil.init(rawImageSaveDir, distinctCharImgSaveDir);
	}

}

现在去E:/test/proxy/ant/char/将图片名称改为其代表的意思：

上面的标注数据生成完grabImage方法就没用了，在此基础上修改一下爬取前十页的内容并返回：

package org.cc11001100.t1;

import cc11001100.ocr.OcrUtil;
import cc11001100.ocr.clean.SingleColorClean;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * @author CC11001100
 */
public class AntProxyGrab {

	private static OcrUtil ocrUtil;

	static {
		ocrUtil = new OcrUtil().setImageClean(new SingleColorClean());
		ocrUtil.loadDictionaryMap("E:/test/proxy/ant/char/");
	}

	private static List<String> grabProxyIp() {
		String url = "http://www.mayidaili.com/free/fiveport/";
		List<String> resultList = new ArrayList<>();
		for (int i = 0; i < 10; i++) {
			String responseContent = getResponseContent(url + i);
			String proxyToken = parseProxyToken(responseContent);
			Document doc = Jsoup.parse(responseContent);
			List<String> ipList = doc.select("tbody tr").stream().map(elt -> {
				String ip = elt.select("td:eq(0)").text();
				String imgLink = elt.select(".js-proxy-img").attr("data-uri");
				byte[] imgBytes = download(imgLink, proxyToken);
				try {
					BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes));
					String port = ocrUtil.ocr(img);
					return ip + ":" + port;
				} catch (IOException e) {
					e.printStackTrace();
				}
				return null;
			}).filter(Objects::nonNull).collect(Collectors.toList());
			resultList.addAll(ipList);
		}
		return resultList;
	}

	private static String parseProxyToken(String responseContent) {
		Matcher matcher = Pattern.compile("proxy_token=(.+);path=/").matcher(responseContent);
		if (matcher.find()) {
			return matcher.group(1);
		}
		return "";
	}

	private static String getResponseContent(String url) {
		byte[] responseBytes = download(url);
		return new String(responseBytes, StandardCharsets.UTF_8);
	}

	private static byte[] download(String url) {
		return download(url, "");
	}

	private static byte[] download(String url, String proxyToken) {
		for (int i = 0; i < 3; i++) {
			try {
				return Jsoup.connect(url).cookie("proxy_token", proxyToken).execute().bodyAsBytes();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return new byte[0];
	}

	public static void main(String[] args) {
		grabProxyIp().forEach(System.out::println);
	}

}

蚂蚁代理免费代理ip爬取（端口图片显示+token检查）

分析

代码实现

推荐阅读