首页 > 技术文章 > java爬虫抓取腾讯漫画评论

SimonHu1993 2017-09-11 14:48 原文

package com.eteclab.wodm.utils;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class MySearchTest2 {
	private final static Executor executor = Executors.newCachedThreadPool();// 启用多线程
	private static String mainUrl = "http://ac.qq.com/Comic/index/state/pink/page/";// 可以根据腾讯漫画的分类来进行抓取
	private static String url = "http://ac.qq.com/Jump";// +/Comic/comicInfo/id/11111
														// 可以获取具体的漫画页面

	public static void main(String[] args) {
		for (int i = 1; i <= 144; i++) {//可以分析漫画的总页数来进行调用
			final int j = i;
			executor.execute(new Runnable() {
				@Override
				public void run() {
					try {
						System.out.println("begin*************第" + j + "页");
						getArticleListFromUrl(mainUrl + j, j);
						System.out.println("end*************第" + j + "页");
					} catch (Exception e) {
						System.err.println("**********************获取漫画错误**********************");
						e.printStackTrace();
					}
				}
			});

		}
	}

	/**
	 * 获取日漫列表
	 * 
	 * @param listurl
	 */
	public static void getArticleListFromUrl(String listurl, int j) {
		Document doc = null;
		try {
			doc = Jsoup
					.connect(listurl)
					.userAgent(
							"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0")
					.timeout(3000).get();
		} catch (IOException e) {
			System.err.println("**********************获取评论请求错误**********************");
			e.printStackTrace();
		}
		// System.out.println(doc);
		Elements elements = doc.getElementsByTag("a");// 找到所有a标签
		for (Element element : elements) {
			String relHref = element.attr("href"); // ==
													// "/"这个是href的属性值,一般都是链接。这里放的是漫画的连接
			String linkHref = element.text();
			// 用if语句过滤掉不是漫画链接的内容
			if (!relHref.startsWith("http://")
					&& relHref.contains("/Comic/comicInfo/id")) {
				StringBuffer sb = new StringBuffer();
				sb.append(url).append(relHref);
				String id = sb.substring(sb.lastIndexOf("/") + 1);
				try {
					for (int i = 1; i <= 50; i++) {//默认取50页评论
						getArticleFromUrl(sb.toString(), Integer.valueOf(id),
								i, j);// 查询第i页的评论
					}
				} catch (Exception e) {
					// TODO: handle exception
					System.err.println("**********************获取评论分页错误**********************");
					e.printStackTrace();
				}
			}
		}

	}

	/**
	 * 获取评论内容,调用评论接口主要就是要获取漫画页面的cookies信息,调用时一起传过去;
	 * 
	 * @param detailurl
	 *            评论的url
	 * @param id
	 *            资源id
	 * @param page
	 *            评论页数
	 */
	public static void getArticleFromUrl(String detailurl, Integer id,
			Integer page, Integer mainIndex) {
		try {
			long i = System.currentTimeMillis();// 生成时间戳
			Connection connect = Jsoup
					.connect("http://ac.qq.com/Community/topicList?targetId="
							+ id + "&page=" + page + "&_=" + i);
			Map<String, String> header = new HashMap<String, String>();
			header.put("User-Agent",
					"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0");
			header.put(
					"Referer",
					"http://ac.qq.com/Comic/ComicInfo/id/530132?trace_id=1_907_10.194.156.134_1504854317");
			header.put(
					"Cookie",
					"LW_uid=q19499A3B6c0z0Y4z0k5h18046; pgv_pvid=8070181612; eas_sid=11O4U96326x0V0r4j0I5b2c073; pgv_pvi=623979520"
							+ "; pt2gguin=o0877101804; RK=zfdTLMzqZc; ptcz=264e6df783796823cf379b14e6aef6aa3be6a4e2fb4b6126692ee05c2a0b0c4c"
							+ "; ue_ts=1493600756; ue_uk=a058f8c6bbbe035c75bece7707297348; ue_uid=e5fb4837d184233402086deba8d197aa;"
							+ " ue_skey=0e157906ef4cb8f560768be75c751a72; LW_pid=7813c0ffd4b168e438f4a5a82ad1c993; ts_uid=2015751548"
							+ "; ts_refer=www.baidu.com/link; theme=white; roastState=2; readRecord=%5B%5B505430%2C%22%E8%88%AA%E6%B5"
							+ "%B7%E7%8E%8B%22%2C888%2C%22%E7%AC%AC871%E8%AF%9D%20%E5%8A%A0%E6%B2%B9%E5%95%8A%EF%BC%81%E5%87%AF%E6%92"
							+ "%92%EF%BC%81%EF%BC%81%22%2C871%5D%5D; readLastRecord=%5B%5D; pgv_si=s8053975040; pgv_info=ssid=s26281936"
							+ "; ts_last=ac.qq.com/Comic/ComicInfo/id/530132; girlHideState=1; topicPop=1; pc_userinfo_cookie=; o_cookie"
							+ "=877101804");
			Connection data = connect.headers(header);
			Document document = data.get();
			Elements elements = document.getElementsByAttributeValue("class",
					"comment-content-detail");
			List<String> commList = new ArrayList<String>();
			for (Element element : elements) {
				commList.add(element.text());
			}

			/*JSONArray json = new JSONArray();
			for (int j = 0; j < commList.size(); j++) {
				JSONObject jo = new JSONObject();
				jo.put("comment", commList.get(j));
				json.add(jo);
			}
			String comment = json.toString();*/
			String comment = StringUtils.listToString(commList, '\r');
			String date = DateUtilsTool.getLongDate(new Date());
			String indexString = formatCode(mainIndex.toString());
			saveArticle(date +indexString , comment);

		} catch (IOException e) {
			System.err.println("**********************获取评论错误**********************");
			e.printStackTrace();
		}

	}

	private static String formatCode(String code) {
		StringBuilder sb = new StringBuilder();
		int a = 4 - code.length();
		if (a < 0) {
			throw new RuntimeException("formatCode错误 code超过9999");
		}
		for (int i = 0; i < a; i++) {
			sb.append("0");
		}
		sb.append(code);
		return sb.toString();
	}

	/**
	 * 保存内容到本地
	 * 
	 * @param titile
	 * @param content
	 * @param blogName
	 */
	public static void saveArticle(String titile, String content) {
		String filePath = "d:\\MyLoadArticle\\" + titile + ".txt";// 保存到本地的路径和文件名
		File file = new File(filePath);
		if (!file.getParentFile().exists()) {
			file.getParentFile().mkdirs();
		}
		try {
			file.createNewFile();
		} catch (IOException e) {
			System.err.println("*******************读取文件错误*******************");
			e.printStackTrace();
			
		}
		try {
			FileWriter fw = new FileWriter(file, true);
			BufferedWriter bw = new BufferedWriter(fw);
			bw.write(content);
			bw.flush();
			bw.close();
			fw.close();
		} catch (IOException e) {
			System.err.println("*******************写入文件错误*******************");
			e.printStackTrace();
		}

	}

}

  pom.xml文件需要添加的jar包

<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.10.3</version>
		</dependency>

  程序中的工具类:

//时间转换类
/**
	 * 将Date日期转换成String长类型的yyyyMMddHHmmss
	 * @author: Simon
	 * @date: 2017年9月9日 上午9:40:39
	 * @param date
	 * @return
	 */
	public static String getLongDate(Date date) {
		SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd");
		String ret = sdf.format(date);
		return ret;
	}
//list根据给定的字符进行切割成String类型
/**
	 * list转string
	 * @author: Simon
	 * @date: 2017年9月9日 上午10:24:52
	 * @param list
	 * @param separator
	 * @return
	 */
	 public static String listToString(List list, char separator) {  
		 StringBuilder sb = new StringBuilder();  
		 for (int i = 0; i < list.size(); i++) {  
		     if (i == list.size() - 1) {  
		     sb.append(list.get(i));  
		     } else {  
		     sb.append(list.get(i));  
		     sb.append(separator);  
		     }  
		 }  
		 return sb.toString();  
		    }  

  

推荐阅读