首页 > 技术文章 > jsoup简单的爬取网页数据

pwenlee 2016-07-25 16:27 原文

/**
 * Project Name:JavaTest
 * File Name:BankOfChinaExchangeRate.java
 * Package Name:com.lee.javatest
 * Date:2016年7月22日下午1:34:09
 * Copyright (c) 2016年7月22日, Pwenlee All Rights Reserved.
 *
*/

package com.lee.javatest;

import java.io.Serializable;
import java.math.BigDecimal;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * ClassName:BankOfChinaExchangeRate <br/>
 * Function: 中行外汇牌价. <br/>
 * Date:     2016年7月22日 下午1:34:09 <br/>
 * @author   PwenLee
 * @version  
 * @see      
 */
public class BankOfChinaExchangeRate implements Serializable{

    private static final Integer DEAFULT_PAGESIZE = 20;
    
    private static final long serialVersionUID = -913877619191789389L;

    /**
     * 货币名称 中文简体
     */
    private String currency;
    
    /**
     * 现汇买入价
     */
    private BigDecimal buyingRate;
    
    /**
     * 现钞买入价
     */
    private BigDecimal cashBuyingRate;
    
    /**
     * 现汇卖出价
     */
    private BigDecimal sellingRate; 
    
    /**
     * 现钞卖出价
     */
    private BigDecimal cashSellingRate; 
    
    /**
     * 外管局中间价
     */
    private BigDecimal SAFEMiddleRate;
    
    /**
     * 中行折算价
     */
    private BigDecimal bankConvertRate;
    
    /**
     * 发布时间
     */
    private String dateTime;
    
    public String getCurrency() {
        return currency;
    }

    public void setCurrency(String currency) {
        this.currency = currency;
    }

    public BigDecimal getBuyingRate() {
        return buyingRate;
    }

    public void setBuyingRate(BigDecimal buyingRate) {
        this.buyingRate = buyingRate;
    }

    public BigDecimal getCashBuyingRate() {
        return cashBuyingRate;
    }

    public void setCashBuyingRate(BigDecimal cashBuyingRate) {
        this.cashBuyingRate = cashBuyingRate;
    }

    public BigDecimal getSellingRate() {
        return sellingRate;
    }

    public void setSellingRate(BigDecimal sellingRate) {
        this.sellingRate = sellingRate;
    }

    public BigDecimal getCashSellingRate() {
        return cashSellingRate;
    }

    public void setCashSellingRate(BigDecimal cashSellingRate) {
        this.cashSellingRate = cashSellingRate;
    }

    public BigDecimal getSAFEMiddleRate() {
        return SAFEMiddleRate;
    }

    public void setSAFEMiddleRate(BigDecimal sAFEMiddleRate) {
        SAFEMiddleRate = sAFEMiddleRate;
    }

    public BigDecimal getBankConvertRate() {
        return bankConvertRate;
    }

    public void setBankConvertRate(BigDecimal bankConvertRate) {
        this.bankConvertRate = bankConvertRate;
    }
    
    public String getDateTime() {
        return dateTime;
    }

    public void setDateTime(String dateTime) {
        this.dateTime = dateTime;
    }
    
    /**
     * 
     * BankOfChinaExchangeRate:
     * date:日期  例入“2016-07-22”
     * time:时间  例如“05:30:00”
     * BankOfChinaCurrencyCode 枚举类
     * @author PwenLee
     * @param startDate
     * @param endDate
     * @param currencyCode
     * @return BankOfChinaExchangeRate
     */
    public BankOfChinaExchangeRate (String date, String time, BankOfChinaCurrencyCode currencyCode){
        List<String> context = getExchangeRate(date, time, currencyCode);
        this.currency = context.get(0);
        this.buyingRate = new BigDecimal(context.get(1));
        this.cashBuyingRate = new BigDecimal(context.get(2));
        this.sellingRate = new BigDecimal(context.get(3));
        this.cashSellingRate = new BigDecimal(context.get(4));
        this.SAFEMiddleRate = new BigDecimal(context.get(5));
        this.bankConvertRate = new BigDecimal(context.get(6));
        this.dateTime = context.get(7) + " " + context.get(8);
    }
    
    /**
     * 取当天凌晨05:30:00的数据
     */
    public BankOfChinaExchangeRate(){
        SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");  
        Date date=new Date();  
        String nowDate=sdf.format(date);  
        List<String> context = getExchangeRate(nowDate, "05:30:00", BankOfChinaCurrencyCode.USD);
        this.currency = context.get(0);
        this.buyingRate = new BigDecimal(context.get(1));
        this.cashBuyingRate = new BigDecimal(context.get(2));
        this.sellingRate = new BigDecimal(context.get(3));
        this.cashSellingRate = new BigDecimal(context.get(4));
        this.SAFEMiddleRate = new BigDecimal(context.get(5));
        this.bankConvertRate = new BigDecimal(context.get(6));
        this.dateTime = context.get(7) + " " + context.get(8);
    }
    
    /**
     * 模拟请求url,返回html源码
     * @author PwenLee
     * @param url
     * @return
     */
    private static String GetHtml(String url) {
        String html = null;
        HttpClient httpClient = new DefaultHttpClient();
        httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
        HttpGet httpGet = new HttpGet(url);
        try {
            HttpResponse httpResponse = httpClient.execute(httpGet);
            int resStatu = httpResponse.getStatusLine().getStatusCode();
            if (resStatu == HttpStatus.SC_OK) {
                HttpEntity entity = httpResponse.getEntity();
                if (entity != null) {
                    html = EntityUtils.toString(entity, "utf-8");
                }
            }
        } catch (Exception e) {
            //TODO  打成logger
            System.out.println("Connect " + url + " error");
            e.printStackTrace();
        } finally {
            httpClient.getConnectionManager().shutdown();
        }
        return html;
    }
    
    private List<String> getExchangeRate(String date, String time, BankOfChinaCurrencyCode currencyCode){
        
        Integer totalPage = totalPage(date, time, currencyCode);
        List<String> contextList = new ArrayList<String>();
        if(totalPage <= 0){
            //TODO  logger
            return contextList;
        }
        
        String context = "";
        for(int i=totalPage;i>=0;i--){
            String url = "http://srh.bankofchina.com/search/whpj/search.jsp?erectDate="+date+"&nothing="+date+"&pjname="+currencyCode.getCode()+"&page="+i;
            String html = GetHtml(url);
            Document doc = Jsoup.parse(html);
            Elements linkElements = doc.getElementsByClass("BOC_main");
            Elements datas = linkElements.get(0).getElementsByTag("tr");
            for (Element ele : datas) {
                if(ele.text().indexOf(time) != -1){
                    context = ele.text();
                    break;
                }
            }
            if(context != ""){  //TODO  换成StringUtils.isNotBlank
                break;
            }
        }
        
        if(context == "") {//TODO  换成StringUtils.isBlank
            //TODO  logger
            return contextList;
        }else{
            contextList = Arrays.asList(context.split(" "));
        }
        return contextList;
    }

    public static Integer totalPage(){
        Integer totalPage = 0;
        try{
            SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");  
            Date date=new Date();  
            String nowDate=sdf.format(date);  
            String url = "http://srh.bankofchina.com/search/whpj/search.jsp?erectDate="+nowDate+"&nothing="+nowDate+"&pjname="+BankOfChinaCurrencyCode.USD.getCode();
            String html = GetHtml(url);
            //截取网页总条数变量
            String stringTemp = html.substring(html.indexOf("m_nRecordCount = "));
            //获取变量的值
            String totalcount = stringTemp.substring(stringTemp.indexOf("m_nRecordCount = ")+"m_nRecordCount = ".length(),stringTemp.indexOf(";"));
            Integer totalnum = Integer.valueOf(totalcount);
            if(totalnum % DEAFULT_PAGESIZE == 0){
                totalPage = totalnum/DEAFULT_PAGESIZE;
            }else{
                totalPage = totalnum/DEAFULT_PAGESIZE+1;
            }
        }catch(Exception e){
            //TODO 打成logger
        }
        return totalPage;
    }

    public static Integer totalPage(String date, String time, BankOfChinaCurrencyCode currencyCode){
        Integer totalPage = 0;
        try{
            String url = "http://srh.bankofchina.com/search/whpj/search.jsp?erectDate="+date+"&nothing="+date+"&pjname="+currencyCode.getCode();
            String html = GetHtml(url);
            //截取网页总条数变量
            String stringTemp = html.substring(html.indexOf("m_nRecordCount = "));
            //获取变量的值
            String totalcount = stringTemp.substring(stringTemp.indexOf("m_nRecordCount = ")+"m_nRecordCount = ".length(),stringTemp.indexOf(";"));
            Integer totalnum = Integer.valueOf(totalcount);
            if(totalnum % DEAFULT_PAGESIZE == 0){
                totalPage = totalnum/DEAFULT_PAGESIZE;
            }else{
                totalPage = totalnum/DEAFULT_PAGESIZE+1;
            }
        }catch(Exception e){
            //TODO 打成logger
        }
        return totalPage;
    }
    
    @Override
    public String toString() {
        return "BankOfChinaExchangeRate [currency=" + currency
                + ", buyingRate=" + buyingRate + ", cashBuyingRate="
                + cashBuyingRate + ", sellingRate=" + sellingRate
                + ", cashSellingRate=" + cashSellingRate + ", SAFEMiddleRate="
                + SAFEMiddleRate + ", bankConvertRate=" + bankConvertRate
                + ", dateTime=" + dateTime + "]";
    }
    
    
}
View Code

 

推荐阅读