首页 > 技术文章 > 基于jsoup的Java服务端http(s)代理程序-代理服务器Demo

dreamzhiya 2016-10-08 16:19 原文

 亲爱的开发者朋友们,知道百度网址翻译么?他们为何能够翻译源网页呢,iframe可是不能跨域操作的哦,那么可以用代理实现。直接上代码:

本Demo基于MVC写的,灰常简单,copy过去,简单改改就可以用的哦。

//Action层
/**
 * 网址翻译代理服务器接口层
 * @Description: 此接口层可完成对所请求网址的代理,实现同域访问
 * @author zhanglongping
 * @CreateDate:   2016-8-23 上午10:52:49  
 */

@At("/proxy")
public class ProxyModule {

    /**
     * 获取网页
     * @return
     * @author zhanglongping
     * @date 2016-8-23 上午10:54:13
     */
    @At("/gethtml")
    @Ok("Raw")
    @Authority("")
    public Object gethtml(@Param("yeekit_proxy_url") String url,HttpServletRequest request, HttpServletResponse response){
        try {
            
            String path = request.getContextPath();
            String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
            
            String html = new ProxyUtils().getUrlMap(url,basePath);
//            return html;
            InputStream is = new StringInputStream(html);
            BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8"));  
              
            String line;  
            PrintWriter out = response.getWriter();  
            while ((line = in.readLine()) != null) { 
                out.println(line);  
            }  
            out.flush();  
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        } 
        return null;
    }

     /** 
     * 使用GET提交到目标服务器。 
     *  
     * @param request 
     * @param response 
     * @param targetUrl 
     * @throws IOException 
     */
    @At("/forward")
    @Ok("Raw")
    @Authority("")
    public Object urlRedirect(@Param("yeekit_proxy_url") String targetUrl,HttpServletRequest request, HttpServletResponse response) throws IOException {  
  
        if(targetUrl.endsWith(".htm") || targetUrl.endsWith(".html") || targetUrl.endsWith(".shtml")){
            try {
                
                String path = request.getContextPath();
                String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
                
                String html = new ProxyUtils().getUrlMap(targetUrl,basePath);
//                return html;
                InputStream is = new StringInputStream(html);
                BufferedReader in = new BufferedReader(new InputStreamReader(is,"UTF-8"));  
                  
                String line;  
                PrintWriter out = response.getWriter();  
                while ((line = in.readLine()) != null) {  
                    out.println(line);  
                }  
                out.flush();  
                in.close();
            } catch (Exception e) {
                e.printStackTrace();
//                return null;
            }
        
        }else if(targetUrl.endsWith(".css") || targetUrl.endsWith(".js") || targetUrl.endsWith(".jpg")||
                targetUrl.endsWith(".png") || targetUrl.endsWith(".svg") || targetUrl.endsWith(".gif")){
            String fileName = targetUrl.split("/")[targetUrl.split("/").length-1];
//            response.setHeader("Content-Disposition", "attachment; filename="
//                    + java.net.URLEncoder.encode(fileName, "UTF-8"));
            //图片的名称
            String imgName = fileName;
            //名称转码,避免中文乱码
            imgName = new String(imgName.getBytes("iso8859-1"),"UTF-8");
            //图片的资源地址,http://10.80.3.229:8081/mediaserver/574fe515e30ab97c9068d2e1
            //这是媒体服务器返回的地址,因为是网络地址,所以需要使用HttpURLConnection去获取图片
            String imgUrl = targetUrl;
            //输入流,用来读取图片
            InputStream ins = null;
            HttpURLConnection httpURL = null;
            try{
                URL url = new URL(imgUrl);
                //打开一个网络连接
                httpURL = (HttpURLConnection)url.openConnection();
                //设置网络连接超时时间
                httpURL.setConnectTimeout(3000);
                //设置应用程序要从网络连接读取数据
                httpURL.setDoInput(true);
                //设置请求方式
                httpURL.setRequestMethod("GET");
                //获取请求返回码
                int responseCode = httpURL.getResponseCode();
                if(responseCode == 200){
                    //如果响应为“200”,表示成功响应,则返回一个输入流
                    ins = httpURL.getInputStream();
                    //设置response响应头
                    //encodeChineseDownloadFileName()用来解决文件名为中文的问题,方法体在下面
                    if(fileName.indexOf(".css")>-1){
                        response.setContentType("text/css");
                    }
                    response.setHeader("content-disposition", "attachment;filename="+ ProxyUtils.encodeChineseDownloadFileName(request,imgName));
                    //输出流到response中
                    byte[] data = new byte[1024];
                    int len = 0;
                    //输出流
                    OutputStream out = response.getOutputStream();
                    while((len = ins.read(data)) > 0){
                        out.write(data, 0, len);
                    }
                    out.flush();
                    ins.close();
                    
                }
            }catch(Exception e){
                System.out.println("下载附件图片出错!"+targetUrl);
                e.printStackTrace();
            }
        }
        return null;
        
 
    } 

工具类

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.http.HttpServletRequest;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * 译库网址翻译代理服务工具类
 * 
 * @Description:包含:提取HTML中网址,并转换为代理的网址服务地址;
 * @author zhanglongping
 * @CreateDate: 2016-8-23 上午10:15:08
 * @UpdateUser: zhanglongping
 * @UpdateDate: 2016-8-23 上午10:15:08
 * @UpdateRemark: 说明本次修改内容
 */
public class ProxyUtils {
//    public static void main(String[] args) throws IOException {
////        ProxyUtils pu = new ProxyUtils();
////        pu.getUrlMap("http://english.cas.cn");
//        Connection conn = Jsoup.connect("http://www.bbc.com");
//        Document doc_one = conn.get();
//        System.out.println(doc_one);
//    }

    /**
     * 获取url哈希:key:源url value:代理url
     * @param url
     * @author zhanglongping
     * @date 2016-8-23 上午10:42:41
     */
    public String getUrlMap(String url,String basePath){
//        String url_protocol = "",url_host = "";
        try {
            //特殊网址转换
            url = transformation(url);
            URL urlcurr = new URL(url);
//            url_protocol = urlcurr.getProtocol();
//            url_host = urlcurr.getHost();
            String hostname = urlcurr.getProtocol()+"://"+urlcurr.getHost();
            
//            String proxyHost = basePath;
            
//            String proxyHostName = proxyHost+"proxy/forward?yeekit_proxy_url=";
            
            Document doc_one;
            
            Connection conn = Jsoup.connect(hostname);
            doc_one = conn.get();
            doc_one.setBaseUri(hostname);

//            Elements links = doc_one.select("a[href]");
//            Elements media = doc_one.select("[src]");
//            Elements imports = doc_one.select("link[href]");
            
            Elements head = doc_one.select("meta");
            head.get(0).before("<base href=\""+hostname+"/"+"\" />");
            //鼠标悬停翻译js脚本注入
            //悬停脚本引用
            String hover_js = "<script src=\""+basePath+"/yeekit_translate_url/js/yeekit_hover_trans.js\" type=\"text/javascript\"></script>";
            String jquery_js = "<script src=\"http://cdn.bootcss.com/jquery/3.1.0/jquery.min.js\" type=\"text/javascript\"></script>";
            head.get(0).after(jquery_js + hover_js);                        
//            for (Element src : media) {
//                String key = src.attr("abs:src");
//                src.attr("src", proxyHostName+key);
//            }
//            
//            for (Element link : imports) {
//                String key = link.attr("abs:href");
//                link.attr("href", proxyHostName+key);
//            }
//            
//            for (Element link : links) {
//                String key = link.attr("abs:href");
//                link.attr("href", proxyHostName+key);
//            }
            
            String dochtml = doc_one.html().toString();
            
            //增强型处理 - 处理js脚本里静态资源地址引用
//            List<String> list_src_img = getImgSrc(dochtml);
//            for(String src:list_src_img){
//                if(src.indexOf("./") > -1){
//                    dochtml = dochtml.replaceAll(src, proxyHostName+hostname+src.substring(1));
//                }
//            }
            
//            System.out.println(dochtml);
            return dochtml;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
        
    }
    
    /**
     * 内容获取
     * @return
     * @author zhanglongping
     * @throws IOException 
     * @date 2016-8-30 下午5:44:31
     */
    public String get_https_html(String url) throws IOException{
        
            URL urlcurr = new URL(url);
            String hostname = urlcurr.getProtocol()+"://"+urlcurr.getHost();
            
            Document doc_one;
            
            Connection conn = Jsoup.connect(hostname);
            doc_one = conn.post();
            doc_one.setBaseUri(hostname);
            
            Elements head = doc_one.select("meta");

            head.get(0).before("<base href=\""+hostname+"/"+"\" />");
            
            String dochtml = doc_one.html().toString();

            return dochtml;
    }
    
    
    /*
     * 解决文件为中文名的乱码问题
     */
    public static String encodeChineseDownloadFileName(HttpServletRequest request, String pFileName) throws UnsupportedEncodingException{
        String filename = null;
        //获取请求头中的浏览器标识
        String agent = request.getHeader("USER-AGENT");
        if(agent != null){
            if(agent.indexOf("Firefox") != -1){
                //Firefox
                filename = "=?UTF-8?B?" + 
                        (new String(Base64.encodeBase64(pFileName.getBytes("UTF-8")))) + "?=";
            }else if(agent.indexOf("Chrome") != -1){
                //Chrome
                filename = new String(pFileName.getBytes(), "ISO8859-1");
            }else{
                //IE7+
                filename = URLEncoder.encode(pFileName, "UTF-8");
                //替换空格
                filename = StringUtils.replace(filename, "+", "%20");
            }
        }else{
            filename = pFileName;
        }
        return filename;
    }
    
    /**
     * 获取img标签中的src值
     * @param content
     * @return
     */
    public  List<String> getImgSrc(String content){
         
        List<String> list = new ArrayList<String>();
        //目前img标签标示有3种表达式
        //<img alt="" src="1.jpg"/>   <img alt="" src="1.jpg"></img>     <img alt="" src="1.jpg">
        //开始匹配content中的<img />标签
        Pattern p_img = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)");
        Matcher m_img = p_img.matcher(content);
        boolean result_img = m_img.find();
        if (result_img) {
            while (result_img) {
                //获取到匹配的<img />标签中的内容
                String str_img = m_img.group(2);
                 
                //开始匹配<img />标签中的src
                Pattern p_src = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");
                Matcher m_src = p_src.matcher(str_img);
                if (m_src.find()) {
                    String str_src = m_src.group(3);
                    list.add(str_src);
                }
                //结束匹配<img />标签中的src
                 
                //匹配content中是否存在下一个<img />标签,有则继续以上步骤匹配<img />标签中的src
                result_img = m_img.find();
            }
        }
        return list;
    }
    
    /**
     * 特殊网址转换
     * @param url
     * @return
     * @author zhanglongping
     * @date 2016-8-30 下午6:18:48
     */
    public String transformation(String url){
        //百度的二级域名www.baidu.com重定向存在问题
        if(url.equals("http://www.baidu.com")){
            url = "http://baidu.com";
        }
        
        return url;
    }
}

 

推荐阅读