首页 > 解决方案 > Java 模式匹配找不到带有正则表达式 [A-Z0-9._%+-]+@[A-Z0-9.-]{3,65}\.[AZ]{2,4} 的电子邮件

问题描述

我正在尝试从网页中提取电子邮件;我有 60 个随机网站 url 并试图从中提取电子邮件用于测试目的,我正在使用这个[A-Z0-9._%+-]+@[A-Z0-9.-]{3,65}。[ AZ]{2,4}正则表达式在页面中查找电子邮件并使用 JSoup 解析网站。

在一个工作源中编辑代码

import java.io.IOException;
import java.net.MalformedURLException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class TestingMail {
    // HERE WE CONSTRUCT CLASS
    public TestingMail(){}


    /****************** SETTING MAIN METHOD TO TEST CLASS *************************/
    public static void main(String[] args){
        // Setting initiator
        String Terms="Trending Bitcoin Investment Chat in NETHERLANDS";
        TestingMail extractor=new TestingMail();
        extractor.extract(Terms, extractor);
    }


    /****************** HERE WE CONSTRUCT THE EXTRACT METHOD **********************/
    public void extract(String terms, TestingMail extractor){
        // HERE WE START CONSTRUCTING THE EXTRACT PROCESSES
        int NUM_THREADS=10;
        int limit=10;
        String[] parseURL={};
        String[] crawedURL={};
        int istype=0;
        int start=0;
        // HERE WE START PROCESSING
        if(terms!=null && terms.length()>0){
            SSLContext sc = null;

            // LETS DISABLE SSL CERTIFICATE
            // Create a trust manager that does not validate certificate chains
            TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
                    public java.security.cert.X509Certificate[] getAcceptedIssuers() {
                        return null;
                    }
                    public void checkClientTrusted(X509Certificate[] certs, String authType) {
                    }
                    public void checkServerTrusted(X509Certificate[] certs, String authType) {
                    }
                }
            };

            try {
                sc = SSLContext.getInstance("SSL");
            } catch (NoSuchAlgorithmException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            try {
                sc.init(null, trustAllCerts, new java.security.SecureRandom());
            } catch (KeyManagementException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());

            // Create all-trusting host name verifier
            HostnameVerifier allHostsValid = new HostnameVerifier() {
                public boolean verify(String hostname, SSLSession session) {
                    return true;
                }
            };

            // Install the all-trusting host verifier
            HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);

            // HERE LETS CRAW DATA FROM GOOGLE
            crawedURL=new String[]{"https://www.globfinances.com", "https://napoleoninvestment.net", "https://www.meetup.com/BitcoinWednesday/?_cookie-check=PXZ_aLyoOMcdpbrs"};
            if(crawedURL!=null && crawedURL.length>0){
                // Here we loop mails to store send mails
                if(crawedURL.length<limit){
                    limit=crawedURL.length;
                    istype=1;
                }

                // Here we set the mails length
                parseURL=new String[limit];
                // HERE WE START THREAD POOL
                ExecutorService es = Executors.newFixedThreadPool(NUM_THREADS);
                List<Future<Integer>> futures = new ArrayList<>(NUM_THREADS);

                // Submit task to every thread:
                for (int i = 0; i < NUM_THREADS; i++) {
                    // Here we loop to get mails
                    if(start<crawedURL.length){
                        for(int k=start, j=0; j<crawedURL.length; k++, j++){
                            if(k<(limit-1)){
                                System.out.println(i+"=="+j);
                                // System.out.println(mails[k]);
                                parseURL[j]=crawedURL[k];
                            }
                            else{
                                start+=limit+1;
                                break;
                            }
                        }
                        // Here we thread task
                        futures.add(i, es.submit((Callable<Integer>) new Extractor(parseURL, extractor)));
                    }
                    else{
                        istype=1;
                        break;
                    }

                    // Checking thread type to prevent multiple run
                    if(istype==1){
                        break;
                    }
                } // end of loop

                // Shutdown thread pool
                es.shutdown();
                System.out.println("Thread: "+futures.size());
            }
        }
    }


    /******************* HERE WE CONSTRUCT THE EXTRACT METHOD *******************/
    private Integer mailExtract(String[] urls) throws MalformedURLException{
        // HERE WE START CONSTRUCTING THE MAIL EXTRACTED PROCESS
        String pattern = "\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b";
        Set<String> emails = new HashSet<>();
        String[][] extracted={};
        int totalMails=0;
        // HERE WE START PROCESSING
        if(urls!=null && urls.length>0){
            extracted=new String[urls.length][];
            // Now lets extract mails
            Pattern pat = Pattern.compile(pattern);
            // Now lets loop
            for(int i=0; i<urls.length; i++){
                emails=parse(urls[i], pat);
                int key=0;
                if(emails.size()>0){
                    for(String email:emails){
                        extracted[i][key]=email;
                        key++;
                    } // end of loop
                }
            } // end of loop

            // HERE WE CHECK EXTRACTED LENGTH
            for(int j=0; j<extracted.length; j++){
                totalMails=totalMails+extracted[j].length;
            } // end of loop

            System.out.println(totalMails);
        }

        // Here we return
        return Integer.valueOf(totalMails);
    }


    /********* HERE WE START CONSTRUCTING THE PARSE FUNCTIONS **********/
    public Set<String> parse(String url, Pattern pat){
        // HERE WE CONSTRUCT THE EMAIL PARSER PROCESS
        Set<String> emailAddresses = new HashSet<>();
        boolean found=false;
        String contents="";
        // HERE WE START PROCESSING
        if(url!=null){
            contents=urlContent(url);
            if(contents.length()>0 && contents.indexOf("body")>=0){
                // Pattern pat = Pattern.compile(pattern);
                //Matches contents against the given Email Address Pattern
                Matcher match = pat.matcher(contents);
                found=match.find();
                //If match found, append to emailAddresses
                System.out.println("I found this: "+found);
                while(found) {
                    emailAddresses.add(match.group());
                } // end of while loop
            }
        }

        // Here we return
        return emailAddresses;
    }


    // HERE WE READ URL CONTENT TO STRING
        private String urlContent(String url){
            // HERE WE CONSTRUCT THE URL CONTENT RETURNER
            String content="";
            Document doc=null;
            String sUrl="";
            // HERE WE START PROCESSING
            try {
                SSLContext sc = null;

                // LETS DISABLE SSL CERTIFICATE
                // Create a trust manager that does not validate certificate chains
                TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
                        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
                            return null;
                        }
                        public void checkClientTrusted(X509Certificate[] certs, String authType) {
                        }
                        public void checkServerTrusted(X509Certificate[] certs, String authType) {
                        }
                    }
                };

                try {
                    sc = SSLContext.getInstance("SSL");
                } catch (NoSuchAlgorithmException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                try {
                    sc.init(null, trustAllCerts, new java.security.SecureRandom());
                } catch (KeyManagementException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());

                // Create all-trusting host name verifier
                HostnameVerifier allHostsValid = new HostnameVerifier() {
                    public boolean verify(String hostname, SSLSession session) {
                        return true;
                    }
                };

                // Install the all-trusting host verifier
                HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);

                // HERE WE START CRAWLING
                if(url.startsWith("http")){
                    Connection con=Jsoup.connect(url).timeout(100000).ignoreHttpErrors(true).followRedirects(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)");
                    Connection.Response resp = con.execute();
                    // HERE WE CHECK RESPONSE CODE
                    if (resp.statusCode() == 200) {
                        doc = con.get();
                        // Now lets get the text document
                        content=doc.html();
                    } // End of status check
                    else if(resp.statusCode() == 307){
                        String sNewUrl = resp.header("Location");
                        if (sNewUrl != null && sNewUrl.length() > 7)
                            sUrl = sNewUrl;
                        resp = Jsoup.connect(sUrl).timeout(100000).ignoreHttpErrors(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)").execute();
                        doc =resp.parse();
                        // Now lets get the text document
                        content=doc.html();
                    } // End of status 307 check
                } // end of start with check
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            // Here we return
            return content;
        }



        /************* HERE WE CONSTRUCT INNER CLASS TO HANDLE THREAD *****************/
        public static final class Extractor implements Callable<Integer>{
            // HERE WE CONSTRUCT CLASS
            String[] Urls;
            TestingMail Extract;
            public Extractor(String[] urls, TestingMail extract){
                Urls=urls;
                Extract=extract;
            }

            /*********** HERE WE CALL THE CALLABLE ***********/
            @Override
            public Integer call() throws Exception {
                try {
                    return Extract.mailExtract(Urls);
                } catch (Throwable t) {
                    t.printStackTrace();
                    throw new RuntimeException(t);
                }
            }

            // END OF CLASS
        }

    // END OF CLASS
}

我添加了一些打印语句来监控过程,我一直得到的都是错误的 java 模式匹配

这是我的控制台中的内容

52
0==0
0==1
0==2
0==3
0==4
0==5
0==6
0==7
0==8
Thread: 5 Extracted Mails: 0
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
    at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
    at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
    at java.util.concurrent.FutureTask.run(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)
java.lang.NullPointerException
    at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
    at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
    at java.util.concurrent.FutureTask.run(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
    at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
    at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
    at java.util.concurrent.FutureTask.run(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
    at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
    at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
    at java.util.concurrent.FutureTask.run(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
    at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
    at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
    at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
    at java.util.concurrent.FutureTask.run(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)

我似乎无法理解为什么没有收到电子邮件,上述网站中至少有一个在页脚中包含了一封支持电子邮件,但我的代码似乎无法收到它。我什至将我的表达改为:\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.-]+\ b但结果相同。我不知道缺少什么或为什么 exprssion 不起作用。

任何帮助将不胜感激

标签: javaregexpattern-matchingjsoup

解决方案


对于任何编程问题,最有帮助的一个特性是最小、完整和可验证的示例。这是一个解决您的问题的方法:

import java.util.regex.*;
class Test {
  public static void main(String[] args) {
    Pattern pat = Pattern.compile("\\\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\\\.[a-zA-Z0-9.-]+\\\\b");
    Matcher match = pat.matcher("<li>email@example.com</li>");
    System.out.println("I found this: "+ match.find() + " with expression: " + pat);
  }
}

它更短,但会产生与您的代码相同的输出:

I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b

它的问题是反斜杠是双转义的。这是没有额外转义的版本:

import java.util.regex.*;
class Test {
  public static void main(String[] args) {
    Pattern pat = Pattern.compile("\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b");
    Matcher match = pat.matcher("<li>info@napoleoninvestment.net</li>");
    System.out.println("I found this: "+ match.find() + " with expression: " + pat);
  }
}

这是输出,现在显示匹配:

I found this: true with expression: \b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.-]+\b

不幸的是,我不知道如何将其应用于您的代码,因为您没有包含您定义的部分pattern。很可能是因为混淆了代码的哪一层需要转义。例如,将 Java 字符串文字复制粘贴到文件不会在读回时产生相同的字符串文字,因为一个是 Java 语法,一个是原始数据,而后者不需要或允许转义。


推荐阅读