java - Java 模式匹配找不到带有正则表达式 [A-Z0-9._%+-]+@[A-Z0-9.-]{3,65}\.[AZ]{2,4} 的电子邮件
问题描述
我正在尝试从网页中提取电子邮件;我有 60 个随机网站 url 并试图从中提取电子邮件用于测试目的,我正在使用这个[A-Z0-9._%+-]+@[A-Z0-9.-]{3,65}。[ AZ]{2,4}正则表达式在页面中查找电子邮件并使用 JSoup 解析网站。
在一个工作源中编辑代码
import java.io.IOException;
import java.net.MalformedURLException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class TestingMail {
// HERE WE CONSTRUCT CLASS
public TestingMail(){}
/****************** SETTING MAIN METHOD TO TEST CLASS *************************/
public static void main(String[] args){
// Setting initiator
String Terms="Trending Bitcoin Investment Chat in NETHERLANDS";
TestingMail extractor=new TestingMail();
extractor.extract(Terms, extractor);
}
/****************** HERE WE CONSTRUCT THE EXTRACT METHOD **********************/
public void extract(String terms, TestingMail extractor){
// HERE WE START CONSTRUCTING THE EXTRACT PROCESSES
int NUM_THREADS=10;
int limit=10;
String[] parseURL={};
String[] crawedURL={};
int istype=0;
int start=0;
// HERE WE START PROCESSING
if(terms!=null && terms.length()>0){
SSLContext sc = null;
// LETS DISABLE SSL CERTIFICATE
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}
};
try {
sc = SSLContext.getInstance("SSL");
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sc.init(null, trustAllCerts, new java.security.SecureRandom());
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
// Create all-trusting host name verifier
HostnameVerifier allHostsValid = new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
};
// Install the all-trusting host verifier
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
// HERE LETS CRAW DATA FROM GOOGLE
crawedURL=new String[]{"https://www.globfinances.com", "https://napoleoninvestment.net", "https://www.meetup.com/BitcoinWednesday/?_cookie-check=PXZ_aLyoOMcdpbrs"};
if(crawedURL!=null && crawedURL.length>0){
// Here we loop mails to store send mails
if(crawedURL.length<limit){
limit=crawedURL.length;
istype=1;
}
// Here we set the mails length
parseURL=new String[limit];
// HERE WE START THREAD POOL
ExecutorService es = Executors.newFixedThreadPool(NUM_THREADS);
List<Future<Integer>> futures = new ArrayList<>(NUM_THREADS);
// Submit task to every thread:
for (int i = 0; i < NUM_THREADS; i++) {
// Here we loop to get mails
if(start<crawedURL.length){
for(int k=start, j=0; j<crawedURL.length; k++, j++){
if(k<(limit-1)){
System.out.println(i+"=="+j);
// System.out.println(mails[k]);
parseURL[j]=crawedURL[k];
}
else{
start+=limit+1;
break;
}
}
// Here we thread task
futures.add(i, es.submit((Callable<Integer>) new Extractor(parseURL, extractor)));
}
else{
istype=1;
break;
}
// Checking thread type to prevent multiple run
if(istype==1){
break;
}
} // end of loop
// Shutdown thread pool
es.shutdown();
System.out.println("Thread: "+futures.size());
}
}
}
/******************* HERE WE CONSTRUCT THE EXTRACT METHOD *******************/
private Integer mailExtract(String[] urls) throws MalformedURLException{
// HERE WE START CONSTRUCTING THE MAIL EXTRACTED PROCESS
String pattern = "\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b";
Set<String> emails = new HashSet<>();
String[][] extracted={};
int totalMails=0;
// HERE WE START PROCESSING
if(urls!=null && urls.length>0){
extracted=new String[urls.length][];
// Now lets extract mails
Pattern pat = Pattern.compile(pattern);
// Now lets loop
for(int i=0; i<urls.length; i++){
emails=parse(urls[i], pat);
int key=0;
if(emails.size()>0){
for(String email:emails){
extracted[i][key]=email;
key++;
} // end of loop
}
} // end of loop
// HERE WE CHECK EXTRACTED LENGTH
for(int j=0; j<extracted.length; j++){
totalMails=totalMails+extracted[j].length;
} // end of loop
System.out.println(totalMails);
}
// Here we return
return Integer.valueOf(totalMails);
}
/********* HERE WE START CONSTRUCTING THE PARSE FUNCTIONS **********/
public Set<String> parse(String url, Pattern pat){
// HERE WE CONSTRUCT THE EMAIL PARSER PROCESS
Set<String> emailAddresses = new HashSet<>();
boolean found=false;
String contents="";
// HERE WE START PROCESSING
if(url!=null){
contents=urlContent(url);
if(contents.length()>0 && contents.indexOf("body")>=0){
// Pattern pat = Pattern.compile(pattern);
//Matches contents against the given Email Address Pattern
Matcher match = pat.matcher(contents);
found=match.find();
//If match found, append to emailAddresses
System.out.println("I found this: "+found);
while(found) {
emailAddresses.add(match.group());
} // end of while loop
}
}
// Here we return
return emailAddresses;
}
// HERE WE READ URL CONTENT TO STRING
private String urlContent(String url){
// HERE WE CONSTRUCT THE URL CONTENT RETURNER
String content="";
Document doc=null;
String sUrl="";
// HERE WE START PROCESSING
try {
SSLContext sc = null;
// LETS DISABLE SSL CERTIFICATE
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}
};
try {
sc = SSLContext.getInstance("SSL");
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sc.init(null, trustAllCerts, new java.security.SecureRandom());
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
// Create all-trusting host name verifier
HostnameVerifier allHostsValid = new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
};
// Install the all-trusting host verifier
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);
// HERE WE START CRAWLING
if(url.startsWith("http")){
Connection con=Jsoup.connect(url).timeout(100000).ignoreHttpErrors(true).followRedirects(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)");
Connection.Response resp = con.execute();
// HERE WE CHECK RESPONSE CODE
if (resp.statusCode() == 200) {
doc = con.get();
// Now lets get the text document
content=doc.html();
} // End of status check
else if(resp.statusCode() == 307){
String sNewUrl = resp.header("Location");
if (sNewUrl != null && sNewUrl.length() > 7)
sUrl = sNewUrl;
resp = Jsoup.connect(sUrl).timeout(100000).ignoreHttpErrors(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)").execute();
doc =resp.parse();
// Now lets get the text document
content=doc.html();
} // End of status 307 check
} // end of start with check
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Here we return
return content;
}
/************* HERE WE CONSTRUCT INNER CLASS TO HANDLE THREAD *****************/
public static final class Extractor implements Callable<Integer>{
// HERE WE CONSTRUCT CLASS
String[] Urls;
TestingMail Extract;
public Extractor(String[] urls, TestingMail extract){
Urls=urls;
Extract=extract;
}
/*********** HERE WE CALL THE CALLABLE ***********/
@Override
public Integer call() throws Exception {
try {
return Extract.mailExtract(Urls);
} catch (Throwable t) {
t.printStackTrace();
throw new RuntimeException(t);
}
}
// END OF CLASS
}
// END OF CLASS
}
我添加了一些打印语句来监控过程,我一直得到的都是错误的 java 模式匹配
这是我的控制台中的内容
52
0==0
0==1
0==2
0==3
0==4
0==5
0==6
0==7
0==8
Thread: 5 Extracted Mails: 0
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
我似乎无法理解为什么没有收到电子邮件,上述网站中至少有一个在页脚中包含了一封支持电子邮件,但我的代码似乎无法收到它。我什至将我的表达改为:\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.-]+\ b但结果相同。我不知道缺少什么或为什么 exprssion 不起作用。
任何帮助将不胜感激
解决方案
对于任何编程问题,最有帮助的一个特性是最小、完整和可验证的示例。这是一个解决您的问题的方法:
import java.util.regex.*;
class Test {
public static void main(String[] args) {
Pattern pat = Pattern.compile("\\\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\\\.[a-zA-Z0-9.-]+\\\\b");
Matcher match = pat.matcher("<li>email@example.com</li>");
System.out.println("I found this: "+ match.find() + " with expression: " + pat);
}
}
它更短,但会产生与您的代码相同的输出:
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
它的问题是反斜杠是双转义的。这是没有额外转义的版本:
import java.util.regex.*;
class Test {
public static void main(String[] args) {
Pattern pat = Pattern.compile("\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b");
Matcher match = pat.matcher("<li>info@napoleoninvestment.net</li>");
System.out.println("I found this: "+ match.find() + " with expression: " + pat);
}
}
这是输出,现在显示匹配:
I found this: true with expression: \b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.-]+\b
不幸的是,我不知道如何将其应用于您的代码,因为您没有包含您定义的部分pattern
。很可能是因为混淆了代码的哪一层需要转义。例如,将 Java 字符串文字复制粘贴到文件不会在读回时产生相同的字符串文字,因为一个是 Java 语法,一个是原始数据,而后者不需要或允许转义。
推荐阅读
- django - 如何在 django 上上传和显示视频
- tensorflow - 通过在完全成对比较中预先计算编码输出,加速比预期小
- node.js - 如何从 NestJS 中的 Query 中获取参数
- ruby-on-rails - Rails 3.2 应用程序的 heroku 推送突然失败,并显示“无法检测到 rake 任务”(没有其他错误)
- node.js - Heroku 没有检测到 Node.js 构建包
- java - 如何从 Windows git shell/gnu makefile 调用 javac
- javascript - Chart.js 折线图大部分时间不显示
- python - 如何解决错误无法导入名称样式?
- c# - 阅读 msix 包 appxmanifest 版本
- javascript - 如何正确使用 v-show 渲染我的元素?