python - 改进请求 - Python 脚本
问题描述
我需要帮助,我有一个 Python 脚本。它从给定的 URL 搜索电子邮件地址。并以 CSV 格式输出结果,我需要有人帮助添加一些行以在 CSV 中添加另一列,该列将显示在哪个 url 地址中找到。谢谢
import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv
class EmailCrawler:
processed_urls = set()
unprocessed_urls = set()
emails = set()
def __init__(self, website: str):
self.website = website
self.unprocessed_urls.add(website)
self.headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
}
self.base_url = urlsplit(self.website).netloc
self.outputfile = self.base_url.replace('.','_')+'.csv'
# we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
# for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
self.garbage_extensions = ['.aif','.webp','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd','.gif','.webp']
self.email_count = 0
def crawl(self):
"""
It will continue crawling untill the list unprocessed urls list is empty
"""
url = self.unprocessed_urls.pop()
print("CRAWL : {}".format(url))
self.parse_url(url)
if len(self.unprocessed_urls)!=0:
self.crawl()
else:
print('End of crawling for {} '.format(self.website))
print('Total urls visited {}'.format(len(self.processed_urls)))
print('Total Emails found {}'.format(self.email_count))
print('Dumping processed urls to {}'.format(self.base_url.replace('.','_')+'.txt'))
解决方案
在您的方法中绝对不需要递归crawl
,它可以很简单
class EmailCrawler:
...
def crawl(self):
for url in self.unprocessed_urls:
print("CRAWL : {}".format(url))
self.parse_url(url)
您的parse_emails
方法可以返回为给定文本找到的一组电子邮件:
class EmailCrawler:
...
def parse_emails(self, text: str) -> Set[str]:
emails = set(re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text, re.I))
# Use a list comprehension to filter emails
filtered_emails = {email for email in emails if not email.endswith(('jpg', 'jpeg', 'png', 'webp', 'gif'))}
return filtered_emails
重构您以返回由(递归在这里更有意义)parse_url
找到的电子邮件:parse_emails
class EmailCrawler:
...
def parse_url(self, current_url: str) -> Dict[str, Set[str]]:
response = requests.get(current_url, headers=self.headers)
tree = html.fromstring(response.content)
urls = tree.xpath('//a/@href')
urls = [urljoin(self.website,url) for url in urls]
urls = [url for url in urls if self.base_url == urlsplit(url).netloc]
urls = list(set(urls))
children_urls = []
for url in urls:
for extension in self.garbage_extensions:
if url.endswith(extension) or url.endswith(extension+'/'):
continue
children_urls.append(url)
email_mapping = {}
# Add to the result the child URLs and their emails
for child_url in children_urls:
email_mapping.update(self.parse_url(child_url))
# Add to the result the parent URL and its emails
email_mapping[current_url] = self.parse_emails(response.text)
return email_mapping
回到您的crawl
方法,将结果写入 CSV 文件:
class EmailCrawler:
...
def crawl(self):
for url in self.unprocessed_urls:
print("CRAWL : {}".format(url))
email_mapping = self.parse_url(url)
for url, emails in email_mapping.items():
for email in emails:
...
# Write to your CSV file the email and its url
# print(','.join((url, email)), file=your_csv_file)
推荐阅读
- asp.net - ASP.NET MVC CORS 问题
- javascript - JavaScript 错误:“ReferenceError:sprLib 未定义”
- azure-functions - Azure Function ServiceBusTrigger 绑定未注册
- amazon-web-services - AWS Lambda RDS 代理 - 无法选择 RDS 实例
- python - 在python中将多个函数输出分配给pandas数据框中的单独列
- javascript - Mongoose 相当于 JOIN ... WHERE
- javascript - 一旦我们向其他对等点发送冰候选信号,如何移除监听器?
- c# - 在 WPF 中 dataGridTemplateColumn.CellTemplate 的同一按钮单击事件后面调用不同的函数
- laravel - 如何检查laravel登录中是否存在电子邮件
- jquery - Ajax Post 调用响应为空