首页 > 解决方案 > 改进请求 - Python 脚本

问题描述

我需要帮助,我有一个 Python 脚本。它从给定的 URL 搜索电子邮件地址。并以 CSV 格式输出结果,我需要有人帮助添加一些行以在 CSV 中添加另一列,该列将显示在哪个 url 地址中找到。谢谢

import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv


class EmailCrawler:

    processed_urls = set()
    unprocessed_urls = set()
    emails = set()

    def __init__(self, website: str):
        self.website = website
        self.unprocessed_urls.add(website)
        self.headers = {
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
        }
        self.base_url = urlsplit(self.website).netloc
        self.outputfile = self.base_url.replace('.','_')+'.csv'
        # we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
        # for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
        self.garbage_extensions = ['.aif','.webp','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd','.gif','.webp']
        self.email_count = 0

    def crawl(self):
        """
        It will continue crawling untill the list unprocessed urls list is empty
        """

        url = self.unprocessed_urls.pop()
        print("CRAWL : {}".format(url))
        self.parse_url(url)


        if len(self.unprocessed_urls)!=0:
            self.crawl()
        else:
            print('End of crawling for {} '.format(self.website))
            print('Total urls visited {}'.format(len(self.processed_urls)))
            print('Total Emails found {}'.format(self.email_count))
            print('Dumping processed urls to {}'.format(self.base_url.replace('.','_')+'.txt'))
      

标签: python

解决方案


在您的方法中绝对不需要递归crawl,它可以很简单

class EmailCrawler:
    ...

    def crawl(self):
        for url in self.unprocessed_urls:
            print("CRAWL : {}".format(url))
            self.parse_url(url)

您的parse_emails方法可以返回为给定文本找到的一组电子邮件:

class EmailCrawler:
    ...

    def parse_emails(self, text: str) -> Set[str]:
        emails = set(re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text, re.I))
          
        # Use a list comprehension to filter emails
        filtered_emails = {email for email in emails if not email.endswith(('jpg', 'jpeg', 'png', 'webp', 'gif'))}
        return filtered_emails

重构您以返回由(递归在这里更有意义)parse_url找到的电子邮件:parse_emails

class EmailCrawler:
    ...

    def parse_url(self, current_url: str) -> Dict[str, Set[str]]:
        response = requests.get(current_url, headers=self.headers)
        tree = html.fromstring(response.content)
        urls = tree.xpath('//a/@href')

        urls = [urljoin(self.website,url) for url in urls]
        urls = [url for url in urls if self.base_url == urlsplit(url).netloc]
        urls = list(set(urls))

        children_urls = []
        for url in urls:
            for extension in self.garbage_extensions:
                if url.endswith(extension) or url.endswith(extension+'/'):
                    continue

            children_urls.append(url)

        email_mapping = {}

        # Add to the result the child URLs and their emails
        for child_url in children_urls:
            email_mapping.update(self.parse_url(child_url))

        # Add to the result the parent URL and its emails
        email_mapping[current_url] = self.parse_emails(response.text)

        return email_mapping

回到您的crawl方法,将结果写入 CSV 文件:

class EmailCrawler:
    ...

    def crawl(self):
        for url in self.unprocessed_urls:
            print("CRAWL : {}".format(url))

            email_mapping = self.parse_url(url)
            for url, emails in email_mapping.items():
                for email in emails:
                    ...
                    # Write to your CSV file the email and its url
                    # print(','.join((url, email)), file=your_csv_file)

推荐阅读