首页 > 解决方案 > 如何让我的网络爬虫抓取包含 Unicode 字符的网址?

问题描述

我制作了一个网络爬虫,它将爬取某些孟加拉语新闻门户并接收链接,然后可以抓取内容以制作网络语料库。

我的爬虫的代码在这里给出:

我最近问了一个关于网络抓取的问题: 如何使用 Python 抓取和抓取这个特定的网站并将数据保存在文本文件中?

这里:(已编辑)


import requests
import urllib.parse
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
from urllib.request import urlopen
from urllib.request import Request

# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW

# initialize the list of links (unique links)
internal_urls = set() #Set of All internal links
external_urls = set() #Set of All external links
old_internals = set() #Keeps track of internal links before including another


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


"""
Returns all URLs that are found on `url` in which it belongs to the same website
"""
# all URLs of `url
def get_all_website_links(url):
    global old_internals
    try:
        urls = set()
        # domain name of the URL without the protocol
        domain_name = urlparse(url).netloc
        user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        req = Request(url, headers={'User-Agent': user_agent})
        article = urlopen(req).read()
        soup = BeautifulSoup(article, "lxml")

        old_internals = internal_urls.copy() #Copies old set of internal links

        for a_tag in soup.findAll("a"): #Links under  <a> tag
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            # join the URL if it's relative (not absolute link)
            href = urljoin(url, href)

            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if not is_valid(href):
                # not a valid URL
                continue
            if href in internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                if href not in external_urls:
                    print(f"{GRAY}[!] External link: {href}{RESET} \n")
                    external_urls.add(href)
                continue
            print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
            urls.add(href)
            internal_urls.add(href)

        #I could definitely have done this as a function
        #instead of writing the whole code again, but well...
        #(I will change it)
        for link_tag in soup.findAll("link"): #Links under <link> tag
            href = link_tag.attrs.get("href")
            if href == "" or href is None:
                # href empty tag
                continue
            # join the URL if it's relative (not absolute link)
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if not is_valid(href):
                # not a valid URL
                continue
            if href in internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
                if href not in external_urls:
                    print(f"{GRAY}[!] External link: {href}{RESET} \n")
                    external_urls.add(href)
                continue
            print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
            urls.add(href)            
            internal_urls.add(href)
        return urls
    except Exception as e: 
        #If the link to be added were problematic, just return the list of
        #old internal links. The function was returning an error and stopped 
        #crawling because of certain internal links midway when max count was
        #large, so...
        print("\n")
        print(e)
        print("\nNone returned\n")
        #print(internal_urls, "\n\n")
        return old_internals

# number of URLs visited so far will be stored here
total_urls_visited = 0

def crawl(url, max_urls=30):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max URLs to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    #print(url)
    print(f"{YELLOW}[*] Crawling: {url}{RESET} \n")
    links = get_all_website_links(url)
    loop=links.copy() #Since returning old internal links may change loop size
    for link in loop:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls)

def extract_name(link_url): #Program to name the file
  name=""
  link_name= link_url[link_url.index(":")+3:] #skips the "https://" part :)
  link_name=link_name.replace('/', '_')
  link_name=link_name.replace('.', '_')
  link_name=link_name.replace(' ', '_')
  link_name=link_name.replace('-', '_')
  return link_name+".txt"

def fileWrite(fname, lst):
    a_file = open(fname, "wb")
    for element in lst:
      l = len(element)
      if l == 0:
        continue
      a_file.write(element.encode() + "\n".encode())
    a_file.close()

#Runtime
if __name__ == "__main__":
    max_urls = 
    #Arbitrary list of links of Bengali sites
    web_links=["https://www.anandabazar.com/",
               "https://www.prothomalo.com/",
               "https://www.littlemag.org/2019/05/blog-post_60.html"]
    
    #Index of weblink in list
    index=1

    crawl(web_links[index], max_urls)
    fname=extract_name(web_links[index])
    fileWrite(fname, internal_urls)

    print("[+] Total Internal links:", len(internal_urls))
    print("[+] Total External links:", len(external_urls))
    print("[+] Total URLs:", len(external_urls) + len(internal_urls))
    print("[+] Total crawled URLs:", max_urls)

(编辑)堆栈溢出不允许我再次将其粘贴到此处。关于垃圾邮件的一些事情。

只要链接的 url 只包含 ASCII 字符,我的代码就可以很好地获取和抓取链接。但如果 url 有任何孟加拉语字符,它可能无法请求它。使用web_links[index=1],即https://www.prothomalo.com/,我的代码给出以下输出......

[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প 



'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)

None returned

[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প 



'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)

None returned

[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প 



'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)

None returned

[+] Total Internal links: 34
[+] Total External links: 11
[+] Total URLs: 45
[+] Total crawled URLs: 50


对于未接受的链接,我一直将此视为错误...

'ASCII codec can't encode characters in position 16-21: ordinal not in range(128)
None returned

我在我的代码中进行哪些更改以处理此错误并使带有孟加拉语字符的网址可接受并抓取链接?


更新:

好的。所以我按照@QHarr 的答案中的示例...并更改了我的代码行,这实际上使孟加拉语网址可以访问。

        domain_name = urlparse(url).netloc
        user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        req = Request(urllib.parse.quote(url, safe = ':/', encoding= 'utf-8'), headers={'User-Agent': user_agent})
        article = urlopen(req).read()
        soup = BeautifulSoup(article, "html.parser")

其余的代码是一样的......但这又带来了另一个问题......虽然其他两个链接在爬行时仍然可以正常工作https://www.prothomalo.com/。无论最大值有多大,即max_urls我输入,爬取的站点数量都保持不变,因为很多站点都返回错误。

大部分输出如下所示:


[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[*] Crawling:
https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF




HTTP Error 404: Not Found

None returned

[*] Crawling:
https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪


[+] Total Internal links: 260 [+] Total External links: 24 [+] Total
URLs: 284 [+] Total crawled URLs: 100

所以我想单独检查一些有问题的链接,并打印它们包含的所有内部链接......

import requests
from bs4 import BeautifulSoup
import urllib

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF'

domain_name=urlparse(url).netloc
print("\n\n"+domain_name+"\n\n")

r = requests.get(url, headers={'User-Agent': user_agent})
soup = BeautifulSoup(r.content, 'html.parser')

links= set()
for link in soup.find_all(['a', 'link'], href=True):
    if(link['href'].find(domain_name) != -1):
        links.add(link['href'])
print(links)

他们工作正常:


www.prothomalo.com


{'https://www.prothomalo.com/business',
'https://www.prothomalo.com/feature/holiday/বিদেশে-উচ্চশিক্ষা-দেশে-ফিরে-দুগ্ধখামারি',
'https://www.prothomalo.com/world',
'https://www.prothomalo.com/world/india/ভারতে-এবার-গ্রিন-ফাঙ্গাসের-সংক্রমণ',
'https://www.prothomalo.com/sports',
'https://www.prothomalo.com/business/economics/সুদানের-৬৫-কোটি-টাকাঋণের-দায়-নিল-বাংলাদেশ',
'https://www.prothomalo.com/bangladesh/অল-কমিউনিটি-ক্লাবে-ভাঙচুরের-অভিযোগ-পরীমনির-বিরুদ্ধে',
'https://www.prothomalo.com/bangladesh',
'https://www.prothomalo.com/video',
'https://www.prothomalo.com/entertainment',
'https://www.prothomalo.com/', 'https://www.prothomalo.com/opinion',
'https://www.prothomalo.com/life',
'https://www.prothomalo.com/world/europe/কোভিড-১৯-জীবন-বাঁচানোর-আরও-এক-চিকিৎসা',
'https://www.prothomalo.com/bangladesh/district/আলীগের-দুই-নেতাকে-কারণ-দর্শাও-নোটিশ'}

我原来的爬取代码有什么问题?

这个HTTP Error 404: Not Found错误是怎么回事?

标签: pythonweb-scrapingencodingutf-8web-crawler

解决方案


urllib一旦你去提出请求,你可以尝试使用requests。您可以将字符添加到字符排除的safe参数中parse.quote

你可以把它拉到urllib.parse.quote(url, safe = ':/', encoding= 'utf-8')它自己的包装函数中,你将一个 url 传递给它;或网址列表。

import requests
from bs4 import BeautifulSoup
import urllib

url = 'https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প'
r = requests.get(urllib.parse.quote(url, safe = ':/', encoding= 'utf-8'))
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.select_one('title').text)

推荐阅读