python - 如何让我的网络爬虫抓取包含 Unicode 字符的网址?
问题描述
我制作了一个网络爬虫,它将爬取某些孟加拉语新闻门户并接收链接,然后可以抓取内容以制作网络语料库。
我的爬虫的代码在这里给出:
我最近问了一个关于网络抓取的问题: 如何使用 Python 抓取和抓取这个特定的网站并将数据保存在文本文件中?
这里:(已编辑)
import requests
import urllib.parse
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
from urllib.request import urlopen
from urllib.request import Request
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
# initialize the list of links (unique links)
internal_urls = set() #Set of All internal links
external_urls = set() #Set of All external links
old_internals = set() #Keeps track of internal links before including another
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
"""
Returns all URLs that are found on `url` in which it belongs to the same website
"""
# all URLs of `url
def get_all_website_links(url):
global old_internals
try:
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(url, headers={'User-Agent': user_agent})
article = urlopen(req).read()
soup = BeautifulSoup(article, "lxml")
old_internals = internal_urls.copy() #Copies old set of internal links
for a_tag in soup.findAll("a"): #Links under <a> tag
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET} \n")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
urls.add(href)
internal_urls.add(href)
#I could definitely have done this as a function
#instead of writing the whole code again, but well...
#(I will change it)
for link_tag in soup.findAll("link"): #Links under <link> tag
href = link_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
print(f"{GRAY}[!] External link: {href}{RESET} \n")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET} \n")
urls.add(href)
internal_urls.add(href)
return urls
except Exception as e:
#If the link to be added were problematic, just return the list of
#old internal links. The function was returning an error and stopped
#crawling because of certain internal links midway when max count was
#large, so...
print("\n")
print(e)
print("\nNone returned\n")
#print(internal_urls, "\n\n")
return old_internals
# number of URLs visited so far will be stored here
total_urls_visited = 0
def crawl(url, max_urls=30):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max URLs to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
#print(url)
print(f"{YELLOW}[*] Crawling: {url}{RESET} \n")
links = get_all_website_links(url)
loop=links.copy() #Since returning old internal links may change loop size
for link in loop:
if total_urls_visited > max_urls:
break
crawl(link, max_urls)
def extract_name(link_url): #Program to name the file
name=""
link_name= link_url[link_url.index(":")+3:] #skips the "https://" part :)
link_name=link_name.replace('/', '_')
link_name=link_name.replace('.', '_')
link_name=link_name.replace(' ', '_')
link_name=link_name.replace('-', '_')
return link_name+".txt"
def fileWrite(fname, lst):
a_file = open(fname, "wb")
for element in lst:
l = len(element)
if l == 0:
continue
a_file.write(element.encode() + "\n".encode())
a_file.close()
#Runtime
if __name__ == "__main__":
max_urls =
#Arbitrary list of links of Bengali sites
web_links=["https://www.anandabazar.com/",
"https://www.prothomalo.com/",
"https://www.littlemag.org/2019/05/blog-post_60.html"]
#Index of weblink in list
index=1
crawl(web_links[index], max_urls)
fname=extract_name(web_links[index])
fileWrite(fname, internal_urls)
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
print("[+] Total crawled URLs:", max_urls)
(编辑)堆栈溢出不允许我再次将其粘贴到此处。关于垃圾邮件的一些事情。
只要链接的 url 只包含 ASCII 字符,我的代码就可以很好地获取和抓取链接。但如果 url 有任何孟加拉语字符,它可能无法请求它。使用web_links[index=1]
,即https://www.prothomalo.com/,我的代码给出以下输出......
[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প
'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)
None returned
[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প
'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)
None returned
[*] Crawling: https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প
'ASCII codec can't encode characters in position 21-25: ordinal not in range(128)
None returned
[+] Total Internal links: 34
[+] Total External links: 11
[+] Total URLs: 45
[+] Total crawled URLs: 50
对于未接受的链接,我一直将此视为错误...
'ASCII codec can't encode characters in position 16-21: ordinal not in range(128)
None returned
我在我的代码中进行哪些更改以处理此错误并使带有孟加拉语字符的网址可接受并抓取链接?
更新:
好的。所以我按照@QHarr 的答案中的示例...并更改了我的代码行,这实际上使孟加拉语网址可以访问。
domain_name = urlparse(url).netloc
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
req = Request(urllib.parse.quote(url, safe = ':/', encoding= 'utf-8'), headers={'User-Agent': user_agent})
article = urlopen(req).read()
soup = BeautifulSoup(article, "html.parser")
其余的代码是一样的......但这又带来了另一个问题......虽然其他两个链接在爬行时仍然可以正常工作https://www.prothomalo.com/
。无论最大值有多大,即max_urls
我输入,爬取的站点数量都保持不变,因为很多站点都返回错误。
大部分输出如下所示:
[*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [*] Crawling: https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF HTTP Error 404: Not Found None returned [*] Crawling: https://www.prothomalo.com/bangladesh/district/গুরুদাসপুর-হাসপাতালে-ভাঙচুর-মামলায়-গ্রেপ্তার-৪ [+] Total Internal links: 260 [+] Total External links: 24 [+] Total URLs: 284 [+] Total crawled URLs: 100
所以我想单独检查一些有问题的链接,并打印它们包含的所有内部链接......
import requests
from bs4 import BeautifulSoup
import urllib
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.prothomalo.com/amp/story/feature/holiday/%E0%A6%AC%E0%A6%BF%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%89%E0%A6%9A%E0%A7%8D%E0%A6%9A%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A7%8D%E0%A6%B7%E0%A6%BE-%E0%A6%A6%E0%A7%87%E0%A6%B6%E0%A7%87-%E0%A6%AB%E0%A6%BF%E0%A6%B0%E0%A7%87-%E0%A6%A6%E0%A7%81%E0%A6%97%E0%A7%8D%E0%A6%A7%E0%A6%96%E0%A6%BE%E0%A6%AE%E0%A6%BE%E0%A6%B0%E0%A6%BF'
domain_name=urlparse(url).netloc
print("\n\n"+domain_name+"\n\n")
r = requests.get(url, headers={'User-Agent': user_agent})
soup = BeautifulSoup(r.content, 'html.parser')
links= set()
for link in soup.find_all(['a', 'link'], href=True):
if(link['href'].find(domain_name) != -1):
links.add(link['href'])
print(links)
他们工作正常:
www.prothomalo.com {'https://www.prothomalo.com/business', 'https://www.prothomalo.com/feature/holiday/বিদেশে-উচ্চশিক্ষা-দেশে-ফিরে-দুগ্ধখামারি', 'https://www.prothomalo.com/world', 'https://www.prothomalo.com/world/india/ভারতে-এবার-গ্রিন-ফাঙ্গাসের-সংক্রমণ', 'https://www.prothomalo.com/sports', 'https://www.prothomalo.com/business/economics/সুদানের-৬৫-কোটি-টাকাঋণের-দায়-নিল-বাংলাদেশ', 'https://www.prothomalo.com/bangladesh/অল-কমিউনিটি-ক্লাবে-ভাঙচুরের-অভিযোগ-পরীমনির-বিরুদ্ধে', 'https://www.prothomalo.com/bangladesh', 'https://www.prothomalo.com/video', 'https://www.prothomalo.com/entertainment', 'https://www.prothomalo.com/', 'https://www.prothomalo.com/opinion', 'https://www.prothomalo.com/life', 'https://www.prothomalo.com/world/europe/কোভিড-১৯-জীবন-বাঁচানোর-আরও-এক-চিকিৎসা', 'https://www.prothomalo.com/bangladesh/district/আলীগের-দুই-নেতাকে-কারণ-দর্শাও-নোটিশ'}
我原来的爬取代码有什么问题?
这个HTTP Error 404: Not Found
错误是怎么回事?
解决方案
urllib
一旦你去提出请求,你可以尝试使用requests
。您可以将字符添加到字符排除的safe
参数中parse.quote
。
你可以把它拉到urllib.parse.quote(url, safe = ':/', encoding= 'utf-8')
它自己的包装函数中,你将一个 url 传递给它;或网址列表。
import requests
from bs4 import BeautifulSoup
import urllib
url = 'https://www.prothomalo.com/sports/football/ছবিতে-ছবিতে-রোনালদোর-রেকর্ডের-গল্প'
r = requests.get(urllib.parse.quote(url, safe = ':/', encoding= 'utf-8'))
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.select_one('title').text)
推荐阅读
- android - Firebase 版本与支持库冲突
- android - ValueError:找到了无效的张量“输入”
- c# - 在 Razor 视图中访问 SQL DB -> 无法将字符串转换为 Generic.List
- apache-kafka - 如何在 kafka 中处理 .png 或 .jpg 文件?
- python - 如何使用 quantization-aware-training 完成神经网络的 4-bit 量化
- xml - 如何根据元素的属性值来限制元素的值
- java - Hazelcast中单线程分区的工作,用于同一键上的并发更新请求
- google-cloud-pubsub - 如何使用 MQTT 和 C 代码订阅遥测事件?
- ios - 自 Lottie 3.0 更新以来 setAnimation 发生了变化?
- php - yii 命令在 yii2 应用程序中不起作用