python - 需要帮助在 Python 脚本中实现 If/Else 或 Case 语句
问题描述
该代码爬取网站,然后打印出内部和外部链接,并将其存储到 txt、json、xml 和 csv 文件中。在实现 if/else 或 case 语句时需要帮助,我可以在其中选择可以在其中获得结果(内部和外部链接)的文件格式。此外,如果可能的话,需要帮助使其比当前更优化或更好。如何执行程序:python 文件名 url 我的输出:内部链接总数:...。外部链接总数:...。链接总数:...然后将这些链接导出到 txt、son、csv 和 XML 文件。
import requests
import argparse
import time
import json
import random
import pandas as pd
import os
import xml.etree.ElementTree as xml
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
internal_links = set()
external_links = set()
urls = []
total_links_visited = 0
#check if url is valid
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
#this function finds and prints out the internal and external links
def get_all_website_links(url):
global urls
domain_name = urlparse(url).netloc
res1 = requests.get(url)
soup = BeautifulSoup(res1.content, "html.parser")
for a_tag in soup.findAll("a"):
href_tag = a_tag.attrs.get("href")
if href_tag:
href_tag = urljoin(url, href_tag)
parsed_href = urlparse(href_tag)
href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if is_valid(href_tag):
if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
print(f"External link: {href_tag}")
external_links.add(href_tag)
continue
elif href_tag not in urls:
print(f"Internal link: {href_tag}")
urls.append(href_tag)
internal_links.add(href_tag)
#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
global total_links_visited, urls
total_links_visited += 1
get_all_website_links(url)
for link in urls:
if total_links_visited > max_urls:
break
crawl(link, max_urls=max_urls)
#main function
def main():
parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
parser.add_argument("url", help="The URL to extract links from.")
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
args = parser.parse_args()
url = args.url
max_urls = args.max_urls
domain_name = urlparse(url).netloc
res = requests.get(url)
statuscode = res.status_code
print("Status Code:", statuscode)
if statuscode == 200:
crawl(url, max_urls=max_urls)
else:
print("Failed to get a request response back.")
print("Total Internal Links:", len(internal_links))
print("Total External Links:", len(external_links))
print("Total Links:", len(external_links) + len(internal_links))
with open(f"{domain_name}_internal_links.txt", "w") as f:
for internal_link in internal_links:
print(internal_link.strip(), file=f)
with open(f"{domain_name}_external_links.txt", "w") as f:
for external_link in external_links:
print(external_link.strip(), file=f)
#writing to json files
f = open(f"{domain_name}_internal_links.json","w")
json.dump({'internal_links':list(internal_links)}, f, indent=6)
f.close()
f = open(f"{domain_name}_external_links.json","w")
json.dump({'external_links':list(external_links)}, f, indent=6)
f.close()
#writing to csv
df = pd.DataFrame(list(internal_links))
df.to_csv(f"{domain_name}_internal_links.csv", index=False, header=False)
df = pd.DataFrame(list(external_links))
df.to_csv(f"{domain_name}_external_links.csv", index=False, header=False)
#writing to xml
xmlformat = xml.Element("internal_links")
xmlformat_1 = xml.SubElement(xmlformat, "link")
for l in list(internal_links):
xmlformat_1.text = str(l)
xmlformat.append(xmlformat_1)
tree = xml.ElementTree(xmlformat)
tree.write(f"{domain_name}_internal_links.xml")
xmlformat = xml.Element("external_links")
xmlformat_1 = xml.SubElement(xmlformat, "link")
for l in list(external_links):
xmlformat_1.text = str(l)
xmlformat.append(xmlformat_1)
tree = xml.ElementTree(xmlformat)
tree.write(f"{domain_name}_external_links.xml")
#executing the python script
if __name__ == "__main__":
main()
解决方案
您可以再调用一个命令行参数output-file-format
import requests
import argparse
import time
import json
import random
import pandas as pd
import os
import xml.etree.ElementTree as xml
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
internal_links = set()
external_links = set()
urls = []
total_links_visited = 0
#check if url is valid
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
#this function finds and prints out the internal and external links
def get_all_website_links(url):
global urls
domain_name = urlparse(url).netloc
res1 = requests.get(url)
soup = BeautifulSoup(res1.content, "html.parser")
for a_tag in soup.findAll("a"):
href_tag = a_tag.attrs.get("href")
if href_tag:
href_tag = urljoin(url, href_tag)
parsed_href = urlparse(href_tag)
href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if is_valid(href_tag):
if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
print(f"External link: {href_tag}")
external_links.add(href_tag)
continue
elif href_tag not in urls:
print(f"Internal link: {href_tag}")
urls.append(href_tag)
internal_links.add(href_tag)
#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
global total_links_visited, urls
total_links_visited += 1
get_all_website_links(url)
for link in urls:
if total_links_visited > max_urls:
break
crawl(link, max_urls=max_urls)
def save(output_file_format, domain_name, internal_links, external_links):
if output_file_format == "json":
#writing to json files
f = open(f"{domain_name}_internal_links.json","w")
json.dump({'internal_links':list(internal_links)}, f, indent=6)
f.close()
f = open(f"{domain_name}_external_links.json","w")
json.dump({'external_links':list(external_links)}, f, indent=6)
f.close()
elif output_file_format == "csv":
#writing to csv
df = pd.DataFrame(list(internal_links))
df.to_csv(f"{domain_name}_internal_links.csv", index=False, header=False)
df = pd.DataFrame(list(external_links))
df.to_csv(f"{domain_name}_external_links.csv", index=False, header=False)
elif output_file_format == "xml":
#writing to xml
xmlformat = xml.Element("internal_links")
xmlformat_1 = xml.SubElement(xmlformat, "link")
for l in list(internal_links):
xmlformat_1.text = str(l)
xmlformat.append(xmlformat_1)
tree = xml.ElementTree(xmlformat)
tree.write(f"{domain_name}_internal_links.xml")
xmlformat = xml.Element("external_links")
xmlformat_1 = xml.SubElement(xmlformat, "link")
for l in list(external_links):
xmlformat_1.text = str(l)
xmlformat.append(xmlformat_1)
tree = xml.ElementTree(xmlformat)
tree.write(f"{domain_name}_external_links.xml")
else:
with open(f"{domain_name}_internal_links.txt", "w") as f:
for internal_link in internal_links:
print(internal_link.strip(), file=f)
with open(f"{domain_name}_external_links.txt", "w") as f:
for external_link in external_links:
print(external_link.strip(), file=f)
#main function
def main():
parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
parser.add_argument("url", help="The URL to extract links from.")
parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
parser.add_argument("-t", "--output-file-format", help="Output file format to store the data. Default text", default="txt")
args = parser.parse_args()
url = args.url
max_urls = args.max_urls
output_file_format = args.output_file_format
domain_name = urlparse(url).netloc
res = requests.get(url)
statuscode = res.status_code
print("Status Code:", statuscode)
if statuscode == 200:
crawl(url, max_urls=max_urls)
else:
print("Failed to get a request response back.")
print("Total Internal Links:", len(internal_links))
print("Total External Links:", len(external_links))
print("Total Links:", len(external_links) + len(internal_links))
save(output_file_format, domain_name, internal_links, external_links)
#executing the python script
if __name__ == "__main__":
main()
用法:
usage: a.py [-h] [-m MAX_URLS] [-t OUTPUT_FILE_FORMAT] url
a.py: error: the following arguments are required: url
运行文件:
python pyfile3.py -m 1 -t csv https://www.youtube.com
输出:
Status Code: 200
Internal link: https://www.youtube.com/about/
Internal link: https://www.youtube.com/about/press/
Internal link: https://www.youtube.com/about/copyright/
Internal link: https://www.youtube.com/t/contact_us
Internal link: https://www.youtube.com/creators/
Internal link: https://www.youtube.com/ads/
External link: https://developers.google.com/youtube
Internal link: https://www.youtube.com/t/terms
External link: https://www.google.co.jp/intl/ja/policies/privacy/
Internal link: https://www.youtube.com/about/policies/
Internal link: https://www.youtube.com/howyoutubeworks
Internal link: https://www.youtube.com/new
Internal link: https://www.youtube.com/about/experiences/
Internal link: https://www.youtube.com/about/brand-resources/
External link: https://youtube.googleblog.com/
Internal link: https://www.youtube.com/trends/
External link: https://twitter.com/YouTube
External link: https://www.instagram.com/youtube/
External link: https://www.facebook.com/youtube/
Internal link: https://youtube.googleblog.com/
Internal link: https://www.youtube.com/jobs/
Internal link: https://www.youtube.com/howyoutubeworks/
External link: https://www.youtubego.com/
Internal link: https://www.youtube.com/kids/
Internal link: https://www.youtube.com/musicpremium
Internal link: https://www.youtube.com/channel/UCqVDpXKLmKeBU_yyt_QkItQ
Internal link: https://www.youtube.com/premium/
External link: https://studio.youtube.com/
External link: https://tv.youtube.com/
Internal link: https://www.youtube.com/yt/dev/
External link: https://artists.youtube.com/
External link: https://creatoracademy.youtube.com/page/education
Internal link: https://www.youtube.com/yt/family/
External link: https://youtube.com/creatorresearch/
External link: https://servicesdirectory.withyoutube.com/
Internal link: https://www.youtube.com/nextup
Internal link: https://www.youtube.com/space/
External link: https://vr.youtube.com/
Internal link: https://www.youtube.com/creators-for-change/
External link: https://youtube.com/csai-match/
External link: https://socialimpact.youtube.com/
Internal link: https://www.youtubego.com/
Internal link: https://studio.youtube.com/
Internal link: https://tv.youtube.com/
Internal link: https://artists.youtube.com/
Internal link: https://creatoracademy.youtube.com/page/education
Internal link: https://youtube.com/creatorresearch/
Internal link: https://servicesdirectory.withyoutube.com/
Internal link: https://vr.youtube.com/
Internal link: https://youtube.com/csai-match/
Internal link: https://socialimpact.youtube.com/
Internal link: https://www.youtube.com
External link: https://www.google.com/policies/privacy/
External link: https://support.google.com/youtube/
Total Internal Links: 36
Total External Links: 18
Total Links: 54
推荐阅读
- c++ - Qt,将数据传递给另一个类
- c - 使用指针对数组进行排序
- java - 定义具有多列的 tableview 数据的更简单方法?
- iis - 如何为不指向 IIS web.config 中任何文件的任何 Web 请求返回特定页面?
- postgresql - 双精度到 varchar postgresql
- python - 如何修复 Pygame 中不可靠的命中框
- react-native - 来自调试版本的红屏错误出现在发布版本中
- c# - 正则表达式词性能:\w vs [a-zA-Z0-9_]
- vb.net - 在旧的 .Net Web 表单 (aspx) 站点中使用 ASP.Net Core 2.2 Auth Cookie 进行 SSO
- mysql - 使用 auto_increment 的正确语法