python - 关于如何使用python进行多处理以爬取网站页面的问题
问题描述
我有一个关于如何使用 Python 制作用于多处理的爬网代码的问题。下图是我想象的功能。但是问题是被操作的进程不能接受 URL 列表。请让我知道您认为的最佳解决方案。
[![在此处输入图像描述][1]][1]
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from multiprocessing import Pool
start_time = time.time()
driver = webdriver.Chrome(executable_path='chromedriver')
# Login
driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("ID")
driver.find_element_by_name("password").send_keys("PW")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
time.sleep(0.1)
all_urls = []
for i in range(1, 201):
all_urls.append('https://quasarzone.com/bbs/qf_cmr?page={}'.format(i))
result = []
def next_page(urls):
driver.get(urls)
res = driver.page_source
soup = BeautifulSoup(res, "html.parser", from_encoding='utf-8')
data_name = soup.select('td:nth-child(4) > div > div')
data_date = soup.select('td:nth-child(6) > span')
data_title = soup.select('td:nth-child(3) > p > a')
data_view = soup.select('td:nth-child(5) > span')
for name, date, title, view in zip(data_name, data_date, data_title, data_view):
result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])
# Problem point!!
if __name__ == '__main__':
with Pool(processes=4) as pool:
pool.map(next_page, all_urls)
pool.join()
f = open('crawling_review_quasarzone.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(f)
header = ['name', 'date', 'title', 'view']
csv_writer.writerow(header)
for i in result:
csv_writer.writerow(i)
f.close()
end_time = time.time()
spend_time = end_time - start_time
t = open('spending_time.txt', 'w')
t.write('total spending time: {} sec'.format(spend_time))
t.close()
driver.quit()
解决方案
我自己解决了。但是我认为这不是最好的情况。也许我可以将多线程与多处理一起使用。无论如何,我上传了我所做的代码。
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from concurrent.futures import ProcessPoolExecutor
board_name = 'cmr'
start_time = time.time()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("disable-gpu")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(executable_path='chromedriver', options=options)
driver.get('https://quasarzone.com/login?nextUrl=https://quasarzone.com/')
driver.find_element_by_name("login_id").send_keys("id")
driver.find_element_by_name("password").send_keys("pw")
driver.find_element_by_xpath('//*[@id="frm"]/div/div[1]/p/a').click()
time.sleep(0.1)
def next_page(pages):
result = []
for i in pages:
driver.get('https://quasarzone.com/bbs/qf_{}?page={}'.format(board_name, i))
time.sleep(5)
res = driver.page_source
soup = BeautifulSoup(res, "html.parser")
data_name = soup.select('td:nth-child(4) > div > div')
data_date = soup.select('td:nth-child(6) > span')
data_title = soup.select('td:nth-child(3) > p > a')
data_view = soup.select('td:nth-child(5) > span')
for name, date, title, view in zip(data_name, data_date, data_title, data_view):
result.append([name.get_text(), date.get_text(), title.get_text(), view.get_text()])
f = open('quasarzone_{}.csv'.format(board_name), 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(f)
header = ['name', 'date', 'title', 'view']
csv_writer.writerow(header)
for i in result:
csv_writer.writerow(i)
f.close()
def multiProcessing():
page_threshold = 100
number_process = 4
pool = ProcessPoolExecutor(max_workers=number_process)
process = []
for i in range(number_process+1):
p = range(page_threshold * i, page_threshold * (i+1))
process.append(p)
pool.map(next_page, process)
if __name__ == '__main__':
multiProcessing()
end_time = time.time()
spend_time = end_time - start_time
t = open('spending_time_{}.txt'.format(board_name), 'w')
t.write('total spending time of {}: {:.2f} sec'.format(board_name, spend_time))
t.close()
推荐阅读
- spring-boot - 无法加载配置类:org.springframework.cloud.netflix.eureka.server.EurekaServerAutoConfiguration
- c# - 从 2D 分形噪声生成点
- linux - 打印有关文本文件的统计信息的 Bash 脚本
- ios - 如何在 videoCompositionWithAsset:applyingCIFiltersWithHandler 方法中使用层指令
- javascript - jQuery Animate 增加数字效果减慢接近尾声
- javascript - 如何使用 javascript 从 gcloud 服务器 HTTP GET 文本文件?
- swift - 纹理(AsyncDisplayKit)
- python - Selenium DevTools Capture 节点截图
- amazon-web-services - 将 AWS Dynamodb 查询结果作为内部服务器错误代码:502
- clang - 如何使用 clang 工具仅解析用户定义的源文件