python - 从站点获取数据并进行汇总
问题描述
嗨,我正在编写两个不同的脚本,一个是通过 selenium 获取数据,一个是获取数据摘要。所以从站点获取数据工作正常,但是当我传递该数据以对数据进行摘要时,数据并没有在我的摘要中传递。请让我知道我在哪里出错以及如何解决这个问题。我是 python selenium 的新手。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
"""
Taking input from user
"""
search_input = input("Input the keyword you want to search for:")
search_input = search_input.replace(' ', '+')
driver = webdriver.Chrome(executable_path="E:\chromedriver\chromedriver.exe")
for i in range(1):
matched_elements = driver.get("https://www.google.com/search?q=" +
search_input + "&start=" + str(i))
print(driver.title)
driver.maximize_window()
time.sleep(5)
links_url = driver.find_elements_by_xpath("//div[@class='yuRUbf']/a[@href]")
links = []
for x in links_url:
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:
# print('\nnew url : ', new_url)
driver.get(new_url)
#Getting the data from the site
try:
link = driver.find_elements(By.TAG_NAME, "p")
for p in link:
datas = p.get_attribute("innerText")
print(datas)
except:
continue
driver.quit()
#getting summary of data
print("\nOriginal text:")
print(datas)
textWordCount = len(datas.split())
print("The number of words in Original text are : " + str(textWordCount))
stopWords = set(stopwords.words("english"))
words = word_tokenize(datas)
freqTable = dict()
for word in words:
word = word.lower()
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
sentences = sent_tokenize(datas)
sentenceValue = dict()
for sentence in sentences:
for word, freq in freqTable.items():
if word in sentence.lower():
if sentence in sentenceValue:
sentenceValue[sentence] += freq
else:
sentenceValue[sentence] = freq
sumValues = 0
for sentence in sentenceValue:
sumValues += sentenceValue[sentence]
average = int(sumValues / len(sentenceValue))
summary = ''
for sentence in sentences:
if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
summary += " " + sentence
print("\nSummary:")
print(summary)
summaryWordCount = len(summary.split())
print("\nThe number of words in summary are : " + str(summaryWordCount))
解决方案
问题出在这一行:
datas = p.get_attribute("innerText")
这会在循环的每次迭代中重写 datas 的值。我猜你真的想追加到一个列表,或者用单词之间的空格来扩展一个字符串?
推荐阅读
- server - 我有超过 500 个成员,但 Server Insights 仍然没有显示任何数据
- go - 如何处理 Go 语言中的字符串错误
- c# - C# MongoDB Upsert - 'BsonValue(Guid) 已过时:'使用 BsonBinaryData 构造函数并指定 Guid 表示'
- python - 如何将函数应用于列表中的列表?
- git - git clone 错误 [错误:inflate:数据流错误(数据检查不正确)]
- ios - UITextView 在 iOS 12 及更低版本上的高度为 8192 时似乎没有
- java - 以编程方式将java代码添加到现有java文件的任何可能方法?
- java - 我正在尝试质疑正则表达式匹配
- flutter - 为 Lottie 动画设置 BoxFit Cover
- postgresql - 在 PostgreSQL 11.0 中将列类型从“字符变化”更改为“日期”