python-3.x - 运行 Python 爬虫时出现“TypeError:字符串索引必须是整数”
问题描述
按照教程,我为一个从特定网站检索特定单词的搜索结果的爬虫编写了这段代码。这是代码:
import requests
from bs4 import BeautifulSoup
class Content:
"""Common base class for all articles/pages"""
def __init__(self, topic, url, title, body):
self.topic = topic
self.url = url
self.title = title
self.body = body
def print(self):
"""Flexible printing function controlling output"""
print('New article found for topic: {}'.format(self.topic))
print('URL: {}'.format(self.url))
print('TITLE: {}'.format(self.title))
print('BODY:\n{}'.format(self.body))
class Website:
"""Contains information about website structure"""
def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
self.name = name
self.url = url
self.searchUrl = searchUrl
self.resultListing = resultListing
self.resultUrl = resultUrl
self.absoluteUrl = absoluteUrl
self.titleTag = titleTag
self.bodyTag = bodyTag
class Crawler:
def getPage(self, url):
try:
req = requests.get(url)
except requests.exceptions.RequestException:
return None
return BeautifulSoup(req.text, 'html.parser')
def safeGet(self, pageObj, selector):
childObj = pageObj.select(selector)
if childObj is not None and len(childObj) > 0:
return childObj[0].get_text()
return ''
def search(self, topic, site):
"""Searches a give website for a given topic and records all pages found"""
bs = self.getPage(site.searchUrl + topic)
searchResults = bs.select(site.resultListing)
for result in searchResults:
url = result.select(site.resultUrl)[0].attrs['href']
#Check to see whether it' a relative or an absolute URL
if(site.absoluteUrl):
bs = self.getPage(url)
else:
bs = self.getPage(site.url + url)
if bs is None:
print('Something was wrong with that page or URL. Skipping!')
return
title = self.safeGet(bs, site.titleTag)
body = self.safeGet(bs, site.bodyTag)
if title != '' and body != '':
content = Content(topic, title, body, url)
content.print()
crawler = Crawler()
siteData = [
['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body'],
--->['Corriere', 'http://www.corriere.it', 'https://sitesearch.corriere.it/forward.jsp?q=',
'div#ris-ricerca', 'p.article a', True, 'h1', 'h2']
]
sites = []
for row in siteData:
sites.append(Website(row[0], row[1], row[2], row[3], row[4], row [5], row[6], row[7]))
topics = ['python', 'data science']
for topic in topics:
print('GETTING INFO ABOUT: ' + topic)
for targetSite in sites:
crawler.search(topic, targetSite)
但是,当我运行它时,我收到以下错误消息:
File "crawlSearch.py", line 74, in <module>
['Corriere', 'http://www.corriere.it', 'https://sitesearch.corriere.it/forward.jsp?q=',
TypeError: string indices must be integers
我不明白为什么它只给我一个网站的错误。请指教。
解决方案
推荐阅读
- regex - 需要从bash中的字符串获取子字符串
- c# - .NET Core 3.0:Razor 视图不会在更改时自动重新编译
- swift - 我正在尝试连接我的用户的名字和姓氏,以将其作为项目添加到 Firebase 数据库中的个人资料中
- javascript - 网页性能分析
- reactjs - 制作由 Formik 控制的 DraftJS 编辑器:字符串值不能很好地转换为 EditorState
- swift - 如何以编程方式从 TableView 执行 segue
- inversion-of-control - 使用类
作为 Haxe 中的 Map 键 - javascript - 如何使用默认导出在 TypeScript 类型中描述 JavaScript 类静态工厂方法?
- node.js - Node/Express:使用异步等待的 API
- python - 为什么 pygame 需要 time.sleep 在退出之前播放声音?