首页 > 解决方案 > 运行 Python 爬虫时出现“TypeError:字符串索引必须是整数”

问题描述

按照教程,我为一个从特定网站检索特定单词的搜索结果的爬虫编写了这段代码。这是代码:

import requests
from bs4 import BeautifulSoup

class Content:
    """Common base class for all articles/pages"""
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """Flexible printing function controlling output"""
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

class Website:
    """Contains information about website structure"""
    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        """Searches a give website for a given topic and records all pages found"""
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            #Check to see whether it' a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()

crawler = Crawler()

siteData = [
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body'],
--->['Corriere', 'http://www.corriere.it', 'https://sitesearch.corriere.it/forward.jsp?q=',
        'div#ris-ricerca', 'p.article a', True, 'h1', 'h2']
]

sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2], row[3], row[4], row [5], row[6], row[7]))

topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

但是,当我运行它时,我收到以下错误消息:

File "crawlSearch.py", line 74, in <module>
    ['Corriere', 'http://www.corriere.it', 'https://sitesearch.corriere.it/forward.jsp?q=',
TypeError: string indices must be integers

我不明白为什么它只给我一个网站的错误。请指教。

标签: python-3.xweb-crawler

解决方案


推荐阅读