首页 > 解决方案 > 使用 BeautifulSoup,在两种不同的情况下调用相同的函数时,我得到了两种不同的结果

问题描述

我定义了 12 个函数,目的是从评论网站检索特定数据点。

我正在检索的信息包括餐厅名称、评级、电话、地址和餐厅链接。

然后我遍历包含餐厅链接的列表,并使用 Beautiful Soup 打开它。通过这样做,我将可以访问我想要检索的其他数据点,例如评论者的姓名、他或她的位置、他或她所做的评论数量、评论的日期和评论他或她写的。

我将与新链接列表一起使用的第一个函数称为 getReviewerName,它接受两个参数(链接和带有新用户代理的标头。

问题来了。当getReviewerName浏览新链接列表以获取审阅者的姓名时,我得到"TypeError: 'NoneType' object is not iterable".

但是,当我getReviewerName只使用一个 url 时,它可以完美地工作。

这是代码:

def getReviewerName (restaurantLink, header): 
    time.sleep(4) 
    session = requests.Session() 
    req = session.get (url, headers = header) 
    bs = BeautifulSoup (req.text, "html.parser") 
    comments = bs.find ("div", {"class":"review-list"}) 
    authorDiv = comments.find_all ("div", {"class":"media-story"}) 
    nameTag = [a.find ("li",{"user-name"}) for a in authorDiv] 
    name = [name.text for name in nameTag] 
    names = [n.replace("\n","") for n in name] 
    return names


def getRestaurantLink(bs):   
    time.sleep(4) 
    listLinks = [] 
    restGrid = bs.find_all ("ul", {"class": "lemon--ul__373c0__1_cxsundefined list__373c0__2G8oH"}) 
    for i in restGrid: 
        link = i.find_all("a", {"href": re.compile ("/biz/.*")}) 
        for l in link: 
            if "target" in l.attrs and "name" in l.attrs and "rel" in l.attrs: 
               listLinks.append (l.attrs["href"])
    restaurantLink = []           
    for link in listLinks[0::3]:
        newLink = ("https://www.yelp.com"+str(link))
        restaurantLink.append (newLink)
    return restaurantLink




def startScraping (url, header): 
    time.sleep(4) 
    session = requests.Session() 
    req = session.get (url, headers = header) 
    bs = BeautifulSoup (req.text, "html.parser")


    restaurantName = getNames(bs)
    restaurantRating = getRating (bs)
    restaurantPhone = getPhone (bs)
    restaurantAddress = getAddress (bs)
    restaurantLink = getRestaurantLink (bs)


    print (restaurantName)
    print (restaurantRating)  
    print (restaurantPhone)
    print (restaurantAddress)
    print (restaurantLink)

    for link in restaurantLink:

    # THIS IS THE FUNCTION I AM HAVING TROUBLE WITH:

        reviewerName = getReviewerName (link, header)

        reviewerLocation = getReviewerLocation (link, header )
        reviewerTotalReviews = getReviewerTotalReviews(link, header )
        reviewerRating = getReviewerRating(link, header )
        reviewDate = getReviewDate(link, header)
        review = getReview (link, header)


        print (reviewerName)  
        print (reviewerLocation) 
        print (reviewerTotalReviews)      
        print (reviewerRating)
        print (reviewDate)     
        print (review)



headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64;  rv:64.0) Gecko/20100101 Firefox/64.0"}
url = "https://www.yelp.com/search?  find_desc=&find_loc=Hunts+Point%2C+Bronx%2C+NY&ns=1"

startScraping (url, headers)



OUTPUT:

-----------------------------------------------------------------     ----------
TypeError                                 Traceback (most recent   call last)
<ipython-input-142-ddbec6805e6f> in <module>()
  2 url = "https://www.yelp.com/search?  find_desc=&find_loc=Hunts+Point%2C+Bronx%2C+NY&ns=1"
      3  
----> 4 startScraping (url, headers)

<ipython-input-141-2e3b65c73e53> in startScraping(url, header)
    166 
    167     for link in restaurantLink:
 --> 168         reviewerName = getReviewerName (link, header)
    169         reviewerLocation = getReviewerLocation (link,  header )
    170         reviewerTotalReviews = getReviewerTotalReviews(link, header )

 <ipython-input-141-2e3b65c73e53> in  getReviewerName(restaurantLink, header)
    70     bs = BeautifulSoup (req.text, "html.parser")
    71     comments = bs.find ("div", {"class":"review-list"})
---> 72     for comment in comments:
     73         if comment is not None:
     74             authorDiv = [comment.find ("div",     {"class":"media-story"}) for comment in comments]

 TypeError: 'NoneType' object is not iterable



# HOWEVER, WHEN I CALL THE getReviewerName (link, header) WITH  JUST ONE URL I GET THE DESIRED RESULT:

def getReviewerName (restaurantLink, header): 
    time.sleep(4) 
    session = requests.Session() 
    req = session.get (url, headers = header) 
    bs = BeautifulSoup (req.text, "html.parser") 
    comments = bs.find ("div", {"class":"review-list"})  
    authorDiv = comments.find_all ("div", {"class":"media-story"}) 
    nameTag = [a.find ("li",{"user-name"}) for a in authorDiv] 
    name = [name.text for name in nameTag] 
    names = [n.replace("\n","") for n in name] 
    return names

url = "https://www.yelp.com/biz/the-boogie-down-grind-café-bronx"
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64;  rv:64.0) Gecko/20100101 Firefox/64.0"}
getReviewerName (url, headers)

输出:

['Bumble B.',
 'Mel G.',
 'Hadassah B.',
 'Jackeline F.',
 'Annie Boom F.',
 'Elvis A.',
 'Anjel P.',
 'Tashalee C.',
 'Deepakie S.',
 'Lidia D.',
 'Mercedes C.',
 'Rachel N.',
 'Jessica M.',
 'Nirva A.',
 'Julissa A.',
 'Samantha T.',
 'Sean B.',
 'Chad J.',
 'Angel R.',
 'Riley R.']

标签: pythonbeautifulsoup

解决方案


推荐阅读