首页 > 解决方案 > 触发 Selenium 点击功能后无法获取标签文本

问题描述

http://211.21.63.217:81/ticket_online.aspx

我爬取网站电影列表数据,可以成功获取电影名称、电影时间。

然后我想在明天<span />的电影时间抓取每部电影,所以我明天使用硒点击。

我确保 Selenium 也单击了<Span />我设置time.sleep的每个,并检查标签根是否正确,但明天movieTime的结果是空的。

我不知道为什么。

这是我的代码:

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from scrapy.http import HtmlResponse
from datetime import datetime
from datetime import timedelta  
import time

# like 07/02
tomorrowDate = datetime.now() + timedelta(days=1)
tomorrow = tomorrowDate.strftime('%m/%d')
afterTomorrowDate = datetime.now() + timedelta(days=2)
afterTomorrow = afterTomorrowDate.strftime('%m/%d')
print(tomorrow)
print(afterTomorrow)

class CenturyasiaxinbeiSpider(scrapy.Spider):
    name = 'CenturyasiaXinbei'
    allowed_domains = ['www.centuryasia.com.tw', '211.21.63.217:81']
    start_urls = ['http://211.21.63.217:81/ticket_online.aspx']

    def __init__(self):
        print('__init__')
        self.driver = webdriver.Chrome('/Users/motogod19/chromedriver')
        self.driver.get('http://211.21.63.217:81/ticket_online.aspx')

    def parse(self, response):
        firstHtmlNodes = response.xpath('//article[@id="ContentPlaceHolder1_Time_box"]//section[@class="tickets_movie_time_box"]')
        for firstHtmlNode in firstHtmlNodes:
            # print(firstHtmlNode)
            # Today
            movieCnName = firstHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_times_t"]/div[@class="times_title"]/text()').get()
            movieEnName = firstHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_times_t"]/div[@class="times_title_en"]/text()').get()
            movieTime = firstHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_date"]/div[3]/div/div[@class="tickets_date_csbox"]/div/ul/li/text()').extract()

            print(movieCnName)
            print(movieEnName)
            # sort the movie time
            print(sorted(movieTime))
            print('\n')

        time.sleep(2)
        allSpanDate = self.driver.find_elements_by_xpath("//span[text()='{}']".format(tomorrow))
        for spanDate in allSpanDate:
            ActionChains(self.driver).move_to_element(spanDate).perform()
            spanDate.click()
            time.sleep(1)

        tomorrowResponse = HtmlResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
        secondHtmlNodes = tomorrowResponse.xpath('//article[@id="ContentPlaceHolder1_Time_box"]//section[@class="tickets_movie_time_box"]')

        for secondHtmlNode in secondHtmlNodes:
            # Tomorrow
            movieCnName = secondHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_times_t"]/div[@class="times_title"]/text()').get()
            movieEnName = secondHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_times_t"]/div[@class="times_title_en"]/text()').get()
            # movieTime = secondHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_date"]/div[3]/div/div[@class="tickets_date_csbox"]/div/ul/li/text()').extract()
            movieTime = secondHtmlNode.xpath('./div[@class="tickets_movie_time"]/div[@class="tickets_times"]/div[@class="tickets_date"]/div[3]/div[@class="tickets_date_cts"]/ul')
            # tickets_date_cts
            print(movieCnName)
            print(movieEnName)
            # sort movie time
            print(sorted(movieTime)) # Empty is here
            print('\n')
        time.sleep(60)

标签: pythonseleniumscrapy

解决方案


试试这个 xpath,它适用于movieTime = firstHtmlNode.xpath()movieTime = secondHtmlNode.xpath.

.//div[@class='tickets_date_cts']/ul/li

推荐阅读