首页 > 解决方案 > Ccraping with rvest - 按 div id 搜索

问题描述

我是新手,我正在尝试通过汽车在网站上提取信息: https ://www.plugndrive.ca/electric-cars-available-in-canada/

library(rvest);library(tidyverse)

elec_url <- read_html('https://www.plugndrive.ca/electric-cars-available-in-canada/')

car_list <- elec_url %>% html_nodes('.car-title') %>% html_text() %>% tolower() %>% 
    gsub(' ','-',.)

price <- read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>% 
    html_nodes('.starting-price .value') %>% html_text() %>% sub("\\..*", "", .) %>% gsub('^\\$|\\,','',.) %>% 
    as.numeric()

我将如何从 Electric Range id 的任一范围中抓取范围

(XPath = //*[@id="content"]/section[1]/div[2]/p[2]/strong/span)

获得 42 公里。

或底部附近“性能”选项卡中的电动 id 范围

(XPath = //*[@id="performance-container"]/ul/li[3]/span[2]/text())

获得 35 公里(不要问我为什么范围不相等!)

尝试基本运行时出现以下错误:

read_html(paste0('https://www.plugndrive.ca/pnd_evcar_cat/',car_list[[1]],'/')) %>% 
html_nodes('//*[@id="performance-container"]/ul/li[3]/span[2]/text()') %>% html_text()

标记化(css)中的错误:在位置 1 发现意外字符“/”

标签: rweb-scrapingrvest

解决方案


您可以使用以下方式获取价格:

library(rvest)

price <- elec_url %>% 
          html_nodes('div.inner p.price-container span.starting-price') %>%
          html_text()

price
# [1] "$32,930"  "$32,990"  "$33,749"  "$33,965"  "$37,895"  "$39,990" 
# [7] "$41,499"  "$42,495"  "$42,595"  "$42,760"  "$43,998"  "$44,590" 
#[13] "$44,898"  "$44,950"  "$44,995"  "$44,998"  "$44,999"  "$45,371" 
#[19] "$55,990"  "$56,469"  "$66,400"  "$68,550"  "$69,400"  "$69,900" 
#[25] "$72,200"  "$72,390"  "$74,950"  "$74,950"  "$80,500"  "$89,800" 
#[31] "$90,000"  "$109,090" "$116,090" "$122,800" "$149,900" "$173,900"

如果要将其转换为数字,可以使用parse_numberfrom readr

readr::parse_number(price)
# [1]  32930  32990  33749  33965  37895  39990  41499  42495  42595  42760
#[11]  43998  44590  44898  44950  44995  44998  44999  45371  55990  56469
#[21]  66400  68550  69400  69900  72200  72390  74950  74950  80500  89800
#[31]  90000 109090 116090 122800 149900 173900

编辑

如果您想从每个单独的汽车页面获取电力范围和汽油范围,也许我之前错过了您正在寻找的内容。您可以先提取所有 URL,然后从中获取数字。

library(tidyverse)

all_urls <- elec_url %>% html_nodes('div.evCar a') %>%  html_attr('href')

all_ranges <- map_chr(all_urls, ~.x %>% 
                      read_html() %>% html_nodes('div.info p strong') %>%
                      .[1] %>% html_text())

tibble(all_ranges, car_list) %>%
  mutate(electic_range = str_extract(all_ranges, '(?<=Electric Range:\\s)\\d+'), 
         gasoline_range = str_extract(all_ranges,'(?<=Gasoline Range:\\s)\\d+')) %>%
  select(-all_ranges)

# A tibble: 36 x 3
#   car_list               electic_range gasoline_range
#   <chr>                  <chr>         <chr>         
# 1 ford-fusion-energi     42            940           
# 2 toyota-prius-prime     40            995           
# 3 hyundai-ioniq-phev     47            961           
# 4 kia-niro-phev          42            853           
# 5 volkswagen-e-golf      198           NA            
# 6 mini-cooper-se         177           NA            
# 7 hyundai-ioniq-electric 274           NA            
# 8 subaru-crosstrek-phev  27            747           
# 9 kia-soul-electric      383           NA            
#10 honda-clarity-phev     77            475           
# … with 26 more rows

推荐阅读