首页 > 解决方案 > 使用 rvest 读取多个页面

问题描述

我正在尝试使用 rvest 读取多个结果页面以收集页面上的所有数据。

我已经尝试从这里调整代码:

跨多个页面的 R 网页抓取

但我只得到结果的第一页。

library(httr)
library(rvest)
library(dplyr)
library(magrittr)
library(stringr)
library(lubridate)
library(purrr)
library(jsonlite)
library(rjson)
library(tidyverse)

body_tags_1 <- lapply(paste0('https://www.eventbrite.com/d/ny--new-york/conference/?page=', 1:49),
                function(url){
                  url %>% read_html() %>% 
                    html_nodes("body") %>% 
                    html_text() %>% 
                    toString() # to produce a single character string describing an R object.
                })

tmp <- str_match_all(body_tags_1,'window.__SERVER_DATA__ = (.*);')  


# Convert R objects from JSON - output - list and flatten the JSON object
json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)

# get Event name from json
Event_Name <- json$suggestions$events$name
# convert to data frame
Event_Name <- as.data.frame(Event_Name)
# convert column variable to character from factor
Event_Name <- data.frame(lapply(Event_Name, as.character), stringsAsFactors=FALSE)

# get event date from dates 
Event_Date <- json$suggestions$events$start_date

# convert dates to data frame
Event_Date <- as.data.frame(Event_Date)

# Convert dates to character
Event_Date <- data.frame(lapply(Event_Date, as.character), stringsAsFactors=FALSE)

#`## convert Date from chr format to Date format using lubridate
Event_Date$Event_Date <- ymd(Event_Date$Event_Date)
# get Location from json
Location <- json$suggestions$events$primary_venue.address.city 

# convert to data frame
Location <- as.data.frame(Location)
# convert column variable to character from factor
Location <- data.frame(lapply(Location, as.character), stringsAsFactors=FALSE)

Tickets <- json$suggestions$events$ticket_availability.minimum_ticket_price.major_value  

# transform a vector into data frame with fixed dimension
# by converting to a matrix, specify the ncol
m1 <- matrix(Tickets, ncol=1, byrow=TRUE)

# convert to data frame
Tickets <- as.data.frame(m1, stringsAsFactors=FALSE)
Tickets <- as.data.frame(Tickets)

# get Currency from json
Currency <- json$suggestions$events$ticket_availability.minimum_ticket_price.currency
Currency
# convert to data frame
Currency <- as.data.frame(Currency)
str(Currency)
# convert column variable to character from factor
Currency <- data.frame(lapply(Currency, as.character), stringsAsFactors=FALSE)

# bind all the data together by columns
all_data_bind <- cbind.data.frame(Event_Name, Event_Date, Location, Tickets, Currency)

# rename V1 as Mininum Price
all_data <- all_data_bind %>% 
  rename(Min_Price = V1)

all_data$Min_Price <- as.numeric(all_data$Min_Price)

# remove rows with na
all_data_1 <- all_data %>% drop_na()
all_data_1
str(all_data_1)
# keep rows with price > 200
all_data_filter_Price <- filter(all_data_1, Min_Price > 200)
all_data_filter_Price

我期望所有页面,但这些是我得到的结果: dput(all_data_1)

structure(list(Event_Name = c("AFROPUNK FEST BROOKLYN 2019", 
"New York: The Wizard's Brunch & Dinner ", "ROOFTOP PARTY | SATURDAY NIGHT | Sky Room NYC Tallest Rooftop Bar Lounge  Times Square ", 
"2019 Tunnel to Towers 5K Run & Walk - NEW YORK CITY", "CIRCLE OF SISTERS 2019", 
"RuPaul's DragCon NYC 2019", "Caribbean Concerts at Six Flags 2019", 
"NYC Ravel Penthouse 808 Rooftop Saturdays Everyone FREE onlist (Gametight)", 
"Comic Con For Kids (Philadelphia, PA)", "AFROBEATS & BRUNCH "
), Event_Date = structure(c(18132, 18124, 18111, 18168, 18146, 
18145, 18126, 18111, 18181, 18112), class = "Date"), Location = c("Brooklyn", 
"New York City", "New York", "Brooklyn", "New York", "New York", 
"Jackson", "Queens", "Oaks", "New York"), Min_Price = c(60, 45, 
0, 0, 22.99, 0, 0, 0, 14.99, 0), Currency = c("USD", "USD", "USD", 
"USD", "USD", "USD", "USD", "USD", "USD", "USD")), row.names = c(NA, 
10L), class = "data.frame")

标签: rjsonrvest

解决方案


您正在获取所有页面,但您没有处理正则表达式中的所有返回项目。

问题是我认为这一行:

json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)

您只使用您感兴趣的所有匹配组中的第一个正则表达式匹配组。您需要使用所有 49 个 ie length(tmp)。您可以通过以下方式检查:

> json1 <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)
> json1$page_number
[1] 1
> json2 <- jsonlite::fromJSON(tmp[[2]][,2], flatten=TRUE)
> json2$page_number
[1] 2
> 

你明白了。您可以编写一个函数,从返回感兴趣信息的任何给定页面中提取并将其应用于所有返回的感兴趣的正则表达式组。


推荐阅读