r - 使用 rvest 读取多个页面
问题描述
我正在尝试使用 rvest 读取多个结果页面以收集页面上的所有数据。
我已经尝试从这里调整代码:
但我只得到结果的第一页。
library(httr)
library(rvest)
library(dplyr)
library(magrittr)
library(stringr)
library(lubridate)
library(purrr)
library(jsonlite)
library(rjson)
library(tidyverse)
body_tags_1 <- lapply(paste0('https://www.eventbrite.com/d/ny--new-york/conference/?page=', 1:49),
function(url){
url %>% read_html() %>%
html_nodes("body") %>%
html_text() %>%
toString() # to produce a single character string describing an R object.
})
tmp <- str_match_all(body_tags_1,'window.__SERVER_DATA__ = (.*);')
# Convert R objects from JSON - output - list and flatten the JSON object
json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)
# get Event name from json
Event_Name <- json$suggestions$events$name
# convert to data frame
Event_Name <- as.data.frame(Event_Name)
# convert column variable to character from factor
Event_Name <- data.frame(lapply(Event_Name, as.character), stringsAsFactors=FALSE)
# get event date from dates
Event_Date <- json$suggestions$events$start_date
# convert dates to data frame
Event_Date <- as.data.frame(Event_Date)
# Convert dates to character
Event_Date <- data.frame(lapply(Event_Date, as.character), stringsAsFactors=FALSE)
#`## convert Date from chr format to Date format using lubridate
Event_Date$Event_Date <- ymd(Event_Date$Event_Date)
# get Location from json
Location <- json$suggestions$events$primary_venue.address.city
# convert to data frame
Location <- as.data.frame(Location)
# convert column variable to character from factor
Location <- data.frame(lapply(Location, as.character), stringsAsFactors=FALSE)
Tickets <- json$suggestions$events$ticket_availability.minimum_ticket_price.major_value
# transform a vector into data frame with fixed dimension
# by converting to a matrix, specify the ncol
m1 <- matrix(Tickets, ncol=1, byrow=TRUE)
# convert to data frame
Tickets <- as.data.frame(m1, stringsAsFactors=FALSE)
Tickets <- as.data.frame(Tickets)
# get Currency from json
Currency <- json$suggestions$events$ticket_availability.minimum_ticket_price.currency
Currency
# convert to data frame
Currency <- as.data.frame(Currency)
str(Currency)
# convert column variable to character from factor
Currency <- data.frame(lapply(Currency, as.character), stringsAsFactors=FALSE)
# bind all the data together by columns
all_data_bind <- cbind.data.frame(Event_Name, Event_Date, Location, Tickets, Currency)
# rename V1 as Mininum Price
all_data <- all_data_bind %>%
rename(Min_Price = V1)
all_data$Min_Price <- as.numeric(all_data$Min_Price)
# remove rows with na
all_data_1 <- all_data %>% drop_na()
all_data_1
str(all_data_1)
# keep rows with price > 200
all_data_filter_Price <- filter(all_data_1, Min_Price > 200)
all_data_filter_Price
我期望所有页面,但这些是我得到的结果: dput(all_data_1)
structure(list(Event_Name = c("AFROPUNK FEST BROOKLYN 2019",
"New York: The Wizard's Brunch & Dinner ", "ROOFTOP PARTY | SATURDAY NIGHT | Sky Room NYC Tallest Rooftop Bar Lounge Times Square ",
"2019 Tunnel to Towers 5K Run & Walk - NEW YORK CITY", "CIRCLE OF SISTERS 2019",
"RuPaul's DragCon NYC 2019", "Caribbean Concerts at Six Flags 2019",
"NYC Ravel Penthouse 808 Rooftop Saturdays Everyone FREE onlist (Gametight)",
"Comic Con For Kids (Philadelphia, PA)", "AFROBEATS & BRUNCH "
), Event_Date = structure(c(18132, 18124, 18111, 18168, 18146,
18145, 18126, 18111, 18181, 18112), class = "Date"), Location = c("Brooklyn",
"New York City", "New York", "Brooklyn", "New York", "New York",
"Jackson", "Queens", "Oaks", "New York"), Min_Price = c(60, 45,
0, 0, 22.99, 0, 0, 0, 14.99, 0), Currency = c("USD", "USD", "USD",
"USD", "USD", "USD", "USD", "USD", "USD", "USD")), row.names = c(NA,
10L), class = "data.frame")
解决方案
您正在获取所有页面,但您没有处理正则表达式中的所有返回项目。
问题是我认为这一行:
json <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)
您只使用您感兴趣的所有匹配组中的第一个正则表达式匹配组。您需要使用所有 49 个 ie length(tmp)
。您可以通过以下方式检查:
> json1 <- jsonlite::fromJSON(tmp[[1]][,2], flatten=TRUE)
> json1$page_number
[1] 1
> json2 <- jsonlite::fromJSON(tmp[[2]][,2], flatten=TRUE)
> json2$page_number
[1] 2
>
你明白了。您可以编写一个函数,从返回感兴趣信息的任何给定页面中提取并将其应用于所有返回的感兴趣的正则表达式组。
推荐阅读
- c++11 - 无法理解 C++ stl 中的向量函数及其差异
- ios - 创建帖子后发送自动聊天消息。火力基地
- user-interface - 如何在 TButton 中显示图标?
- c# - TryInvokeMember 上的异步任务(DynamicObject)
- javascript - 尝试学习 React 和 Javascript,但坚持使用这种奇怪的 map 语法并将匿名函数传递给它
- django - Django:将表单添加到我的扩展用户模型
- python - Python 循环一直在 Window 的解释器中停止
- python - 将 pdf 转换为 excel(使用 Camelot 获取特定表格)
- cloudera - Errno 14 PYCURL 错误 6 ;在 Cloudera Manager 7.x 升级中无法解析主机
- javascript - 页面加载到容器中时 UI 滑块不可见