r - 在多个 url 上抓取 jsonlite - 错误:词法错误:json 文本中的字符无效
问题描述
我在向量中收集了以下网址
departments<- c("https://www.jurinst.su.se/english/about-us/contact/researchers-teachers",
"https://www.jurinst.su.se/english/about-us/contact/doctoral-students",
"https://www.buv.su.se/english/research/our-researchers/researchers-child-and-youth-studies",
"https://www.buv.su.se/english/research/our-researchers/researchers-children-s-culture",
"https://www.buv.su.se/english/research/our-researchers/researchers-early-childhood-education",
"https://www.buv.su.se/english/research/our-researchers/researchers-schoolage-educare",
"https://www.edu.su.se/english/about-us/organisation/researchers-faculty-members",
"https://www.edu.su.se/english/about-us/organisation/phd-students",
"https://www.psychology.su.se/english/about-us/contact/staff-a-z",
"https://www.su.se/publichealth/english/about-us/our-staff",
"https://www.sbs.su.se/english/research/research-sections/accounting/faculty",
"https://www.sbs.su.se/english/research/research-sections/finance/people",
"https://www.sbs.su.se/english/research/research-sections/management/faculty",
"https://www.sbs.su.se/english/research/research-sections/marketing/faculty",
"https://www.sofi.su.se/english/staff/all-staff",
"https://www.astro.su.se/english/about-us/contact/2.16629",
"https://www.mnd.su.se/english/research/mathematics-education/researchers",
"https://www.mnd.su.se/english/research/science-education/researchers",
"https://www.mnd.su.se/english/research/mathematics-education/graduate-students",
"https://www.mnd.su.se/english/research/science-education/graduate-students",
"https://www.fysik.su.se/english/about-us/contact/contact-list-alphabetical",
"https://www.dbb.su.se/about-us/contact",
"https://www.mmk.su.se/about-us/units-and-staff/people-at-mmk",
"https://www.su.se/mbw/about-us/staff/all-contacts",
"https://www.aces.su.se/staff/",
"https://www.su.se/geo/english/about-us/contact/staff",
"http://www.bergianska.se/english/about-us/contact-us/staff",
"https://www.nordita.org/people/zebra/index.php")
就 xpath 而言,这些 url 相似但不相同。我正在尝试使用 jsonlite 创建一个能够下载所有人员姓名和电子邮件地址的循环。但是,如下例所示,我也在处理单点 url 时遇到错误。你有更好的代码想法吗?谢谢
url.1=departments[1]
json.content <- read_html(url.1) %>% html_node('body') %>% html_text() %>%
jsonlite::fromJSON(simplifyVector = FALSE)
解决方案
解决它的蛮力方法可能是这个
departments<-departments[-c(18, 24,25 )] #eliminate departments who don't have emails
df<-data.frame(people_name=NA, emails=NA, university=NA )
#################################################################################### 1168 PERSONE
if(TRUE){
for(i in 1:16){
r<-read_html(departments[i])
people_name <- r %>%
html_nodes(xpath = '//h3') %>%
html_text()
email <- r %>%
html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "profiles-mail", " " ))]') %>%
html_attrs() %>%
as.character()
a<-str_split(email, '"')
email<-sapply(a, "[[", 2)
email<-gsub("mailto:","",email)
d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)
rm(email, people_name,d)
} #DEPARTMENTS 1:16 1168 PERSONE
#save(df, file="Sweden4_2.RData")
#DEPARTMENTS 1:16 1168 PERSONE
################################################################################################# 36 people
email<-NULL
people_name<-NULL
for(i in 17:19){
r<-read_html(departments[i])
people_name1 <- r %>%
html_nodes(xpath = '//td[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//a') %>%
html_text()
email1 <- r %>%
html_nodes(css='td+ td a') %>%
html_attrs() %>%
as.character()
people_name<-c(people_name,people_name1)
email<-c(email,email1)
rm(email1, people_name1)
}
email<-gsub("mailto:","",email)
email<-gsub('c(href = ',"",email, fixed = T)
email<-gsub('\"',"",email, fixed = T)
email<-gsub('http://',"",email, fixed = T)
email<-str_split(email, ",")
email<-sapply(email, "[[", 1)
d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)
rm(email, people_name)
###################################################################################################### 1147 people
for(i in 20:22){
r<-read_html(departments[i])
people_name <- r %>%
html_nodes(xpath = '//h3') %>%
html_text()
email <- r %>%
html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "profiles-mail", " " ))]') %>%
html_attrs() %>%
as.character()
a<-str_split(email, '"')
email<-sapply(a, "[[", 2)
email<-gsub("mailto:","",email)
d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)
rm(email, people_name,d)
} #DEPARTMENTS 20:23 1147 PERSONE
df<-df[-1,]
}
推荐阅读
- nginx - K8s入口响应多个服务器名称而不配置它们?
- javascript - 使用 Alpine JS 有条件地添加 CSS 类
- django - 优化重复查询集
- c# - .NET Core 中的并行程序集
- java - 如何检查 Cloud Firestore 中任何文档的集合中是否存在值(例如名称)?
- reactjs - 反应:双地图循环中的自定义组件不呈现
- python - python请求只打开url,不需要在浏览器中显示或打开
- oauth-2.0 - 授权期间在 Swagger UI 中验证了错误的 OAuth
- java - 在 SpringBoot 中使用 Testcontainers 进行 Spring Data Elasticsearch 集成测试
- php - 如何在 PHPSpreadsheet 中显示数组中的项目?