首页 > 解决方案 > 在多个 url 上抓取 jsonlite - 错误:词法错误:json 文本中的字符无效

问题描述

我在向量中收集了以下网址

departments<-  c("https://www.jurinst.su.se/english/about-us/contact/researchers-teachers",
                 "https://www.jurinst.su.se/english/about-us/contact/doctoral-students",
                 "https://www.buv.su.se/english/research/our-researchers/researchers-child-and-youth-studies",
                 "https://www.buv.su.se/english/research/our-researchers/researchers-children-s-culture",
                 "https://www.buv.su.se/english/research/our-researchers/researchers-early-childhood-education",
                 "https://www.buv.su.se/english/research/our-researchers/researchers-schoolage-educare",
                 "https://www.edu.su.se/english/about-us/organisation/researchers-faculty-members",
                 "https://www.edu.su.se/english/about-us/organisation/phd-students",
                 "https://www.psychology.su.se/english/about-us/contact/staff-a-z",
                 "https://www.su.se/publichealth/english/about-us/our-staff",
                 "https://www.sbs.su.se/english/research/research-sections/accounting/faculty",
                 "https://www.sbs.su.se/english/research/research-sections/finance/people",
                 "https://www.sbs.su.se/english/research/research-sections/management/faculty",
                 "https://www.sbs.su.se/english/research/research-sections/marketing/faculty",
                 "https://www.sofi.su.se/english/staff/all-staff",
                 "https://www.astro.su.se/english/about-us/contact/2.16629",
                 "https://www.mnd.su.se/english/research/mathematics-education/researchers",
                 "https://www.mnd.su.se/english/research/science-education/researchers",
                 "https://www.mnd.su.se/english/research/mathematics-education/graduate-students",
                 "https://www.mnd.su.se/english/research/science-education/graduate-students",
                 "https://www.fysik.su.se/english/about-us/contact/contact-list-alphabetical",
                 "https://www.dbb.su.se/about-us/contact",
                 "https://www.mmk.su.se/about-us/units-and-staff/people-at-mmk",
                 "https://www.su.se/mbw/about-us/staff/all-contacts",
                 "https://www.aces.su.se/staff/",
                 "https://www.su.se/geo/english/about-us/contact/staff",
                 "http://www.bergianska.se/english/about-us/contact-us/staff",
                 "https://www.nordita.org/people/zebra/index.php")

就 xpath 而言,这些 url 相似但不相同。我正在尝试使用 jsonlite 创建一个能够下载所有人员姓名和电子邮件地址的循环。但是,如下例所示,我也在处理单点 url 时遇到错误。你有更好的代码想法吗?谢谢

url.1=departments[1]

json.content <- read_html(url.1) %>% html_node('body') %>% html_text() %>% 
  jsonlite::fromJSON(simplifyVector = FALSE)

标签: rweb-scrapingrvestjsonlite

解决方案


解决它的蛮力方法可能是这个

departments<-departments[-c(18, 24,25 )] #eliminate departments who don't have emails
df<-data.frame(people_name=NA, emails=NA, university=NA )
#################################################################################### 1168 PERSONE
if(TRUE){
for(i in 1:16){
  
  r<-read_html(departments[i]) 
  people_name <- r %>%
    html_nodes(xpath = '//h3') %>%
    html_text()  
  
  email <- r %>%
    html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "profiles-mail", " " ))]') %>%
    html_attrs() %>%
    as.character()
  
  a<-str_split(email, '"')
  email<-sapply(a, "[[", 2)  
  email<-gsub("mailto:","",email)
  
  d<-data.frame(people_name, emails=email, university="Stockholm University")
  df<-rbind(df,d)
  rm(email, people_name,d)
} #DEPARTMENTS 1:16 1168 PERSONE

#save(df, file="Sweden4_2.RData")
#DEPARTMENTS 1:16 1168 PERSONE


#################################################################################################   36 people
email<-NULL
people_name<-NULL
for(i in 17:19){
 
  r<-read_html(departments[i]) 
  people_name1 <- r %>%
    html_nodes(xpath = '//td[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//a') %>%
    html_text()  
  
  email1 <- r %>%
    html_nodes(css='td+ td a') %>%
    html_attrs() %>%
    as.character()
  

  people_name<-c(people_name,people_name1)
  email<-c(email,email1)
  rm(email1, people_name1)
  
}

email<-gsub("mailto:","",email)
email<-gsub('c(href = ',"",email, fixed = T)
email<-gsub('\"',"",email, fixed = T)
email<-gsub('http://',"",email, fixed = T)

email<-str_split(email, ",")
email<-sapply(email, "[[", 1)  


d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)

rm(email, people_name)
###################################################################################################### 1147 people

for(i in 20:22){
  
  r<-read_html(departments[i]) 
  people_name <- r %>%
    html_nodes(xpath = '//h3') %>%
    html_text()  
  
  email <- r %>%
    html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "profiles-mail", " " ))]') %>%
    html_attrs() %>%
    as.character()
  
  a<-str_split(email, '"')
  email<-sapply(a, "[[", 2)  
  email<-gsub("mailto:","",email)
  
  d<-data.frame(people_name, emails=email, university="Stockholm University")
  df<-rbind(df,d)
  rm(email, people_name,d)
} #DEPARTMENTS 20:23 1147 PERSONE

df<-df[-1,]
}

推荐阅读