首页 > 解决方案 > 使用 Rselenium 进行网页抓取并保存为数据框时创建“for”循环

问题描述

我一直在玩 Rselenium 并从 URL 列表中进行网络抓取。自然,我想将我抓取的每个 URL 中的数据组合到一个数据框中。当我这样做时,返回的数据框将包含数据,以及诸如“ checkStatus”、“ statusClass”等杂项。这很难解释,但我希望代码能帮助更好地解释它。

remDr <- remoteDriver( remoteServerAddr = "localhost",
                       port = 4444,
                       browserName = "chrome")

remDr$open()    
URL_list <- c("https://www.premierleague.com/players/4183/Ahmed-El-Mohamady/stats?co=1&se=363")

# Webscrape function
ScrapeDF <- function(link_element){
  #General Stats
  link_element <- remDr$findElement(using = "css selector",".statappearances")
  Appearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statwins")
  Wins <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statlosses")
  Losses <- as.character(link_element$getElementText())
  
  #Defence Stats
  link_element <- remDr$findElement(using = "css selector",".statclean_sheet")
  CleanSheet <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statgoals_conceded")
  Conceded <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_tackle")
  Tackles <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattackle_success")
  SuccessfulTackle <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statlast_man_tackle")
  LastManTackle <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statblocked_scoring_att")
  BlockedShots <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statinterception")
  Interceptions <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_clearance")
  Clearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stateffective_head_clearance")
  HeadedClearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statclearance_off_line")
  ClearanceOffLine <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statball_recovery")
  Recovery <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statduel_won")
  DuelsWon <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statduel_lost")
  DuelsLost <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statwon_contest")
  Successful5050 <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataerial_won")
  AerialWon <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataerial_lost")
  AerialLost <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statown_goals")
  OwnGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".staterror_lead_to_goal")
  ErrorsToGoal <- as.character(link_element$getElementText())
  
  #Team Play Stats
  link_element <- remDr$findElement(using = "css selector",".statgoal_assist")
  Assists <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_pass")
  Passes <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_pass_per_game")
  PassperMatch <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statbig_chance_created")
  BigChanceCreated <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_cross")
  Crosses <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statcross_accuracy")
  CrossAcc <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_through_ball")
  ThroughBall <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataccurate_long_balls")
  LongBall <- as.character(link_element$getElementText())
  
  #Discipline Stats
  link_element <- remDr$findElement(using = "css selector",".statyellow_card")
  YelCard <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statred_card")
  RedCard <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statfouls")
  Fouls <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_offside")
  Offside <- as.character(link_element$getElementText()) 
  
  #Attack stats
  link_element <- remDr$findElement(using = "css selector",".statgoals")
  Goals <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_hd_goal")
  HeadedGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_rf_goal")
  RightFootGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_lf_goal")
  LeftFootGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stathit_woodwork")
  Woodwork <- as.character(link_element$getElementText())

  
  DF_Compiled <- data.frame("Position" = Text, "Appearance" = Appearance,
                      "Wins" = Wins, "Losses" = Losses, "Goals" = Goals,
                      "HeadedGoals" = HeadedGoal, "RightFootGoal" = RightFootGoal,
                      "LeftFootGoal" = LeftFootGoal, "Woodwork" = Woodwork,
                      "YellowCard" = YelCard, "RedCard" = RedCard,
                      "Fouls" = Fouls, "Offside" = Offside, "Assist" = Assists,
                      "Passes" = Passes, "PassperMatch" = PassperMatch, "BigChanceCreated" = BigChanceCreated,
                      "Crosses" = Crosses, "CrossAcc" = CrossAcc, "ThroughBall" = ThroughBall,
                      "AccLongBall" = LongBall, "CleanSheet" = CleanSheet,
                      "Conceded" = Conceded, "Tackles" = Tackles,
                      "SuccessTackle" = SuccessfulTackle, "LastManTackle" = LastManTackle,
                      "BlockedShots" = BlockedShots, "Interceptions" = Interceptions,
                      "Clearances" = Clearance, "HeadedClearance" = HeadedClearance,
                      "OffLineClearance" = ClearanceOffLine,"Recoveries" = Recovery,
                      "DuelsWon" = DuelsWon, "DuelsLost" = DuelsLost,
                      "Successful50_50" = Successful5050, "AerialWon" = AerialWon,
                      "AerialLost" = AerialLost, "OwnGoal" = OwnGoal,
                      "ErrorsToGoal" = ErrorsToGoal)
}
 

## For loop to webscrape
CompletePlayerData <- data.frame(matrix(nrow = 0,ncol = 0))

#looping function of scraping the stats for all the players
 for (url in URL_list) {
   remDr$navigate(url)
   Sys.sleep(4)
   
   Position <- remDr$findElement(using = "css selector",".info")
   Text <- as.character(Position$getElementText())
   
    if(Text == "Defender"){
     saved_list <- lapply(Position, ScrapeDF)    
               
   } else {
     Position <- remDr$findElement(using = "css selector",".info~ .info")
     Text <- as.character(Position$getElementText()) 
     
     if(Text == "Defender"){
       saved_list <- lapply(Position, ScrapeDF)    
       
     } 
     }
 
   CompletePlayerData <- bind_rows(CompletePlayerData, saved_list)
 }

这将返回一个包含 900 列的数据框,如下所示

checkError.Position ...  checkStatus.Position ... nativeEvents.Appearance ...
 Defender          ...         Defender          ...      14                ...

所以我的问题是:

  1. 为什么它返回这么多列
  2. 有没有办法绑定数据,使这些列与“位置”、“外观”、“目标”等相对应?

我想提前为长代码道歉

标签: rweb-scrapingrselenium

解决方案


library(RSelenium)
library(tidyverse)

remDr <- remoteDriver( remoteServerAddr = "localhost",
                       port = 4444,
                       browserName = "chrome")

remDr$open()    
URL_list <- c("https://www.premierleague.com/players/4183/Ahmed-El-Mohamady/stats?co=1&se=363")

# Webscrape function
ScrapeDF <- function(link_element){
  #General Stats
  link_element <- remDr$findElement(using = "css selector",".statappearances")
  Appearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statwins")
  Wins <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statlosses")
  Losses <- as.character(link_element$getElementText())
  
  #Defence Stats
  link_element <- remDr$findElement(using = "css selector",".statclean_sheet")
  CleanSheet <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statgoals_conceded")
  Conceded <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_tackle")
  Tackles <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattackle_success")
  SuccessfulTackle <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statlast_man_tackle")
  LastManTackle <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statblocked_scoring_att")
  BlockedShots <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statinterception")
  Interceptions <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_clearance")
  Clearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stateffective_head_clearance")
  HeadedClearance <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statclearance_off_line")
  ClearanceOffLine <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statball_recovery")
  Recovery <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statduel_won")
  DuelsWon <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statduel_lost")
  DuelsLost <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statwon_contest")
  Successful5050 <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataerial_won")
  AerialWon <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataerial_lost")
  AerialLost <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statown_goals")
  OwnGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".staterror_lead_to_goal")
  ErrorsToGoal <- as.character(link_element$getElementText())
  
  #Team Play Stats
  link_element <- remDr$findElement(using = "css selector",".statgoal_assist")
  Assists <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_pass")
  Passes <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_pass_per_game")
  PassperMatch <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statbig_chance_created")
  BigChanceCreated <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_cross")
  Crosses <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statcross_accuracy")
  CrossAcc <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_through_ball")
  ThroughBall <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stataccurate_long_balls")
  LongBall <- as.character(link_element$getElementText())
  
  #Discipline Stats
  link_element <- remDr$findElement(using = "css selector",".statyellow_card")
  YelCard <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statred_card")
  RedCard <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statfouls")
  Fouls <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stattotal_offside")
  Offside <- as.character(link_element$getElementText()) 
  
  #Attack stats
  link_element <- remDr$findElement(using = "css selector",".statgoals")
  Goals <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_hd_goal")
  HeadedGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_rf_goal")
  RightFootGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".statatt_lf_goal")
  LeftFootGoal <- as.character(link_element$getElementText())
  link_element <- remDr$findElement(using = "css selector",".stathit_woodwork")
  Woodwork <- as.character(link_element$getElementText())
  
  
  DF_Compiled <- data.frame("Position" = Text, "Appearance" = Appearance,
                            "Wins" = Wins, "Losses" = Losses, "Goals" = Goals,
                            "HeadedGoals" = HeadedGoal, "RightFootGoal" = RightFootGoal,
                            "LeftFootGoal" = LeftFootGoal, "Woodwork" = Woodwork,
                            "YellowCard" = YelCard, "RedCard" = RedCard,
                            "Fouls" = Fouls, "Offside" = Offside, "Assist" = Assists,
                            "Passes" = Passes, "PassperMatch" = PassperMatch, "BigChanceCreated" = BigChanceCreated,
                            "Crosses" = Crosses, "CrossAcc" = CrossAcc, "ThroughBall" = ThroughBall,
                            "AccLongBall" = LongBall, "CleanSheet" = CleanSheet,
                            "Conceded" = Conceded, "Tackles" = Tackles,
                            "SuccessTackle" = SuccessfulTackle, "LastManTackle" = LastManTackle,
                            "BlockedShots" = BlockedShots, "Interceptions" = Interceptions,
                            "Clearances" = Clearance, "HeadedClearance" = HeadedClearance,
                            "OffLineClearance" = ClearanceOffLine,"Recoveries" = Recovery,
                            "DuelsWon" = DuelsWon, "DuelsLost" = DuelsLost,
                            "Successful50_50" = Successful5050, "AerialWon" = AerialWon,
                            "AerialLost" = AerialLost, "OwnGoal" = OwnGoal,
                            "ErrorsToGoal" = ErrorsToGoal)
}

## For loop to webscrape
CompletePlayerData <- tibble()

#looping function of scraping the stats for all the players
for (url in URL_list) {
  remDr$navigate(url)
  Sys.sleep(4)
  
  Position <- remDr$findElement(using = "css selector",".info")
  Text <- as.character(Position$getElementText())

  if(Text == "Defender"){
    # Return an empty list if call fails
    saved_list <- lapply(Position, possibly(ScrapeDF, list()))
  }
  
  new_data <- tibble(
    Position = Position %>% list(),
    Text = Text,
    saved_list = saved_list %>% list()
  )
  
  CompletePlayerData <- bind_rows(CompletePlayerData, new_data)
}

CompletePlayerData %>%
  select(saved_list) %>%
  unnest(saved_list) %>%
  unnest(saved_list) %>%
  distinct(Position, Goals, Appearance)

输出:

# A tibble: 1 x 3
  Position Appearance Goals
  <fct>    <fct>      <fct>
1 Defender 14         0   

推荐阅读