首页 > 解决方案 > 使用 R 抓取网站的 Tableau 仪表板

问题描述

我正在尝试从该站点的密苏里州 COVID 人口统计数据中抓取数据并将其放入 RStudio。另外,这是我从 HTML 中找到的 Tableau URL:https ://results.mo.gov/t/COVID19/views/Demographics/Public-Demographics?:embed=y&:showVizHome=no&:host_url=https%3A% 2F%2Fresults.mo.gov%2F&:embed_code_version=3&:tabs=no&:toolbar=no&:showAppBanner=false&:%E2%80%98iframeSizedToWindow%E2%80%99=%E2%80%98true%E2%80% 99&:dataDetails=no&:display_spinner=no&:loadOrderID=0

我一直在这里使用这个论坛指导如何将公共画面仪表板刮到 R 中,但它似乎没有为我的数据框提供任何行或列。

这是我尝试重新创建的代码:

library(rvest)
library(rjson)
library(httr)
library(stringr)

tableauHost <- "https://results.mo.gov"

url <- httr::modify_url(tableauHost,
                        path = "/t/COVID19/views/Demographics/Public-Demographics",
                        query = list(":embed" = "y",
                                     ":showVizHome" = "no",
                                     ":host_url" = "https://results.mo.gov/",
                                     ":embed_code_version" = 3,
                                     ":tabs" = "no",
                                     ":toolbar" = "no",
                                     ":showAppBanner" = "false",
                                     ":'iframeSizedToWindow'" = "'true'",
                                     ":dataDetails" = "no",
                                     ":display_spinner" = "no",
                                     ":loadOrderID" = 0)
                        )

body <- read_html(url)
data <- body %>% 
  html_nodes("textarea#tsConfigContainer") %>% 
  html_text()
json <- fromJSON(data)

url <- modify_url(tableauHost, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))

resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")

extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])

worksheet <- "+ PCR by age"

columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData

i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
  if (is.null(t[["localBaseColumnName"]]) == FALSE) {
    result[[i]] <- list(
      localBaseColumnName = t[["localBaseColumnName"]], 
      valueIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$valueIndices,
      aliasIndices = columnsData$paneColumnsList[[t$paneIndices + 1]]$vizPaneColumns[[t$columnIndices + 1]]$aliasIndices, 
      dataType = t[["dataType"]],
      stringsAsFactors = FALSE
    )
    i <- i + 1
  }
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns

buildOffset = function(x){
  return(if(x>=0) 0 else -1)
}

data_index <- 1
name_index <- 1
frameData <-  list();
frameNames <- c()
for(t in dataFull) {
  for(index in result) {
    if (t$dataType == "cstring"){
      if (length(index$valueIndices) > 0) {
        j <- 1
        vector <- character(length(index$valueIndices))
        for (it in index$valueIndices){
          vector[j] <- t$dataValues[abs(it)+1]
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$localBaseColumnName, "value", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
      if (length(index$aliasIndices) > 0) {
        j <- 1
        vector <- character(length(index$aliasIndices))
        for (it in index$aliasIndices){
          vector[j] <- t$dataValues[abs(it) + buildOffset(it) + 1]
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$localBaseColumnName, "alias", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
    }
  }
}

columnToKeep = c('[Student Aid Program Type]-value','[Student Aid Program]-value', '[:Measure Names]-alias', '[Multiple Values]-alias')
df <- NULL
for(i in 1:length(frameNames)){
  if (frameNames[i] %in% columnToKeep){
    df[frameNames[i]] <- frameData[i]
  }
}
options(width = 1200) #for readability
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)

但它只是给我这个作为输出

print(df)
data frame with 0 columns and 0 rows

我知道columnToKeep我们为数据框定义列的部分与我正在寻找的数据不匹配,那么不正确的列数会影响这一点吗?我一直在尝试破译原始代码并使用密苏里 COVID 的仪表板实现它,但每次看到它都让我非常困惑。对此的任何帮助将不胜感激!

标签: rweb-scrapingtableau-api

解决方案


问题是它需要以aliasIndices不同的方式处理。流程如下:

  • 选择下面的工作表

    data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap
    

presModelHolder$genVizDataPresModel$paneColumnsData

  • 获取下的所有列vizDataColumns,注意fieldCaption作为列名
  • 注意所有列的paneIndicesandcolumnIndices
  • 对于每一列,在 下paneColumnsList,将行分配给指定的 columnIndices 以及在vizPaneColumnswhich 下给出的值valueIndices以及aliasIndices哪些是字典中实际数据的索引

在字典中(在 下data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments),您拥有完整的数据。您只需要匹配您之前提取的索引。为了匹配这些索引,您需要让dataType列的 知道要在字典中选择哪个对象,然后:

  • valueIndices仅在列表中分配索引
  • 对于aliasIndices,如果值 > 0 分配索引,如果它是负数,则将列表中的索引与dataType“cstring”匹配

我不确定这个算法是否适用于所有画面数据,但对于我测试过的一些数据来说效果很好:

library(rvest)
library(rjson)
library(httr)
library(stringr)

#replace the hostname and the path if necessary
host_url <- "https://results.mo.gov"
path <- "/t/COVID19/views/Demographics/Public-Demographics"

body <- read_html(modify_url(host_url, 
                             path = path, 
                             query = list(":embed" = "y",":showVizHome" = "no")
))

data <- body %>% 
  html_nodes("textarea#tsConfigContainer") %>% 
  html_text()
json <- fromJSON(data)

url <- modify_url(host_url, path = paste(json$vizql_root, "/bootstrapSession/sessions/", json$sessionid, sep =""))

resp <- POST(url, body = list(sheet_id = json$sheetId), encode = "form")
data <- content(resp, "text")

extract <- str_match(data, "\\d+;(\\{.*\\})\\d+;(\\{.*\\})")
info <- fromJSON(extract[1,1])
data <- fromJSON(extract[1,3])

worksheets = names(data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap)

for(i in 1:length(worksheets)){
  print(paste("[",i,"] ",worksheets[i], sep=""))
}
selected <-  readline(prompt="select worksheet by index: ");
worksheet <- worksheets[as.integer(selected)]
print(paste("you selected :", worksheet, sep=" "))

columnsData <- data$secondaryInfo$presModelMap$vizData$presModelHolder$genPresModelMapPresModel$presModelMap[[worksheet]]$presModelHolder$genVizDataPresModel$paneColumnsData

i <- 1
result <- list();
for(t in columnsData$vizDataColumns){
  if (is.null(t[["fieldCaption"]]) == FALSE) {
    paneIndex <- t$paneIndices
    columnIndex <- t$columnIndices
    if (length(t$paneIndices) > 1){
      paneIndex <- t$paneIndices[1]
    }
    if (length(t$columnIndices) > 1){
      columnIndex <- t$columnIndices[1]
    }
    result[[i]] <- list(
      fieldCaption = t[["fieldCaption"]], 
      valueIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$valueIndices,
      aliasIndices = columnsData$paneColumnsList[[paneIndex + 1]]$vizPaneColumns[[columnIndex + 1]]$aliasIndices, 
      dataType = t[["dataType"]],
      stringsAsFactors = FALSE
    )
    i <- i + 1
  }
}
dataFull = data$secondaryInfo$presModelMap$dataDictionary$presModelHolder$genDataDictionaryPresModel$dataSegments[["0"]]$dataColumns

cstring <- list();
for(t in dataFull) {
  if(t$dataType == "cstring"){
    cstring <- t
    break
  }
}
data_index <- 1
name_index <- 1
frameData <-  list()
frameNames <- c()
for(t in dataFull) {
  for(index in result) {
    if (t$dataType == index["dataType"]){
      if (length(index$valueIndices) > 0) {
        j <- 1
        vector <- character(length(index$valueIndices))
        for (it in index$valueIndices){
          vector[j] <- t$dataValues[it+1]
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$fieldCaption, "value", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
      if (length(index$aliasIndices) > 0) {
        j <- 1
        vector <- character(length(index$aliasIndices))
        for (it in index$aliasIndices){
          if (it >= 0){
            vector[j] <- t$dataValues[it+1]
          } else {
            vector[j] <- cstring$dataValues[abs(it)]
          }
          j <- j + 1
        }
        frameData[[data_index]] <- vector
        frameNames[[name_index]] <- paste(index$fieldCaption, "alias", sep="-")
        data_index <- data_index + 1
        name_index <- name_index + 1
      }
    }
  }
}

df <- NULL
lengthList <- c()
for(i in 1:length(frameNames)){
  lengthList[i] <- length(frameData[[i]])
}
max <- max(lengthList)
for(i in 1:length(frameNames)){
  if (length(frameData[[i]]) < max){
    len <- length(frameData[[i]])
    frameData[[i]][(len+1):max]<-""
  }
  df[frameNames[[i]]] <- frameData[i]
}
options(width = 1200)
df <- as.data.frame(df, stringsAsFactors = FALSE)
print(df)

我在这里创建了一个包含 R 和 Python 脚本的存储库


推荐阅读