首页 > 解决方案 > 如何加快我的功能,特别是 ggplot 命令?

问题描述

我整理了一个函数来识别异常值。它需要一个数据框,然后用线条显示数据图,以指示潜在的异常值。它也会给出一个标有异常值的表格。

但是,它很慢。问题是绘图需要很长时间才能加载。

我很好奇你是否有关于如何加快速度的建议。

相关:默认绘图系统是否比 ggplot 更快?

我将从依赖项开始

#These next four functions are not mine. They're used in GetOutliers()

ExtractDetails <- function(x, down, up){

  outClass <- rep("N", length(x))
  indexLo <- which(x < down)
  indexHi <- which(x > up)
  outClass[indexLo] <- "L"
  outClass[indexHi] <- "U"
  index <- union(indexLo, indexHi)
  values <- x[index]
  outClass <- outClass[index]
  nOut <- length(index)
  maxNom <- max(x[which(x <= up)])
  minNom <- min(x[which(x >= down)])
  outList <- list(nOut = nOut, lowLim = down,
                  upLim = up, minNom = minNom,
                  maxNom = maxNom, index = index,
                  values = values,
                  outClass = outClass)
  return(outList)
}

Hampel <- function(x, t = 3){
  #
  mu <- median(x, na.rm = TRUE)
  sig <- mad(x, na.rm = TRUE)
  if (sig == 0){
    message("Hampel identifer implosion: MAD scale estimate is zero")
  }
  up<-mu+t*sig
  down<-mu-t*sig
  out <- list(up = up, down = down)
  return(out)
}


ThreeSigma <- function(x, t = 3){
  #
  mu <- mean(x, na.rm = TRUE)
  sig <- sd(x, na.rm = TRUE)
  if (sig == 0){
    message("All non-missing x-values are identical")
  }
  up<-mu+t* sig
  down<-mu-t * sig
  out <- list(up = up, down = down)
  return(out)
}

BoxplotRule <- function(x, t = 1.5){
  #
  xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
  xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
  Q<-xU-xL
  if(Q==0){
    message("Boxplot rule implosion: interquartile distance is zero")
  }
  up<-xU+t*Q
  down<-xU-t*Q
  out <- list(up = up, down = down)
  return(out)
}

FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
  threeLims <- ThreeSigma(x, t = t3)
  HampLims <- Hampel(x, t = tH)
  boxLims <- BoxplotRule(x, t = tb)

  n <- length(x)
  nMiss <- length(which(is.na(x)))

  threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
  HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
  boxList <- ExtractDetails(x, boxLims$down, boxLims$up)

  sumFrame <- data.frame(method = "ThreeSigma", n = n,
                         nMiss = nMiss, nOut = threeList$nOut,
                         lowLim = threeList$lowLim,
                         upLim = threeList$upLim,
                         minNom = threeList$minNom,
                         maxNom = threeList$maxNom)
  upFrame <- data.frame(method = "Hampel", n = n,
                         nMiss = nMiss, nOut = HampList$nOut,
                         lowLim = HampList$lowLim,
                         upLim = HampList$upLim,
                         minNom = HampList$minNom,
                         maxNom = HampList$maxNom)
  sumFrame <- rbind.data.frame(sumFrame, upFrame)
  upFrame <- data.frame(method = "BoxplotRule", n = n,
                         nMiss = nMiss, nOut = boxList$nOut,
                         lowLim = boxList$lowLim,
                         upLim = boxList$upLim,
                         minNom = boxList$minNom,
                         maxNom = boxList$maxNom)
  sumFrame <- rbind.data.frame(sumFrame, upFrame)

  threeFrame <- data.frame(index = threeList$index,
                         values = threeList$values,
                         type = threeList$outClass)
  HampFrame <- data.frame(index = HampList$index,
                        values = HampList$values,
                        type = HampList$outClass)
  boxFrame <- data.frame(index = boxList$index,
                       values = boxList$values,
                       type = boxList$outClass)
  outList <- list(summary = sumFrame, threeSigma = threeFrame,
                Hampel = HampFrame, boxplotRule = boxFrame)
  return(outList)
}

#strip non-numeric variables out of a dataframe
num_vars <- function(df){
  X <- which(sapply(df, is.numeric))
  num_vars <- df[names(X)]
  return(num_vars)
}

这是功能

GetOutliers <- function(df){
  library('dplyr')
  library('ggplot2')

  #strip out the non-numeric columns
  df_out <- num_vars(df)

  #initialize the data frame
  df_out$Hampel <- NA
  df_out$threeSigma <- NA
  df_out$boxplotRule <- NA
  df_out_id <- df_out

  #identify outliers for each column
  for (i in 1:length(names(num_vars(df)))){

    #find the outliers
    Outs <- FindOutliers(df_out[[i]])
    OutsSum <- Outs$summary

    #re-enter the outlier status
    df_out$Hampel <- NA
    df_out$threeSigma <- NA
    df_out$boxplotRule <- NA
    ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
    ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
    ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)

    #visualize the outliers and print outlier information
    Temp <- df_out
    A <- colnames(Temp)[i]
    AA <- paste(A,"Index")
    colnames(Temp)[i] <- 'curr_column'

    #table with outlier status
    X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))

    #scatterplot with labels
    Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
      geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
      geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
      geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
      geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
      geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
      geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
      geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
      xlab(AA) + ylab(A)

    #scatterplot without labels
    Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
      geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
      xlab(AA) + ylab(A)

    U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)

    print(A)
    print(X)
    print(OutsSum)
    print(Z)
    print(Y)
    print(U)

    #mark the extreme outliers, the rest are reasonable outliers
    A <- colnames(df_out_id[i])
    Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
    W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
    col <- df_out_id[i]
    df_out_id[i] <- sapply(col[[1]], function(x){
      if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
      if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
      else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
      else return('Non-Outlier')
    })

  }

  #return a dataframe with outlier status, excluding the outlier ID columns
  summary(df_out_id)
  return(df_out_id[1:(length(names(df_out_id))-3)])
}

例子

library('ISLR')
data(Carseats)

GetOutliers(Carseats)

它会显示每个数值变量的异常值。

在此处输入图像描述

它将绘制变量密度,然后绘制带有标识符线的散点图

在此处输入图像描述

它还将接受输入,因此您可以将一些异常值标记为合理,将其他异常值标记为极端

标签: rggplot2

解决方案


推荐阅读