首页 > 解决方案 > 如何在 R 中进行连续子集化并在每一步绘制计数?

问题描述

我有一个包含 16 个变量的 1401 个观察值的数据框。对于每一列(第一列除外),我有 1(如果满足条件)或 0(如果不满足条件)。我根据所有条件(所有列)以不同的顺序对我的数据进行子集化(例如,第 1 列、第 2 列、第 3 列等或第 3 列、第 12 列、第 1 列)。为了比较不同的场景,我绘制了子集的每个步骤之后的观察次数。

我编写了我的代码并且它可以工作,但是它非常混乱且冗长,它肯定会从您的建议中受益。

dput(droplevels(head(data,20)))
structure(list(Substance = structure(c(13L, 9L, 10L, 12L, 1L, 
19L, 16L, 17L, 5L, 2L, 14L, 7L, 4L, 6L, 20L, 18L, 15L, 3L, 11L, 
8L), .Label = c("104653-34-1", "107-02-8", "111-30-8", "12057-74-8", 
"122454-29-9", "14915-37-8", "20859-73-8", "27083-27-8", "28772-56-7", 
"3691-35-8", "55965-84-9", "56073-07-5", "56073-10-0", "5836-29-3", 
"71751-41-2", "74-90-8", "81-81-2", "86347-14-0", "90035-08-8", 
"91465-08-6"), class = "factor"), colA = c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    colB = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), colC = c(1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), colD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), colE = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 
    1L, 1L), colF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), colG = c(0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
    1L), colH = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colI = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L
    ), colK = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 
    0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L), colJ = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 
    0L, 0L), colL = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 
    0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), colM = c(NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_), colN = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), colO = c(1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Substance", 
"Oral", "Dermal", "Inhalation", "SC", "SED", "RS", "SS", "M", 
"C", "R", "STOT.SE", "STOT.RE", "AT", "Eco.Acute", "Eco.Chronic"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 12L, 13L, 
14L, 17L, 18L, 19L, 20L, 21L, 22L, 28L, 34L), class = "data.frame")
#scenario A
#I count the number of observations for each condition
count_0 <- count(data)
count_1 <- sum(data$colA == 1)
count_2 <- sum(data$colA == 1 & data$colB == 1)
count_3 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1)
count_4 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1)
count_5 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1& data$colD == 1 & data$colE == 1)
count_6 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1)
count_7 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1& data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1)
count_8 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1 & data$colH == 1)
count_9 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1& data$colH == 1 &                 data$colI == 1)
count_10 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1& data$colH == 1 & data$colI == 1 & data$colJ == 1)
count_11 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1 & data$colH == 1 &                  data$colI == 1 & data$colJ == 1 &  data$colK == 1)
count_12 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1 & data$colH == 1 &                  data$colI == 1 & data$colJ == 1 &  data$colK == 1& data$colL == 1)
count_13 <- sum(data$colA == 1 & data$colB == 1 &  data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 &  data$colG == 1& data$colH == 1 &                  data$colI == 1& data$colJ == 1 & data$colK == 1 & data$colL == 1 & data$colM == 1)
count_14 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1 & data$colD == 1 & data$colE == 1 & data$colF == 1 & data$colG == 1 & data$colH == 1 & data$colI == 1 & data$colJ == 1 & data$colK == 1 & data$colL == 1 & data$colM == 1 & data$colN == 1)
count_15 <- sum(data$colA == 1 & data$colB == 1 & data$colC == 1& data$colD == 1 & data$colE == 1& data$colF == 1 & data$colG == 1 & data$colH == 1 &               data$colI == 1 & data$colJ == 1 & data$colK == 1 & data$colL == 1 & data$colM == 1 & data$colN == 1 & data$colO == 1)
scenarioA <- rbind(count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,count_10,count_11,count_12,count_13, count_14,count_15)
#I add a column to the new dataframe "scenarioA" to indicate that the results correspond to scenario A (for later use for ggplot)
scenario <- c("scenarioA","scenarioA","scenarioA","scenarioA","scenarioA","scenarioA","scenarioA","scenarioA","scenarioA",            "scenarioA","scenarioA","scenarioA","scenarioA","scenarioA","scenarioA","scenarioA")
scenarioA <- cbind(rownames(scenarioA),scenarioA, scenario)
rownames(scenarioA) <- NULL
colnames(scenarioA) <- c("endpoint","hits","scenario")
#I repeat the same for scenario B but with different order in the subsetting
count_1 <- sum(data$colM == 1)
count_2 <- sum(data$colM == 1 & data$colC == 1)
count_3 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1)
count_4 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1)
count_5 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1)
count_6 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1)
count_7 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 &  data$colD == 1)
count_8 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1)
count_9 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1)
count_10 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1 & data$colG == 1)
count_11 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1 & data$colG == 1 & data$colB == 1)
count_12 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1 & data$colG == 1 & data$colB == 1& data$colJ == 1)
count_13 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1 & data$colG == 1 & data$colB == 1& data$colJ == 1 & data$colL == 1)
count_14 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1 & data$colG == 1 & data$colB == 1& data$colJ == 1 & data$colL == 1 & data$colN == 1)
count_15 <- sum(data$colM == 1 & data$colC == 1 & data$colE == 1 & data$colH == 1 & data$colF == 1 & data$colA == 1 & data$colD == 1 & data$colO == 1 & data$colI == 1 & data$colG == 1 & data$colB == 1& data$colJ == 1 & data$colL == 1 & data$colN == 1 & data$colK == 1)
#I create a new data frame with the results from the scenario B
scenarioB <- rbind(count_0, count_1,count_2,count_3,count_4,count_5,count_6,count_7,                                count_8,count_9,count_10,count_11,count_12,count_13, count_14,count_15)
scenario <- c("scenarioB","scenarioB","scenarioB","scenarioB","scenarioB","scenarioB","scenarioB","scenarioB","scenarioB",       "scenarioB","scenarioB","scenarioB","scenarioB","scenarioB","scenarioB","scenarioB")
scenarioB <- cbind(rownames(scenarioB),scenarioB, scenario)
rownames(scenarioB) <- NULL
colnames(scenarioB) <- c("endpoint","hits","scenario")
#I create a new data frame with scenarios A & B
scenarios <- as.data.frame(rbind(scenarioA, scenarioB))
scenarios$endpoint <- as.character(c("count_0", "count_1","count_2","count_3","count_4","count_5","count_6","count_7",
           "count_8","count_9","count_10","count_11","count_12","count_13","count_14","count_15"))
scenarios$endpoint <- factor(scenarios$endpoint, levels=unique(scenarios$endpoint))
scenarios$hits <- as.numeric(as.character(scenarios$hits))
#I plot the results in one single graph to compare the 2 scenarios
library(ggplot2)
ggplot(scenarios, aes(x=endpoint, y=hits, color=scenario))+
  geom_point()+
  theme_minimal()+
  theme(axis.text.x = element_text(hjust=.5))

代码运行但并不简单。最终我将使用超过 2 个场景,所以我想找到一种更好的方法来编写这个。我想到了循环和/或嵌套。

标签: rloopsoptimizationnestedsubset

解决方案


好的,所以场景在列的顺序上有所不同。

colA <- colnames(data)[2:16]
colB <- colnames(data)[c(14, 4, 6, 9, 7, 2, 5, 16, 10, 8, 3, 11, 13, 15, 12)]

我们再做一件事来简化我们的工作,即用 NA 替换所有 0。如果 0 和 NA 之间的区别对您很重要,那么您的问题并不明显。我们将在下面使用 dplyr。

library(dplyr)
data <- data %>% mutate_all(~ ifelse(. == 0, NA, .))

现在,来计算一下。有很多选择,我选择了一个相对容易理解的:

count_runs <- function(data, cols) {
  data %>% select(cols) %>% drop_na %>% count %>% pull(n)
}

备择方案:

# without removing NA's
count_runs <- function(data, cols) {
  data %>% select(cols) %>% drop_na %>% 
     filter_all(all_vars(. == 1)) %>%    # important notice below!
     count %>% pull(n)
}

# without dplyr
# and without converting 0's to NA's
count_runs <- function(data, cols) {
    data <- data[, cols]
    sum(apply(data, 1, function(x) all(!is.na(x) & x == 1)))
}

所有这些函数都返回完全相同的结果。

此函数删除所有包含 NA 的行(即值不全部等于 1 的行)并计算剩余的行数。运行的结果count_runs(data, colA[1:3])例如是 84。

现在我们可以制作场景A和场景B:

 endpoints <- 0:15
 counts <- sapply(1:15, function(i) count_runs(data, colA[1:i]))
 counts <- c(nrow(data), counts)
 scenarioA <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioA")
 counts <- sapply(1:15, function(i) count_runs(data, colB[1:i]))
 counts <- c(nrow(data), counts)
 scenarioB <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioB")
 scenarios <- rbind(scenarioA, scenarioB)

最后,剧情:

 ggplot(scenarios, aes(x=endpoint, y=hits, color=scenario, group=scenario)) + 
       geom_point() + geom_line() + theme_minimal()

在此处输入图像描述

现在,有几点意见。

  • 您提供的代码不起作用。首先,在一行中你有“M”、“C”等而不是“colM”、“colC”等。然后,计数是 NA,因为你做sum(...)了而不是sum(..., na.rm=T)并且一些列填充了NA的。
  • as.numeric(as.character(hits))– 我假设你曾被一个看起来像数字的因素伤害过。这不是解决问题的方法;sum()将始终返回一个数字,并且您始终可以检查数据类型是否正确。
  • 为什么要转换endpoint为字符值?它在带有所有这些标签的情节上看起来并不好,它实际上对应于一系列连续的数字。
  • 重要提示:您的数据是double类型(浮点)。这意味着您永远不应该使用这些数据==%in%对这些数据做任何事情。要么使用all_equal,或者更好的是,将数据类型更改为整数。
  • 下次您发布此类问题时,请帮助那些想帮助您的人:选择您的数据子集并使用dput它直接将其嵌入您的帖子中。实际上,即使我已经回答了您的问题,您也应该这样做,以便其他可能对您的问题感兴趣的人。

推荐阅读