首页 > 解决方案 > 如果在 R 中使用超过 1 个组,na.rm 函数将不起作用

问题描述

在这篇文章 中,在某些观察结果之前选择组,当使用一组时,通过将 R 中的 var 与 NA 控制进行分组来分隔add na.rm=T。但新数据,其中三组

data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "x", class = "factor"), 
    x1 = c(0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 
    1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 
    0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), add1 = c(514L, 514L, 
    514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 
    514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 
    514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 
    514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 
    514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L, 514L
    ), group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", 
    "male"), class = "factor"), add2 = c(2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L)), .Names = c("add", "x1", "add1", 
"group", "add2"), class = "data.frame", row.names = c(NA, -52L
))

所以当我运行代码时

library(tidyverse)
library( data.table)
data %>%  
  group_by(add,add1,add2) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add,add1,add2, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T),      ## extra code here ##    
         Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>%  ## extra code here ##
  group_by(add,add1,add2) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

我得到错误

Error in mutate_impl(.data, dots) : 
  Column `x1` must be length 24 (the group size) or one, not 0

PS。我只是提供了一个示例来给出数据结构,因为有 1000 个组。我找不到有错误的组

如何解决此错误。

标签: rdplyrdata.tablelapply

解决方案


如果我理解正确,错误是由第一个男性群体引起的,所有 x1男性群体都NA在第一部分 ( group == 1L)。

恕我直言,一种更简洁的方法是首先按照此处的建议计算所有组的统计信息,然后按照此处的建议使用非等值连接来更新第二个男性组中受影响的行。

library( data.table)
grp_stats <- setDT(data)[, group2 :=rleid(group), by = .(add, add1, add2)][
  group2 == 1L & group == "male", 
  .(group2 = 3L, mean = mean(x1, na.rm = TRUE), q25 = quantile(x1, 0.25, na.rm = TRUE)), 
  by = .(add, add1, add2)] 
grp_stats 
   add add1 add2 group2 mean  q25
1:   x  514 2018      3  1.5 1.25
2:   y  515 2018      3  NaN   NA
3:   z  516 2018      3  2.0 2.00

可以清楚地识别产生错误统计数据的组。由 OP 从数据集中删除受影响的组。

但是,对于随后的加入,我们可以将它们留在其中,因为它们不会有任何影响。

group2具有常量值的列3已添加到组统计信息中以简化后续update in a non-equi join

data[, x1 := as.numeric(x1)][
  grp_stats, on = .(group2, add, add1, add2, x1 > q25), x1 := mean][]
data
    add  x1 add1  group add2 group2
 1:   x 1.0  514   male 2018      1
 2:   x 2.0  514   male 2018      1
 3:   x  NA  514 female 2018      2
 4:   x  NA  514 female 2018      2
 5:   x 1.5  514   male 2018      3
 6:   x 1.0  514   male 2018      3
 7:   y  NA  515   male 2018      1
 8:   y  NA  515   male 2018      1
 9:   y  NA  515 female 2018      2
10:   y  NA  515 female 2018      2
11:   y 7.0  515   male 2018      3
12:   y 1.0  515   male 2018      3
13:   z 2.0  516   male 2018      1
14:   z  NA  516   male 2018      1
15:   z  NA  516 female 2018      2
16:   z  NA  516 female 2018      2
17:   z 2.0  516   male 2018      3
18:   z 1.0  516   male 2018      3

请注意,第 5 行和第 17 行已更新,而第二组中产生错误统计数据的行尚未被触及。

x1在加入之前被强制输入numeric以匹配 . 返回的结果的类型mean()

样本数据

这是一个由三组组成的样本数据。在 seocnd 组中,x1第一个男性部分的所有值都是NA

data <- data.table::fread("
add x1 add1  group add2
x    1  514   male 2018
x    2  514   male 2018
x   NA  514 female 2018
x   NA  514 female 2018
x    7  514   male 2018
x    1  514   male 2018
y   NA  515   male 2018
y   NA  515   male 2018
y   NA  515 female 2018
y   NA  515 female 2018
y    7  515   male 2018
y    1  515   male 2018
z    2  516   male 2018
z   NA  516   male 2018
z   NA  516 female 2018
z   NA  516 female 2018
z    7  516   male 2018
z    1  516   male 2018
")

验证错误消息是由全 NA 第一男组引起的

当上述示例数据集通过管道传输到 OP 的代码中时,我们可以重现错误消息:

library(dplyr)
data %>% 
  group_by(add,add1,add2) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add,add1,add2, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T),      ## extra code here ##    
         Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>%  ## extra code here ##
  group_by(add,add1,add2) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

mutate_impl(.data, dots) 中的错误:
x1的长度必须为 6(组大小)或 1,而不是 0


推荐阅读