首页 > 解决方案 > 对执行因子的函数给出条件

问题描述

这个问题与这里有关,在@Akruns 要求下,我要求类似的东西。

本质上,如果我在以下条件中插入数据框:

if(length(weight) > 0) {weight %>% 
    select(where(negate(is.numeric))) %>% 
    map_dfc(~ model.matrix(~ .x -1) %>% 
              as_tibble) %>% 
    rename_all(~ str_remove(., "\\.x")) %>%
    bind_rows(weight, .)  
}

任务:

#Following @Akruns mention for turning numeric into factor:
i1 <- sapply(weight, is.numeric); df[i1] <- lapply(weight[i1], factor) and then use the Filter(function(x) is.factor(x)|is.character(x), weight)

test = function(data) {
  x = as.data.frame(Reduce(cbind, lapply(x, function(col) model.matrix(~ . -1, data = data.frame(col)))))
  setNames(x, sub(pattern = "^col", replacement = "", names(x)))
  
}
  
test(weight)
#Missing column names                     
1  64 57  8 1 0 0 1 0
2  71 59 10 1 0 0 1 0
3  53 49  6 1 0 0 1 0
4  67 62 11 1 0 0 1 0
5  55 51  8 0 0 1 1 0
6  58 50  7 0 0 1 1 0
7  77 55 10 0 0 1 0 1
8  57 48  9 0 0 1 0 1
9  56 42 10 0 1 0 0 1
10 51 42  6 0 1 0 0 1
11 76 61 12 0 1 0 0 1
12 68 57  9 0 1 0 0 1

然后,如果weight有因子,它将作为因子的列拆分为列,并为其分配值,并1在其之前和0其他地方出现。

但是,如果我输入一个numeric唯一的数据框,它会返回character(0). 问题是,如何给以下函数一个条件,例如数据帧是否x为数字,然后按原样返回数据帧。如果它是一个因素,则返回请求的输出。

我要求这样做的原因是因为我希望在另一个函数中实现它,这将包括许多数据框,其中一些只有数字,而另一些则包括因子。在这种情况下,我可以将数据框表示为x函数内。

我对功能的编辑:

fact_col <- function(x){
if(length(x) > 0) {
  weight_sub <- x %>% 
    select(where(is.factor)) 
  weight_sub %>%
    map_dfc(~ model.matrix(~ .x -1) %>% 
              as_tibble) %>% 
    rename_all(~ str_remove(., "\\.x")) %>%
    bind_cols(weight_sub, .) -> x
 x<- x%>% select(!where(is.factor))
 x<- data.frame(sapply(x, as.numeric))
}}

预期输出:

#when x is numeric
function(x) { ... }
 Richness pat
1        20   1
2        17   2
3        18   3
4        19   4
5        11   5
6        15   6
7        17   7
8        15   8
9        15   9
10        9  10
11       13  11
12       14  12

#when x is a factor
function(x) { ... }

 wgt hgt age    id    sex black brown white female male
1   64  57   8 black female     1     0     0      1    0
2   71  59  10 black female     1     0     0      1    0
3   53  49   6 black female     1     0     0      1    0
4   67  62  11 black female     1     0     0      1    0
5   55  51   8 white female     0     0     1      1    0
6   58  50   7 white female     0     0     1      1    0
7   77  55  10 white   male     0     0     1      0    1
8   57  48   9 white   male     0     0     1      0    1
9   56  42  10 brown   male     0     1     0      0    1
10  51  42   6 brown   male     0     1     0      0    1
11  76  61  12 brown   male     0     1     0      0    1
12  68  57   9 brown   male     0     1     0      0    1

可重现的代码:

structure(list(wgt = c(64L, 71L, 53L, 67L, 55L, 58L, 77L, 57L, 
56L, 51L, 76L, 68L), hgt = c(57L, 59L, 49L, 62L, 51L, 50L, 55L, 
48L, 42L, 42L, 61L, 57L), age = c(8L, 10L, 6L, 11L, 8L, 7L, 10L, 
9L, 10L, 6L, 12L, 9L), id = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 
3L, 3L, 2L, 2L, 2L, 2L), .Label = c("black", "brown", "white"
), class = "factor"), sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", "male"), class = "factor")), class = "data.frame", row.names = c(NA, 
-12L))

标签: rfunction

解决方案


一个选项是在我们使用之前拆分代码,ifselectfactor并创建一个新对象('weight_sub'),然后检查length'weight_sub',if它是否大于0,完成其余部分model.matrix并将其分配回去去“重”

weight_sub <- weight %>% 
 select(where(is.factor)) 
 
if(length(weight_sub) > 0) {
  weight_sub %>%
   map_dfc(~ model.matrix(~ .x -1) %>% 
             as_tibble) %>% 
     rename_all(~ str_remove(., "\\.x")) %>%
      bind_cols(weight, .) -> weight

  }

-输出

#   wgt hgt age    id    sex black brown white female male
#1   64  57   8 black female     1     0     0      1    0
#2   71  59  10 black female     1     0     0      1    0
#3   53  49   6 black female     1     0     0      1    0
#4   67  62  11 black female     1     0     0      1    0
#5   55  51   8 white female     0     0     1      1    0
#6   58  50   7 white female     0     0     1      1    0
#7   77  55  10 white   male     0     0     1      0    1
#8   57  48   9 white   male     0     0     1      0    1
#9   56  42  10 brown   male     0     1     0      0    1
#10  51  42   6 brown   male     0     1     0      0    1
#11  76  61  12 brown   male     0     1     0      0    1
#12  68  57   9 brown   male     0     1     0      0    1

作为否定测试,通过检查它是否是character类列来执行此操作

weight_sub <- weight %>% 
 select(where(is.character)) 
 
if(length(weight_sub) > 0) {
  weight_sub %>%
   map_dfc(~ model.matrix(~ .x -1) %>% 
             as_tibble) %>% 
     rename_all(~ str_remove(., "\\.x")) %>%
      bind_cols(weight, .) -> weight

  }

if条件返回时没有输出FALSE,因此“权重”数据集保持不变,无需添加任何新列


在更新中,如果 OP 还使用numeric要传递到 的列model.matrix,它只返回同一列,即一列(因为我们正在使用 循环遍历列map),列名称为.x(来自model.matrix公式)。当我们使用 时,这个.x列名被删除,留下一个空白的列名,默认情况下填充一个从 分配为 'col' 的列名。为了防止这种情况,我们可以在执行此操作之前使用条件将原始列名附加为具有一列输出并且是数字的列的后缀rename_allstr_remove_dfcif/else

weight %>%
    imap_dfc(~ {
        nm1 <- .y
        tmp <- model.matrix(~ .x - 1) %>%
           as_tibble
       if(ncol(tmp) == 1 && class(tmp[[1]]) == 'numeric') {
          names(tmp) <- paste0(names(tmp), nm1)
       }
      tmp
     }) %>% 
      rename_all(~ str_remove(., "\\.x"))

-输出

# A tibble: 12 x 8
#     wgt   hgt   age black brown white female  male
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>
# 1    64    57     8     1     0     0      1     0
# 2    71    59    10     1     0     0      1     0
# 3    53    49     6     1     0     0      1     0
# 4    67    62    11     1     0     0      1     0
# 5    55    51     8     0     0     1      1     0
# 6    58    50     7     0     0     1      1     0
# 7    77    55    10     0     0     1      0     1
# 8    57    48     9     0     0     1      0     1
# 9    56    42    10     0     1     0      0     1
#10    51    42     6     0     1     0      0     1
#11    76    61    12     0     1     0      0     1
#12    68    57     9     0     1     0      0     1

或者我们用Mapin来做这个base R

 out <- do.call(cbind, unname(Map(function(x, y) {
      tmp <- as.data.frame(model.matrix(~x -1))
      if(ncol(tmp) == 1 & class(tmp[[1]]) == 'numeric') {
          names(tmp) <- paste0(names(tmp), y)}
          tmp
          }, weight, names(weight))))
names(out) <- sub('^x', '', names(out))
out
#   wgt hgt age black brown white female male
#1   64  57   8     1     0     0      1    0
#2   71  59  10     1     0     0      1    0
#3   53  49   6     1     0     0      1    0
#4   67  62  11     1     0     0      1    0
#5   55  51   8     0     0     1      1    0
#6   58  50   7     0     0     1      1    0
#7   77  55  10     0     0     1      0    1
#8   57  48   9     0     0     1      0    1
#9   56  42  10     0     1     0      0    1
#10  51  42   6     0     1     0      0    1
#11  76  61  12     0     1     0      0    1
#12  68  57   9     0     1     0      0    1

推荐阅读