首页 > 解决方案 > 使用列名模式在列表中创建新变量

问题描述

您好我正在尝试使用以下代码将 33 个变量减少为单个指标(我知道它的效率极低:

data_indicator <- data %>%
  mutate(plot_1=(farm_sell_1+farm_lease_1+farm_bequeath_1)/3, na.rm=T) %>%
  mutate(plot_sec_1=ifelse(plot_1>.5, 1, 0)) %>%
  mutate(plot_2=(farm_sell_2+farm_lease_2+farm_bequeath_2)/3, na.rm=T) %>%
  mutate(plot_sec_2=ifelse(plot_2>.5, 1, 0)) %>%
  mutate(plot_3=(farm_sell_3+farm_lease_3+farm_bequeath_3)/3, na.rm=T) %>%
  mutate(plot_sec_3=ifelse(plot_3>.5, 1, 0)) %>%
  mutate(plot_4=(farm_sell_4+farm_lease_4+farm_bequeath_4)/3, na.rm=T) %>%
  mutate(plot_sec_4=ifelse(plot_4>.5, 1, 0)) %>%
  mutate(plot_5=(farm_sell_5+farm_lease_5+farm_bequeath_5)/3, na.rm=T) %>%
  mutate(plot_sec_5=ifelse(plot_5>.5, 1, 0)) %>%
  mutate(plot_6=(farm_sell_6+farm_lease_6+farm_bequeath_6)/3, na.rm=T) %>%
  mutate(plot_sec_6=ifelse(plot_6>.5, 1, 0)) %>%
  mutate(plot_7=(farm_sell_7+farm_lease_7+farm_bequeath_7)/3, na.rm=T) %>%
  mutate(plot_sec_7=ifelse(plot_7>.5, 1, 0)) %>%
  mutate(plot_8=(farm_sell_8+farm_lease_8+farm_bequeath_8)/3, na.rm=T) %>%
  mutate(plot_sec_8=ifelse(plot_8>.5, 1, 0)) %>%
  mutate(plot_9=(farm_sell_9+farm_lease_9+farm_bequeath_9)/3, na.rm=T) %>%
  mutate(plot_sec_9=ifelse(plot_9>.5, 1, 0)) %>%
  mutate(plot_10=(farm_sell_10+farm_lease_10+farm_bequeath_10)/3, na.rm=T) %>%
  mutate(plot_sec_10=ifelse(plot_10>.5, 1, 0)) %>%
  mutate(plot_11=(farm_sell_11+farm_lease_11+farm_bequeath_11)/3, na.rm=T) %>%
  mutate(plot_sec_11=ifelse(plot_11>.5, 1, 0)) %>%
  mutate(num_plots_sec = plot_sec_1+plot_sec_2+plot_sec_3+plot_sec_4+plot_sec_5+plot_sec_6+plot_sec_7+plot_sec_8+plot_sec_9+plot_sec_10+plot_sec_11, na.rm=T) 

我如何循环遍历这 11 个变量,以便为每个以特定数字“_1”结尾的变量获得一个度量,然后从 1:11 对这些数字求和以获得 1 个指标?

如何遍历所有以数字结尾的变量,将它们加在一起以创建以数字结尾的新变量,然后将所有 11 个变量加在一起成为一个指标?

当前数据框示例:

$ farm_sell_1      <dbl+lbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,...
$ farm_sell_2      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NA, 1, 0, 0, 0, 0, 0, 0, NA, N...
$ farm_sell_3      <dbl> 0, 1, 0, 0, NA, 0, NA, NA, NA, 0, 0, NA, 1, NA, 1, 0, NA, NA, NA, NA,...
$ farm_sell_4      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, 1, NA, 1, NA, NA, NA, ...
$ farm_sell_5      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA...
$ farm_sell_6      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA...
$ farm_sell_7      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA...
$ farm_sell_8      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_sell_9      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_sell_10     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_sell_11     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_lease_1     <dbl+lbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,...
$ farm_lease_2     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, NA, 1, 0, 0, 0, 0, 0, 0, NA, N...
$ farm_lease_3     <dbl> 1, 1, 0, 0, NA, 0, NA, NA, NA, 0, 0, NA, 1, NA, 1, 0, NA, NA, NA, NA,...
$ farm_lease_4     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, 1, NA, 1, NA, NA, NA, ...
$ farm_lease_5     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA...
$ farm_lease_6     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA...
$ farm_lease_7     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA...
$ farm_lease_8     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_lease_9     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_lease_10    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_lease_11    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_bequeath_1  <dbl+lbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,...
$ farm_bequeath_2  <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, NA, 1, 1, 0, 0, 0, 0, 0, NA, N...
$ farm_bequeath_3  <dbl> 1, 1, 1, 1, NA, 0, NA, NA, NA, 0, 0, NA, 1, NA, 1, 1, NA, NA, NA, NA,...
$ farm_bequeath_4  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, 1, NA, 1, NA, NA, NA, ...
$ farm_bequeath_5  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA...
$ farm_bequeath_6  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA...
$ farm_bequeath_7  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, NA...
$ farm_bequeath_8  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_bequeath_9  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_bequeath_10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ farm_bequeath_11 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...

标签: rdplyr

解决方案


使这更清晰的一种方法是将此数据转换为长数据。有关宽与长的更多信息,请查看此站点:https ://uc-r.github.io/tidyr

我将逐步完成此操作,以便您了解它是如何工作的,然后包含最后一次完成所有操作的代码。

首先使用一些假数据:

fake.data <- data.frame(matrix(data = rbinom(1650, 1, 0.5), nrow = 50, ncol = 33))
colnames(fake.data) <- c(paste0("farm_sell_", 1:11), paste0("farm_lease_", 1:11),
                          paste0("farm_bequeath_", 1:11))

它看起来像你上面的:

'data.frame':   50 obs. of  33 variables:
 $ farm_sell_1     : int  0 0 0 1 0 1 0 0 1 1 ...
 $ farm_sell_2     : int  0 0 1 1 1 1 1 1 0 0 ...
 $ farm_sell_3     : int  1 0 0 0 1 1 0 1 0 0 ...
 $ farm_sell_4     : int  1 1 1 0 1 0 0 0 1 1 ...
 $ farm_sell_5     : int  1 1 0 0 1 0 0 0 1 1 ...
 $ farm_sell_6     : int  0 1 0 0 0 0 0 0 0 0 ...
 $ farm_sell_7     : int  1 0 1 1 0 0 0 1 0 1 ...
 $ farm_sell_8     : int  0 0 1 0 0 1 1 0 1 0 ...
 $ farm_sell_9     : int  1 1 1 0 0 0 1 1 1 1 ...
 $ farm_sell_10    : int  1 1 0 0 1 0 1 1 0 0 ...
 $ farm_sell_11    : int  0 0 0 0 1 1 1 0 0 0 ...
 $ farm_lease_1    : int  0 0 1 1 0 0 1 0 1 0 ...
 $ farm_lease_2    : int  0 0 0 1 1 1 1 1 1 0 ...
 $ farm_lease_3    : int  0 1 1 1 0 1 1 1 0 0 ...
 $ farm_lease_4    : int  1 0 1 1 0 1 1 1 1 1 ...
 $ farm_lease_5    : int  0 0 0 0 1 1 0 1 0 1 ...
 $ farm_lease_6    : int  0 1 1 0 1 1 0 0 1 1 ...
 $ farm_lease_7    : int  0 0 0 1 1 1 0 1 1 1 ...
 $ farm_lease_8    : int  0 1 0 1 0 0 1 0 1 0 ...
 $ farm_lease_9    : int  0 0 1 0 0 1 0 0 1 1 ...
 $ farm_lease_10   : int  1 1 1 1 0 1 1 1 0 1 ...
 $ farm_lease_11   : int  1 0 0 1 1 0 0 0 1 1 ...
 $ farm_bequeath_1 : int  1 1 1 0 0 1 1 1 0 0 ...
 $ farm_bequeath_2 : int  0 1 1 0 0 1 1 1 1 1 ...
 $ farm_bequeath_3 : int  1 0 0 1 1 0 1 0 0 1 ...
 $ farm_bequeath_4 : int  0 0 1 1 1 0 0 0 1 0 ...
 $ farm_bequeath_5 : int  1 1 0 0 0 0 0 1 1 0 ...
 $ farm_bequeath_6 : int  0 1 0 0 0 0 1 0 1 1 ...
 $ farm_bequeath_7 : int  0 0 0 0 1 0 1 0 0 1 ...
 $ farm_bequeath_8 : int  0 1 0 1 1 0 0 1 1 1 ...
 $ farm_bequeath_9 : int  0 0 1 1 0 1 1 0 0 1 ...
 $ farm_bequeath_10: int  0 0 0 1 1 1 0 0 1 1 ...
 $ farm_bequeath_11: int  0 0 0 1 0 0 0 0 0 0 ...

您将需要dplyrandtidyr包来完成所有这些工作。

library(dplyr)
library(tidyr)

pivot_longer然后我们使用from让它变得很长tidyr。我在此处添加了一个键来参考每个指标适用于哪个农场。我们稍后将需要它进行分组,但基本上与原始数据中的行号匹配。

data.long <- fake.data %>%
  #add a key to keep track of stuff
  mutate(farm_key = 1:n()) %>%
  pivot_longer(farm_sell_1:farm_bequeath_11, names_to = "variable", values_to = "value")

这看起来像这样

# A tibble: 6 x 3
  farm_key variable    value
     <int> <chr>       <int>
1        1 farm_sell_1     0
2        1 farm_sell_2     0
3        1 farm_sell_3     1
4        1 farm_sell_4     1

接下来,我们使用separate将您的 farm_sell_1 等变量拆分为更机器可读的内容:

data.long2 <- data.long %>%
  tidyr::separate(col = variable, into = c("farm", "var", "var_num"), sep = "_")

结果数据如下:

# A tibble: 6 x 5
  farm_key farm  var   var_num value
     <int> <chr> <chr> <chr>   <int>
1        1 farm  sell  1           0
2        1 farm  sell  2           0
3        1 farm  sell  3           1
4        1 farm  sell  4           1

然后我们完成您在上面所做的所有添加。首先,我们按 var_num 分组并为每个农场添加这些变量。这与添加 farm_sell_1 + farm_lease_1 + farm_bequeath_1 并除以 3 相同,就像您在上面所做的那样。ifelse然后我们通过语句计算 plot_sec 。最后,我们可以将每个农场的这 11 个索引(_1、_2、_3 各一个)相加,得到每个农场的一个索引值。

data.long3 <- data.long2 %>%
  group_by(farm_key, var_num) %>%
  summarise(plot_val = sum(value, na.rm = T)/3) %>% #same as plot_1, plot_2, etc.
  ungroup() %>%
  mutate(plot_sec = ifelse(plot_val>0.5,1,0)) %>%
  #sum together to get one value for each farm_key
  group_by(farm_key) %>%
  summarise(num_plots_sec = sum(plot_sec)) %>%
  ungroup() 

然后数据看起来像这样:

# A tibble: 6 x 2
  farm_key num_plots_sec
     <int>         <dbl>
1        1             4
2        2             4
3        3             4
4        4             8
5        5             7

正如所承诺的那样,一段代码可以一次完成所有操作:

data.one.ind <- fake.data %>%
  #add a key to keep track of stuff
  mutate(farm_key = 1:n()) %>%
  pivot_longer(farm_sell_1:farm_bequeath_11, names_to = "variable", values_to = "value") %>%
  tidyr::separate(col = variable, into = c("farm", "var", "var_num"), sep = "_") %>%
  group_by(farm_key, var_num) %>%
  summarise(plot_val = sum(value, na.rm = T)/3) %>% #same as plot_1, plot_2, etc.
  ungroup() %>%
  mutate(plot_sec = ifelse(plot_val>0.5,1,0)) %>%
  #sum together to get one value for each farm_key
  group_by(farm_key) %>%
  summarise(num_plots_sec = sum(plot_sec)) %>%
  ungroup()

总而言之,它实际上可能不会为您节省太多的打字时间。但它更能适应变量的变化。


推荐阅读