首页 > 解决方案 > 计算名称几乎相同的列对的差异(最后一个字符串不同)

问题描述

我想自动计算多列对的差异。在我的示例数据框中,我有 10 列,其中 5 列:PSA1.x、PSA2.x、PSA3.x、PSA4.x、PSA5.x

和五列

PSA1.y、PSA2.y、PSA3.y、PSA4.y PSA5.y。

区别应该是 PSA1.x - PSA1.y、PSA2.x -PSA2.y 等。我尝试过的工作有效,但是 40 列太笨重了:

df <- structure(list(PSA1.x = c(1677.7, 1016.2, 1309.4, 1179.9, 1234.4
), PSA2.x = c(693.7, 420.2, 541.4, 487.9, 510.4), PSA3.x = c(531.4, 
321.8, 414.7, 373.7, 391), PSA4.x = c(305, 184.8, 238.1, 214.5, 
224.4), PSA5.x = c(282.9, 171.3, 220.8, 198.9, 208.2), PSA1.y = c(1593.8, 
1016.2, 1309.4, 1179.9, 1234.4), PSA2.y = c(624.3, 420.2, 0, 
487.9, 510.4), PSA3.y = c(531.4, 321.8, 0, 373.7, 391), PSA4.y = c(305, 
184.8, 238.1, 214.5, 224.4), PSA5.y = c(254.6, 0, 165.6, 198.9, 
208.2)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
))

df <- df %>% 
  mutate(diff_PSA1 = PSA1.x - PSA1.y,
         diff_PSA2 = PSA2.x - PSA2.y,
         diff_PSA3 = PSA3.x - PSA3.y,
         diff_PSA4 = PSA4.x - PSA4.y,
         diff_PSA5 = PSA5.x - PSA5.y
         )

我怎样才能自动化这个过程?非常感谢

标签: r

解决方案


我们可以使用across循环遍历具有 suffix 的列,从列名 ( ) 中.x删除,用 'y' 替换值,减去并通过修改添加前缀 'diff_'来创建新列xcur_column()str_replaceget.names

library(dplyr)
library(stringr)
df %>%
   mutate(across(ends_with('.x'), ~ . -
       get(str_replace(cur_column(), "x$", "y")), .names = "diff_{.col}")) %>%
   rename_at(vars(starts_with('diff')), ~ str_remove(., "\\.x"))

-输出

# A tibble: 5 x 15
# PSA1.x PSA2.x PSA3.x PSA4.x PSA5.x PSA1.y PSA2.y PSA3.y PSA4.y PSA5.y diff_PSA1 diff_PSA2 diff_PSA3 diff_PSA4 diff_PSA5
#   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
#1  1678.   694.   531.   305    283.  1594.   624.   531.   305    255.      83.9      69.4        0          0      28.3
#2  1016.   420.   322.   185.   171.  1016.   420.   322.   185.     0        0         0          0          0     171. 
#3  1309.   541.   415.   238.   221.  1309.     0      0    238.   166.       0       541.       415.         0      55.2
#4  1180.   488.   374.   214.   199.  1180.   488.   374.   214.   199.       0         0          0          0       0  
#5  1234.   510.   391    224.   208.  1234.   510.   391    224.   208.       0         0          0          0      

或者另一种选择是split.default按列名模式拆分,即删除后缀,然后使用map/reduce,获取差异并将列与原始数据绑定

library(purrr)
split.default(df, sub("\\..*", "", names(df))) %>% 
      map_dfr(reduce, `-`) %>% 
      rename_all(~ str_c('diff_', .)) %>% 
      bind_cols(df, .)
# A tibble: 5 x 15
#  PSA1.x PSA2.x PSA3.x PSA4.x PSA5.x PSA1.y PSA2.y PSA3.y PSA4.y PSA5.y diff_PSA1 diff_PSA2 diff_PSA3 diff_PSA4 diff_PSA5
#   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
#1  1678.   694.   531.   305    283.  1594.   624.   531.   305    255.      83.9      69.4        0          0      28.3
#2  1016.   420.   322.   185.   171.  1016.   420.   322.   185.     0        0         0          0          0     171. 
#3  1309.   541.   415.   238.   221.  1309.     0      0    238.   166.       0       541.       415.         0      55.2
#4  1180.   488.   374.   214.   199.  1180.   488.   374.   214.   199.       0         0          0          0       0  
#5  1234.   510.   391    224.   208.  1234.   510.   391    224.   208.       0         0          0          0       0  

或先重塑为“长”格式,然后再做差异

library(tidyr)
df %>%
    mutate(rn = row_number()) %>%
    pivot_longer(cols = -rn, names_to = c( "grp", '.value'), names_sep = "\\.") %>%
    transmute(rn, grp = str_c('diff_', grp), diff = x- y) %>% 
    pivot_wider(names_from = grp, values_from = diff) %>%
    bind_cols(df, .)

或使用base R, 子集.x.ygrep并获得差异

xnm <- grep('\\.x$', names(df), value = TRUE)
ynm <- grep("\\.y$", names(df), value = TRUE)
df[paste0("diff_, sub("\\.x$", "", xnm))] <- df[xnm] - df[ynm]

推荐阅读