首页 > 解决方案 > 按列减去数据帧的行,保留多因子列

问题描述

我有两个列数不等的数据框。我想从 df1 中减去 df2 行的强度值,按列(即按样本)。我的条件是:

  1. 在 df1 中,每个基因 (gene_nm) 的肽序列 (pep_seq) 及其对应的每个样本 (int_sam) 的强度有多行。同一个基因出现多次,即占据多行。
  2. 在 df2 中,基因(行)仅出现一次及其相应的强度值
  3. 因此,df1 比 df2 长得多(例如,55000 行与 6000 行)
  4. 强度列 (int_samp) 的数量可以很多。在这个例子中我有 3 个

数据框 1

pep_seq = c("aaaaaaaaa", "ababababba", "dfsfsfsfds", "xbbcbcncncc", "fbbdsgffhhh", "dggdgdgegeggerr", 
        "dfgthrgfgf", "wegregegg", "egegegergewge", "sfngegebser", "qegqeefbew", "qegqetegqt", 
        "qwtqtewr", "etghsfrgf", "sfsdfbdfbergeagaegr", "wasfqertsdfaefwe")
int_samp_1 = c("2421432", "24242424", "NA", "4684757849", "NA", "10485040", "NA", 
          "6849400", "40300", "NA", "NA", "NA", "556456466", "4646456466", "246464266", "4564242646")
int_samp_2 = c("NA", "5342353", "14532556", "43566", "46367367", "768769769", "797899", "NA", "NA", "NA", 
          "686899", "7898979", "678568", "NA", "68886", "488")
int_samp_3 = c("11351", "NA", "NA", "NA", "1354151345", "1351351354", "314534", "1535", "3145354", "4353455", 
          "324535", "3543445", "34535", "34535534", "NA", "NA")
gene_nm = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "C", "C", "C", "C", "C", "C", "C")
df_1 = cbind.data.frame(pep_seq, int_samp_1, int_samp_2, int_samp_3, gene_nm)

数据框 2

int_samp_1a = c("2421432", "24242424", "NA")
int_samp_2a = c("NA", "5342353", "14532556")
int_samp_3a = c("11351", "NA", "NA")
gene_nm.a = c("A", "B", "C")
df_2 = cbind.data.frame(gene_nm.a, int_samp_1a, int_samp_2a, int_samp_3a)

请建议。

标签: rdataframedplyrpurrr

解决方案


一种选择是加入df_1df_2使用dplyr,然后执行简单的矩阵减法。

注意:数据帧在因子中获得了强度读数。我认为当您希望执行减法时,将测量因素考虑在内并不是一个好主意。因此,我已将它们转换为integer.

library(dplyr)

# The NA values from df_2 has been changed to 0 since keeping those NA, will
# turn values in df_A NA for no reason. 
mod <- df_1 %>% left_join(df_2, by= c("gene_nm" = "gene_nm.a")) %>% # join on gene
  mutate_at(vars(starts_with("int_samp")), funs(as.integer(as.character(.)))) %>%
  mutate_at(vars(ends_with("a")), funs(ifelse(is.na(.),0L,.))) #Values are converted

# The modified data.frame got columns from both df_1 and df_2
mod[,grepl("^int_samp_\\d+$", names(mod))] <- 
                mod[,grepl("^int_samp_\\d+$", names(mod))] -  
                mod[,grepl("^int_samp_\\d+[a-z]+$", names(mod))]

# Take columns from df_1. 
mod[names(df_1)]
#                pep_seq int_samp_1 int_samp_2 int_samp_3 gene_nm
# 1            aaaaaaaaa          0         NA          0       A
# 2           ababababba   21820992    5342353         NA       A
# 3           dfsfsfsfds         NA   14532556         NA       A
# 4          xbbcbcncncc         NA      43566         NA       A
# 5          fbbdsgffhhh         NA   46367367 1354139994       A
# 6      dggdgdgegeggerr    8063608  768769769 1351340003       A
# 7           dfgthrgfgf         NA   -4544454     314534       B
# 8            wegregegg  -17393024         NA       1535       B
# 9        egegegergewge  -24202124         NA    3145354       B
# 10         sfngegebser         NA         NA    4353455       C
# 11          qegqeefbew         NA  -13845657     324535       C
# 12          qegqetegqt         NA   -6633577    3543445       C
# 13            qwtqtewr  556456466  -13853988      34535       C
# 14           etghsfrgf         NA         NA   34535534       C
# 15 sfsdfbdfbergeagaegr  246464266  -14463670         NA       C
# 16    wasfqertsdfaefwe         NA  -14532068         NA       C

推荐阅读