r - 如何根据值条件合并字符串
问题描述
我有如下数据
df<-structure(list(position = structure(c(6L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 7L, 1L, 2L, 3L, 4L, 8L, 1L, 2L, 3L, 4L), .Label = c("1,2,3,4,5,6,7,8,9,10,11,12,13,14,15",
"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17",
"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "TP<AMB88", "TP<AMT55",
"TP<ELANE", "TP<RACK1"), class = "factor"), col = structure(c(15L,
6L, 3L, 11L, 5L, 14L, 9L, 18L, 16L, 8L, 13L, 4L, 2L, 17L, 7L,
12L, 1L, 10L), .Label = c("EQMTLRGTLKGHNGW", "GRRLACLFLACVLPA",
"GSLSNYALLQLTLTA", "LGRRLACLFLACVLP", "LSNYALLQLTLTAFL", "MGSLSNYALLQLTLT",
"MTEQMTLRGTLKGHN", "MTLGRRLACLFLACV", "MVKETTYYDVLGVKP", "QMTLRGTLKGHNGWV",
"SLSNYALLQLTLTAF", "TEQMTLRGTLKGHNG", "TLGRRLACLFLACVL", "TP<AMB88",
"TP<AMT55", "TP<ELANE", "TP<RACK1", "VKETTYYDVLGVKPN"), class = "factor"),
newcol = structure(c(13L, 5L, 3L, 6L, 11L, 12L, 9L, 9L, 14L,
7L, 3L, 6L, 4L, 15L, 1L, 8L, 2L, 10L), .Label = c("1.189898095",
"1.323231429", "1.732914564", "1.789898095", "1.866247897",
"2.732914564", "2.973557262", "3.139572262", "3.189898095",
"3.323231429", "3.87645", "TP<AMB88", "TP<AMT55", "TP<ELANE",
"TP<RACK1"), class = "factor")), class = "data.frame", row.names = c(NA,
-18L))
如果它们符合条件,我想合并每个部分中的字符串,所以如果newcol
低于 2,那么如果它们在一个部分中,则合并它们
col
例如,我们看columnTP<AMP55
是一个section
所以我们查看列newcol
,我们看到其中两个的值小于 2
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MGSLSNYALLQLTLT 1.866247897
2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 GSLSNYALLQLTLTA 1.732914564
然后我想根据第一列合并这两个,它说 1,2,3 .... 在第一行和 2,3,4, ... 所以它可以变成这样
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 MGSLSNYALLQLTLTA
所以我想要这样的输出
out<- structure(list(position = structure(c(6L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 7L, 1L, 2L, 3L, 4L, 8L, 1L, 2L, 3L, 4L), .Label = c("1,2,3,4,5,6,7,8,9,10,11,12,13,14,15",
"2,3,4,5,6,7,8,9,10,11,12,13,14,15,16", "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17",
"4,5,6,7,8,9,10,11,12,13,14,15,16,17,18", "TP<AMB88", "TP<AMT55",
"TP<ELANE", "TP<RACK1"), class = "factor"), col = structure(c(15L,
6L, 3L, 11L, 5L, 14L, 9L, 18L, 16L, 8L, 13L, 4L, 2L, 17L, 7L,
12L, 1L, 10L), .Label = c("EQMTLRGTLKGHNGW", "GRRLACLFLACVLPA",
"GSLSNYALLQLTLTA", "LGRRLACLFLACVLP", "LSNYALLQLTLTAFL", "MGSLSNYALLQLTLT",
"MTEQMTLRGTLKGHN", "MTLGRRLACLFLACV", "MVKETTYYDVLGVKP", "QMTLRGTLKGHNGWV",
"SLSNYALLQLTLTAF", "TEQMTLRGTLKGHNG", "TLGRRLACLFLACVL", "TP<AMB88",
"TP<AMT55", "TP<ELANE", "TP<RACK1", "VKETTYYDVLGVKPN"), class = "factor"),
newcol = structure(c(13L, 5L, 3L, 6L, 11L, 12L, 9L, 9L, 14L,
7L, 3L, 6L, 4L, 15L, 1L, 8L, 2L, 10L), .Label = c("1.189898095",
"1.323231429", "1.732914564", "1.789898095", "1.866247897",
"2.732914564", "2.973557262", "3.139572262", "3.189898095",
"3.323231429", "3.87645", "TP<AMB88", "TP<AMT55", "TP<ELANE",
"TP<RACK1"), class = "factor"), Newposition = structure(c(1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 3L, 1L,
1L, 1L), .Label = c("", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16",
"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17", "2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18"
), class = "factor"), newcol2 = structure(c(1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 3L, 1L, 1L, 1L), .Label = c("",
"MGSLSNYALLQLTLTA", "MTEQMTLRGTLKGHNGW", "TLGRRLACLFLACVLPA"
), class = "factor")), class = "data.frame", row.names = c(NA,
-18L))
解决方案
您可以尝试这样的事情,它提取最终的整数/字母并dplyr::accumulate
在将它们添加到每个部分的第一行之前将它们组合在一起as.double(newcol) < 2
。为了将正确的累积结尾与每个部分中的第一个 <2 匹配,我首先按部分和 <2 分组,然后我使用dplyr::lead
将所有内容向上移动:
df %>%
mutate_all(as.character) %>%
mutate(sect = col == newcol,
group = cumsum(sect),
less_2 = replace_na(as.double(newcol) < 2, F)
) %>%
group_by(group, sect) %>%
mutate(Newposition = str_extract(position, "\\d+$") %>%
accumulate(c) %>%
map_chr(str_c, collapse = ","),
newcol2 = str_extract(col, ".$") %>%
accumulate(c) %>%
map_chr(str_c, collapse = "")
) %>%
group_by(less_2, add = T) %>%
mutate(Newposition = lead(Newposition) %>% str_replace("^\\d+", position),
Newposition = ifelse(less_2, Newposition, "") %>% replace_na(""),
newcol2 = lead(newcol2) %>% str_replace("^.", col),
newcol2 = ifelse(less_2, newcol2, "") %>% replace_na("")
) %>%
ungroup %>%
select(-(sect:less_2))
该代码应产生以下数据帧。它也会发出警告,但不会影响输出:
position col newcol newcol2 Newposition
1 TP<AMT55 TP<AMT55 TP<AMT55
2 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MGSLSNYALLQLTLT 1.866247897 MGSLSNYALLQLTLTA 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
3 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 GSLSNYALLQLTLTA 1.732914564
4 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 SLSNYALLQLTLTAF 2.732914564
5 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 LSNYALLQLTLTAFL 3.87645
6 TP<AMB88 TP<AMB88 TP<AMB88
7 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MVKETTYYDVLGVKP 3.189898095
8 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 VKETTYYDVLGVKPN 3.189898095
9 TP<ELANE TP<ELANE TP<ELANE
10 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MTLGRRLACLFLACV 2.973557262
11 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 TLGRRLACLFLACVL 1.732914564 TLGRRLACLFLACVLPA 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
12 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 LGRRLACLFLACVLP 2.732914564
13 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 GRRLACLFLACVLPA 1.789898095
14 TP<RACK1 TP<RACK1 TP<RACK1
15 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 MTEQMTLRGTLKGHN 1.189898095 MTEQMTLRGTLKGHNGW 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
16 2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 TEQMTLRGTLKGHNG 3.139572262
17 3,4,5,6,7,8,9,10,11,12,13,14,15,16,17 EQMTLRGTLKGHNGW 1.323231429
18 4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 QMTLRGTLKGHNGWV 3.323231429
推荐阅读
- reactjs - 如何在事件处理程序中访问 useSelector 值
- keil - 从 .s 文件启动调试器
- docker - 无法在主机上卷曲 docker 端口 - “(56) Recv failure: Connection reset by peer”
- excel - 根据您在多个下拉菜单中选择的内容更改行的颜色和单元格中的值
- swift - 从领域数据库中删除对象时,出现错误作为原因:“只能从它所属的领域中删除一个对象。”
- python - 如何从 Python 3 中的 ObservedList 中删除列表对象?
- swift - 如何快速计算 JSON 文件中的元素?
- .net - 将 dotnet 从 2.1 更新到 3.1 后的 Automapper 问题
- ios - 当我将它修补到 json 服务器 swift 时,照片没有更新
- laravel - 如何将参数传递给 eloquent 模型类以获取自定义关系