首页 > 解决方案 > 如何用 R 连接数据框中的行?

问题描述

我想将这些行与重复ScanNum值连接起来。

这是我的数据框的一部分。

structure(list(UniprotID = c("P06493", "P06493", "P06493", "P06493", 
"P16591", "Q7Z460", "Q7Z460", "Q7Z460", "Q7Z460", "Q7Z460", "P16591", 
"P11802", "P09651", "P09651", "P22830", "P46734", "Q00535", "P09651", 
"P63261", "P68032", "P06493", "Q9UKI8", "P63261", "P68032", "Q9NVU7", 
"P06239", "P06239", "Q00535", "P06239", "P11802", "Q13164", "P06493", 
"Q9UKI8", "P06239", "Q00535", "P06239", "Q09428", "O96017", "Q14289", 
"O96017", "P06493", "Q9UKI8", "Q9NWZ3", "P06239", "O43318", "O43318", 
"P06493", "P50613", "Q9BQI3", "Q86UE8"), Description = c("Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Tyrosine-protein kinase Fer OS=Homo sapiens OX=9606 GN=FER PE=1 SV=2", 
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1", 
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1", 
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1", 
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1", 
"CLIP-associating protein 1 OS=Homo sapiens OX=9606 GN=CLASP1 PE=1 SV=1", 
"Tyrosine-protein kinase Fer OS=Homo sapiens OX=9606 GN=FER PE=1 SV=2", 
"Cyclin-dependent kinase 4 OS=Homo sapiens OX=9606 GN=CDK4 PE=1 SV=2", 
"Heterogeneous nuclear ribonucleoprotein A1 OS=Homo sapiens OX=9606 GN=HNRNPA1 PE=1 SV=5", 
"Heterogeneous nuclear ribonucleoprotein A1 OS=Homo sapiens OX=9606 GN=HNRNPA1 PE=1 SV=5", 
"Ferrochelatase, mitochondrial OS=Homo sapiens OX=9606 GN=FECH PE=1 SV=2", 
"Dual specificity mitogen-activated protein kinase kinase 3 OS=Homo sapiens OX=9606 GN=MAP2K3 PE=1 SV=2", 
"Cyclin-dependent-like kinase 5 OS=Homo sapiens OX=9606 GN=CDK5 PE=1 SV=3", 
"Heterogeneous nuclear ribonucleoprotein A1 OS=Homo sapiens OX=9606 GN=HNRNPA1 PE=1 SV=5", 
"Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1", 
"Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Serine/threonine-protein kinase tousled-like 1 OS=Homo sapiens OX=9606 GN=TLK1 PE=1 SV=2", 
"Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1", 
"Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1", 
"Protein SDA1 homolog OS=Homo sapiens OX=9606 GN=SDAD1 PE=1 SV=3", 
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6", 
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6", 
"Cyclin-dependent-like kinase 5 OS=Homo sapiens OX=9606 GN=CDK5 PE=1 SV=3", 
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6", 
"Cyclin-dependent kinase 4 OS=Homo sapiens OX=9606 GN=CDK4 PE=1 SV=2", 
"Mitogen-activated protein kinase 7 OS=Homo sapiens OX=9606 GN=MAPK7 PE=1 SV=2", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Serine/threonine-protein kinase tousled-like 1 OS=Homo sapiens OX=9606 GN=TLK1 PE=1 SV=2", 
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6", 
"Cyclin-dependent-like kinase 5 OS=Homo sapiens OX=9606 GN=CDK5 PE=1 SV=3", 
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6", 
"ATP-binding cassette sub-family C member 8 OS=Homo sapiens OX=9606 GN=ABCC8 PE=1 SV=6", 
"Serine/threonine-protein kinase Chk2 OS=Homo sapiens OX=9606 GN=CHEK2 PE=1 SV=1", 
"Protein-tyrosine kinase 2-beta OS=Homo sapiens OX=9606 GN=PTK2B PE=1 SV=2", 
"Serine/threonine-protein kinase Chk2 OS=Homo sapiens OX=9606 GN=CHEK2 PE=1 SV=1", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Serine/threonine-protein kinase tousled-like 1 OS=Homo sapiens OX=9606 GN=TLK1 PE=1 SV=2", 
"Interleukin-1 receptor-associated kinase 4 OS=Homo sapiens OX=9606 GN=IRAK4 PE=1 SV=1", 
"Tyrosine-protein kinase Lck OS=Homo sapiens OX=9606 GN=LCK PE=1 SV=6", 
"Mitogen-activated protein kinase kinase kinase 7 OS=Homo sapiens OX=9606 GN=MAP3K7 PE=1 SV=1", 
"Mitogen-activated protein kinase kinase kinase 7 OS=Homo sapiens OX=9606 GN=MAP3K7 PE=1 SV=1", 
"Cyclin-dependent kinase 1 OS=Homo sapiens OX=9606 GN=CDK1 PE=1 SV=3", 
"Cyclin-dependent kinase 7 OS=Homo sapiens OX=9606 GN=CDK7 PE=1 SV=1", 
"Eukaryotic translation initiation factor 2-alpha kinase 1 OS=Homo sapiens OX=9606 GN=EIF2AK1 PE=1 SV=2", 
"Serine/threonine-protein kinase tousled-like 2 OS=Homo sapiens OX=9606 GN=TLK2 PE=1 SV=2"
), Gene.name = c("CDK1", "CDK1", "CDK1", "CDK1", "FER", "CLASP1", 
"CLASP1", "CLASP1", "CLASP1", "CLASP1", "FER", "CDK4", "HNRNPA1", 
"HNRNPA1", "FECH", "MAP2K3", "CDK5", "HNRNPA1", "ACTG1", "ACTC1", 
"CDK1", "TLK1", "ACTG1", "ACTC1", "SDAD1", "LCK", "LCK", "CDK5", 
"LCK", "CDK4", "MAPK7", "CDK1", "TLK1", "LCK", "CDK5", "LCK", 
"ABCC8", "CHEK2", "PTK2B", "CHEK2", "CDK1", "TLK1", "IRAK4", 
"LCK", "MAP3K7", "MAP3K7", "CDK1", "CDK7", "EIF2AK1", "TLK2"), 
    Sequence = c("R.HKTTGQVVAMK(982.466)K.I", "R.HKTTGQVVAMK(982.466)K.I", 
    "R.HKTTGQVVAMK(982.466)K.I", "R.HKTTGQVVAMK(982.466)K.I", 
    "K.TSVAVK(982.466)TCK.E", "R.VNALKK(982.466).I", "R.VNALKK(982.466).I", 
    "R.VNALK(982.466)K.I", "R.VNALKK(982.466).I", "R.VNALKK(982.466).I", 
    "K.TSVAVK(982.466)TCK.E", "K.ARDPHSGHFVALK(982.466)SVR.V", 
    "R.NQGGY(982.466)GGSSSSSSYGSGR.R", "R.NQGGY(982.466)GGSSSSSSYGSGR.R", 
    "R.TPK(982.466)IQEQYR.R", "R.HAQSGTIMAVK(982.466)R.I", "K.NRETHEIVALK(982.466)R.V", 
    "R.NQGGY(982.466)GGSSSSSSYGSGR.R", "K.DSY(982.466)VGDEAQSKR.G", 
    "K.DSY(982.466)VGDEAQSKR.G", "K.TTGQVVAMKK(982.466).I", "R.YAAVK(982.466)IHQLNK.S", 
    "K.DSY(982.466)VGDEAQSKR.G", "K.DSY(982.466)VGDEAQSKR.G", 
    "K.AMK(982.466)VLK.K", "K.VAVK(982.466)SLK.Q", "K.VAVK(982.466)SLK.Q", 
    "K.NRETHEIVALK(982.466)R.V", "K.VAVK(982.466)SLK.Q", "K.ARDPHSGHFVALK(982.466)SVR.V", 
    "R.LTGQQVAIKK(982.466).I", "K.TTGQVVAMKK(982.466).I", "R.YAAVK(982.466)IHQLNK.S", 
    "K.VAVK(982.466)SLK.Q", "K.NRETHEIVALK(982.466)R.V", "K.VAVK(982.466)SLK.Q", 
    "K.GIK(982.466)LLK.L", "K.KVAIK(982.466)IISK.R", "K.INVAVK(982.466)TCK.K", 
    "K.KVAIK(982.466)IISK.R", "K.TTGQVVAMKK(982.466).I", "R.YAAVK(982.466)IHQLNK.S", 
    "K.GYVNNTTVAVKK(982.466).L", "K.VAVK(982.466)SLK.Q", "R.AKDVAIK(982.466)QIESESER.K", 
    "R.AKDVAIK(982.466)QIESESER.K", "K.TTGQVVAMK(982.466)K.I", 
    "R.DKNTNQIVAIK(982.466)K.I", "R.NKLDGQYYAIK(982.466)K.I", 
    "R.YVAVK(982.466)IHQLNK.N"), `m/z_126.127725_int` = c(7328, 
    1431, 0, 0, 0, 1534, 1208, 0, 0, 0, 0, 5472, 0, 0, 0, 0, 
    3059, 0, 0, 0, 14694, 0, 0, 0, 767, 15399, 8508, 5963, 3329, 
    2850, 866, 15159, 0, 12952, 3607, 61261, 1594, 0, 0, 0, 9174, 
    0, 0, 4064, 0, 0, 4193, 3903, 0, 0), `m/z_127.12476_int` = c(22305, 
    4867, 2166, 3183, 1615, 9900, 6436, 1924, 4641, 3176, 568, 
    10705, 0, 0, 1373, 689, 11166, 0, 0, 0, 35789, 2580, 0, 0, 
    5881, 63064, 30110, 18335, 10285, 5732, 1249, 42999, 1566, 
    39681, 9785, 309388, 5557, 2419, 1038, 3424, 29050, 722, 
    1554, 12719, 1443, 0, 12181, 10057, 0, 2435), `m/z_128.134433_int` = c(38137, 
    8048, 5042, 5280, 5324, 22723, 20533, 5320, 5032, 4471, 813, 
    12294, 863, 0, 2362, 1407, 13618, 648, 733, 733, 82205, 11746, 
    1359, 1359, 7196, 166646, 75239, 23451, 30788, 6175, 4969, 
    70456, 5618, 106209, 11896, 829224, 8316, 2921, 3481, 5204, 
    51919, 1575, 6209, 39754, 4444, 3658, 24940, 32154, 1757, 
    6020), `m/z_129.131468_int` = c(44762, 7626, 6014, 8076, 
    11264, 52091, 63456, 13223, 11973, 10541, 2274, 12982, 3369, 
    1938, 5093, 7320, 16850, 3051, 4353, 4353, 83011, 25283, 
    2897, 2897, 15137, 176041, 83912, 24140, 30193, 9100, 13435, 
    81335, 9670, 105429, 15821, 819311, 12094, 7961, 5593, 13966, 
    54175, 4243, 11926, 49495, 8842, 7331, 24976, 28836, 5722, 
    14175), `m/z_130.141141_int` = c(46636, 10425, 7086, 8641, 
    11370, 85939, 81372, 18722, 22222, 17278, 2397, 16696, 4024, 
    4826, 8287, 18216, 13907, 5872, 4442, 4442, 82328, 38189, 
    4520, 4520, 22714, 182513, 80678, 25336, 33127, 10046, 25467, 
    77154, 14168, 129888, 17157, 880050, 13502, 14193, 7167, 
    20157, 48899, 7369, 16091, 46048, 12467, 10887, 27694, 21979, 
    8712, 19013), `m/z_131.138176_int` = c(49103, 9367, 9452, 
    11609, 9746, 85046, 99942, 27284, 27647, 22801, 5214, 15570, 
    13161, 12293, 17222, 38651, 16360, 15486, 11286, 11286, 80727, 
    37110, 10795, 10795, 30313, 194256, 87209, 26696, 36470, 
    13323, 36787, 70568, 13075, 128171, 16578, 805814, 18556, 
    25095, 10181, 31390, 54114, 9680, 15058, 56991, 18002, 11603, 
    26753, 17995, 17081, 22651), TMT_purity = c("0.98141234715268", 
    "0.71134850965744001", "0.76128382110317905", "0.76128382110317905", 
    "1", "0.78702255963842904", "0.78170688482709405", "0.776974521760607", 
    "1", "0.76160370785582798", "1", "0.64272765210635596", "0.90646438991621103", 
    "0.82319643556607203", "0.58148349410262401", "1", "0.95294631885274494", 
    "0.91291708141626005", "0.698686479445912", "0.698686479445912", 
    "0.96955605239368403", "0.79561280886225205", "0.64177968168606403", 
    "0.64177968168606403", "0.88734015495342999", "0.88733946625779203", 
    "0.93493384401468704", "1", "1", "0.61903261519569497", "0.73288251651566405", 
    "1", "0.88757994170849897", "0.91888430409069299", "0.94640973341271395", 
    "0.91492692770042205", "0.72126051188328899", "0.32216956233298499", 
    "1", "0.75795632756268905", "0.961165374625497", "0.56361697494671903", 
    "1", "0.84217317911923095", "0.76607291679043199", "0.94119959458560598", 
    "0.97979179200421396", "0.81295038316780099", "0.67927222063109804", 
    "0.81685860457191595"), `Signal-noise` = c(40.21, 9.04, 7.52, 
    10.02, 10.51, 39.74, 29.91, 16.62, 18.66, 14.05, 3.42, 16.39, 
    5.42, 4.65, 8.06, 16.31, 18.58, 6.21, 5.53, 5.53, 74.27, 
    23.62, 5, 5, 19.45, 110.03, 69.9, 29.31, 32.85, 11.91, 17.31, 
    24.61, 9.46, 25.76, 16.86, 26.44, 12.86, 12.31, 6.77, 14.35, 
    25.33, 5.41, 11.65, 24.88, 9.96, 7.64, 25.44, 23.6, 8.06, 
    10.43), ScanNum = c(9809, 10035, 10254, 10269, 10521, 10567, 
    10597, 10716, 10807, 10816, 11002, 11031, 11056, 11061, 11064, 
    11085, 11194, 11288, 11314, 11314, 11320, 11322, 11326, 11326, 
    11330, 11340, 11361, 11412, 11423, 11432, 11454, 11549, 11553, 
    11601, 11640, 11698, 11720, 11726, 11750, 11757, 11775, 11802, 
    11840, 11886, 11928, 11935, 11996, 12004, 12011, 12016), 
    CState = c(4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 5, 4, 4, 4, 4, 
    4, 4, 4, 4, 3, 4, 4, 4, 3, 4, 3, 4, 2, 5, 4, 3, 4, 3, 4, 
    4, 3, 4, 3, 4, 3, 4, 3, 3, 5, 4, 3, 4, 4, 4), Filename = c("Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09", "Y20210222-09", "Y20210222-09", "Y20210222-09", 
    "Y20210222-09"), sequence1 = c("HKTTGQVVAMK(982.466)K", "HKTTGQVVAMK(982.466)K", 
    "HKTTGQVVAMK(982.466)K", "HKTTGQVVAMK(982.466)K", "TSVAVK(982.466)TCK", 
    "VNALKK(982.466)", "VNALKK(982.466)", "VNALK(982.466)K", 
    "VNALKK(982.466)", "VNALKK(982.466)", "TSVAVK(982.466)TCK", 
    "ARDPHSGHFVALK(982.466)SVR", "NQGGY(982.466)GGSSSSSSYGSGR", 
    "NQGGY(982.466)GGSSSSSSYGSGR", "TPK(982.466)IQEQYR", "HAQSGTIMAVK(982.466)R", 
    "NRETHEIVALK(982.466)R", "NQGGY(982.466)GGSSSSSSYGSGR", "DSY(982.466)VGDEAQSKR", 
    "DSY(982.466)VGDEAQSKR", "TTGQVVAMKK(982.466)", "YAAVK(982.466)IHQLNK", 
    "DSY(982.466)VGDEAQSKR", "DSY(982.466)VGDEAQSKR", "AMK(982.466)VLK", 
    "VAVK(982.466)SLK", "VAVK(982.466)SLK", "NRETHEIVALK(982.466)R", 
    "VAVK(982.466)SLK", "ARDPHSGHFVALK(982.466)SVR", "LTGQQVAIKK(982.466)", 
    "TTGQVVAMKK(982.466)", "YAAVK(982.466)IHQLNK", "VAVK(982.466)SLK", 
    "NRETHEIVALK(982.466)R", "VAVK(982.466)SLK", "GIK(982.466)LLK", 
    "KVAIK(982.466)IISK", "INVAVK(982.466)TCK", "KVAIK(982.466)IISK", 
    "TTGQVVAMKK(982.466)", "YAAVK(982.466)IHQLNK", "GYVNNTTVAVKK(982.466)", 
    "VAVK(982.466)SLK", "AKDVAIK(982.466)QIESESER", "AKDVAIK(982.466)QIESESER", 
    "TTGQVVAMK(982.466)K", "DKNTNQIVAIK(982.466)K", "NKLDGQYYAIK(982.466)K", 
    "YVAVK(982.466)IHQLNK"), Mod.or.not = c("Y", "Y", "Y", "Y", 
    "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
    "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
    "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", 
    "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), Mod.position.in.pep = c(11L, 
    11L, 11L, 11L, 6L, 6L, 6L, 5L, 6L, 6L, 6L, 13L, 5L, 5L, 3L, 
    11L, 11L, 5L, 3L, 3L, 10L, 5L, 3L, 3L, 3L, 4L, 4L, 11L, 4L, 
    13L, 10L, 10L, 5L, 4L, 11L, 4L, 3L, 5L, 6L, 5L, 10L, 5L, 
    12L, 4L, 7L, 7L, 9L, 11L, 11L, 5L), Mod.sequence = c("HKTTGQVVAMKK", 
    "HKTTGQVVAMKK", "HKTTGQVVAMKK", "HKTTGQVVAMKK", "TSVAVKTCK", 
    "VNALKK", "VNALKK", "VNALKK", "VNALKK", "VNALKK", "TSVAVKTCK", 
    "ARDPHSGHFVALKSVR", "NQGGYGGSSSSSSYGSGR", "NQGGYGGSSSSSSYGSGR", 
    "TPKIQEQYR", "HAQSGTIMAVKR", "NRETHEIVALKR", "NQGGYGGSSSSSSYGSGR", 
    "DSYVGDEAQSKR", "DSYVGDEAQSKR", "TTGQVVAMKK", "YAAVKIHQLNK", 
    "DSYVGDEAQSKR", "DSYVGDEAQSKR", "AMKVLK", "VAVKSLK", "VAVKSLK", 
    "NRETHEIVALKR", "VAVKSLK", "ARDPHSGHFVALKSVR", "LTGQQVAIKK", 
    "TTGQVVAMKK", "YAAVKIHQLNK", "VAVKSLK", "NRETHEIVALKR", "VAVKSLK", 
    "GIKLLK", "KVAIKIISK", "INVAVKTCK", "KVAIKIISK", "TTGQVVAMKK", 
    "YAAVKIHQLNK", "GYVNNTTVAVKK", "VAVKSLK", "AKDVAIKQIESESER", 
    "AKDVAIKQIESESER", "TTGQVVAMKK", "DKNTNQIVAIKK", "NKLDGQYYAIKK", 
    "YVAVKIHQLNK"), start.position = c(23L, 23L, 23L, 23L, 586L, 
    342L, 342L, 342L, 342L, 342L, 586L, 23L, 353L, 353L, 116L, 
    83L, 23L, 353L, 53L, 53L, 25L, 481L, 53L, 53L, 266L, 270L, 
    270L, 23L, 270L, 23L, 76L, 25L, 481L, 270L, 23L, 270L, 505L, 
    245L, 452L, 245L, 25L, 481L, 203L, 270L, 57L, 57L, 25L, 31L, 
    186L, 487L), pos.in.protein = c(33, 33, 33, 33, 591, 347, 
    347, 346, 347, 347, 591, 35, 357, 357, 118, 93, 33, 357, 
    55, 55, 34, 485, 55, 55, 268, 273, 273, 33, 273, 35, 85, 
    34, 485, 273, 33, 273, 507, 249, 457, 249, 34, 485, 214, 
    273, 63, 63, 33, 41, 196, 491), Mod.site = c("P06493_33", 
    "P06493_33", "P06493_33", "P06493_33", "P16591_591", "Q7Z460_347", 
    "Q7Z460_347", "Q7Z460_346", "Q7Z460_347", "Q7Z460_347", "P16591_591", 
    "P11802_35", "P09651_357", "P09651_357", "P22830_118", "P46734_93", 
    "Q00535_33", "P09651_357", "P63261_55", "P68032_55", "P06493_34", 
    "Q9UKI8_485", "P63261_55", "P68032_55", "Q9NVU7_268", "P06239_273", 
    "P06239_273", "Q00535_33", "P06239_273", "P11802_35", "Q13164_85", 
    "P06493_34", "Q9UKI8_485", "P06239_273", "Q00535_33", "P06239_273", 
    "Q09428_507", "O96017_249", "Q14289_457", "O96017_249", "P06493_34", 
    "Q9UKI8_485", "Q9NWZ3_214", "P06239_273", "O43318_63", "O43318_63", 
    "P06493_33", "P50613_41", "Q9BQI3_196", "Q86UE8_491")), row.names = c(NA, 
-50L), class = c("tbl_df", "tbl", "data.frame"))

例如,以下行在ScanNum(11314) 方面重复。

structure(list(UniprotID = c("P63261", "P68032"), Description = c("Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1", 
"Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1"
), Gene.name = c("ACTG1", "ACTC1"), Sequence = c("K.DSY(982.466)VGDEAQSKR.G", 
"K.DSY(982.466)VGDEAQSKR.G"), `m/z_126.127725_int` = c(0, 0), 
    `m/z_127.12476_int` = c(0, 0), `m/z_128.134433_int` = c(733, 
    733), `m/z_129.131468_int` = c(4353, 4353), `m/z_130.141141_int` = c(4442, 
    4442), `m/z_131.138176_int` = c(11286, 11286), TMT_purity = c("0.698686479445912", 
    "0.698686479445912"), `Signal-noise` = c(5.53, 5.53), ScanNum = c(11314, 
    11314), CState = c(4, 4), Filename = c("Y20210222-09", "Y20210222-09"
    ), sequence1 = c("DSY(982.466)VGDEAQSKR", "DSY(982.466)VGDEAQSKR"
    ), Mod.or.not = c("Y", "Y"), Mod.position.in.pep = c(3L, 
    3L), Mod.sequence = c("DSYVGDEAQSKR", "DSYVGDEAQSKR"), start.position = c(53L, 
    53L), pos.in.protein = c(55, 55), Mod.site = c("P63261_55", 
    "P68032_55")), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"))

所以,我想将这两行合并为一行,如下所示。;基本上,将所有唯一值保留在新行中,并根据它们是否不同来分隔这些值。

structure(list(X = 2L, UniprotID = "P68032;P63261", Description = "Actin, alpha cardiac muscle 1 OS=Homo sapiens OX=9606 GN=ACTC1 PE=1 SV=1", 
    Gene.name = "ACTC1", Sequence = "K.DSY(982.466)VGDEAQSKR.G", 
    m.z_126.127725_int = 0L, m.z_127.12476_int = 0L, m.z_128.134433_int = 733L, 
    m.z_129.131468_int = 4353L, m.z_130.141141_int = 4442L, m.z_131.138176_int = 11286L, 
    TMT_purity = 0.698686479, Signal.noise = 5.53, ScanNum = 11314L, 
    CState = 4L, Filename = "Y20210222-09", sequence1 = "DSY(982.466)VGDEAQSKR", 
    Mod.or.not = "Y", Mod.position.in.pep = 3L, Mod.sequence = "DSYVGDEAQSKR", 
    start.position = 53L, pos.in.protein = 55L, Mod.site = "P68032_55; P63261_55"), class = "data.frame", row.names = c(NA, 
-1L))

标签: rdataframeconcatenation

解决方案


If dat2 is your second example (with 2 rows), then

library(dplyr)
dat2 %>%
  group_by(ScanNum) %>%
  summarize(across(everything(), ~ if (is.numeric(.)) .[1] else paste(unique(.), collapse = ";"))) %>%
  str(.)
# tibble [1 x 22] (S3: tbl_df/tbl/data.frame)
#  $ ScanNum            : num 11314
#  $ UniprotID          : chr "P63261;P68032"
#  $ Description        : chr "Actin, cytoplasmic 2 OS=Homo sapiens OX=9606 GN=ACTG1 PE=1 SV=1;Actin, alpha cardiac muscle 1 OS=Homo sapiens O"| __truncated__
#  $ Gene.name          : chr "ACTG1;ACTC1"
#  $ Sequence           : chr "K.DSY(982.466)VGDEAQSKR.G"
#  $ m/z_126.127725_int : num 0
#  $ m/z_127.12476_int  : num 0
#  $ m/z_128.134433_int : num 733
#  $ m/z_129.131468_int : num 4353
#  $ m/z_130.141141_int : num 4442
#  $ m/z_131.138176_int : num 11286
#  $ TMT_purity         : chr "0.698686479445912"
#  $ Signal-noise       : num 5.53
#  $ CState             : num 4
#  $ Filename           : chr "Y20210222-09"
#  $ sequence1          : chr "DSY(982.466)VGDEAQSKR"
#  $ Mod.or.not         : chr "Y"
#  $ Mod.position.in.pep: int 3
#  $ Mod.sequence       : chr "DSYVGDEAQSKR"
#  $ start.position     : int 53
#  $ pos.in.protein     : num 55
#  $ Mod.site           : chr "P63261_55;P68032_55"

Note that I'm grouping by ScanNum (it's what you said was duplicated), and for any columns that inherit numeric, I arbitrarily take the first value found. Strings are a little more robust in that we combine the unique values.

If you need to group by more variables, increase group_by. Warning, though, grouping by integer is perfectly safe, but grouping by floating-point (numeric) may pose some issues with high-precision data; for references, see Why are these numbers not equal?, Is floating point math broken?, and https://en.wikipedia.org/wiki/IEEE_754.


推荐阅读