首页 > 解决方案 > 将许多列乘以和除以每个组的另一列

问题描述

我有一个具有以下结构的数据框:

# A tibble: 95 x 7
# Groups:   WallReg_2p5 [19]
   CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1  piC_2 piC_3 piC_4
        <int>                <dbl> <chr>       <dbl>  <dbl> <dbl> <dbl>
 1       6561                0.915 African      6.55  6.63  5.84  0.766
 2       6278                0.947 African     15.1   5.59  2.15  2.01 
 3       4394                0.971 African     11.4   3.92  0.774 1.47 
 4       4840                0.994 African      4.70  0.962 6.21  3.54 
 5       4105                0.947 African      6.35  2.10  2.25  3.24 
 6       5228                1.000 Amazonian    8.49  5.00  1.92  2.42 
 7       5089                1.000 Amazonian   15.6   6.48  2.53  2.89 
 8       4939                0.998 Amazonian    5.56  2.94  0.389 2.44 
 9       5088                1.000 Amazonian   12.9   5.16  1.99  3.13 
10       4947                0.998 Amazonian    8.05 11.2   2.54  4.61 
# ... with 85 more rows

这是dput()数据帧的一个子集。我的真实数据集由 10,368 行和 255,611 列组成

structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L, 
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L, 
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L, 
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L, 
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L, 
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L, 
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L, 
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L, 
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L, 
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L, 
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106, 
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909, 
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603, 
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702, 
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261, 
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545, 
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545, 
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069, 
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603, 
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232, 
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882, 
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799, 
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929, 
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947, 
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947, 
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447, 
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447, 
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688, 
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261, 
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603, 
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447, 
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545, 
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929, 
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508, 
0.831469612302545), WallReg_2p5 = c("African", "African", "African", 
"African", "African", "Amazonian", "Amazonian", "Amazonian", 
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian", 
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian", 
"Australian", "Australian", "Australian", "Australian", "Chinese", 
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian", 
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian", 
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan", 
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan", 
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan", 
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican", 
"Mexican", "Mexican", "Mexican", "Mexican", "North American", 
"North American", "North American", "North American", "North American", 
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic", 
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental", 
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian", 
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian", 
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian", 
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American", 
"South American", "South American", "South American", "South American", 
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684, 
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572, 
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577, 
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576, 
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176, 
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767, 
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198, 
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335, 
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193, 
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283, 
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873, 
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809, 
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784, 
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095, 
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443, 
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618, 
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367, 
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012, 
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888, 
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212, 
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805, 
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088, 
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873, 
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355, 
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613, 
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488, 
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422, 
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539, 
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119, 
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407, 
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303, 
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346, 
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869, 
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828, 
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928, 
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788, 
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704, 
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585, 
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207, 
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562, 
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437, 
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713, 
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509, 
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068, 
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422, 
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917, 
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119, 
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123, 
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781, 
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968, 
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616, 
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917, 
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346, 
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805, 
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413, 
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992, 
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209, 
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967, 
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531, 
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385, 
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844, 
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772, 
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857, 
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059, 
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296, 
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745, 
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558, 
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331, 
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209, 
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029, 
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229, 
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262, 
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234, 
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363, 
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348, 
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061, 
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015, 
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442, 
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402, 
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654, 
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112, 
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219, 
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595, 
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995, 
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808, 
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109, 
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584, 
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103, 
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674, 
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275, 
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956, 
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043, 
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166, 
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414, 
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877, 
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643, 
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting", 
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
    0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44, 
    45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89, 
    90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
    WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian", 
    "Australian", "Chinese", "Eurasian", "Guineo-Congolian", 
    "Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American", 
    "Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian", 
    "Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA, 
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))

我要做的是piC_为每个区域生成所有列的加权值。每列 ( x) 的过程涉及 3 个步骤:

  1. 将列中的每一行乘以中piC_x的值Y_Coord_2p5Weighting
  2. piC_x对每个WallReg_2p5组内的加权值求和
  3. piC_x将总和值除以Y_Coord_2p5Weighting每个WallReg_2p5组的值的总和

经过一番阅读,在大型数据集上似乎data.table比. 我都尝试过,但是在使用 时得到了不正确的结果,而且我担心将其应用于完整数据帧时的速度。这是我到目前为止尝试过的dplyrrdata.tabledplyr

dplyr

df <- df %>% tbl_df() %>% 
  group_by(WallReg_2p5) %>% 
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS   minS  maxS
   <chr>            <dbl>  <dbl> <dbl>
 1 African           8.83  4.70  15.1 
 2 Amazonian        10.1   5.56  15.6 
 3 Arctico-Siberian 13.9   2.95  21.6 
 4 Australian        8.14  3.44  12.3 
 5 Chinese           9.38  3.87  14.3 
 6 Eurasian         10.5   1.86  22.0 
 7 Guineo-Congolian  7.41  0.602 12.3 
 8 Indo-Malayan     13.3   6.22  22.0 
 9 Japanese          5.37  0.996  8.33
10 Madagascan        4.41  0.893 10.4 
11 Mexican           6.03  3.31   8.11
12 North American    5.77  1.25  13.3 
13 Novozelandic     14.1   6.72  20.8 
14 Oriental         15.3  11.5   18.6 
15 Panamanian       13.2   2.95  18.6 
16 Papua-Melanesian  6.35  0.512 13.3 
17 Saharo-Arabian    5.27  0.866 11.2 
18 South American   13.2   6.42  21.8 
19 Tibetan           7.03  1.83  13.6 

weighted <- df %>%
  mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight
  mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values
  mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights

weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS  minS  maxS
   <chr>            <dbl> <dbl> <dbl>
 1 African           8.82  8.82  8.82
 2 Amazonian        10.1  10.1  10.1 
 3 Arctico-Siberian 14.5  14.5  14.5 
 4 Australian        8.21  8.21  8.21
 5 Chinese           9.32  9.32  9.32
 6 Eurasian          9.86  9.86  9.86
 7 Guineo-Congolian  7.41  7.41  7.41
 8 Indo-Malayan     13.4  13.4  13.4 
 9 Japanese          5.47  5.47  5.47
10 Madagascan        4.38  4.38  4.38
11 Mexican           6.10  6.10  6.10
12 North American    5.09  5.09  5.09
13 Novozelandic     14.6  14.6  14.6 
14 Oriental         15.2  15.2  15.2 
15 Panamanian       13.2  13.2  13.2 
16 Papua-Melanesian  6.36  6.36  6.36
17 Saharo-Arabian    5.22  5.22  5.22
18 South American   13.2  13.2  13.2 
19 Tibetan           7.01  7.01  7.01

使用dplyr我得到正确的值。但是,当我使用时,data.table我得到不正确的值。我的代码基于这里的问题,但显然我做错了什么。

数据表

df <- df %>% group_by(WallReg_2p5) %>%
  as.data.table(.) %>% setkey(., WallReg_2p5)
is.data.table(df); haskey(df)
[1] TRUE
[1] TRUE

## same as above
df %>% tbl_df %>% group_by(WallReg_2p5) %>% 
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS   minS  maxS
   <chr>            <dbl>  <dbl> <dbl>
 1 African           8.83  4.70  15.1 
 2 Amazonian        10.1   5.56  15.6 
 3 Arctico-Siberian 13.9   2.95  21.6 
 4 Australian        8.14  3.44  12.3 
 5 Chinese           9.38  3.87  14.3 
 6 Eurasian         10.5   1.86  22.0 
 7 Guineo-Congolian  7.41  0.602 12.3 
 8 Indo-Malayan     13.3   6.22  22.0 
 9 Japanese          5.37  0.996  8.33
10 Madagascan        4.41  0.893 10.4 
11 Mexican           6.03  3.31   8.11
12 North American    5.77  1.25  13.3 
13 Novozelandic     14.1   6.72  20.8 
14 Oriental         15.3  11.5   18.6 
15 Panamanian       13.2   2.95  18.6 
16 Papua-Melanesian  6.35  0.512 13.3 
17 Saharo-Arabian    5.27  0.866 11.2 
18 South American   13.2   6.42  21.8 
19 Tibetan           7.03  1.83  13.6 

# https://stackoverflow.com/q/28123098/1710632
indx <- grep("piC_", colnames(df))
for (j in indx) {
  set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights
  set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values
  set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights
}
## wrong values
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
  summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))

# A tibble: 19 x 4
   WallReg_2p5      meanS  minS  maxS
   <chr>            <dbl> <dbl> <dbl>
 1 African           9.27  9.27  9.27
 2 Amazonian         9.27  9.27  9.27
 3 Arctico-Siberian  9.27  9.27  9.27
 4 Australian        9.27  9.27  9.27
 5 Chinese           9.27  9.27  9.27
 6 Eurasian          9.27  9.27  9.27
 7 Guineo-Congolian  9.27  9.27  9.27
 8 Indo-Malayan      9.27  9.27  9.27
 9 Japanese          9.27  9.27  9.27
10 Madagascan        9.27  9.27  9.27
11 Mexican           9.27  9.27  9.27
12 North American    9.27  9.27  9.27
13 Novozelandic      9.27  9.27  9.27
14 Oriental          9.27  9.27  9.27
15 Panamanian        9.27  9.27  9.27
16 Papua-Melanesian  9.27  9.27  9.27
17 Saharo-Arabian    9.27  9.27  9.27
18 South American    9.27  9.27  9.27
19 Tibetan           9.27  9.27  9.27

Reading?set()声明它不能执行分组操作,但我认为由于我已经定义了我的组,所以这个过程可以工作。我以前从未使用data.table过,所以任何指导将不胜感激。

标签: rdplyrdata.table

解决方案


推荐阅读