首页 > 解决方案 > R在对另一列的完整组合进行分组后,创建具有比例列的汇总数据框

问题描述

这是我正在使用的数据框的屏幕截图。

我在这里也包括了 dput(head(df, n = 50))。

在此处输入图像描述

structure(list(conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004", 
"0001cd061567445d62e91e9d5c28c004", "0003bc99a5039e664fd71dab54d7d533", 
"0003c6d107ae3dc12174eee321ee4589", "0006ba6c297c921e745ebe624c61b3c2", 
"0007dbcbc304375c09d6cd650a144cd1", "00093c82a4a8fb0827833960fabb5e1a", 
"000a508ea789bbad8e9e832bb1c2b787", "000d383e749beb47f8c93ec3a05a1b4a", 
"00106ff98badafd53709fbb617052bb6", "0010a4d7e4f77d5c0ce684499a529ce2", 
"00131c89636a6d0f37ae93fe6da4144e", "001dcc2ff0655c7dd8243b86ad461cff", 
"002097d1eb159f92b43a6dc0510d9b08", "0020c1e9b159733d2d3ba9912c795382", 
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3", 
"00212acae3e183252078bdfd2c3963e3", "0023ff241f1c71743146300098021297", 
"002f421d2a3ad29afd9cb15807ce1f0f", "00316ec40c99e9d8b53b4834386b0476", 
"003593dce8e41b1a2a544b16196cc6a1", "00389f0ef15d13e4ec2eec7a1ac03240", 
"003c69e077ba1ccbf051a229f5fc627a", "00415d4cd157e3f256976d0e9f5dab19", 
"0041723c94fa240eea4e6245513e7213", "0043e1fd2c4ef8c1da1959a3f4dd0362", 
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873", 
"004df76075f83c1c6de314ff785aa0d7", "004df76075f83c1c6de314ff785aa0d7", 
"005090f2bf5796a2cbe693e43e70d653", "0051d01a74e467cb91d779ed064f7dda", 
"0052ef5a56f4a24011abe9a1a7242f49", "00592654753f3ef738a6f93227f3dd61", 
"005af6fca539bb39a4c88713b0c9df2b", "005d62d75f8ecf29e7a3c8571b6f9bf6", 
"005de06981b347a4af1f104fb3c7c6f2", "005dea5fc43528e8a4f00d172308cbea", 
"0060afd14a7edd637d99ded56429ab4a", "00669dae25369a8655199c54aec4c229", 
"00679670a8f30a3b59b9dd803c2aa48c", "006a9b088b768bbabb6d77cd6b44f763", 
"006af62acaac8b6d8b620a170f09e71b", "006c137bd6884d985e2e0724c45ad2ab", 
"006c83b0f6c0a8e3773a63c85f34c3c7", "006fc42950d4a610baaef0f66ded0819", 
"00716fcf53da8f347571e387e1beae6d", "00716fcf53da8f347571e387e1beae6d", 
"0073cee456597683aaecc9434a18eda3"), tier_1 = c("OTT", "Paid Search", 
"OTT", "Email", "Paid Search", "Paid Search", "Email", "Paid Social", 
"Paid Social", "Paid Search", "Direct", "Email", "Paid Search", 
"Paid Social", "Paid Search", "Paid Social", "Organic Search", 
"Paid Search", "Paid Social", "Organic Search", "Direct", "Paid Search", 
"Paid Video", "Direct", "Email", "Paid Search", "Affiliate", 
"Email", "Paid Search", "OTT", "Paid Search", "Paid Search", 
"Email", "Direct", "Direct", "Paid Search", "Organic Search", 
"Paid Search", "Paid Search", "Paid Search", "Paid Social", "Direct", 
"Organic Search", "Paid Search", "Email", "Paid Search", "Paid Social", 
"Paid Search", "Paid Social", "Organic Social"), normalized = c(0.4287711836, 
0.5712288164, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.7237434946, 
0.2762565054, 0.3000477093, 0.6999522907, 1, 1, 1, 1, 1, 1, 1, 
1, 0.5447557896, 0.4552442104, 1, 0.3493884477, 0.6506115523, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.700741097, 
0.299258903, 0.5210938212)), row.names = c(NA, -50L), groups = structure(list(
    conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004", 
    "0003bc99a5039e664fd71dab54d7d533", "0003c6d107ae3dc12174eee321ee4589", 
    "0006ba6c297c921e745ebe624c61b3c2", "0007dbcbc304375c09d6cd650a144cd1", 
    "00093c82a4a8fb0827833960fabb5e1a", "000a508ea789bbad8e9e832bb1c2b787", 
    "000d383e749beb47f8c93ec3a05a1b4a", "00106ff98badafd53709fbb617052bb6", 
    "0010a4d7e4f77d5c0ce684499a529ce2", "00131c89636a6d0f37ae93fe6da4144e", 
    "001dcc2ff0655c7dd8243b86ad461cff", "002097d1eb159f92b43a6dc0510d9b08", 
    "0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3", 
    "0023ff241f1c71743146300098021297", "002f421d2a3ad29afd9cb15807ce1f0f", 
    "00316ec40c99e9d8b53b4834386b0476", "003593dce8e41b1a2a544b16196cc6a1", 
    "00389f0ef15d13e4ec2eec7a1ac03240", "003c69e077ba1ccbf051a229f5fc627a", 
    "00415d4cd157e3f256976d0e9f5dab19", "0041723c94fa240eea4e6245513e7213", 
    "0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873", 
    "004df76075f83c1c6de314ff785aa0d7", "005090f2bf5796a2cbe693e43e70d653", 
    "0051d01a74e467cb91d779ed064f7dda", "0052ef5a56f4a24011abe9a1a7242f49", 
    "00592654753f3ef738a6f93227f3dd61", "005af6fca539bb39a4c88713b0c9df2b", 
    "005d62d75f8ecf29e7a3c8571b6f9bf6", "005de06981b347a4af1f104fb3c7c6f2", 
    "005dea5fc43528e8a4f00d172308cbea", "0060afd14a7edd637d99ded56429ab4a", 
    "00669dae25369a8655199c54aec4c229", "00679670a8f30a3b59b9dd803c2aa48c", 
    "006a9b088b768bbabb6d77cd6b44f763", "006af62acaac8b6d8b620a170f09e71b", 
    "006c137bd6884d985e2e0724c45ad2ab", "006c83b0f6c0a8e3773a63c85f34c3c7", 
    "006fc42950d4a610baaef0f66ded0819", "00716fcf53da8f347571e387e1beae6d", 
    "0073cee456597683aaecc9434a18eda3"), .rows = structure(list(
        1:2, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 
        14L, 15:16, 17:18, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
        26L, 27:28, 29L, 30:31, 32L, 33L, 34L, 35L, 36L, 37L, 
        38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48:49, 
        50L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -44L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

我需要创建一个包含一些我不知道如何创建的逻辑的摘要数据框。

我需要做的是将属于完全相同的 tier_1 类别集的每个 conversion_hash_id 分组并计算出现的总数。

例如,如果有 30 个不同的 conversion_hash_ids 彼此不同,但每个都列出两次,一次用于 OTT,一次用于付费搜索,那么我需要计算该组合出现的次数。

这是新数据框的样子。这些只是几个案例的数字。也许决赛桌的结构可能还有其他可能性。我没有在这里列出所有案例。

group                   count
OTT, Paid Search         30 
OTT, Paid Social         25

标签: rtidy

解决方案


按 'conversion_hash_id' 分组后,将'tier_1'summarisesorted值通过ing ( - )unique作为单个字符串,然后获取'group' 列的pastetoStringpaste(..., collapse=", ")count

library(dplyr)
df %>%
    group_by(conversion_hash_id) %>%
    summarise(group = toString(sort(unique(tier_1)))) %>%
    count(group)

推荐阅读