r - R在对另一列的完整组合进行分组后,创建具有比例列的汇总数据框
问题描述
这是我正在使用的数据框的屏幕截图。
我在这里也包括了 dput(head(df, n = 50))。
structure(list(conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004",
"0001cd061567445d62e91e9d5c28c004", "0003bc99a5039e664fd71dab54d7d533",
"0003c6d107ae3dc12174eee321ee4589", "0006ba6c297c921e745ebe624c61b3c2",
"0007dbcbc304375c09d6cd650a144cd1", "00093c82a4a8fb0827833960fabb5e1a",
"000a508ea789bbad8e9e832bb1c2b787", "000d383e749beb47f8c93ec3a05a1b4a",
"00106ff98badafd53709fbb617052bb6", "0010a4d7e4f77d5c0ce684499a529ce2",
"00131c89636a6d0f37ae93fe6da4144e", "001dcc2ff0655c7dd8243b86ad461cff",
"002097d1eb159f92b43a6dc0510d9b08", "0020c1e9b159733d2d3ba9912c795382",
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3",
"00212acae3e183252078bdfd2c3963e3", "0023ff241f1c71743146300098021297",
"002f421d2a3ad29afd9cb15807ce1f0f", "00316ec40c99e9d8b53b4834386b0476",
"003593dce8e41b1a2a544b16196cc6a1", "00389f0ef15d13e4ec2eec7a1ac03240",
"003c69e077ba1ccbf051a229f5fc627a", "00415d4cd157e3f256976d0e9f5dab19",
"0041723c94fa240eea4e6245513e7213", "0043e1fd2c4ef8c1da1959a3f4dd0362",
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873",
"004df76075f83c1c6de314ff785aa0d7", "004df76075f83c1c6de314ff785aa0d7",
"005090f2bf5796a2cbe693e43e70d653", "0051d01a74e467cb91d779ed064f7dda",
"0052ef5a56f4a24011abe9a1a7242f49", "00592654753f3ef738a6f93227f3dd61",
"005af6fca539bb39a4c88713b0c9df2b", "005d62d75f8ecf29e7a3c8571b6f9bf6",
"005de06981b347a4af1f104fb3c7c6f2", "005dea5fc43528e8a4f00d172308cbea",
"0060afd14a7edd637d99ded56429ab4a", "00669dae25369a8655199c54aec4c229",
"00679670a8f30a3b59b9dd803c2aa48c", "006a9b088b768bbabb6d77cd6b44f763",
"006af62acaac8b6d8b620a170f09e71b", "006c137bd6884d985e2e0724c45ad2ab",
"006c83b0f6c0a8e3773a63c85f34c3c7", "006fc42950d4a610baaef0f66ded0819",
"00716fcf53da8f347571e387e1beae6d", "00716fcf53da8f347571e387e1beae6d",
"0073cee456597683aaecc9434a18eda3"), tier_1 = c("OTT", "Paid Search",
"OTT", "Email", "Paid Search", "Paid Search", "Email", "Paid Social",
"Paid Social", "Paid Search", "Direct", "Email", "Paid Search",
"Paid Social", "Paid Search", "Paid Social", "Organic Search",
"Paid Search", "Paid Social", "Organic Search", "Direct", "Paid Search",
"Paid Video", "Direct", "Email", "Paid Search", "Affiliate",
"Email", "Paid Search", "OTT", "Paid Search", "Paid Search",
"Email", "Direct", "Direct", "Paid Search", "Organic Search",
"Paid Search", "Paid Search", "Paid Search", "Paid Social", "Direct",
"Organic Search", "Paid Search", "Email", "Paid Search", "Paid Social",
"Paid Search", "Paid Social", "Organic Social"), normalized = c(0.4287711836,
0.5712288164, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.7237434946,
0.2762565054, 0.3000477093, 0.6999522907, 1, 1, 1, 1, 1, 1, 1,
1, 0.5447557896, 0.4552442104, 1, 0.3493884477, 0.6506115523,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.700741097,
0.299258903, 0.5210938212)), row.names = c(NA, -50L), groups = structure(list(
conversion_hash_id = c("0001cd061567445d62e91e9d5c28c004",
"0003bc99a5039e664fd71dab54d7d533", "0003c6d107ae3dc12174eee321ee4589",
"0006ba6c297c921e745ebe624c61b3c2", "0007dbcbc304375c09d6cd650a144cd1",
"00093c82a4a8fb0827833960fabb5e1a", "000a508ea789bbad8e9e832bb1c2b787",
"000d383e749beb47f8c93ec3a05a1b4a", "00106ff98badafd53709fbb617052bb6",
"0010a4d7e4f77d5c0ce684499a529ce2", "00131c89636a6d0f37ae93fe6da4144e",
"001dcc2ff0655c7dd8243b86ad461cff", "002097d1eb159f92b43a6dc0510d9b08",
"0020c1e9b159733d2d3ba9912c795382", "00212acae3e183252078bdfd2c3963e3",
"0023ff241f1c71743146300098021297", "002f421d2a3ad29afd9cb15807ce1f0f",
"00316ec40c99e9d8b53b4834386b0476", "003593dce8e41b1a2a544b16196cc6a1",
"00389f0ef15d13e4ec2eec7a1ac03240", "003c69e077ba1ccbf051a229f5fc627a",
"00415d4cd157e3f256976d0e9f5dab19", "0041723c94fa240eea4e6245513e7213",
"0043e1fd2c4ef8c1da1959a3f4dd0362", "004a2bcbef69fb0d334b675583cdb873",
"004df76075f83c1c6de314ff785aa0d7", "005090f2bf5796a2cbe693e43e70d653",
"0051d01a74e467cb91d779ed064f7dda", "0052ef5a56f4a24011abe9a1a7242f49",
"00592654753f3ef738a6f93227f3dd61", "005af6fca539bb39a4c88713b0c9df2b",
"005d62d75f8ecf29e7a3c8571b6f9bf6", "005de06981b347a4af1f104fb3c7c6f2",
"005dea5fc43528e8a4f00d172308cbea", "0060afd14a7edd637d99ded56429ab4a",
"00669dae25369a8655199c54aec4c229", "00679670a8f30a3b59b9dd803c2aa48c",
"006a9b088b768bbabb6d77cd6b44f763", "006af62acaac8b6d8b620a170f09e71b",
"006c137bd6884d985e2e0724c45ad2ab", "006c83b0f6c0a8e3773a63c85f34c3c7",
"006fc42950d4a610baaef0f66ded0819", "00716fcf53da8f347571e387e1beae6d",
"0073cee456597683aaecc9434a18eda3"), .rows = structure(list(
1:2, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15:16, 17:18, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27:28, 29L, 30:31, 32L, 33L, 34L, 35L, 36L, 37L,
38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48:49,
50L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -44L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
我需要创建一个包含一些我不知道如何创建的逻辑的摘要数据框。
我需要做的是将属于完全相同的 tier_1 类别集的每个 conversion_hash_id 分组并计算出现的总数。
例如,如果有 30 个不同的 conversion_hash_ids 彼此不同,但每个都列出两次,一次用于 OTT,一次用于付费搜索,那么我需要计算该组合出现的次数。
这是新数据框的样子。这些只是几个案例的数字。也许决赛桌的结构可能还有其他可能性。我没有在这里列出所有案例。
group count
OTT, Paid Search 30
OTT, Paid Social 25
解决方案
按 'conversion_hash_id' 分组后,将'tier_1'summarise
的sort
ed值通过ing ( - )unique
作为单个字符串,然后获取'group' 列的paste
toString
paste(..., collapse=", ")
count
library(dplyr)
df %>%
group_by(conversion_hash_id) %>%
summarise(group = toString(sort(unique(tier_1)))) %>%
count(group)
推荐阅读
- python - 将浮点数舍入到与乘数相同的小数
- bash - ld:OS X 上的未知选项 -melf_i386
- reactjs - React Router 如何持久化状态对象?
- ffmpeg - 如何使用 FFmpeg 将网络摄像头直播到 YouTube?
- css - 无法使用 BS4 找到 CSS 选择器
- python - Python 将函数误认为是未定义的变量
- soapui - 使用财产转移时如何去除括号?
- c# - Visual Studio for Mac:日志在项目上工作并在 Azure 上显示,但在同一解决方案的另一个项目上失败
- python - 前向填充熊猫中的季节性数据
- java - java - 如何拥有资源文件