r - R:如何对数据进行分组并在数据框中的不同组内分配因子级别?
问题描述
structure(list(drug = c("Chlorambucil", "Fludarabine", "FludarabineMafosfamide",
"NDI031301", "CMPB", "Tofacitinib", "Peficitinib", "FludarabineMafosfamide",
"PDB", "Filgotinib", "Dexamethasone", "CMPA", "Lenalidomide",
"Dexamethasone", "Gandotinib", "NDI031301", "Filgotinib", "PDB",
"CMPB", "Ruxolitinib", "CC122", "Atovaquone", "CC122", "SAR20347",
"Momelotinib", "Momelotinib", "Tofacitinib", "Fludarabine", "Fludarabine",
"Cerdulatinib", "Lenalidomide", "Atovaquone", "Chlorambucil",
"CMPA", "FludarabineMafosfamide", "FludarabineMafosfamide", "Fludarabine",
"Atovaquone", "Momelotinib", "PDB", "Filgotinib", "Chlorambucil",
"Dexamethasone", "Tofacitinib", "SAR20347", "CMPB", "Momelotinib",
"Fludarabine", "Cerdulatinib", "Peficitinib", "Atovaquone", "CC122",
"CMPA", "NDI031301", "PDB", "CMPA", "Lenalidomide", "SAR20347",
"Tofacitinib", "Gandotinib", "Lenalidomide", "Peficitinib", "CMPB",
"CC122", "Dexamethasone", "FludarabineMafosfamide", "Ruxolitinib",
"CMPB", "Peficitinib", "Tofacitinib", "FludarabineMafosfamide",
"Filgotinib", "Dexamethasone", "CMPA", "Dexamethasone", "Gandotinib",
"NDI031301", "Filgotinib", "SAR20347", "CMPB", "Ruxolitinib",
"Peficitinib", "Atovaquone", "CC122", "SAR20347", "Momelotinib",
"Momelotinib", "Tofacitinib", "Fludarabine", "Fludarabine", "Cerdulatinib",
"Atovaquone", "Chlorambucil", "CMPA", "NDI031301"), dose = c(1,
1, 10, 1, 0.1, 1, 1, 1, 100, 1, 10, 1, 10, 100, 1, 10, 10, 10,
1, 1, 0.1, 3, 1, 1, 1, 0.1, 10, 1, 10, 1, 1, 30, 30, 0.1, 0.01,
0.1, 0.01, 0.3, 0.001, 1, 0.01, 0.3, 0.1, 0.01, 0.1, 0.001, 0.01,
0.1, 0.01, 0.1, 0.03, 0.01, 0.01, 0.01, 0.1, 0.001, 0.01, 0.01,
0.1, 0.01, 0.1, 0.01, 0.01, 0.001, 1, 10, 10, 0.1, 1, 1, 1, 1,
10, 1, 100, 1, 10, 10, 10, 1, 1, 10, 3, 1, 1, 1, 0.1, 10, 10,
1, 1, 30, 30, 0.1, 1), drug.dose = c("Chlorambucil_1uM", "Fludarabine_1uM",
"FludarabineMafosfamide_10ug/mlplus1ug/ml", "NDI031301_1uM",
"CMPB_0.1uM", "Tofacitinib_1uM", "Peficitinib_1uM", "FludarabineMafosfamide_1ug/mlplus1ug/ml",
"PDB_100ng/ml", "Filgotinib_1uM", "Dexamethasone_10uM", "CMPA_1uM",
"Lenalidomide_10uM", "Dexamethasone_100uM", "Gandotinib_1uM",
"NDI031301_10uM", "Filgotinib_10uM", "PDB_10ng/ml", "CMPB_1uM",
"Ruxolitinib_1uM", "CC122_0.1uM", "Atovaquone_3uM", "CC122_1uM",
"SAR20347_1uM", "Momelotinib_1uM", "Momelotinib_0.1uM", "Tofacitinib_10uM",
"Fludarabine_1ug/ml", "Fludarabine_10ug/ml", "Cerdulatinib_1uM",
"Lenalidomide_1uM", "Atovaquone_30uM", "Chlorambucil_30uM", "CMPA_0.1uM",
"FludarabineMafosfamide_0.01ug/mlplus1ug/ml", "FludarabineMafosfamide_0.1ug/mlplus1ug/ml",
"Fludarabine_0.01ug/ml", "Atovaquone_0.3uM", "Momelotinib_0.001uM",
"PDB_1ng/ml", "Filgotinib_0.01uM", "Chlorambucil_0.3uM", "Dexamethasone_0.1uM",
"Tofacitinib_0.01uM", "SAR20347_0.1uM", "CMPB_0.001uM", "Momelotinib_0.01uM",
"Fludarabine_0.1ug/ml", "Cerdulatinib_0.01uM", "Peficitinib_0.1uM",
"Atovaquone_0.03uM", "CC122_0.01uM", "CMPA_0.01uM", "NDI031301_0.01uM",
"PDB_0.1ng/ml", "CMPA_0.001uM", "Lenalidomide_0.01uM", "SAR20347_0.01uM",
"Tofacitinib_0.1uM", "Gandotinib_0.01uM", "Lenalidomide_0.1uM",
"Peficitinib_0.01uM", "CMPB_0.01uM", "CC122_0.001uM", "Dexamethasone_1uM",
"FludarabineMafosfamide_10ug/mlplus1ug/ml", "Ruxolitinib_10uM",
"CMPB_0.1uM", "Peficitinib_1uM", "Tofacitinib_1uM", "FludarabineMafosfamide_1ug/mlplus1ug/ml",
"Filgotinib_1uM", "Dexamethasone_10uM", "CMPA_1uM", "Dexamethasone_100uM",
"Gandotinib_1uM", "NDI031301_10uM", "Filgotinib_10uM", "SAR20347_10uM",
"CMPB_1uM", "Ruxolitinib_1uM", "Peficitinib_10uM", "Atovaquone_3uM",
"CC122_1uM", "SAR20347_1uM", "Momelotinib_1uM", "Momelotinib_0.1uM",
"Tofacitinib_10uM", "Fludarabine_10ug/ml", "Fludarabine_1ug/ml",
"Cerdulatinib_1uM", "Atovaquone_30uM", "Chlorambucil_30uM", "CMPA_0.1uM",
"NDI031301_1uM"), combo = c("none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none", "none", "none", "none", "none", "none", "none",
"none", "none"), cluster = c(3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L), dosage = c("1uM", "1uM", "10ug/mlplus1ug/ml",
"1uM", "0.1uM", "1uM", "1uM", "1ug/mlplus1ug/ml", "100ng/ml",
"1uM", "10uM", "1uM", "10uM", "100uM", "1uM", "10uM", "10uM",
"10ng/ml", "1uM", "1uM", "0.1uM", "3uM", "1uM", "1uM", "1uM",
"0.1uM", "10uM", "1ug/ml", "10ug/ml", "1uM", "1uM", "30uM", "30uM",
"0.1uM", "0.01ug/mlplus1ug/ml", "0.1ug/mlplus1ug/ml", "0.01ug/ml",
"0.3uM", "0.001uM", "1ng/ml", "0.01uM", "0.3uM", "0.1uM", "0.01uM",
"0.1uM", "0.001uM", "0.01uM", "0.1ug/ml", "0.01uM", "0.1uM",
"0.03uM", "0.01uM", "0.01uM", "0.01uM", "0.1ng/ml", "0.001uM",
"0.01uM", "0.01uM", "0.1uM", "0.01uM", "0.1uM", "0.01uM", "0.01uM",
"0.001uM", "1uM", "10ug/mlplus1ug/ml", "10uM", "0.1uM", "1uM",
"1uM", "1ug/mlplus1ug/ml", "1uM", "10uM", "1uM", "100uM", "1uM",
"10uM", "10uM", "10uM", "1uM", "1uM", "10uM", "3uM", "1uM", "1uM",
"1uM", "0.1uM", "10uM", "10ug/ml", "1ug/ml", "1uM", "30uM", "30uM",
"0.1uM", "1uM")), row.names = c(NA, -95L), class = "data.frame")
对不起菜鸟问题,我有这个复杂的药物集群数据,如屏幕截图所示。
我想将它们显示成一个堆叠的 geom_col 类型的图,x 轴是“药物”,Y 轴是出现的计数,并按集群分面。
到目前为止,这很容易。但我也想通过使用颜色填充来匹配它们的剂量来查看这些药物和剂量在每个集群中的分布。实际剂量有不同的单位等。
我将数字剂量提取到它自己的立柱中。我想分配一个因子向量(“min”、“low”、“high”、“max”)来反映剂量水平,因为我知道每种药物都有 4 种不同的剂量。
问题是不同药物的数字剂量不同,所以我不能简单地使用等级
例如,有些药物剂量范围从 0.03 到 30,有些等级从 0.3 到 300,有些范围从 0.01 到 10。
那么如何使用该数字药物剂量列将药物水平分配给每种药物呢?
解决方案
这是一种方法rank()
和加入。我们可以利用每种药物在药物中具有相同单位的事实。
library(dplyr)
df %>%
arrange(drug) %>% #for visualization
group_by(drug) %>% #group by drug
select(dose) %>% #get rid of extra columns
filter(!duplicated(dose)) %>% #remove duplicates
mutate(rank = rank(dose), #rank doses, mostly for visualization of results
category = c("min","low","high","max")[rank]) #assign category
# A tibble: 67 x 4
# Groups: drug [19]
drug dose rank category
<chr> <dbl> <dbl> <chr>
1 Atovaquone 3 3 high
2 Atovaquone 30 4 max
3 Atovaquone 0.3 2 low
4 Atovaquone 0.03 1 min
5 CC122 0.1 3 high
6 CC122 1 4 max
7 CC122 0.01 2 low
8 CC122 0.001 1 min
9 Cerdulatinib 1 2 low
10 Cerdulatinib 0.01 1 min
# … with 57 more rows
现在我们可以加入到原来的data.frame:
df %>%
arrange(drug) %>%
group_by(drug) %>%
select(dose) %>%
filter(!duplicated(dose)) %>%
mutate(rank = rank(dose), #rank doses
category = c("min","low","high","max")[rank]) %>%
right_join(df)
# A tibble: 95 x 8
# Groups: drug [19]
drug dose dosage rank category drug.dose combo cluster
<chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <int>
1 Atovaquone 3 3uM 3 high Atovaquone_3uM none 4
2 Atovaquone 3 3uM 3 high Atovaquone_3uM none 6
3 Atovaquone 30 30uM 4 max Atovaquone_30uM none 4
4 Atovaquone 30 30uM 4 max Atovaquone_30uM none 6
5 Atovaquone 0.3 0.3uM 2 low Atovaquone_0.3uM none 5
6 Atovaquone 0.03 0.03uM 1 min Atovaquone_0.03uM none 5
7 CC122 0.1 0.1uM 3 high CC122_0.1uM none 4
8 CC122 1 1uM 4 max CC122_1uM none 4
9 CC122 1 1uM 4 max CC122_1uM none 6
10 CC122 0.01 0.01uM 2 low CC122_0.01uM none 5
# … with 85 more rows
推荐阅读
- python - 如何获取文件夹中文件的大小
- mysql - 使用 SQL 将字段值增加原始值的设定百分比
- github - 如何使用nestjs进行github社交登录?
- scala - 如何在 Scala 中将元素添加到 Map 中,其中键是字符串,值是 List[String]
- xaml - XAMARIN 在点击 ShellContent (DataTemplate) 后阻止页面加载
- javascript - 如何在excelJs中自动调整行大小?
- ios - 对与其他 pod 有依赖关系的 pod 检测 podspec 时出错
- python - 如果“pip”不可用,如何告诉 Python 使用“pip3”?
- cordova - “cordova-plugin-code-push”插件是否与电容器兼容?
- javascript - 为什么要花费如此多的尝试才能生成 1 到 10000 之间的随机值?