r - 将数据集与特定标准进行比较
问题描述
我尽我最大的努力使我在这里的第一个问题尽可能具体和可重复。我有一个庞大的数据集,其中包含不同年份(2010-2019 年)不同站点(例如:Carb_Ben、Carb_uR、OW_Kessin)的不同参数(例如:TP 和 O2)的年平均值或最小值。每个站都属于特定类型(例如:11_tl_ba,14_ka,23)。
data<-structure(list(Station = c("Carb_Ben", "Carb_Ben", "Carb_uR",
"Carb_uR", "Laak_GK", "Laak_GK", "Laak_GK", "OW_Kessin", "OW_Kessin",
"OW_Kessin"), Typ = c("11_tl_ba", "11_tl_ba", "11_tl_ba", "11_tl_ba",
"23", "23", "23", "23", "23", "23"), Jahr = structure(c(4L, 7L,
4L, 7L, 2L, 5L, 8L, 3L, 4L, 5L), .Label = c("2010", "2013", "2014",
"2015", "2016", "2017", "2018", "2019"), class = "factor"), O2_min = c(1.44,
1.53, 8, 6.7, 2.7, 1.79, 1.4, 4.3, 4.7675, 4.015), TP_mean = c(0.513333333333333,
0.1625, 0.148333333333333, 0.124166666666667, 0.155, 0.28, 0.175833333333333,
0.0954166666666667, 0.0929166666666667, 0.0970833333333333)), row.names = c(NA,
-10L), groups = structure(list(Station = c("Carb_Ben", "Carb_uR",
"Laak_GK", "OW_Kessin"), Typ = c("11_tl_ba", "11_tl_ba", "23",
"23"), .rows = list(1:2, 3:4, 5:7, 8:10)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
#so the data look like this:
# A tibble: 10 x 5
# Groups: Station, Typ [4]
Station Typ Jahr O2_min TP_mean
<chr> <chr> <fct> <dbl> <dbl>
1 Carb_Ben 11_tl_ba 2015 1.44 0.513
2 Carb_Ben 11_tl_ba 2018 1.53 0.162
3 Carb_uR 11_tl_ba 2015 8 0.148
4 Carb_uR 11_tl_ba 2018 6.7 0.124
5 Laak_GK 23 2013 2.7 0.155
6 Laak_GK 23 2016 1.79 0.28
7 Laak_GK 23 2019 1.4 0.176
8 OW_Kessin 23 2014 4.3 0.0954
9 OW_Kessin 23 2015 4.77 0.0929
10 OW_Kessin 23 2016 4.01 0.0971
然后我有每个参数的标准和良好状态 (_g) 或非常好的状态 (_sg) 的类型。
criteria<-structure(list(Typ = structure(c(10L, 5L, 24L), .Label = c("10",
"11_alp", "11_mg_ba", "11_mg_br", "11_tl_ba", "11_tl_br", "12_mg_ba",
"12_tl_ba", "12_tl_br", "14_ka", "14_si", "15", "15 g", "16_ka",
"16_si", "17", "18", "19", "19_ba", "2.1", "2.2", "20", "22",
"23", "3.1", "3.2", "4", "5", "5.1", "6", "6 K", "7", "9", "9.1",
"9.1 K", "9.2", "B1", "B2a", "B2b", "B3a", "B3b", "Subtyp 21 N"
), class = "factor"), O2_g = c(7L, 6L, 4L), O2_sg = 9:7, TP_g = c(0.1,
0.15, 0.1), TP_sg = c(0.05, 0.05, 0.05)), row.names = c(24L,
28L, 37L), class = "data.frame")
#This looks like this:
Typ O2_g O2_sg TP_g TP_sg
24 11_tl_ba 8 8 0.10 0.05
28 14_ka 7 9 0.10 0.05
27 23 6 8 0.15 0.05
现在我想根据类型比较每个站点和每年的每个参数是否符合标准。找出每年每个站点的每个参数的状态是“好”(gut)、“非常好”(sehr gut)还是“不好”(nicht gut)
我目前的解决方案是
data_criteria<-data%>%
mutate(GW_O2_g=case_when(Typ=='11_tl_ba' ~criteria$O2_g[which(criteria$Typ=='11_tl_ba')],
Typ=='14_ka' ~criteria$O2_g[which(criteria$Typ=='14_ka')],
Typ=='23' ~criteria$O2_g[which(criteria$Typ=='23')]),
GW_O2_sg=case_when(Typ=='11_tl_ba' ~criteria$O2_sg[which(criteria$Typ=='11_tl_ba')],
Typ=='14_ka' ~criteria$O2_sg[which(criteria$Typ=='14_ka')],
Typ=='23' ~criteria$O2_sg[which(criteria$Typ=='23')]),
GW_TP_g=case_when(Typ=='11_tl_ba' ~criteria$TP_g[which(criteria$Typ=='11_tl_ba')],
Typ=='14_ka' ~criteria$TP_g[which(criteria$Typ=='14_ka')],
Typ=='23' ~criteria$TP_g[which(criteria$Typ=='23')]),
GW_TP_sg=case_when(Typ=='11_tl_ba' ~criteria$TP_sg[which(criteria$Typ=='11_tl_ba')],
Typ=='14_ka' ~criteria$TP_sg[which(criteria$Typ=='14_ka')],
Typ=='23' ~criteria$TP_sg[which(criteria$Typ=='23')]))%>%
mutate(GK_O2=case_when(O2_min<GW_O2_g ~'nicht gut',
O2_min>=GW_O2_g & O2_min<GW_O2_sg~'gut',
O2_min>GW_O2_sg ~ 'sehr gut'),
GK_TP=case_when(TP_mean>GW_TP_g ~ 'nicht gut',
TP_mean<=GW_TP_g & TP_mean>GW_TP_sg ~ 'gut',
TP_mean<=GW_TP_sg ~ 'sehr gut'))
#to get this:
# A tibble: 10 x 11
# Groups: Station, Typ [4]
Station Typ Jahr O2_min TP_mean GW_O2_g GW_O2_sg GW_TP_g GW_TP_sg GK_O2 GK_TP
<chr> <chr> <fct> <dbl> <dbl> <int> <int> <dbl> <dbl> <chr> <chr>
1 Carb_Ben 11_tl_ba 2015 1.44 0.513 8 8 0.1 0.05 nicht gut nicht gut
2 Carb_Ben 11_tl_ba 2018 1.53 0.162 8 8 0.1 0.05 nicht gut nicht gut
3 Carb_uR 11_tl_ba 2015 8 0.148 8 8 0.1 0.05 sehr gut nicht gut
4 Carb_uR 11_tl_ba 2018 6.7 0.124 8 8 0.1 0.05 nicht gut nicht gut
5 Laak_GK 23 2013 2.7 0.155 6 8 0.15 0.05 nicht gut nicht gut
6 Laak_GK 23 2016 1.79 0.28 6 8 0.15 0.05 nicht gut nicht gut
7 Laak_GK 23 2019 1.4 0.176 6 8 0.15 0.05 nicht gut nicht gut
8 OW_Kessin 23 2014 4.3 0.0954 6 8 0.15 0.05 nicht gut gut
9 OW_Kessin 23 2015 4.77 0.0929 6 8 0.15 0.05 nicht gut gut
10 OW_Kessin 23 2016 4.01 0.0971 6 8 0.15 0.05 nicht gut gut
因为我有更多的站点和类型以及更多的参数,所以我的解决方案对我来说意味着更多的输入和更多包含错误的机会。我很确定有一种更优雅的方法可以得到相同的结果,但到目前为止我还没有深入了解 R 来弄清楚该怎么做?
因此,如果有人有一个很好的简单解决方案,我会很高兴;)。
解决方案
criteria
您可以从使用中反复选择match
。within
有助于保持代码简洁。
dat <- within(dat, {
GW_O2_g <- criteria[match(dat$Typ, criteria$Typ), "O2_g"]
GW_O2_sg <- criteria[match(dat$Typ, criteria$Typ), "O2_sg"]
GW_TP_g <- criteria[match(dat$Typ, criteria$Typ), "TP_g"]
GW_TP_sg <- criteria[match(dat$Typ, criteria$Typ), "TP_sg"]
GK_O2 <- NA
GK_O2[O2_min < GW_O2_g] <- "nicht gut"
GK_O2[O2_min >= GW_O2_g & O2_min < GW_O2_sg] <- "gut"
GK_O2[O2_min > GW_O2_sg] <- "sehr gut"
GK_TP <- NA
GK_TP[TP_mean > GW_TP_g] <- "nicht gut"
GK_TP[TP_mean <= GW_TP_g & TP_mean > GW_TP_sg] <- "gut"
GK_TP[TP_mean <= GW_TP_sg] <- "sehr gut"
})
# # A tibble: 10 x 11
# # Groups: Station, Typ [4]
# Station Typ Jahr O2_min TP_mean GK_TP GK_O2 GW_TP_sg GW_TP_g GW_O2_sg GW_O2_g
# <chr> <chr> <fct> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <int> <int>
# 1 Carb_Ben 11_tl_ba 2015 1.44 0.513 nicht gut nicht gut 0.05 0.15 8 6
# 2 Carb_Ben 11_tl_ba 2018 1.53 0.162 nicht gut nicht gut 0.05 0.15 8 6
# 3 Carb_uR 11_tl_ba 2015 8 0.148 gut NA 0.05 0.15 8 6
# 4 Carb_uR 11_tl_ba 2018 6.7 0.124 gut gut 0.05 0.15 8 6
# 5 Laak_GK 23 2013 2.7 0.155 nicht gut nicht gut 0.05 0.1 7 4
# 6 Laak_GK 23 2016 1.79 0.28 nicht gut nicht gut 0.05 0.1 7 4
# 7 Laak_GK 23 2019 1.4 0.176 nicht gut nicht gut 0.05 0.1 7 4
# 8 OW_Kessin 23 2014 4.3 0.0954 gut gut 0.05 0.1 7 4
# 9 OW_Kessin 23 2015 4.77 0.0929 gut gut 0.05 0.1 7 4
# 10 OW_Kessin 23 2016 4.01 0.0971 gut gut 0.05 0.1 7 4
推荐阅读
- javascript - 刷新页面API丢失数据?
- php - 如何在 Google 数据洞察中嵌入报告过滤器?
- vuejs2 - 我可以在所见即所得组件中使用 v-validate 必需选项吗?
- rest - WinHTTP Open 方法导致延迟?
- swift - CloudKit 正在工作,但突然出现权限错误
- matlab - 基于最大值索引到 MAtlab 表
- ruby - 如何解决在 Ruby 中调用函数的问题?
- python - 是否有在 _init_ 方法中定义变量的最佳实践
- macos - `kAFAssistantErrorDomain` 代码记录在哪里?
- html - 在网站上查找特定 ID 标签的递归函数