r - 如何按多次出现的值删除行
问题描述
我有一个数据集,我在同一列中合并了两个数据框。这样做之后,我发现该列的值在一天/一周中LOC_ID
不止一个。ID
这是 BTO 数据,我认为这是因为他们更新了栖息地指标并为同一个花园发送了多个栖息地数据。
我想存储最新的,我猜这是一组重复 ID 值中最新出现的 ID 值。
可重现的代码:
structure(list(year = c(2009, 2009, 2009, 2009, 2009, 2009, 2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2009), week = c(52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52), POSTCODE = c("IP21 4TR", "IP22 4BD", "IP22 4BD", "IP22 4BE",
"IP22 4YW", "IP22 5SB", "IP22 5SB", "IP24 2YW", "IP24 3EP", "IP24 3HG",
"IP24 3NF", "IP25 6QU", "IP25 6QU", "IP25 6TD", "IP25 7DB", "IP25 7EY",
"IP25 7HG", "IP25 7SX", "IP26 5HH", "IP26 5JD", "IP27 0RQ", "IP27 0RQ",
"NR10 4EA", "NR10 4EA", "NR10 4PF", "NR11 6JP", "NR11 7QB", "NR11 7QB",
"NR11 7QG", "NR11 8PA"), OBS_DT = structure(c(14605, 14605, 14605,
14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605,
14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605,
14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605
), class = "Date"), LOC_ID = c("LOC633232", "LOC1153084541859",
"LOC1153084541859", "LOC1300605020", "LOC1049970899816", "LOC628053",
"LOC628053", "LOC1300611642", "LOC710372", "LOC676185", "LOC696817",
"LOC1300602161", "LOC1300602161", "LOC1039890913279", "LOC287351",
"LOC13006219110", "LOC1300605191", "LOC1133275915797", "LOC614479",
"LOC1300611662", "LOC575712", "LOC575712", "LOC575593", "LOC575593",
"LOC1300604079", "LOC1300604818", "LOC606431", "LOC606431", "LOC1300612920",
"LOC1300604135"), HOW_MANY = c(14L, 6L, 6L, 9L, 3L, 15L, 15L,
9L, 13L, 9L, 10L, 8L, 8L, 15L, 14L, 6L, 4L, 8L, 8L, 7L, 11L,
11L, 16L, 16L, 7L, 3L, 13L, 13L, 15L, 8L), GARDEN_SIZE = c("M",
"S", "S", "M", "S", "L", "L", "M", "X", "X", "M", "M", "L", "S",
"L", "X", "M", "L", "M", "L", "M", "S", "L", "L", "L", "M", "L",
"L", "S", "M"), GARDEN_AGE = c("B", "D", "D", "C", "C", "D",
"E", "B", "X", "X", "D", "E", "E", "D", "E", "X", "D", "E", "D",
"A", "D", "D", "E", "E", "E", "D", "E", "D", "B", "C"), SMALL_DECID_TREES = c("C",
"C", "C", "B", "A", "D", "D", "D", "X", "X", "C", "D", "E", "C",
"E", "X", "D", "E", "A", "X", "C", "C", "D", "E", "C", "D", "D",
"E", "C", "E"), LARGE_DECID_TREES = c("B", "A", "A", "A", "B",
"C", "D", "B", "X", "X", "B", "C", "D", "C", "D", "X", "B", "E",
"A", "D", "B", "B", "C", "E", "C", "C", "E", "E", "X", "C")), row.names = c(NA,
-30L), groups = structure(list(year = c(2009, 2009, 2009, 2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009,
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009), week = c(52,
52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52,
52, 52, 52, 52, 52, 52, 52), POSTCODE = c("IP21 4TR", "IP22 4BD",
"IP22 4BE", "IP22 4YW", "IP22 5SB", "IP24 2YW", "IP24 3EP", "IP24 3HG",
"IP24 3NF", "IP25 6QU", "IP25 6TD", "IP25 7DB", "IP25 7EY", "IP25 7HG",
"IP25 7SX", "IP26 5HH", "IP26 5JD", "IP27 0RQ", "NR10 4EA", "NR10 4PF",
"NR11 6JP", "NR11 7QB", "NR11 7QG", "NR11 8PA"), OBS_DT = structure(c(14605,
14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605,
14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605, 14605,
14605, 14605, 14605, 14605, 14605), class = "Date"), .rows = structure(list(
1L, 2:3, 4L, 5L, 6:7, 8L, 9L, 10L, 11L, 12:13, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21:22, 23:24, 25L, 26L, 27:28, 29L,
30L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), row.names = c(NA, 24L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
它的外观和我想要的输出:
original:
year week POSTCODE OBS_DT LOC_ID HOW_MANY GARDEN_SIZE GARDEN_AGE SMALL_DECID_TREES LARGE_DECID_TREES
<dbl> <dbl> <chr> <date> <chr> <int> <chr> <chr> <chr> <chr>
1 2009 52 IP21 4TR 2009-12-27 LOC633232 14 M B C B
2 2009 52 IP22 4BD 2009-12-27 LOC1153084541859 6 S D C A
3 2009 52 IP22 4BD 2009-12-27 LOC1153084541859 6 S D C A
4 2009 52 IP22 4BE 2009-12-27 LOC1300605020 9 M C B A
5 2009 52 IP22 4YW 2009-12-27 LOC1049970899816 3 S C A B
6 2009 52 IP22 5SB 2009-12-27 LOC628053 15 L D D C
7 2009 52 IP22 5SB 2009-12-27 LOC628053 15 L E D D
8 2009 52 IP24 2YW 2009-12-27 LOC1300611642 9 M B D B
9 2009 52 IP24 3EP 2009-12-27 LOC710372 13 X X X X
10 2009 52 IP24 3HG 2009-12-27 LOC676185 9 X X X X
New Output:
year week POSTCODE OBS_DT LOC_ID HOW_MANY GARDEN_SIZE GARDEN_AGE SMALL_DECID_TREES LARGE_DECID_TREES
<dbl> <dbl> <chr> <date> <chr> <int> <chr> <chr> <chr> <chr>
1 2009 52 IP21 4TR 2009-12-27 LOC633232 14 M B C B
2 2009 52 IP22 4BD 2009-12-27 LOC1153084541859 6 S D C A
3 2009 52 IP22 4BE 2009-12-27 LOC1300605020 9 M C B A
4 2009 52 IP22 4YW 2009-12-27 LOC1049970899816 3 S C A B
5 2009 52 IP22 5SB 2009-12-27 LOC628053 15 L E D D
6 2009 52 IP24 2YW 2009-12-27 LOC1300611642 9 M B D B
7 2009 52 IP24 3EP 2009-12-27 LOC710372 13 X X X X
8 2009 52 IP24 3HG 2009-12-27 LOC676185 9 X X X X
解决方案
也许你可以试试
subset(df,as.logical(ave(LOC_ID, year, week, FUN = Negate(duplicated))))
推荐阅读
- hashicorp-vault - 无法通过 CLI 登录 Vault
- reactjs - 我们在哪里获得 React.js 中 render 方法中的 props
- python - Python & Pandas:使用 pd.json_normalize 展平嵌套的 json
- java - 有没有办法在 Java 流上使用条件谓词?
- amazon-web-services - RuntimeError:尝试反序列化 CUDA 设备上的对象,但 torch.cuda.is_available() 为 False。无法将 GPU 与 tacotron2 一起使用
- kubernetes-helm - Helm 模板 .Files.Get 函数在范围内无法正常工作
- go - 我们可以在 Golang 可测试示例输出注释中使用正则表达式吗?
- .net-core - EFCore 2.0:在前一个操作完成之前在此上下文上启动了第二个操作
- c++ - 是 O(NlogN) 还是 O(N^2)?
- reactjs - 单击按钮时打开导航栏