r - Append latest values to older records
问题描述
I need to transform a historical data set of population by hierarchically nested subdivisions (region, city, district, etc.) so that the data can be analyzed according to the latest year (2015) version of the subdivisions. In the data, the latest year version is the rows where column y2015 is filled in with an amount. The previous year versions are where any of y1990-y2010 is filled in. Year 2015 is always on separate rows because it is from a different data file. Between 1990 and 2010, changes in subdivision lead to a new row, e.g. Anjung-myeon became Anjung-eup in 2005, so it has 3 rows: 1 for 1995-200, 1 for 2005-2010, and 1 for 2015. Below is a hand-picked sample with all the data required for transformation.
head(pop, 10)
y1990 y1995 y2000 y2005 y2010 y2015 n1 n2 n3 n4
1 10603250 10217177 9853972 9762546 9631482 <NA> Seoul
26 187355 135082 130370 126679 117253 <NA> Seoul Jung-gu
694 <NA> <NA> <NA> <NA> <NA> 9904312 Seoul
713 <NA> <NA> <NA> <NA> <NA> 128478 Seoul Jung-gu
1144 1816328 2304176 2466338 2517680 2632035 <NA> Incheon
1148 80799 67388 65775 85392 81846 <NA> Incheon Jung-gu
1149 4295 2580 - - - <NA> Incheon Jung-gu Jungang-dong
1158 8743 6392 5439 4532 3985 <NA> Incheon Jung-gu Bukseong-dong
1363 <NA> <NA> <NA> <NA> <NA> 2890451 Incheon
1367 <NA> <NA> <NA> <NA> <NA> 112910 Incheon Jung-gu
pop <- structure(list(y1990 = c("10603250", "187355", NA, NA, "1816328", "80799", "4295", "8743", NA, NA, NA, "644622", "329846", "16805", "79124", "-", "-", "-", "115215", "14721", NA, NA, NA, NA, NA, NA, "-", NA, "71600", "-", NA, "22204"),
y1995 = c("10217177", "135082", NA, NA, "2304176", "67388", "2580", "6392", NA, NA, NA, "754670", "260109", "14605", "311940", "-", "13069", "20673", "-", "-", NA, NA, NA, NA, NA, NA, "64695", NA, "-", "22146", NA, "-"),
y2000 = c("9853972", "130370", NA, NA, "2466338", "65775", "-", "5439", NA, NA, NA, "944239", "315396", "14688", "342806", "-", "17552", "29249", "-", "-", NA, NA, NA, NA, NA, NA, "60289", NA, "-", "21600", NA, "-"),
y2005 = c("9762546", "126679", NA, NA, "2517680", "85392", "-", "4532", NA, NA, NA, "1039233", "290546", "11041", "374262", "34336", "-", "36010", "-", "-", NA, NA, NA, NA, NA, NA, "57705", NA, "-", "19899", NA, "-"),
y2010 = c("9631482", "117253", NA, NA, "2632035", "81846", "-", "3985", NA, NA, NA, "1054053", "298748", "4710", "381731", "37930", "-", "47419", "-", "-", NA, NA, NA, NA, NA, NA, "56423", NA, "-", "19441", NA, "-"),
y2015 = c(NA, NA, "9904312", "128478", NA, NA, NA, NA, "2890451", "112910", "4116", NA, NA, NA, NA, NA, NA, NA, NA, NA, "1194313", "349339", "9944", "457873", "41092", "46014", NA, "62291", NA, NA, "21089", NA),
level = c("1", "2", "1", "2", "1", "2", "3", "3", "1", "2", "3", "2", "3", "4", "2", "3", "3", "3", "2", "3", "2", "3", "4", "2", "3", "3", "2", "2", "2", "3", "3", "3"),
n1 = c("Seoul", "Seoul", "Seoul", "Seoul", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do"),
n2 = c("", "Jung-gu", "", "Jung-gu", "", "Jung-gu", "Jung-gu", "Jung-gu", "", "Jung-gu", "Jung-gu", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-gun", "Pyeongtaek-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun"),
n3 = c("", "", "", "", "", "", "Jungang-dong", "Bukseong-dong", "", "", "Bukseong-dong", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Anjung-myeon", "Jungang-dong", "", "Anjung-myeon", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Jungang-dong", "", "", "", "Ganghwa-eup", "Ganghwa-eup", "Ganghwa-eup"),
n4 = c("", "", "", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "")),
row.names = c(1L, 26L, 694L, 713L, 1144L, 1148L, 1149L, 1158L, 1363L, 1367L, 1374L, 1532L, 1553L, 1555L, 1827L, 1829L, 1837L, 1838L, 2467L, 2476L, 2512L, 2524L, 2525L, 2720L, 2722L, 2730L, 1314L, 1505L, 2485L, 1315L, 1506L, 2486L),
class = "data.frame")
Subdivisions (n1-n4) are identified by name and the subdivision hierarchy is flattened into columns for each of the 4 levels the hierarchy. Over time, subdivisions may: 1) change parent or 2) change suffix. Because there can be synonyms with different parents (e.g. Jung-gu can be found in both Seoul and Incheon), the two types of change can probably be tracked only when they happen separately, not in combination (to the best of my analysis). Parent changes can be tracked if the suffix does not change, e.g. Ganghwa-gun switched from parent Gyeonggi-do to Incheon in 1995. Suffix changes can be tracked if the parent stems don't change, e.g. Anjung-myeon, Pyeongtaek-gun, Gyeonggi-do became Anjung-eup, Pyeongtaek-si, Gyeonggi-do in 2005.
Below is the desired ouput, with the 2015 version of each subdivision in columns n1_15 to n4_15. Most subdivisions do not change over time so the 2015 hierarchy is the same as for previous years. The only changes in the sample are for Ganghwa-gun and Anjung-myeon. For example, Ganghwa-gun is under Incheon in 2015 (row 1505), so that hierarchy is copied into rows 2485 and 1314. Same for row 1506 to rows 2486 and 1315.
head(pop, 10)
y1990 y1995 y2000 y2005 y2010 y2015 level n1 n2 n3 n4
1 10603250 10217177 9853972 9762546 9631482 <NA> 1 Seoul
26 187355 135082 130370 126679 117253 <NA> 2 Seoul Jung-gu
694 <NA> <NA> <NA> <NA> <NA> 9904312 1 Seoul
713 <NA> <NA> <NA> <NA> <NA> 128478 2 Seoul Jung-gu
1144 1816328 2304176 2466338 2517680 2632035 <NA> 1 Incheon
1148 80799 67388 65775 85392 81846 <NA> 2 Incheon Jung-gu
1149 4295 2580 - - - <NA> 3 Incheon Jung-gu Jungang-dong
1158 8743 6392 5439 4532 3985 <NA> 3 Incheon Jung-gu Bukseong-dong
1363 <NA> <NA> <NA> <NA> <NA> 2890451 1 Incheon
1367 <NA> <NA> <NA> <NA> <NA> 112910 2 Incheon Jung-gu
pop <- structure(list(y1990 = c("10603250", "187355", NA, NA, "1816328", "80799", "4295", "8743", NA, NA, NA, "644622", "329846", "16805", "79124", "-", "-", "-", "115215", "14721", NA, NA, NA, NA, NA, NA, "-", NA, "71600", "-", NA, "22204"),
y1995 = c("10217177", "135082", NA, NA, "2304176", "67388", "2580", "6392", NA, NA, NA, "754670", "260109", "14605", "311940", "-", "13069", "20673", "-", "-", NA, NA, NA, NA, NA, NA, "64695", NA, "-", "22146", NA, "-"),
y2000 = c("9853972", "130370", NA, NA, "2466338", "65775", "-", "5439", NA, NA, NA, "944239", "315396", "14688", "342806", "-", "17552", "29249", "-", "-", NA, NA, NA, NA, NA, NA, "60289", NA, "-", "21600", NA, "-"),
y2005 = c("9762546", "126679", NA, NA, "2517680", "85392", "-", "4532", NA, NA, NA, "1039233", "290546", "11041", "374262", "34336", "-", "36010", "-", "-", NA, NA, NA, NA, NA, NA, "57705", NA, "-", "19899", NA, "-"),
y2010 = c("9631482", "117253", NA, NA, "2632035", "81846", "-", "3985", NA, NA, NA, "1054053", "298748", "4710", "381731", "37930", "-", "47419", "-", "-", NA, NA, NA, NA, NA, NA, "56423", NA, "-", "19441", NA, "-"),
y2015 = c(NA, NA, "9904312", "128478", NA, NA, NA, NA, "2890451", "112910", "4116", NA, NA, NA, NA, NA, NA, NA, NA, NA, "1194313", "349339", "9944", "457873", "41092", "46014", NA, "62291", NA, NA, "21089", NA),
level = c("1", "2", "1", "2", "1", "2", "3", "3", "1", "2", "3", "2", "3", "4", "2", "3", "3", "3", "2", "3", "2", "3", "4", "2", "3", "3", "2", "2", "2", "3", "3", "3"),
n1 = c("Seoul", "Seoul", "Seoul", "Seoul", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do"),
n2 = c("", "Jung-gu", "", "Jung-gu", "", "Jung-gu", "Jung-gu", "Jung-gu", "", "Jung-gu", "Jung-gu", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-gun", "Pyeongtaek-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun"),
n3 = c("", "", "", "", "", "", "Jungang-dong", "Bukseong-dong", "", "", "Bukseong-dong", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Anjung-myeon", "Jungang-dong", "", "Anjung-myeon", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Jungang-dong", "", "", "", "Ganghwa-eup", "Ganghwa-eup", "Ganghwa-eup"),
n4 = c("", "", "", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", ""),
n1_15 = c("Seoul", "Seoul", "Seoul", "Seoul", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do"),
n2_15 = c("", "Jung-gu", "", "Jung-gu", "", "Jung-gu", "Jung-gu", "Jung-gu", "Ganghwa-gun", "Ganghwa-gun", "", "Jung-gu", "Jung-gu", "Ganghwa-gun", "Ganghwa-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Ganghwa-gun", "Ganghwa-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si"),
n3_15 = c("", "", "", "", "", "", "Jungang-dong", "Bukseong-dong", "", "Ganghwa-eup", "", "", "Bukseong-dong", "", "Ganghwa-eup", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Anjung-eup", "Jungang-dong", "", "Anjung-eup", "", "Ganghwa-eup", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Jungang-dong"),
n4_15 = c("", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "")),
row.names = c(1L, 26L, 694L, 713L, 1144L, 1148L, 1149L, 1158L, 1363L, 1367L, 1374L, 1532L, 1553L, 1555L, 1827L, 1829L, 1837L, 1838L, 2467L, 2476L, 2512L, 2524L, 2525L, 2720L, 2722L, 2730L, 1314L, 1505L, 2485L, 1315L, 1506L, 2486L),
class = "data.frame")
I would prefer to handle this with dplyr
(or base R) because I am relatively familiar with it, although not enough to solve this myself.
EDIT1: I have reduced the question to the single challenge of assigning latest parents to older records.
EDIT2: I have followed Prashant's suggestions but also re-introduced the suffix change challenge because upon thinking about it, the hard part is to deal with both challenges at the same time.
解决方案
我写了一个“程序”(?)解决方案。可以改进和/或使其矢量化吗?
for (i in 1:nrow(pop)) {
N1 <- pop$n1[i]
N2 <- pop$n2[i]
N3 <- pop$n3[i]
N4 <- pop$n4[i]
#If entire path found as is in 2015
if (with(pop, sum(!is.na(y2015) & n1==N1 & n2==N2 & n3==N3 & n4==N4))==1) {
pop$n1_15[i] <- N1
pop$n2_15[i] <- N2
pop$n3_15[i] <- N3
pop$n4_15[i] <- N4
next
}
#If unique place found in 2015, from level 4 to 2
l <- pop$level[i]
if (l==4 & with(pop, sum(!is.na(y2015) & n4==N4))==1) {
pop$n1_15[i] <- with(pop, n1[!is.na(y2015) & n4==N4])
pop$n2_15[i] <- with(pop, n2[!is.na(y2015) & n4==N4])
pop$n3_15[i] <- with(pop, n3[!is.na(y2015) & n4==N4])
pop$n4_15[i] <- N4
next
}
if (l==3 & with(pop, sum(!is.na(y2015) & level==3 & n3==N3))==1) {
pop$n1_15[i] <- with(pop, n1[!is.na(y2015) & level==3 & n3==N3])
pop$n2_15[i] <- with(pop, n2[!is.na(y2015) & level==3 & n3==N3])
pop$n3_15[i] <- N3
pop$n4_15[i] <- N4
next
}
if (l==2 & with(pop, sum(!is.na(y2015) & level==2 & n2==N2))==1) {
pop$n1_15[i] <- with(pop, n1[!is.na(y2015) & level==2 & n2==N2])
pop$n2_15[i] <- N2
pop$n3_15[i] <- N3
pop$n4_15[i] <- N4
next
}
#If entire path without suffixes found in 2015
P <- pop[
with(pop, !is.na(y2015) &
sub("[-\\) ][^-\\) ]*$", "", trimws(n1))==sub("[-\\) ][^-\\) ]*$", "", trimws(N1)) &
sub("[-\\) ][^-\\) ]*$", "", trimws(n2))==sub("[-\\) ][^-\\) ]*$", "", trimws(N2)) &
sub("[-\\) ][^-\\) ]*$", "", trimws(n3))==sub("[-\\) ][^-\\) ]*$", "", trimws(N3)) &
sub("[-\\) ][^-\\) ]*$", "", trimws(n4))==sub("[-\\) ][^-\\) ]*$", "", trimws(N4))),,drop=F]
if (nrow(P)==1) {
pop$n1_15[i] <- P$n1
pop$n2_15[i] <- P$n2
pop$n3_15[i] <- P$n3
pop$n4_15[i] <- P$n4
next
}
#Garbage collection
pop$n1_15[i] <- NA
pop$n2_15[i] <- NA
pop$n3_15[i] <- NA
pop$n4_15[i] <- NA
}
推荐阅读
- python - 如何修复相对导入错误:“没有已知的父包”?
- html - Hot 使子元素比父元素更宽
- node.js - 从 Jhipster 构建的应用程序中完全删除实体
- flutter - 由 SQLite 支持的 Flutter 应用的状态管理
- ios - iOS 15 Beta 5(19A5318f) runloop 运行崩溃
- android - 为什么列表视图不显示用户输入的文本
- algorithm - 我如何理解这段代码以递归方式反转链表?
- java - 使方法泛型和属性
- lambda - 我应该如何使用 Kotlin 将 lambda 函数作为参数传递给 Android 中的另一个函数
- javascript - 如何使嵌套的反应路由器组件加载到新页面而不是页面的一半