首页 > 解决方案 > Append latest values to older records

问题描述

I need to transform a historical data set of population by hierarchically nested subdivisions (region, city, district, etc.) so that the data can be analyzed according to the latest year (2015) version of the subdivisions. In the data, the latest year version is the rows where column y2015 is filled in with an amount. The previous year versions are where any of y1990-y2010 is filled in. Year 2015 is always on separate rows because it is from a different data file. Between 1990 and 2010, changes in subdivision lead to a new row, e.g. Anjung-myeon became Anjung-eup in 2005, so it has 3 rows: 1 for 1995-200, 1 for 2005-2010, and 1 for 2015. Below is a hand-picked sample with all the data required for transformation.

head(pop, 10)
        y1990    y1995   y2000   y2005   y2010   y2015      n1      n2            n3 n4
1    10603250 10217177 9853972 9762546 9631482    <NA>   Seoul                         
26     187355   135082  130370  126679  117253    <NA>   Seoul Jung-gu                 
694      <NA>     <NA>    <NA>    <NA>    <NA> 9904312   Seoul                         
713      <NA>     <NA>    <NA>    <NA>    <NA>  128478   Seoul Jung-gu                 
1144  1816328  2304176 2466338 2517680 2632035    <NA> Incheon                         
1148    80799    67388   65775   85392   81846    <NA> Incheon Jung-gu                 
1149     4295     2580       -       -       -    <NA> Incheon Jung-gu  Jungang-dong   
1158     8743     6392    5439    4532    3985    <NA> Incheon Jung-gu Bukseong-dong   
1363     <NA>     <NA>    <NA>    <NA>    <NA> 2890451 Incheon                         
1367     <NA>     <NA>    <NA>    <NA>    <NA>  112910 Incheon Jung-gu                            

pop <- structure(list(y1990 = c("10603250", "187355", NA, NA, "1816328", "80799", "4295", "8743", NA, NA, NA, "644622", "329846", "16805", "79124", "-", "-", "-", "115215", "14721", NA, NA, NA, NA, NA, NA, "-", NA, "71600", "-", NA, "22204"), 
                      y1995 = c("10217177", "135082", NA, NA, "2304176", "67388", "2580", "6392", NA, NA, NA, "754670", "260109", "14605", "311940", "-", "13069", "20673", "-", "-", NA, NA, NA, NA, NA, NA, "64695", NA, "-", "22146", NA, "-"), 
                      y2000 = c("9853972", "130370", NA, NA, "2466338", "65775", "-", "5439", NA, NA, NA, "944239", "315396", "14688", "342806", "-", "17552", "29249", "-", "-", NA, NA, NA, NA, NA, NA, "60289", NA, "-", "21600", NA, "-"), 
                      y2005 = c("9762546", "126679", NA, NA, "2517680", "85392", "-", "4532", NA, NA, NA, "1039233", "290546", "11041", "374262", "34336", "-", "36010", "-", "-", NA, NA, NA, NA, NA, NA, "57705", NA, "-", "19899", NA, "-"), 
                      y2010 = c("9631482", "117253", NA, NA, "2632035", "81846", "-", "3985", NA, NA, NA, "1054053", "298748", "4710", "381731", "37930", "-", "47419", "-", "-", NA, NA, NA, NA, NA, NA, "56423", NA, "-", "19441", NA, "-"), 
                      y2015 = c(NA, NA, "9904312", "128478", NA, NA, NA, NA, "2890451", "112910", "4116", NA, NA, NA, NA, NA, NA, NA, NA, NA, "1194313", "349339", "9944", "457873", "41092", "46014", NA, "62291", NA, NA, "21089", NA), 
                      level = c("1", "2", "1", "2", "1", "2", "3", "3", "1", "2", "3", "2", "3", "4", "2", "3", "3", "3", "2", "3", "2", "3", "4", "2", "3", "3", "2", "2", "2", "3", "3", "3"), 
                      n1 = c("Seoul", "Seoul", "Seoul", "Seoul", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do"), 
                      n2 = c("", "Jung-gu", "", "Jung-gu", "", "Jung-gu", "Jung-gu", "Jung-gu", "", "Jung-gu", "Jung-gu", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-gun", "Pyeongtaek-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun"), 
                      n3 = c("", "", "", "", "", "", "Jungang-dong", "Bukseong-dong", "", "", "Bukseong-dong", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Anjung-myeon", "Jungang-dong", "", "Anjung-myeon", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Jungang-dong", "", "", "", "Ganghwa-eup", "Ganghwa-eup", "Ganghwa-eup"), 
                      n4 = c("", "", "", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "")), 
                 row.names = c(1L, 26L, 694L, 713L, 1144L, 1148L, 1149L, 1158L, 1363L, 1367L, 1374L, 1532L, 1553L, 1555L, 1827L, 1829L, 1837L, 1838L, 2467L, 2476L, 2512L, 2524L, 2525L, 2720L, 2722L, 2730L, 1314L, 1505L, 2485L, 1315L, 1506L, 2486L), 
                 class = "data.frame")

Subdivisions (n1-n4) are identified by name and the subdivision hierarchy is flattened into columns for each of the 4 levels the hierarchy. Over time, subdivisions may: 1) change parent or 2) change suffix. Because there can be synonyms with different parents (e.g. Jung-gu can be found in both Seoul and Incheon), the two types of change can probably be tracked only when they happen separately, not in combination (to the best of my analysis). Parent changes can be tracked if the suffix does not change, e.g. Ganghwa-gun switched from parent Gyeonggi-do to Incheon in 1995. Suffix changes can be tracked if the parent stems don't change, e.g. Anjung-myeon, Pyeongtaek-gun, Gyeonggi-do became Anjung-eup, Pyeongtaek-si, Gyeonggi-do in 2005.

Below is the desired ouput, with the 2015 version of each subdivision in columns n1_15 to n4_15. Most subdivisions do not change over time so the 2015 hierarchy is the same as for previous years. The only changes in the sample are for Ganghwa-gun and Anjung-myeon. For example, Ganghwa-gun is under Incheon in 2015 (row 1505), so that hierarchy is copied into rows 2485 and 1314. Same for row 1506 to rows 2486 and 1315.

head(pop, 10)

        y1990    y1995   y2000   y2005   y2010   y2015 level      n1      n2            n3 n4
1    10603250 10217177 9853972 9762546 9631482    <NA>     1   Seoul                         
26     187355   135082  130370  126679  117253    <NA>     2   Seoul Jung-gu                 
694      <NA>     <NA>    <NA>    <NA>    <NA> 9904312     1   Seoul                         
713      <NA>     <NA>    <NA>    <NA>    <NA>  128478     2   Seoul Jung-gu                 
1144  1816328  2304176 2466338 2517680 2632035    <NA>     1 Incheon                         
1148    80799    67388   65775   85392   81846    <NA>     2 Incheon Jung-gu                 
1149     4295     2580       -       -       -    <NA>     3 Incheon Jung-gu  Jungang-dong   
1158     8743     6392    5439    4532    3985    <NA>     3 Incheon Jung-gu Bukseong-dong   
1363     <NA>     <NA>    <NA>    <NA>    <NA> 2890451     1 Incheon                         
1367     <NA>     <NA>    <NA>    <NA>    <NA>  112910     2 Incheon Jung-gu                 

pop <- structure(list(y1990 = c("10603250", "187355", NA, NA, "1816328", "80799", "4295", "8743", NA, NA, NA, "644622", "329846", "16805", "79124", "-", "-", "-", "115215", "14721", NA, NA, NA, NA, NA, NA, "-", NA, "71600", "-", NA, "22204"), 
                      y1995 = c("10217177", "135082", NA, NA, "2304176", "67388", "2580", "6392", NA, NA, NA, "754670", "260109", "14605", "311940", "-", "13069", "20673", "-", "-", NA, NA, NA, NA, NA, NA, "64695", NA, "-", "22146", NA, "-"), 
                      y2000 = c("9853972", "130370", NA, NA, "2466338", "65775", "-", "5439", NA, NA, NA, "944239", "315396", "14688", "342806", "-", "17552", "29249", "-", "-", NA, NA, NA, NA, NA, NA, "60289", NA, "-", "21600", NA, "-"), 
                      y2005 = c("9762546", "126679", NA, NA, "2517680", "85392", "-", "4532", NA, NA, NA, "1039233", "290546", "11041", "374262", "34336", "-", "36010", "-", "-", NA, NA, NA, NA, NA, NA, "57705", NA, "-", "19899", NA, "-"), 
                      y2010 = c("9631482", "117253", NA, NA, "2632035", "81846", "-", "3985", NA, NA, NA, "1054053", "298748", "4710", "381731", "37930", "-", "47419", "-", "-", NA, NA, NA, NA, NA, NA, "56423", NA, "-", "19441", NA, "-"), 
                      y2015 = c(NA, NA, "9904312", "128478", NA, NA, NA, NA, "2890451", "112910", "4116", NA, NA, NA, NA, NA, NA, NA, NA, NA, "1194313", "349339", "9944", "457873", "41092", "46014", NA, "62291", NA, NA, "21089", NA), 
                      level = c("1", "2", "1", "2", "1", "2", "3", "3", "1", "2", "3", "2", "3", "4", "2", "3", "3", "3", "2", "3", "2", "3", "4", "2", "3", "3", "2", "2", "2", "3", "3", "3"), 
                      n1 = c("Seoul", "Seoul", "Seoul", "Seoul", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do"), 
                      n2 = c("", "Jung-gu", "", "Jung-gu", "", "Jung-gu", "Jung-gu", "Jung-gu", "", "Jung-gu", "Jung-gu", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-gun", "Pyeongtaek-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun", "Ganghwa-gun"), 
                      n3 = c("", "", "", "", "", "", "Jungang-dong", "Bukseong-dong", "", "", "Bukseong-dong", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Anjung-myeon", "Jungang-dong", "", "Anjung-myeon", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Jungang-dong", "", "", "", "Ganghwa-eup", "Ganghwa-eup", "Ganghwa-eup"), 
                      n4 = c("", "", "", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", ""), 
                      n1_15 = c("Seoul", "Seoul", "Seoul", "Seoul", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Incheon", "Incheon", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do", "Gyeonggi-do"), 
                      n2_15 = c("", "Jung-gu", "", "Jung-gu", "", "Jung-gu", "Jung-gu", "Jung-gu", "Ganghwa-gun", "Ganghwa-gun", "", "Jung-gu", "Jung-gu", "Ganghwa-gun", "Ganghwa-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si", "Ganghwa-gun", "Ganghwa-gun", "Suwon-si", "Suwon-si", "Suwon-si", "Pyeongtaek-si", "Pyeongtaek-si", "Pyeongtaek-si"), 
                      n3_15 = c("", "", "", "", "", "", "Jungang-dong", "Bukseong-dong", "", "Ganghwa-eup", "", "", "Bukseong-dong", "", "Ganghwa-eup", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Anjung-eup", "Jungang-dong", "", "Anjung-eup", "", "Ganghwa-eup", "", "Gwonseon-gu", "Gwonseon-gu", "", "Anjung-eup", "Jungang-dong"), 
                      n4_15 = c("", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "", "", "", "", "", "", "", "", "Seryu 1(il)-dong", "", "", "")), 
                 row.names = c(1L, 26L, 694L, 713L, 1144L, 1148L, 1149L, 1158L, 1363L, 1367L, 1374L, 1532L, 1553L, 1555L, 1827L, 1829L, 1837L, 1838L, 2467L, 2476L, 2512L, 2524L, 2525L, 2720L, 2722L, 2730L, 1314L, 1505L, 2485L, 1315L, 1506L, 2486L), 
                 class = "data.frame")

I would prefer to handle this with dplyr (or base R) because I am relatively familiar with it, although not enough to solve this myself.

EDIT1: I have reduced the question to the single challenge of assigning latest parents to older records.

EDIT2: I have followed Prashant's suggestions but also re-introduced the suffix change challenge because upon thinking about it, the hard part is to deal with both challenges at the same time.

标签: rdataframedplyr

解决方案


我写了一个“程序”(?)解决方案。可以改进和/或使其矢量化吗?

for (i in 1:nrow(pop)) {
  N1 <- pop$n1[i]
  N2 <- pop$n2[i]
  N3 <- pop$n3[i]
  N4 <- pop$n4[i]
  #If entire path found as is in 2015 
  if (with(pop, sum(!is.na(y2015) & n1==N1 & n2==N2 & n3==N3 & n4==N4))==1) {
    pop$n1_15[i] <- N1
    pop$n2_15[i] <- N2
    pop$n3_15[i] <- N3
    pop$n4_15[i] <- N4
    next
  }
  #If unique place found in 2015, from level 4 to 2
  l <- pop$level[i]
  if (l==4 & with(pop, sum(!is.na(y2015) & n4==N4))==1) {
    pop$n1_15[i] <- with(pop, n1[!is.na(y2015) & n4==N4])
    pop$n2_15[i] <- with(pop, n2[!is.na(y2015) & n4==N4])
    pop$n3_15[i] <- with(pop, n3[!is.na(y2015) & n4==N4])
    pop$n4_15[i] <- N4
    next
  }
  if (l==3 & with(pop, sum(!is.na(y2015) & level==3 & n3==N3))==1) {
    pop$n1_15[i] <- with(pop, n1[!is.na(y2015) & level==3 & n3==N3])
    pop$n2_15[i] <- with(pop, n2[!is.na(y2015) & level==3 & n3==N3])
    pop$n3_15[i] <- N3
    pop$n4_15[i] <- N4
    next
  }
  if (l==2 & with(pop, sum(!is.na(y2015) & level==2 & n2==N2))==1) {
    pop$n1_15[i] <- with(pop, n1[!is.na(y2015) & level==2 & n2==N2])
    pop$n2_15[i] <- N2
    pop$n3_15[i] <- N3
    pop$n4_15[i] <- N4
    next
  }
  #If entire path without suffixes found in 2015
  P <- pop[
    with(pop, !is.na(y2015) & 
           sub("[-\\) ][^-\\) ]*$", "", trimws(n1))==sub("[-\\) ][^-\\) ]*$", "", trimws(N1)) & 
           sub("[-\\) ][^-\\) ]*$", "", trimws(n2))==sub("[-\\) ][^-\\) ]*$", "", trimws(N2)) & 
           sub("[-\\) ][^-\\) ]*$", "", trimws(n3))==sub("[-\\) ][^-\\) ]*$", "", trimws(N3)) & 
           sub("[-\\) ][^-\\) ]*$", "", trimws(n4))==sub("[-\\) ][^-\\) ]*$", "", trimws(N4))),,drop=F]
  if (nrow(P)==1) {
    pop$n1_15[i] <- P$n1
    pop$n2_15[i] <- P$n2
    pop$n3_15[i] <- P$n3
    pop$n4_15[i] <- P$n4
    next
  }
  #Garbage collection
  pop$n1_15[i] <- NA
  pop$n2_15[i] <- NA
  pop$n3_15[i] <- NA
  pop$n4_15[i] <- NA
}

推荐阅读