首页 > 解决方案 > 优化嵌套列表的子集操作

问题描述

是否可以提高此代码中最后一个子集操作的速度?此代码获取一小部分 Open Streetmap 数据,搜索所有具有名称的道路并创建一个仅包含道路的新osm o 对象。我对优化代码的最后一点很感兴趣:

highway_subset <- subset(muc, ids = highway_subset_ids)

类(MUC)

[1] 《奥斯玛》《名单》

muc 是列表的列表 end 列表的每个元素都有一个用于创建子集的 id。

这是完整的示例:

library("osmar")
src <- osmsource_api(url = "https://api.openstreetmap.org/api/0.6/")
muc_bbox <- center_bbox(11.575278, 48.137222, 1000, 1000)
muc <- get_osm(muc_bbox, src)

highway_subset_ids <- subset(muc, way_ids = find(muc, way(tags(k == "highway"))))
highway_subset_ids <- find(highway_subset_ids, way(tags(k == "name")))
highway_subset_ids <- find_down(muc, way(highway_subset_ids))
highway_subset <- subset(muc, ids = highway_subset_ids)

非常感谢您提前。

更新

如果您在使用 ssl 时遇到问题,请尝试复制粘贴以下代码示例。这是我能做到的最低限度。

我要优化的行是这一行:

final_subset <- 子集(highway_subset,ids = Highway_subset_ids)

library("osmar")

highway_subset <-
  structure(list(nodes = structure(list(
          attrs = structure(
            list(
              id = numeric(0),
              visible = character(0),
              timestamp = structure(
                list(
                  sec = numeric(0),
                  min = integer(0),
                  hour = integer(0),
                  mday = integer(0),
                  mon = integer(0),
                  year = integer(0),
                  wday = integer(0),
                  yday = integer(0),
                  isdst = integer(0),
                  zone = character(0),
                  gmtoff = integer(0)
                ),
                class = c("POSIXlt", "POSIXt")
              ),
              version = numeric(0),
              changeset = numeric(0),
              user = structure(integer(0), .Label = character(0), class = "factor"),
              uid = structure(
                integer(0),
                .Label = c("2455020", "2590140", "367380"),
                class = "factor"
              ),
              lat = numeric(0),
              lon = numeric(0)
            ),
            row.names = integer(0),
            class = "data.frame"
          ),
          tags = structure(
            list(
              id = numeric(0),
              k = structure(integer(0), .Label = character(0), class = "factor"),
              v = structure(integer(0), .Label = character(0), class = "factor")
            ),
            row.names = integer(0),
            class = "data.frame"
          )
        ),
        class = c("nodes", "osmar_element", "list")
      ),
      ways = structure(
        list(
          attrs = structure(
            list(
              id = c(105071009, 366457476),
              visible = c("true", "true"),
              timestamp = structure(
                list(
                  sec = c(10, 48),
                  min = c(54L, 15L),
                  hour = c(13L, 20L),
                  mday = c(4L, 15L),
                  mon = c(2L, 4L),
                  year = 117:116,
                  wday = c(6L, 0L),
                  yday = c(62L, 135L),
                  isdst = 0:1,
                  zone = c("CET", "CEST"),
                  gmtoff = c(NA_integer_, NA_integer_)
                ),
                class = c("POSIXlt", "POSIXt")
              ),
              version = c(15, 5),
              changeset = c(46573027, 39338422),
              user = structure(
                2:1,
                .Label = c("bjoern262", "saerdnaer"),
                class = "factor"
              ),
              uid = structure(
                4:3,
                .Label = c("367380",
                           "64536", "651621", "6998"),
                class = "factor"
              )
            ),
            row.names = c(2L,
                          4L),
            class = "data.frame"
          ),
          tags = structure(
            list(
              id = c(
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                105071009,
                366457476,
                366457476,
                366457476,
                366457476,
                366457476
              ),
              k = structure(
                c(1L, 2L, 3L,
                  4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 3L, 5L, 6L, 7L, 11L),
                .Label = c(
                  "conveying",
                  "description",
                  "highway",
                  "incline",
                  "indoor",
                  "layer",
                  "level",
                  "oneway",
                  "operator",
                  "ref",
                  "tunnel"
                ),
                class = "factor"
              ),
              v = structure(
                c(6L,
                  9L, 10L, 4L, 11L, 3L, 2L, 11L, 8L, 7L, 11L, 5L, 11L, 1L, 1L,
                  11L),
                .Label = c(
                  "-3",
                  "-3;-4",
                  "-4",
                  "down",
                  "footway",
                  "forward",
                  "MP19",
                  "MVG",
                  "Rolltreppe MP19",
                  "steps",
                  "yes"
                ),
                class = "factor"
              )
            ),
            row.names = 4:19,
            class = "data.frame"
          ),
          refs = structure(
            list(
              id = c(105071009, 105071009, 366457476,
                     366457476, 366457476),
              ref = c(3270556979, 1211172719, 3270556979,
                      3704371485, 3704371444)
            ),
            row.names = c(20L, 21L, 68L, 69L,
                          70L),
            class = "data.frame"
          )
        ),
        class = c("ways", "osmar_element",
                  "list")
      ),
      relations = structure(
        list(
          attrs = structure(
            list(
              id = numeric(0),
              visible = character(0),
              timestamp = structure(
                list(
                  sec = numeric(0),
                  min = integer(0),
                  hour = integer(0),
                  mday = integer(0),
                  mon = integer(0),
                  year = integer(0),
                  wday = integer(0),
                  yday = integer(0),
                  isdst = integer(0),
                  zone = character(0),
                  gmtoff = integer(0)
                ),
                class = c("POSIXlt", "POSIXt")
              ),
              version = numeric(0),
              changeset = numeric(0),
              user = structure(integer(0), .Label = character(0), class = "factor"),
              uid = structure(
                integer(0),
                .Label = c(
                  "137242",
                  "161619",
                  "2455020",
                  "2590140",
                  "531886",
                  "72235",
                  "8748",
                  "9451067"
                ),
                class = "factor"
              )
            ),
            row.names = integer(0),
            class = "data.frame"
          ),
          tags = structure(
            list(
              id = numeric(0),
              k = structure(integer(0), .Label = character(0), class = "factor"),
              v = structure(integer(0), .Label = character(0), class = "factor")
            ),
            row.names = integer(0),
            class = "data.frame"
          ),
          refs = structure(
            list(
              id = numeric(0),
              type = structure(integer(0), .Label = character(0), class = "factor"),
              ref = numeric(0),
              role = structure(integer(0), .Label = character(0), class = "factor")
            ),
            row.names = integer(0),
            class = "data.frame"
          )
        ),
        class = c("relations",
                  "osmar_element", "list")
      )
    ),
    class = c("osmar", "list")
  )
highway_subset_ids <- find_down(highway_subset, way(highway_subset$ways$attrs$id))
final_subset <- subset(highway_subset, ids = highway_subset_ids)

谢谢!

标签: rdplyrdata.tabletidyr

解决方案


我分析了你的代码

library("osmar")
src <- osmsource_api(url = "https://api.openstreetmap.org/api/0.6/")
muc_bbox <- center_bbox(11.575278, 48.137222, 1000, 1000)
muc <- get_osm(muc_bbox, src)

system.time(
  highway_subset_ids <- subset(muc, way_ids = find(muc, way(tags(k == "highway"))))
)
# 0.157
system.time(
  highway_subset_ids <- find(highway_subset_ids, way(tags(k == "name")))
)
# 0.001
system.time(
  highway_subset_ids <- find_down(muc, way(highway_subset_ids))
)
# 0.008
system.time(
  highway_subset <- subset(muc, ids = highway_subset_ids)
)
# 0.025

如您所见,对我来说,最后一个subset不是瓶颈,但第一个是(贵了 6 倍)。

内部数据不是很大

  • nodes15157 行
  • ways2938 行
  • tags11966 行
  • relations350 行
  • 另外tags3270 行

您提到您需要多次执行子集。要解决的问题可能是尝试“矢量化”您的代码。我的意思不是很明显lapply,而是提取内部 data.frames,绑定它们,然后只做一次子集,如果需要再次拆分它们。data.table可以在这里使用以带来额外的速度。这将比仅在 15000 行的循环中使用 data.table 子集更有益,其中收益会小得多。

要了解如何“矢量化”该代码,您需要了解 osmar 的subset工作原理。如果您查看源代码https://github.com/cran/osmar/blob/master/R/osmar-subsetting.R,这并不难

  • 尝试从所有对象中取出 data.frames 到子集
  • rbindlist他们
  • subset他们使用[.data.table
  • 如果需要,拆分
  • 如果需要,变成原始对象

另请注意,该osmar软件包相当旧,日期为 2013 年,它具有sp非常积极开发的软件包之类的间接依赖关系。osmar您可能会预料到一些与过去 7 年中可能在依赖项中引入的重大更改相关的问题。


推荐阅读