首页 > 解决方案 > 如何用 R 计算子组内的出现次数?

问题描述

我有以下data.framedput()在问题的末尾)。

(c_arr_cords[1:20,])
   linkId       x       y vehicleRefId new_arr_time duration dep_time
1      90 2681090 1245442      1267069          0.0      6.5      6.5
2      90 2681090 1245442       532654         11.0      1.0     12.0
3      90 2681090 1245442      1398907         12.5      0.5     12.5
4      90 2681090 1245442      1267069         12.0     24.0     36.0
5      90 2681090 1245442         4205         16.5      0.0     16.5
6      90 2681090 1245442      1111105         11.0      0.0     11.5
7      90 2681090 1245442       434774         16.0      0.0     16.5
8      90 2681090 1245442      1179923          0.0     15.5     15.5
9      90 2681090 1245442        46491         14.5      0.5     15.0
10     90 2681090 1245442      1179923         16.0     19.5     36.0
11     90 2681090 1245442      1326473         11.0      3.0     13.5
12     90 2681090 1245442      1239391         13.0      0.5     13.5
13     90 2681090 1245442       810534          8.0      0.0      8.0
14     90 2681090 1245442        51825          9.5      0.5     10.0
15     90 2681090 1245442      1199672         11.0      1.0     12.0
16     90 2681090 1245442      1269433         17.5      1.5     19.0
17    389 2681367 1247844       492533         14.5      1.5     16.0
18    389 2681367 1247844      1454119         17.5     18.0     36.0
19    389 2681367 1247844      1278645          0.0      8.0      8.0
20    389 2681367 1247844      1430553         10.5      1.5     12.0

我的目标是创建一个我可以看到在任何 1 小时垃圾箱data.frame上有多少车辆的地方。linkId如果车辆在时间 x 位于链路上,则可以从new_arr_time(到达)和dep_time(离开)推导出。对于time = 12链接 90 上的(第 12 小时),必须计算有多少车辆拥有new_arr_time <= 12, 和dep_time >=12。总共将有最多 48 个时间箱(如果为 0,则不需要有一个箱)。

所需的表应具有以下结构:

linkId  time    count
90      0.0     3
90      0.5     x
90      1.0     y
...
389     0.0     z
...

我的斗争是创建一个有效的循环来执行此操作。

先感谢您!

数据:

structure(list(linkId = c(90L, 90L, 90L, 90L, 90L, 90L, 90L, 
90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 90L, 389L, 389L, 389L, 
389L, 389L, 389L, 389L, 389L, 389L, 389L, 389L, 389L, 389L, 389L, 
451L, 451L, 451L, 451L, 480L, 480L, 480L, 480L, 480L, 578L, 578L, 
578L, 578L, 578L, 578L, 578L, 662L, 662L, 662L, 662L, 662L, 662L, 
723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 
723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 
723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 
723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 723L, 
723L, 723L, 723L, 723L), x = c(2681090, 2681090, 2681090, 2681090, 
2681090, 2681090, 2681090, 2681090, 2681090, 2681090, 2681090, 
2681090, 2681090, 2681090, 2681090, 2681090, 2681366.83333333, 
2681366.83333333, 2681366.83333333, 2681366.83333333, 2681366.83333333, 
2681366.83333333, 2681366.83333333, 2681366.83333333, 2681366.83333333, 
2681366.83333333, 2681366.83333333, 2681366.83333333, 2681366.83333333, 
2681366.83333333, 2683684, 2683684, 2683684, 2683684, 2683675.34782609, 
2683675.34782609, 2683675.34782609, 2683675.34782609, 2683675.34782609, 
2676435, 2676435, 2676435, 2676435, 2676435, 2676435, 2676435, 
2682590, 2682590, 2682590, 2682590, 2682590, 2682590, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 2672126, 
2672126, 2672126, 2672126, 2672126, 2672126), y = c(1245442, 
1245442, 1245442, 1245442, 1245442, 1245442, 1245442, 1245442, 
1245442, 1245442, 1245442, 1245442, 1245442, 1245442, 1245442, 
1245442, 1247843.75, 1247843.75, 1247843.75, 1247843.75, 1247843.75, 
1247843.75, 1247843.75, 1247843.75, 1247843.75, 1247843.75, 1247843.75, 
1247843.75, 1247843.75, 1247843.75, 1246790, 1246790, 1246790, 
1246790, 1246835.5, 1246835.5, 1246835.5, 1246835.5, 1246835.5, 
1241381, 1241381, 1241381, 1241381, 1241381, 1241381, 1241381, 
1237645.6, 1237645.6, 1237645.6, 1237645.6, 1237645.6, 1237645.6, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 1248783, 
1248783, 1248783, 1248783, 1248783, 1248783, 1248783), vehicleRefId = c(1267069L, 
532654L, 1398907L, 1267069L, 4205L, 1111105L, 434774L, 1179923L, 
46491L, 1179923L, 1326473L, 1239391L, 810534L, 51825L, 1199672L, 
1269433L, 492533L, 1454119L, 1278645L, 1430553L, 1412246L, 1533113L, 
1278645L, 1454119L, 1412246L, 1430553L, 1533113L, 1278645L, 1310775L, 
1310775L, 1161080L, 1290940L, 558745L, 628509L, 1530598L, 403850L, 
1397256L, 774916L, 3874L, 1043798L, 1881121L, 193835L, 526654L, 
2066556L, 1221124L, 12799L, 486288L, 485689L, 488147L, 485689L, 
486288L, 488147L, 2095866L, 42794L, 2149105L, 1887358L, 1902958L, 
1901830L, 1215125L, 2148165L, 1457624L, 1898426L, 1394390L, 1859644L, 
1908352L, 1885007L, 1885718L, 1887788L, 1222534L, 1888344L, 1926462L, 
1785664L, 2147547L, 1898186L, 1921295L, 1905635L, 1888247L, 1747951L, 
2149105L, 2821L, 1094609L, 1531804L, 1670344L, 1912658L, 1799420L, 
1908352L, 1925302L, 2064554L, 1887316L, 1869032L, 1925659L, 1794294L, 
1378838L, 1528492L, 4806833L, 5259385L, 1860654L, 1187619L, 1814856L, 
1863281L), new_arr_time = c(0, 11, 12.5, 12, 16.5, 11, 16, 0, 
14.5, 16, 11, 13, 8, 9.5, 11, 17.5, 14.5, 17.5, 0, 10.5, 18, 
13.5, 25, 0, 0, 12.5, 8.5, 17, 19, 0, 7.5, 7.5, 7.5, 7.5, 8.5, 
6, 13.5, 7.5, 14, 8, 10, 7.5, 18, 18, 9.5, 16, 18.5, 21, 0, 0, 
0, 18.5, 12, 19, 8, 9, 18, 14, 19, 10, 17, 12, 7, 13, 13.5, 11, 
14.5, 17, 9.5, 8.5, 8.5, 7, 6.5, 18.5, 22.5, 12.5, 18.5, 8, 14, 
6.5, 9.5, 8, 17.5, 17, 12.5, 8, 5.5, 18, 19.5, 7.5, 8.5, 13, 
18.5, 12, 15.5, 19, 20, 13, 8, 9.5), duration = c(6.5, 1, 0.5, 
24, 0, 0, 0, 15.5, 0.5, 19.5, 3, 0.5, 0, 0.5, 1, 1.5, 1.5, 18, 
8, 1.5, 17.5, 5, 11, 7, 7, 0.5, 4, 2, 16.5, 7.5, 10, 10, 10, 
9.5, 10.5, 8, 8.5, 9.5, 8, 0.5, 0.5, 3, 1, 1, 2.5, 0, 17.5, 15, 
13, 7, 8, 17.5, 1, 3.5, 4.5, 2.5, 2, 1.5, 4.5, 1, 1, 1, 10, 2, 
4, 1, 2.5, 2, 2, 1, 0.5, 10, 10.5, 5, 0, 3.5, 0, 10.5, 3, 9.5, 
1.5, 0, 3, 2.5, 0, 3, 5.5, 1.5, 1, 10, 1, 3, 0, 1, 1, 1.5, 2.5, 
1, 2.5, 0.5), dep_time = c(6.5, 12, 12.5, 36, 16.5, 11.5, 16.5, 
15.5, 15, 36, 13.5, 13.5, 8, 10, 12, 19, 16, 36, 8, 12, 36, 18.5, 
36, 7, 7, 13, 12, 19.5, 36, 7.5, 17.5, 17.5, 17.5, 17, 19, 14, 
22, 17, 22, 8.5, 11, 10, 19, 19, 12.5, 16.5, 36, 36, 13, 7, 8, 
36, 12.5, 22.5, 12.5, 11, 20, 15.5, 24, 10.5, 18, 12.5, 17, 14.5, 
17.5, 11.5, 17, 19, 12, 9.5, 9, 17, 17.5, 23.5, 22.5, 16.5, 18.5, 
19, 17, 16, 11, 8, 20.5, 19.5, 12.5, 11.5, 11, 19.5, 20.5, 17.5, 
9.5, 16, 18.5, 13, 16, 20.5, 22.5, 14, 10.5, 10)), row.names = c(NA, 
100L), class = "data.frame")

标签: rdataframedplyr

解决方案


我希望我现在能更好地理解它,这里有一种基于outer产品和数据与包tidyr 处理的方法。它比循环更消耗内存,但也更紧凑:


library("tidyr")
library("dplyr")

## half hour time slots
tm <- seq(0, 24, 0.5)

## Test if a value is in the interval. Please check manually with some examples.
## second version is more robust against IEEE floating point deviations
# fun <- function(i, x) (d[i, "new_arr_time"] <= x) & (x <= d[i, "dep_time"])
fun <- function(i, x) (d[i, "new_arr_time"] - x < 1e-6) & (x - d[i, "dep_time"] < 1e-6)

## outer creates all combinations between LinkIDs and time slots
expanded <- data.frame(outer(1:nrow(d), tm, fun))
names(expanded) <- tm

cbind(linkId=d$linkId, expanded) %>%
  pivot_longer(-linkId, names_to = "time", values_to = "count") %>%
  group_by(linkId, time) %>%
  summarize(count = sum(count))

要了解管道中各个步骤的%>%作用,请从头开始重建管道并逐行添加一行。


推荐阅读