数据平滑在R
这个问题与我以前问过的这个问题有关。 但提到这个问题没有必要回答这个问题。
数据
我有一个数据集,其中包含以0.1秒为间隔记录的2169辆车辆的速度。 所以,单个车辆有很多排。 在这里,我仅复制车辆#2的数据:
> dput(uma)
structure(list(Frame.ID = 13:445, Vehicle.velocity = c(40, 40,
40, 40, 40, 40, 40, 40.02, 40.03, 39.93, 39.61, 39.14, 38.61,
38.28, 38.42, 38.78, 38.92, 38.54, 37.51, 36.34, 35.5, 35.08,
34.96, 34.98, 35, 34.99, 34.98, 35.1, 35.49, 36.2, 37.15, 38.12,
38.76, 38.95, 38.95, 38.99, 39.18, 39.34, 39.2, 38.89, 38.73,
38.88, 39.28, 39.68, 39.94, 40.02, 40, 39.99, 39.99, 39.65, 38.92,
38.52, 38.8, 39.72, 40.76, 41.07, 40.8, 40.59, 40.75, 41.38,
42.37, 43.37, 44.06, 44.29, 44.13, 43.9, 43.92, 44.21, 44.59,
44.87, 44.99, 45.01, 45.01, 45, 45, 45, 44.79, 44.32, 43.98,
43.97, 44.29, 44.76, 45.06, 45.36, 45.92, 46.6, 47.05, 47.05,
46.6, 45.92, 45.36, 45.06, 44.96, 44.97, 44.99, 44.99, 44.99,
44.99, 45.01, 45.02, 44.9, 44.46, 43.62, 42.47, 41.41, 40.72,
40.49, 40.6, 40.76, 40.72, 40.5, 40.38, 40.43, 40.38, 39.83,
38.59, 37.02, 35.73, 35.04, 34.85, 34.91, 34.99, 34.99, 34.97,
34.96, 34.98, 35.07, 35.29, 35.54, 35.67, 35.63, 35.53, 35.53,
35.63, 35.68, 35.55, 35.28, 35.06, 35.09, 35.49, 36.22, 37.08,
37.8, 38.3, 38.73, 39.18, 39.62, 39.83, 39.73, 39.58, 39.57,
39.71, 39.91, 40, 39.98, 39.97, 40.08, 40.38, 40.81, 41.27, 41.69,
42.2, 42.92, 43.77, 44.49, 44.9, 45.03, 45.01, 45, 45, 45, 45,
45, 45, 45, 45, 45, 45, 45, 44.99, 45.03, 45.26, 45.83, 46.83,
48.2, 49.68, 50.95, 51.83, 52.19, 52, 51.35, 50.38, 49.38, 48.63,
48.15, 47.87, 47.78, 48.01, 48.63, 49.52, 50.39, 50.9, 50.96,
50.68, 50.3, 50.05, 49.94, 49.87, 49.82, 49.82, 49.88, 49.96,
50, 50, 49.98, 49.98, 50.16, 50.64, 51.43, 52.33, 53.01, 53.27,
53.22, 53.25, 53.75, 54.86, 56.36, 57.64, 58.28, 58.29, 57.94,
57.51, 57.07, 56.64, 56.43, 56.73, 57.5, 58.27, 58.55, 58.32,
57.99, 57.89, 57.92, 57.74, 57.12, 56.24, 55.51, 55.1, 54.97,
54.98, 55.02, 55.03, 54.86, 54.3, 53.25, 51.8, 50.36, 49.41,
49.06, 49.17, 49.4, 49.51, 49.52, 49.51, 49.45, 49.24, 48.84,
48.29, 47.74, 47.33, 47.12, 47.06, 47.07, 47.08, 47.05, 47.04,
47.25, 47.68, 47.93, 47.56, 46.31, 44.43, 42.7, 41.56, 41.03,
40.92, 40.92, 40.98, 41.19, 41.45, 41.54, 41.32, 40.85, 40.37,
40.09, 39.99, 39.99, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
39.98, 39.97, 40.1, 40.53, 41.36, 42.52, 43.71, 44.57, 45.01,
45.1, 45.04, 45, 45, 45, 45, 45, 45, 44.98, 44.97, 45.08, 45.39,
45.85, 46.2, 46.28, 46.21, 46.29, 46.74, 47.49, 48.35, 49.11,
49.63, 49.89, 49.94, 49.97, 50.14, 50.44, 50.78, 51.03, 51.12,
51.05, 50.85, 50.56, 50.26, 50.06, 50.1, 50.52, 51.36, 52.5,
53.63, 54.46, 54.9, 55.03, 55.09, 55.23, 55.35, 55.35, 55.23,
55.07, 54.99, 54.98, 54.97, 55.06, 55.37, 55.91, 56.66, 57.42,
58.07, 58.7, 59.24, 59.67, 59.95, 60.02, 60, 60, 60, 60, 60,
60.01, 60.06, 60.23, 60.65, 61.34, 62.17, 62.93, 63.53, 64, 64.41,
64.75, 65.04, 65.3, 65.57, 65.75, 65.74, 65.66, 65.62, 65.71,
65.91, 66.1, 66.26, 66.44, 66.61, 66.78, 66.91, 66.99, 66.91,
66.7, 66.56, 66.6, 66.83, 67.17, 67.45, 67.75, 68.15, 68.64,
69.15, 69.57, 69.79, 69.79, 69.72, 69.72, 69.81, 69.94, 70, 70.01,
70.02, 70.03)), .Names = c("Frame.ID", "Vehicle.velocity"), class = "data.frame", row.names = c(NA,
433L))
Frame.ID是观察Vehicle.velocity的时间帧。 速度变量中有一些噪音,我想平滑它。
方法
为了平滑速度,我使用下面的公式:
哪里,
Delta = 10
Nalpha =数据点数(行)
i = 1,...,Nalpha(即行号)
D = {i-1,Nalpha-i,3 * delta = 30}的最小值
xalpha =速度
题
我已经通过了R中的filter
和convolution
的文档。似乎我必须知道卷积来做到这一点。 然而,我已经尽力而为,无法理解卷积是如何工作的! 链接的问题有一个答案,这有助于我理解函数的一些内部运作,但我仍然不确定。
请问所有人都可以在这里解释这个东西是如何工作的? 或者引导我采用另一种方法来实现相同的目的,即应用等式?
我目前的代码有效,但很长
这就是uma
样子:
> head(uma)
Frame.ID Vehicle.velocity
1 13 40
2 14 40
3 15 40
4 16 40
5 17 40
6 18 40
uma$i <- 1:nrow(uma) # this is i
uma$im1 <- uma$i - 1
uma$Nai <- nrow(uma) - uma$i # this is Nalpha
uma$delta3 <- 30 # this is 3 times delta
uma$D <- pmin(uma$im1, uma$Nai, uma$delta3) # selecting the minimum of {i-1, Nalpha - i, 3*delta=15}
uma$imD <- uma$i - uma$D # i-D
uma$ipD <- uma$i + uma$D # i+D
uma <- ddply(uma, .(Frame.ID), transform, k = imD:ipD) # to include all k in the data frame
umai <- uma
umai$imk <- umai$i - umai$k # i-k
umai$aimk <- (-1) * abs(umai$imk) # -|i-k|
umai$delta <- 10
umai$kernel <- exp(umai$aimk/umai$delta) # The kernel in the equation i.e. EXP^-|i-k|/delta
umai$p <- umai$Vehicle.velocity[match(umai$k,umai$i)] #observed velocity in kth row as described in equation as t(k)
umai$kernelp <- umai$p * umai$kernel # the product of kernel and observed velocity in kth row as described in equation as t(k)
umair <- ddply(umai, .(Frame.ID), summarize, Z = sum(kernel), prod = sum(kernelp)) # summing the kernel to get Z and summing the product to get the numerator of the equation
umair$new.Y <- umair$prod/umair$Z # the final step to get the smoothed velocity
情节
仅供参考,如果将观察到的平滑速度与时间框架进行对比,我们可以看到平滑的结果:
ggplot() +
geom_point(data=uma,aes(y=Vehicle.velocity, x= Frame.ID)) +
geom_point(data=umair,aes(y=new.Y, x= Frame.ID), color="red")
请通过指导我使用卷积来帮助我缩短代码并适用于所有车辆(由数据集中的Vehicle.ID表示)。
dplyr
好的,所以我使用了下面的代码,它可以工作,但需要32 GB RAM的3个小时。 任何人都可以提出改进来加速它(每个小时由umal
, umav
和umaa
拍摄,每个小时)?
uma <- tbl_df(uma)
uma <- uma %>% # take data frame
group_by(Vehicle.ID) %>% # group by Vehicle ID
mutate(i = 1:length(Frame.ID), im1 = i-1, Nai = length(Frame.ID) - i,
Dv = pmin(im1, Nai, 30),
Da = pmin(im1, Nai, 120),
Dl = pmin(im1, Nai, 15),
imDv = i - Dv,
ipDv = i + Dv,
imDa = i - Da,
ipDa = i + Da,
imDl = i - Dl,
ipDl = i + Dl) %>% # finding i, i-1 and Nalpha-i, D, i-D and i+D for location, velocity and acceleration
ungroup()
umav <- uma %>%
group_by(Vehicle.ID, Frame.ID) %>%
do(data.frame(kv = .$imDv:.$ipDv)) %>%
left_join(x=., y=uma) %>%
mutate(imk = i - kv, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
ungroup() %>%
group_by(Vehicle.ID) %>%
mutate(p = Vehicle.velocity2[match(kv,i)], kernelp = p * kernel) %>%
ungroup() %>%
group_by(Vehicle.ID, Frame.ID) %>%
summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
mutate(svel = prod/Z) %>%
ungroup()
umaa <- uma %>%
group_by(Vehicle.ID, Frame.ID) %>%
do(data.frame(ka = .$imDa:.$ipDa)) %>%
left_join(x=., y=uma) %>%
mutate(imk = i - ka, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
ungroup() %>%
group_by(Vehicle.ID) %>%
mutate(p = Vehicle.acceleration2[match(ka,i)], kernelp = p * kernel) %>%
ungroup() %>%
group_by(Vehicle.ID, Frame.ID) %>%
summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
mutate(sacc = prod/Z) %>%
ungroup()
umal <- uma %>%
group_by(Vehicle.ID, Frame.ID) %>%
do(data.frame(kl = .$imDl:.$ipDl)) %>%
left_join(x=., y=uma) %>%
mutate(imk = i - kl, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
ungroup() %>%
group_by(Vehicle.ID) %>%
mutate(p = Local.Y[match(kl,i)], kernelp = p * kernel) %>%
ungroup() %>%
group_by(Vehicle.ID, Frame.ID) %>%
summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
mutate(ycoord = prod/Z) %>%
ungroup()
umal <- select(umal,c("Vehicle.ID", "Frame.ID", "ycoord"))
umav <- select(umav, c("Vehicle.ID", "Frame.ID", "svel"))
umaa <- select(umaa, c("Vehicle.ID", "Frame.ID", "sacc"))
umair <- left_join(uma, umal) %>% left_join(x=., y=umav) %>% left_join(x=., y=umaa)
一个好的第一步是取一个for循环(我将用sapply
隐藏)并对每个索引执行指数平滑:
josilber1 <- function(uma) {
delta <- 10
sapply(1:nrow(uma), function(i) {
D <- min(i-1, nrow(uma)-i, 30)
rng <- (i-D):(i+D)
rng <- rng[rng >= 1 & rng <= nrow(uma)]
expabs <- exp(-abs(i-rng)/delta)
return(sum(uma$Vehicle.velocity[rng] * expabs) / sum(expabs))
})
}
更多参与的方法是仅计算每个指数的指数平滑函数的增量变化(而不是每个指数的重新求和)。 指数平滑函数有一个较低的部分(当前索引之前的数据;我在下面的代码中包含当前索引low
)和上部(当前索引之后的数据;下面的代码中为high
)。 当我们遍历矢量时,下半部分的所有数据的权重都会减小(我们除以mult
),而上半部分的所有数据的权重都会加大(我们乘以mult
)。 最左边的元素从low
,最左边的元素high
移动到low
,并且一个元素添加到high
右边。
实际的代码对于处理矢量的开始和结束以及处理数值稳定性问题( high
误差乘以每次迭代的mult
)有点麻烦:
josilber2 <- function(uma) {
delta <- 10
x <- uma$Vehicle.velocity
ret <- c(x[1], rep(NA, nrow(uma)-1))
low <- x[1]
high <- 0
norm <- 1
old.D <- 0
mult <- exp(1/delta)
for (i in 2:nrow(uma)) {
D <- min(i-1, nrow(uma)-i, 30)
if (D == old.D + 1) {
low <- low / mult + x[i]
high <- high * mult - x[i] + x[i+D-1]/mult^(D-1) + x[i+D]/mult^D
norm <- norm + 2 / mult^D
} else if (D == old.D) {
low <- low / mult - x[i-(D+1)]/mult^(D+1) + x[i]
high <- high * mult - x[i] + x[i+D]/mult^D
} else {
low <- low / mult - x[i-(D+2)]/mult^(D+2) - x[i-(D+1)]/mult^(D+1) + x[i]
high <- high * mult - x[i]
norm <- norm - 2 / mult^(D+1)
}
# For numerical stability, recompute high every so often
if (i %% 50 == 0) {
rng <- (i+1):(i+D)
expabs <- exp(-abs(i-rng)/delta)
high <- sum(x[rng] * expabs)
}
ret[i] <- (low+high)/norm
old.D <- D
}
return(ret)
}
像josilber2
这样的R代码通常可以使用Rcpp
包加速:
library(Rcpp)
josilber3 <- cppFunction(
"
NumericVector josilber3(NumericVector x) {
double delta = 10.0;
NumericVector ret(x.size(), 0.0);
ret[0] = x[0];
double low = x[0];
double high = 0.0;
double norm = 1.0;
int oldD = 0;
double mult = exp(1/delta);
for (int i=1; i < x.size(); ++i) {
int D = i;
if (x.size()-i-1 < D) D = x.size()-i-1;
if (30 < D) D = 30;
if (D == oldD + 1) {
low = low / mult + x[i];
high = high * mult - x[i] + x[i+D-1]/pow(mult, D-1) + x[i+D]/pow(mult, D);
norm = norm + 2 / pow(mult, D);
} else if (D == oldD) {
low = low / mult - x[i-(D+1)]/pow(mult, D+1) + x[i];
high = high * mult - x[i] + x[i+D]/pow(mult, D);
} else {
low = low / mult - x[i-(D+2)]/pow(mult, D+2) - x[i-(D+1)]/pow(mult, D+1) + x[i];
high = high * mult - x[i];
norm = norm - 2 / pow(mult, D+1);
}
if (i % 50 == 0) {
high = 0.0;
for (int j=i+1; j <= i+D; ++j) {
high += x[j] * exp((i-j)/delta);
}
}
ret[i] = (low+high)/norm;
oldD = D;
}
return ret;
}")
现在我们可以对这三种新方法的改进进行基准测试:
all.equal(umair.fxn(uma), josilber1(uma))
# [1] TRUE
all.equal(umair.fxn(uma), josilber2(uma))
# [1] TRUE
all.equal(umair.fxn(uma), josilber3(uma$Vehicle.velocity))
# [1] TRUE
library(microbenchmark)
microbenchmark(umair.fxn(uma), josilber1(uma), josilber2(uma), josilber3(uma$Vehicle.velocity))
# Unit: microseconds
# expr min lq mean median uq max neval
# umair.fxn(uma) 370006.728 382327.4115 398554.71080 393495.052 404186.153 572801.355 100
# josilber1(uma) 12879.268 13640.1310 15981.82099 14265.610 14805.419 28959.230 100
# josilber2(uma) 4324.724 4502.8125 5753.47088 4918.835 5244.309 17328.797 100
# josilber3(uma$Vehicle.velocity) 41.582 54.5235 57.76919 57.435 60.099 90.998 100
我们使用josilber1
更简单的josilber1
和70x的总加速比( josilber2
更大的delta值会有更多的优势),我们得到了很多改进(25x)。 使用josilber3
我们实现了6800倍的加速,将运行时间降至54微秒以处理单个车辆!
上一篇: Data Smoothing in R
下一篇: Fullcalendar v2: How to maintain the same scroll time when navigating weeks?