modelr:用重采样数据拟合多个模型
在实施数据科学(TM)的整齐模型modelr
,重新采样数据使用列表列组织:
library(modelr)
library(tidyverse)
# create the k-folds
df_heights_resampled = heights %>%
crossv_kfold(k = 10, id = "Resample ID")
它可以map
模型,以列表中的每个列的训练数据集的train
,并通过计算性能度量map
平榜上有名列test
。
如果需要使用多个模型来完成,则需要对每个模型重复此操作。
# create a list of formulas
formulas_heights = formulas(
.response = ~ income,
model1 = ~ height + weight + marital + sex,
model2 = ~ height + weight + marital + sex + education
)
# fit each of the models in the list of formulas
df_heights_resampled = df_heights_resampled %>%
mutate(
model1 = map(train, function(train_data) {
lm(formulas_heights[[1]], data = train_data)
}),
model2 = map(train, function(train_data) {
lm(formulas_heights[[2]], data = train_data)
})
)
# score the models on the test sets
df_heights_resampled = df_heights_resampled %>%
mutate(
rmse1 = map2_dbl(.x = model1, .y = test, .f = rmse),
rmse2 = map2_dbl(.x = model2, .y = test, .f = rmse)
)
这使:
> df_heights_resampled
# A tibble: 10 × 7
train test `Resample ID` model1 model2 rmse1 rmse2
<list> <list> <chr> <list> <list> <dbl> <dbl>
1 <S3: resample> <S3: resample> 01 <S3: lm> <S3: lm> 58018.35 53903.99
2 <S3: resample> <S3: resample> 02 <S3: lm> <S3: lm> 55117.37 50279.38
3 <S3: resample> <S3: resample> 03 <S3: lm> <S3: lm> 49005.82 44613.93
4 <S3: resample> <S3: resample> 04 <S3: lm> <S3: lm> 55437.07 51068.90
5 <S3: resample> <S3: resample> 05 <S3: lm> <S3: lm> 48845.35 44673.88
6 <S3: resample> <S3: resample> 06 <S3: lm> <S3: lm> 58226.69 54010.50
7 <S3: resample> <S3: resample> 07 <S3: lm> <S3: lm> 56571.93 53322.41
8 <S3: resample> <S3: resample> 08 <S3: lm> <S3: lm> 46084.82 42294.50
9 <S3: resample> <S3: resample> 09 <S3: lm> <S3: lm> 59762.22 54814.55
10 <S3: resample> <S3: resample> 10 <S3: lm> <S3: lm> 45328.48 41882.79
题:
如果要探索的模型数量很大,这可能非常麻烦。 modelr
提供fit_with
功能,允许遍历多个型号(如,其特征在于多个公式),但是似乎不以允许像列表列train
在上面的模型。 我假设*map*
函数族中的一个将使这成为可能( invoke_map
?),但一直未能弄清楚。
您可以使用map
和lazyeval::interp
编程方式构建调用。 我很好奇,如果有纯粹的purrr
解决方案,但问题是您想创建多个列,并且您需要多次调用。 purrr
解决方案可能会创建另一个包含所有模型的列表列。
library(lazyeval)
model_calls <- map(formulas_heights,
~interp(~map(train, ~lm(form, data = .x)), form = .x))
score_calls <- map(names(model_calls),
~interp(~map2_dbl(.x = m, .y = test, .f = rmse), m = as.name(.x)))
names(score_calls) <- paste0("rmse", seq_along(score_calls))
df_heights_resampled %>% mutate_(.dots = c(model_calls, score_calls))
# A tibble: 10 × 7
train test `Resample ID` model1 model2 rmse1 rmse2
<list> <list> <chr> <list> <list> <dbl> <dbl>
1 <S3: resample> <S3: resample> 01 <S3: lm> <S3: lm> 44720.86 41452.07
2 <S3: resample> <S3: resample> 02 <S3: lm> <S3: lm> 54174.38 48823.03
3 <S3: resample> <S3: resample> 03 <S3: lm> <S3: lm> 56854.21 52725.62
4 <S3: resample> <S3: resample> 04 <S3: lm> <S3: lm> 53312.38 48797.48
5 <S3: resample> <S3: resample> 05 <S3: lm> <S3: lm> 61883.90 57469.17
6 <S3: resample> <S3: resample> 06 <S3: lm> <S3: lm> 55709.83 50867.26
7 <S3: resample> <S3: resample> 07 <S3: lm> <S3: lm> 53036.06 48698.07
8 <S3: resample> <S3: resample> 08 <S3: lm> <S3: lm> 55986.83 52717.94
9 <S3: resample> <S3: resample> 09 <S3: lm> <S3: lm> 51738.60 48006.74
10 <S3: resample> <S3: resample> 10 <S3: lm> <S3: lm> 45061.22 41480.35
受我自己的问题启发,我认为对这个问题有类似的方法。
首先,定义一个函数,该函数可以采用列 - 列结构中的数据和公式参数,并使用输入估计模型:
est_model <- function(data, formula, ...) {
map(list(data), formula, ...)[[1]]
}
然后用每个CV折叠和配方对估计多个模型会很简单:
library(gapminder)
library(tidyverse)
library(modelr)
cv_gm <- gapminder %>%
crossv_kfold(k = 10, id = "Resample ID")
# Assume 4 different formulae
formulae_tbl <- tibble::frame_data(
~model, ~fmla,
1, ~lm(lifeExp ~ year, data = .),
2, ~lm(lifeExp ~ year + pop, data = .),
3, ~lm(lifeExp ~ year + gdpPercap, data = .),
4, ~lm(lifeExp ~ year + pop + gdpPercap, data = .)
)
cv_gm_results <- cv_gm %>%
tidyr::crossing(formulae_tbl)
cv_gm_results <- cv_gm_results %>%
mutate(fit=map2(train, fmla, est_model),
rmse=map2_dbl(fit, test, .f = rmse))
可以说,根据整洁的数据哲学,最好先处理cv_gm_results
,但如果你想在原始问题的形状中使用它(h / t这个问题):
cv_gm_results %>%
select(`Resample ID`, model, fit, rmse) %>%
gather(variable, value, fit, rmse) %>%
unite(temp, variable, model, sep="") %>%
spread(temp, value) %>%
mutate_at(.cols=vars(starts_with("rmse")), .funs=flatten_dbl)
# A tibble: 10 × 9
`Resample ID` fit1 fit2 fit3 fit4 rmse1 rmse2
<chr> <list> <list> <list> <list> <dbl> <dbl>
1 01 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.32344 11.32201
2 02 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.34626 11.33175
3 03 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.62480 11.60221
4 04 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 10.80946 10.81421
5 05 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.52413 11.52384
6 06 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 12.10914 12.08134
7 07 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.97641 12.00809
8 08 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 12.30191 12.31489
9 09 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.96970 11.95617
10 10 <S3: lm> <S3: lm> <S3: lm> <S3: lm> 11.30289 11.30294
# ... with 2 more variables: rmse3 <dbl>, rmse4 <dbl>
更新
事实证明,est_model()不是必需的, purrr
提供了at_depth()
,它适用于我们的目的:
cv_gm_results <- cv_gm_results %>%
mutate(fit=map2(train, fmla, ~at_depth(.x, 0, .y)),
rmse=map2_dbl(fit, test, .f = rmse))
链接地址: http://www.djcxy.com/p/36883.html