author:
date: 30. April 2021
knitr::opts_chunk$set(echo = TRUE,
cache.lazy = FALSE,
dev = c("svglite", "pdf", "png"),
dpi = 300,
fig.path = 'figures/',
fig.keep = "high")
#added from: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
# this hook is used only when the linewidth option is not NULL
if (!is.null(n <- options$linewidth)) {
x = knitr:::split_lines(x)
# any lines wider than n should be wrapped
if (any(nchar(x) > n)) x = strwrap(x, width = n)
x = paste(x, collapse = '\n')
}
hook_output(x, options)
})
library(tidyverse) # A collection of packages for data science. More about it on
# www.tidyverse.com
library(magrittr) # A package that provides pipe operators like %>%
library(mgcv) # A package written by Simon Wood that implements his
# (spline based) GAM specification.
library(glue) # A package that provides interpolated string functions.
#When you want to reproduce the examples, you have to run this code chunk in advance!!
powerSet <- function(set) {
apply(X = rep(list(c(T,F)), length = length(set)) %>% expand.grid(.),
MARGIN = 1,
FUN = function(...) set[...])
}
train_data <- readRDS("data/dataset-train_nostd.rds")
Train set size: nrow(train_data)
measurement points.
method <- "pearson"
corr_matrix <-
sapply(select(train_data, amv:horovitz),
cor,
method = method,
y = train_data$target)
size_feature_list <- 5
feature_list <-
corr_matrix[order(abs(corr_matrix), decreasing=TRUE)][1:size_feature_list] %>%
names(.) %>%
paste0("s(", .,")")
fitted_models <-
#generate the power set of feature_list, and remove the void set
powerSet(feature_list)[-(2^length(feature_list))] %>%
#build the symbolic model description
sapply(., function(...) glue("target~{glue_collapse(..., sep='+')}")) %>%
#fit the models to data, and extract key statistics
tibble(formula_str = .,
models = lapply(X = .,
FUN = function(m_str,...) gam(as.formula(m_str), ...),
data = train_data),
data_fit = round(sapply(models, function(m) summary(m)$dev.expl) * 100),
complexity = sapply(models, function(...) attr(logLik(...), 'df'))) %>%
#Sort the models so that we can find models that replicate the data well.
#For this models data_fit should be approximately 100.
arrange(desc(data_fit), complexity)
head(fitted_models[,-2], n = 10)
formula_str | data_fit | complexity |
---|---|---|
<chr> | <dbl> | <dbl> |
target~s(bili) | 94 | 10.99993 |
target~s(bili)+s(thrombo) | 94 | 19.94038 |
target~s(bili)+s(quinr) | 94 | 19.98920 |
target~s(bili)+s(alat) | 94 | 19.99194 |
target~s(bili)+s(asat) | 94 | 19.99783 |
target~s(bili)+s(asat)+s(thrombo) | 94 | 28.91834 |
target~s(bili)+s(thrombo)+s(alat) | 94 | 28.93953 |
target~s(bili)+s(quinr)+s(thrombo) | 94 | 28.96878 |
target~s(bili)+s(asat)+s(alat) | 94 | 28.97807 |
target~s(bili)+s(quinr)+s(alat) | 94 | 28.98078 |
selected_model <- gam(target ~ s(bili, k = 100), data=train_data)
summary(selected_model)
Family: gaussian Link function: identity Formula: target ~ s(bili, k = 100) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.234892 0.000071 3308 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Approximate significance of smooth terms: edf Ref.df F p-value s(bili) 98.93 99 890760 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 R-sq.(adj) = 0.996 Deviance explained = 99.6% GCV = 0.0016308 Scale est. = 0.0016303 n = 323404
jump_bili <- function(x) (x > 12) + (x > 6) + (x > 2) + (x > 1.2) + 0
plot(selected_model,
se = FALSE,
#rug = TRUE,
xlim = c(0,15),
ylim = c(0,4),
main = "",
ylab = "",
xlab = "bilirubin",
shift = coef(selected_model)[1])
rug(unique(train_data$bili))
curve(jump_bili, from = 0, to = 15, n = 1001, add = TRUE, col = "blue", lty="dashed")
abline(v = c(1.2, 2, 6, 12), col = "blue", lty="dotted")
curve(jump_bili, from = 0, to = 15, n = 1001, add = TRUE, col = "blue", lty="dashed")
abline(v = c(1.2, 2, 6, 12), col = "blue", lty="dotted")
legend(legend = c("Theoretical", "Estimated"),
col = c("blue", "black"),
lty = c("dashed", "solid"),
bty = "o",
bg = "white",
x = "topleft")
Warning message in rug(unique(train_data$bili)): “some values will be clipped”
gam_all <-
gam(target ~ s(bili, k = 100) + s(asat) + s(quinr) + s(thrombo) + s(alat),
data = train_data)
gam_nobili <-
gam(target ~ s(asat) + s(quinr) + s(thrombo) + s(alat),
data = train_data)
summary(gam_all )
Family: gaussian Link function: identity Formula: target ~ s(bili, k = 100) + s(asat) + s(quinr) + s(thrombo) + s(alat) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 2.349e-01 6.984e-05 3363 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Approximate significance of smooth terms: edf Ref.df F p-value s(bili) 98.970 99.000 684426.8 <2e-16 *** s(asat) 9.000 9.000 447.7 <2e-16 *** s(quinr) 9.000 9.000 146.5 <2e-16 *** s(thrombo) 8.963 8.999 145.2 <2e-16 *** s(alat) 8.997 9.000 215.5 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 R-sq.(adj) = 0.996 Deviance explained = 99.6% GCV = 0.0015781 Scale est. = 0.0015774 n = 323404
summary(gam_nobili)
Family: gaussian Link function: identity Formula: target ~ s(asat) + s(quinr) + s(thrombo) + s(alat) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.234892 0.001013 231.8 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Approximate significance of smooth terms: edf Ref.df F p-value s(asat) 8.989 9.000 938.8 <2e-16 *** s(quinr) 8.980 9.000 2150.2 <2e-16 *** s(thrombo) 8.946 8.999 2230.6 <2e-16 *** s(alat) 8.933 8.998 745.2 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 R-sq.(adj) = 0.256 Deviance explained = 25.6% GCV = 0.33215 Scale est. = 0.33211 n = 323404
dev_nocirc <- round(summary(gam_nobili)$dev.expl * 100)
dev_withcirc <- round(summary(gam_all)$dev.expl * 100)
par(mfrow = c(5,2), mar=c(2,3,2,3), oma = c(1,1,1,1))
plot(x = 0:15, y = seq(0,4, length.out=16),
xlim = c(0,15),
ylim = c(0,4),
main = glue("GAM without Bilirubin (D\U00B2={dev_nocirc}%)"),
ylab = "",
xlab = "",
type = "n")
text(labels = "not included in GAM",
x = mean(c(0,15)),
y = mean(c(0,4)),
adj = .5)
plot(gam_all,
se = FALSE,
#rug = TRUE,
xlim = c(0,15),
ylim = c(0,4),
main = glue("GAM with all Features (D\U00B2={dev_withcirc}%)"),
ylab = "",
xlab = "",
select = 1,
shift = coef(gam_all)[1])
rug(unique(train_data$bili))
mtext("bilirubin", side = 4, line = 1)
plot(gam_nobili,
se = FALSE,
#rug = TRUE,
ylab = "Feature Shape",
xlab = "",
ylim = c(-1,4),
#xlim = c(,),
select = 1)
abline(h=0, lty = 3)
rug(unique(train_data$asat))
plot(gam_all,
se = FALSE,
#rug = TRUE,
ylab = "",
xlab = "",
ylim = c(-1,4),
#xlim = c(,),
select = 2)
abline(h=0, lty = 3)
rug(unique(train_data$asat))
mtext("asat", side = 4, line = 1)
plot(gam_nobili,
se = FALSE,
#rug = TRUE,
ylab = "",
xlab = "",
ylim = c(-1,4),
xlim = c(0.8, 4),
select = 2)
abline(h=0, lty = 3)
rug(unique(train_data$quinr))
plot(gam_all,
se = FALSE,
#rug = TRUE,
ylab = "",
xlab = "",
ylim = c(-1,4),
xlim = c(0.8, 4),
select = 3)
abline(h=0, lty = 3)
rug(unique(train_data$quinr))
mtext("quinr", side = 4, line = 1)
plot(gam_nobili,
se = FALSE,
#rug = TRUE,
#ylab = "",
xlab = "",
ylim = c(-1,4),
#xlim = c(0, 7000),
select = 3)
rug(unique(train_data$thrombo))
abline(h=0, lty = 3)
plot(gam_all,
se = FALSE,
#rug = TRUE,
ylab = "",
#xlab = "",
ylim = c(-1,4),
#xlim = c(0, 7000),
select = 4)
abline(h=0, lty = 3)
rug(unique(train_data$thrombo))
mtext("thrombo", side = 4, line = 1)
plot(gam_nobili,
se = FALSE,
#rug = TRUE,
ylab = "",
#xlab = "",
ylim = c(-1,4),
#xlim = c(, ),
select = 4)
abline(h=0, lty = 3)
rug(unique(train_data$alat))
plot(gam_all,
se = FALSE,
#rug = TRUE,
ylab = "",
xlab = "",
ylim = c(-1,4),
#xlim = c(, ),
select = 5)
abline(h=0, lty = 3)
rug(unique(train_data$alat))
mtext("alat", side = 4, line = 1)
par(mfrow = c(1,1), mar = c(5,4,4,2))
Warning message in rug(unique(train_data$bili)): “some values will be clipped” Warning message in rug(unique(train_data$quinr)): “some values will be clipped” Warning message in rug(unique(train_data$quinr)): “some values will be clipped”