author:
date: 30. April 2021
knitr::opts_chunk$set(echo = TRUE,
cache.lazy = FALSE,
dev = c("svglite", "pdf", "png"),
dpi = 300,
fig.path = 'figures/',
fig.keep = "high")
#added from: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
# this hook is used only when the linewidth option is not NULL
if (!is.null(n <- options$linewidth)) {
x = knitr:::split_lines(x)
# any lines wider than n should be wrapped
if (any(nchar(x) > n)) x = strwrap(x, width = n)
x = paste(x, collapse = '\n')
}
hook_output(x, options)
})
library(tidyverse) # A collection of packages for data science. More about it on
# www.tidyverse.com
library(magrittr) # A package that provides pipe operators like %>%
library(mgcv) # A package written by Simon Wood that implements his
# (spline based) GAM specification.
library(glue) # A package that provides interpolated string functions.
#When you want to reproduce the examples, you have to run this code chunk in advance!!
powerSet <- function(set) {
apply(X = rep(list(c(T,F)), length = length(set)) %>% expand.grid(.),
MARGIN = 1,
FUN = function(...) set[...])
}
train_data <- readRDS("data/ir_trainset.rds")
feature_list <-
c("s(score_neural)", "s(score_tfIdf)", "cited_inventor", "cited_examiner", "cited_family")
fitted_models <-
#generate the power set of feature_list, and remove the void set
powerSet(feature_list)[-(2^length(feature_list))] %>%
#build the symbolic model description
sapply(., function(...) glue("relevance~{glue_collapse(..., sep='+')}")) %>%
#fit the models to data, and extract key statistics
tibble(formula_str = .,
models = lapply(X = .,
FUN = function(m_str,...) gam(as.formula(m_str), ...),
data = train_data),
data_fit = round(sapply(models, function(m) summary(m)$dev.expl) * 100),
complexity = sapply(models, function(...) attr(logLik(...), 'df'))) %>%
#Sort the models so that we can find models that replicate the data well.
#For this models the deviance should be approximatly zero.
arrange(desc(data_fit), complexity)
head(fitted_models[,-2], n = 10)
formula_str | data_fit | complexity |
---|---|---|
<chr> | <dbl> | <dbl> |
relevance~cited_inventor+cited_examiner+cited_family | 100 | 5.000000 |
relevance~s(score_neural)+cited_inventor+cited_examiner+cited_family | 100 | 7.773945 |
relevance~s(score_tfIdf)+cited_inventor+cited_examiner+cited_family | 100 | 8.104671 |
relevance~s(score_neural)+s(score_tfIdf)+cited_inventor+cited_examiner+cited_family | 100 | 11.719429 |
relevance~s(score_tfIdf)+cited_examiner+cited_family | 95 | 12.933423 |
relevance~s(score_neural)+cited_examiner+cited_family | 95 | 12.998698 |
relevance~s(score_neural)+s(score_tfIdf)+cited_examiner+cited_family | 95 | 21.863611 |
relevance~cited_examiner+cited_family | 94 | 4.000000 |
relevance~s(score_neural)+s(score_tfIdf)+cited_inventor+cited_examiner | 87 | 21.790113 |
relevance~s(score_tfIdf)+cited_inventor+cited_examiner | 86 | 12.966607 |
selected_model <- fitted_models$models[[1]]
summary(selected_model)
Family: gaussian Link function: identity Formula: relevance ~ cited_inventor + cited_examiner + cited_family Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.249e-16 0.000e+00 -Inf <2e-16 *** cited_inventoryes 1.000e+00 0.000e+00 Inf <2e-16 *** cited_examineryes 2.000e+00 0.000e+00 Inf <2e-16 *** cited_familyyes 3.000e+00 0.000e+00 Inf <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 R-sq.(adj) = 1 Deviance explained = 100% GCV = 0 Scale est. = 0 n = 318375
relevance <- function(x, coeff = 0:3) coeff[round(x) + 1]
pseudoSmoother <- function(x, model = selected_model) {
coef(model)[round(x) + 1]
}
dev_selected <- floor(summary(selected_model)$dev.expl * 100)
par(mfrow = c(1,2), mar=c(4,3,2,1), oma = c(1,1,1,1))
plot.function(relevance,
from = -.49,
to = 3.49,
n = 1001,
lty = "dashed",
col = "blue",
xlim = c(-.49, 3.49),
ylim = c(0,3),
xaxt = "n",
yaxt = "n",
main = glue("Theoretical Function"),
ylab = "Relevance",
xlab = "")
axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"), cex.axis = .9)
axis(2, at = 0:3, labels = 0:3)
plot.function(pseudoSmoother,
from = -.49,
to = 3.49,
n = 1001,
lty = "solid",
xlim = c(-.49, 3.49),
ylim = c(0,3),
xaxt = "n",
yaxt = "n",
main = glue("Selected Model (D\U00B2={dev_selected}%)"),
ylab = "",
xlab = "")
axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"), cex.axis = .9)
axis(2, at = 0:3, labels = 0:3)
par(mfrow = c(1,1), mar = c(5,4,4,2))
gam_ir_nocirc <-
gam(relevance ~ s(score_tfIdf) + s(score_neural),
data = train_data)
gam_ir_all <-
gam(relevance ~ s(score_tfIdf) + s(score_neural) + cited_inventor + cited_examiner + cited_family,
data = train_data)
pseudoSmoother <- function(x, model = gam_ir_all) {
coef(model)[round(x) + 1]
}
dev_nocirc <- floor(summary(gam_ir_nocirc)$dev.expl * 100)
dev_withcirc <- floor(summary(gam_ir_all)$dev.expl * 100)
par(mfrow = c(3,2), mar=c(2,3,2,3), oma = c(1,1,1,1))
plot.function(pseudoSmoother,
from = -.49,
to = 3.49,
n = 1001,
type = "n",
xlim = c(-.49, 3.49),
ylim = c(-.1,3),
xaxt = "n",
yaxt = "n",
main = glue("GAM without Citation Feature (D\U00B2={dev_nocirc}%)"),
ylab = "Pseudo Feature Shape",
xlab = "citation")
axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"))
axis(2, at = 0:3, labels = 0:3)
text(labels = "not included in GAM",
x = mean(c(-.49, 3.49)),
y = mean(c(0,3)),
adj = .5)
plot.function(pseudoSmoother,
from = -.49,
to = 3.49,
n = 1001,
lty ="solid",
xlim = c(-.49, 3.49),
ylim = c(-.1 ,3),
xaxt = "n",
yaxt = "n",
main = glue("GAM with Citation Feature (D\U00B2={dev_withcirc}%)"),
ylab = "",
xlab = "citation")
axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"))
axis(2, at = 0:3, labels = 0:3)
mtext("citation", side = 4, line = 1)
plot(gam_ir_nocirc,
se = FALSE,
#rug = TRUE,
main = "",
ylab = "",
xlab = "",
ylim = c(-.1,3),
yaxt = "n",
select = 1,
shift = coef(gam_ir_nocirc)[1])
axis(2, at = 0:3, labels = 0:3)
rug(unique(train_data$score_tfIdf))
plot(gam_ir_all,
se = FALSE,
#rug = TRUE,
main = "",
ylab = "",
yaxt = "n",
xlab = "",
ylim = c(-.1, 3),
select = 1)
axis(2, at = 0:3, labels = 0:3)
rug(unique(train_data$score_tfIdf))
mtext("tf-Idf", side = 4, line = 1)
plot(gam_ir_nocirc,
se = FALSE,
#rug = TRUE,
main = "",
ylab = "",
xlab = "",
ylim = c(-.1,3),
xlim = c(-18, 10),
yaxt = "n",
select = 2,
shift = coef(gam_ir_nocirc)[1])
axis(2, at = 0:3, labels = 0:3)
rug(unique(train_data$score_neural))
plot(gam_ir_all,
se = FALSE,
#rug = TRUE,
main = "",
ylab = "",
yaxt = "n",
xlab = "",
ylim = c(-.1, 3),
xlim = c(-18, 10),
select = 2)
axis(2, at = 0:3, labels = 0:3)
rug(unique(train_data$score_neural))
mtext("neural", side = 4, line = 1)
par(mfrow = c(1,1), mar = c(5,4,4,2))
Warning message in rug(unique(train_data$score_neural)): “some values will be clipped” Warning message in rug(unique(train_data$score_neural)): “some values will be clipped”
summary(gam_ir_all )
Family: gaussian Link function: identity Formula: relevance ~ s(score_tfIdf) + s(score_neural) + cited_inventor + cited_examiner + cited_family Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.735e-16 0.000e+00 Inf <2e-16 *** cited_inventoryes 1.000e+00 0.000e+00 Inf <2e-16 *** cited_examineryes 2.000e+00 0.000e+00 Inf <2e-16 *** cited_familyyes 3.000e+00 0.000e+00 Inf <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Approximate significance of smooth terms: edf Ref.df F p-value s(score_tfIdf) 3.500 0 NA NA s(score_neural) 3.737 0 NA NA R-sq.(adj) = 1 Deviance explained = 100% GCV = 0 Scale est. = 0 n = 318375