knitr::opts_chunk$set(echo = TRUE, 
                      cache.lazy = FALSE,
                      dev = c("svglite", "pdf", "png"),
                      dpi = 300,
                      fig.path = 'figures/',
                      fig.keep = "high")

#added from: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
  # this hook is used only when the linewidth option is not NULL
  if (!is.null(n <- options$linewidth)) {
    x = knitr:::split_lines(x)
    # any lines wider than n should be wrapped
    if (any(nchar(x) > n)) x = strwrap(x, width = n)
    x = paste(x, collapse = '\n')
  }
  hook_output(x, options)
})


library(tidyverse) # A collection of packages for data science. More about it on 
                   # www.tidyverse.com
library(magrittr)  # A package that provides pipe operators like %>%
library(mgcv)      # A package written by Simon Wood that implements his 
                   # (spline based) GAM specification.
library(glue)      # A package that provides interpolated string functions.


#When you want to reproduce the examples, you have to run this code chunk in advance!!
powerSet <- function(set) {
  
    apply(X      = rep(list(c(T,F)), length = length(set)) %>% expand.grid(.),
          MARGIN = 1,
          FUN    = function(...) set[...])
}


train_data <- readRDS("data/ir_trainset.rds")


feature_list <- 
  c("s(score_neural)", "s(score_tfIdf)", "cited_inventor", "cited_examiner", "cited_family")


fitted_models <-
  #generate the power set of feature_list, and remove the void set 
  powerSet(feature_list)[-(2^length(feature_list))] %>%
  #build the symbolic model description  
  sapply(., function(...) glue("relevance~{glue_collapse(..., sep='+')}")) %>%
  #fit the models to data, and extract key statistics
  tibble(formula_str = .,
         models      = lapply(X = .,
                              FUN  = function(m_str,...) gam(as.formula(m_str), ...),
                              data = train_data),
         data_fit    = round(sapply(models, function(m) summary(m)$dev.expl) * 100),
         complexity  = sapply(models, function(...) attr(logLik(...), 'df'))) %>%
  #Sort the models so that we can find models that replicate the data well.
  #For this models the deviance should be approximatly zero.
  arrange(desc(data_fit), complexity)


head(fitted_models[,-2], n = 10)


selected_model <- fitted_models$models[[1]]

summary(selected_model)

Family: gaussian 
Link function: identity 

Formula:
relevance ~ cited_inventor + cited_examiner + cited_family

Parametric coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -1.249e-16  0.000e+00    -Inf   <2e-16 ***
cited_inventoryes  1.000e+00  0.000e+00     Inf   <2e-16 ***
cited_examineryes  2.000e+00  0.000e+00     Inf   <2e-16 ***
cited_familyyes    3.000e+00  0.000e+00     Inf   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1


R-sq.(adj) =      1   Deviance explained =  100%
GCV =      0  Scale est. = 0         n = 318375


relevance <- function(x, coeff = 0:3) coeff[round(x) + 1]

pseudoSmoother <- function(x, model = selected_model) {
  coef(model)[round(x) + 1]
}

dev_selected <- floor(summary(selected_model)$dev.expl * 100)

par(mfrow = c(1,2), mar=c(4,3,2,1), oma = c(1,1,1,1))
  plot.function(relevance, 
      from = -.49, 
      to   = 3.49, 
      n    = 1001, 
      lty  = "dashed",
      col  = "blue",
      xlim = c(-.49, 3.49),
      ylim = c(0,3),
      xaxt = "n",
      yaxt = "n",
      main = glue("Theoretical Function"),
      ylab = "Relevance", 
      xlab = "")
    axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"), cex.axis = .9)
    axis(2, at = 0:3, labels = 0:3)

  plot.function(pseudoSmoother, 
      from = -.49, 
      to   = 3.49, 
      n    = 1001, 
      lty  = "solid",
      xlim = c(-.49, 3.49),
      ylim = c(0,3),
      xaxt = "n",
      yaxt = "n",
      main = glue("Selected Model (D\U00B2={dev_selected}%)"),
      ylab = "", 
      xlab = "")
    axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"), cex.axis = .9)
    axis(2, at = 0:3, labels = 0:3)
par(mfrow = c(1,1), mar = c(5,4,4,2))


gam_ir_nocirc <- 
  gam(relevance ~ s(score_tfIdf) + s(score_neural),
      data = train_data)

gam_ir_all <- 
  gam(relevance ~ s(score_tfIdf) + s(score_neural) + cited_inventor + cited_examiner + cited_family,
      data = train_data)


pseudoSmoother <- function(x, model = gam_ir_all) {
  coef(model)[round(x) + 1]
}

dev_nocirc   <- floor(summary(gam_ir_nocirc)$dev.expl * 100)
dev_withcirc <- floor(summary(gam_ir_all)$dev.expl * 100)

par(mfrow = c(3,2), mar=c(2,3,2,3), oma = c(1,1,1,1))
    plot.function(pseudoSmoother, 
      from = -.49, 
      to   = 3.49, 
      n    = 1001, 
      type = "n", 
      xlim = c(-.49, 3.49),
      ylim = c(-.1,3),
      xaxt = "n",
      yaxt = "n",
      main = glue("GAM without Citation Feature (D\U00B2={dev_nocirc}%)"),
      ylab = "Pseudo Feature Shape",
      xlab = "citation")
    axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"))
    axis(2, at = 0:3, labels = 0:3)
    text(labels = "not included in GAM",
         x      = mean(c(-.49, 3.49)),
         y      = mean(c(0,3)),
         adj    = .5)
  
   plot.function(pseudoSmoother, 
      from = -.49, 
      to   = 3.49, 
      n    = 1001, 
      lty  ="solid",
      xlim = c(-.49, 3.49),
      ylim = c(-.1 ,3),
      xaxt = "n",
      yaxt = "n",
      main = glue("GAM with Citation Feature (D\U00B2={dev_withcirc}%)"),
      ylab = "", 
      xlab = "citation")
   axis(1, at = 0:3, labels = c("none", "inventor", "examiner", "family"))
   axis(2, at = 0:3, labels = 0:3)
   mtext("citation", side = 4, line = 1)
  
   
  plot(gam_ir_nocirc, 
       se   = FALSE, 
       #rug  = TRUE, 
       main = "",
       ylab = "", 
       xlab = "",
       ylim = c(-.1,3),
       yaxt = "n",
       select = 1,
       shift = coef(gam_ir_nocirc)[1])
  axis(2, at = 0:3, labels = 0:3)
  rug(unique(train_data$score_tfIdf))

  plot(gam_ir_all, 
     se   = FALSE, 
     #rug  = TRUE, 
     main = "",
     ylab = "", 
     yaxt = "n",
     xlab = "",
     ylim = c(-.1, 3),
     select = 1)
  axis(2, at = 0:3, labels = 0:3)
  rug(unique(train_data$score_tfIdf))
  mtext("tf-Idf", side = 4, line = 1)
   
   
  plot(gam_ir_nocirc, 
       se   = FALSE, 
       #rug  = TRUE, 
       main = "",
       ylab = "", 
       xlab = "",
       ylim = c(-.1,3),
       xlim = c(-18, 10),
       yaxt = "n",
       select = 2,
       shift = coef(gam_ir_nocirc)[1])
  axis(2, at = 0:3, labels = 0:3)
  rug(unique(train_data$score_neural))

  plot(gam_ir_all, 
     se   = FALSE, 
     #rug  = TRUE, 
     main = "",
     ylab = "", 
     yaxt = "n",
     xlab = "",
     ylim = c(-.1, 3),
     xlim = c(-18, 10),
     select = 2)
  axis(2, at = 0:3, labels = 0:3)
  rug(unique(train_data$score_neural))
  mtext("neural", side = 4, line = 1)
  
  
par(mfrow = c(1,1), mar = c(5,4,4,2))

Warning message in rug(unique(train_data$score_neural)):
“some values will be clipped”
Warning message in rug(unique(train_data$score_neural)):
“some values will be clipped”


summary(gam_ir_all )

Family: gaussian 
Link function: identity 

Formula:
relevance ~ s(score_tfIdf) + s(score_neural) + cited_inventor + 
    cited_examiner + cited_family

Parametric coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       6.735e-16  0.000e+00     Inf   <2e-16 ***
cited_inventoryes 1.000e+00  0.000e+00     Inf   <2e-16 ***
cited_examineryes 2.000e+00  0.000e+00     Inf   <2e-16 ***
cited_familyyes   3.000e+00  0.000e+00     Inf   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Approximate significance of smooth terms:
                  edf Ref.df  F p-value
s(score_tfIdf)  3.500      0 NA      NA
s(score_neural) 3.737      0 NA      NA

R-sq.(adj) =      1   Deviance explained =  100%
GCV =      0  Scale est. = 0         n = 318375

formula_str	data_fit	complexity
<chr>	<dbl>	<dbl>
relevance~cited_inventor+cited_examiner+cited_family	100	5.000000
relevance~s(score_neural)+cited_inventor+cited_examiner+cited_family	100	7.773945
relevance~s(score_tfIdf)+cited_inventor+cited_examiner+cited_family	100	8.104671
relevance~s(score_neural)+s(score_tfIdf)+cited_inventor+cited_examiner+cited_family	100	11.719429
relevance~s(score_tfIdf)+cited_examiner+cited_family	95	12.933423
relevance~s(score_neural)+cited_examiner+cited_family	95	12.998698
relevance~s(score_neural)+s(score_tfIdf)+cited_examiner+cited_family	95	21.863611
relevance~cited_examiner+cited_family	94	4.000000
relevance~s(score_neural)+s(score_tfIdf)+cited_inventor+cited_examiner	87	21.790113
relevance~s(score_tfIdf)+cited_inventor+cited_examiner	86	12.966607

Detecting Circular Features: Information Retrieval Example¶

Search for circular features¶

Top 10¶

Selected Model (feature shapes)¶

Nullification¶

Model with all features¶