knitr::opts_chunk$set(echo = TRUE, 
                      cache.lazy = FALSE,
                      dev = c("svglite", "pdf", "png"),
                      dpi = 300,
                      fig.path = 'figures/',
                      fig.keep = "high")


#added from: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
  # this hook is used only when the linewidth option is not NULL
  if (!is.null(n <- options$linewidth)) {
    x = knitr:::split_lines(x)
    # any lines wider than n should be wrapped
    if (any(nchar(x) > n)) x = strwrap(x, width = n)
    x = paste(x, collapse = '\n')
  }
  hook_output(x, options)
})


library(tidyverse) # A collection of packages for data science. More about it on 
                   # www.tidyverse.com
library(magrittr)  # A package that provides pipe operators like %>%
library(lme4)      # A package 
library(glue)      # A package that provides interpolated string functions.
library(latex2exp) # A package that converts latex to plotmath expressions.library(lmerTest)
library(lmerTest)  # A package that improves inference for LMEM.
library(emmeans)   # A package that allows to conduct post-hoc tests.
library(parameters)

── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

✔ ggplot2 3.3.3     ✔ purrr   0.3.4
✔ tibble  3.1.1     ✔ dplyr   1.0.5
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.4.0     ✔ forcats 0.5.1

── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


Attaching package: ‘magrittr’


The following object is masked from ‘package:purrr’:

    set_names


The following object is masked from ‘package:tidyr’:

    extract


Loading required package: Matrix


Attaching package: ‘Matrix’


The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack


Attaching package: ‘glue’


The following object is masked from ‘package:dplyr’:

    collapse


Attaching package: ‘lmerTest’


The following object is masked from ‘package:lme4’:

    lmer


The following object is masked from ‘package:stats’:

    step


#this data set contains replicates for different seeds 
data_ter <- 
  readRDS("data/data_ter.rds")

summary(data_ter)

  sentence_id        system     replication      ter           src_length   
 0_0    :   7   Baseline:1041   1:3123      Min.   :0.0000   Min.   : 1.00  
 0_1    :   7   Marking :3123   2:2082      1st Qu.:0.4091   1st Qu.:10.00  
 0_10   :   7   PostEdit:3123   3:2082      Median :0.5652   Median :17.00  
 0_11   :   7                               Mean   :0.5813   Mean   :19.25  
 0_12   :   7                               3rd Qu.:0.7343   3rd Qu.:26.00  
 0_13   :   7                               Max.   :1.6000   Max.   :76.00  
 (Other):7245


#for a traditional analysis one has to average over replicates
data_ter_mean <- 
  data_ter %>%
  select(-replication) %>%
  group_by(across(!ter)) %>%
  summarize(ter = mean(ter))

`summarise()` has grouped output by 'sentence_id', 'system'. You can override using the `.groups` argument.


ggplot(data = data_ter_mean) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("") +
  ylab("TER Score") +
  geom_boxplot(aes(x = system, y = ter, fill = system), alpha = .3)


data_ter_mean %>%
  group_by(system) %>%
  summarise(mean_ter = mean(ter)) %>%
  mutate(baseline_difference = mean_ter - mean_ter[1])


model_lm <- 
  lm(ter ~ system, data = data_ter_mean)

summary(model_lm)

Call:
lm(formula = ter ~ system, data = data_ter_mean)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.59071 -0.16933 -0.01544  0.14968  1.01891 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     0.590713   0.007983  73.994   <2e-16 ***
systemMarking  -0.012342   0.011290  -1.093    0.274    
systemPostEdit -0.009620   0.011290  -0.852    0.394    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2576 on 3120 degrees of freedom
Multiple R-squared:  0.0004228,	Adjusted R-squared:  -0.000218 
F-statistic: 0.6598 on 2 and 3120 DF,  p-value: 0.517


model.matrix(model_lm) %>% unique()


model_parameters(anova(model_lm, test = "Chisq"))


model_lmm <- 
  lmer(ter ~ system + (1 | sentence_id), data = data_ter)

summary(model_lmm)

Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: ter ~ system + (1 | sentence_id)
   Data: data_ter

REML criterion at convergence: -15463.9

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-10.4761  -0.2997   0.0002   0.2601   8.4976 

Random effects:
 Groups      Name        Variance Std.Dev.
 sentence_id (Intercept) 0.064074 0.25313 
 Residual                0.003487 0.05905 
Number of obs: 7287, groups:  sentence_id, 1041

Fixed effects:
                 Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)     5.907e-01  8.056e-03  1.138e+03  73.325  < 2e-16 ***
systemMarking  -1.234e-02  2.113e-03  6.244e+03  -5.840 5.49e-09 ***
systemPostEdit -9.620e-03  2.113e-03  6.244e+03  -4.552 5.42e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr) systmM
systemMrkng -0.197       
systmPstEdt -0.197  0.750


anova(model_lmm, lmer.df = "asymptotic", test= "Chisq")


group_means <- 
  emmeans(object  = model_lmm, 
          specs   = pairwise ~ system, 
          lmer.df = "asymptotic")

group_means

structure(function (k, dfargs) 
Inf, mesg = "asymptotic")

structure(function (k, dfargs) 
Inf, mesg = "asymptotic")


ggplot(data = data_ter, 
       aes(x = src_length, y = ter, colour = system)) +
  theme_bw()+
  theme(legend.position = 'none') +
  facet_wrap(~system) +
  xlab("Source Sentence Length") +
  ylab("TER Score") +
  geom_point(aes(colour = system), alpha = .1) + #add points
  geom_vline(xintercept = c(15, 55), linetype = "dashed") +
  geom_density_2d(alpha = .3) +                  #add contour lines
  geom_smooth(method = "loess", se = FALSE)      #add loess smoother
  #geom_smooth(method = "gam", se = FALSE)       #alternative: gam smoother

`geom_smooth()` using formula 'y ~ x'


data_ter_mean %<>%
  mutate(src_length_class = factor((src_length > 15) + (src_length > 55), 
                                   levels = 0:2, 
                                   labels = c("short", "typical", "very long"))) 

data_ter %<>%
  mutate(src_length_class = factor((src_length > 15) + (src_length > 55), 
                                   levels = 0:2, 
                                   labels = c("short", "typical", "very long")))


data_plot <-  data_ter

levels(data_plot$src_length_class) <- 
glue("{levels(data_plot$src_length_class)} (#sentences={table(data_ter_mean$src_length_class) / 3})")


ggplot(data = data_plot) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("") +
  ylab("TER Score") +
  geom_boxplot(aes(x = src_length_class, y = ter, fill = system), alpha = .3)


model_ssl <- 
  #formula expands to:
  #ter ~ system + src_length_class + system:src_length_class + (1 | sentence_id)
  lmer(ter ~ system*src_length_class + (1 | sentence_id), data = data_ter)


anova(model_ssl, lmer.df = "asymptotic")


group_means_ssl <- 
  emmeans(model_ssl, ~ system:src_length_class, lmer.df = "asymptotic")


ggplot(as_tibble(group_means_ssl)) +
  theme_bw() +
  theme(legend.position = c(0.1,.8)) +
  xlab("Source Sentence Length (grouped)") +
  ylab("Estimated Group Mean") + 
  geom_pointrange(aes(x = src_length_class, 
                      y = emmean,
                      ymin = emmean - SE,
                      ymax = emmean + SE,
                      colour = system),
                  alpha = .7) +
  geom_line(aes(x = src_length_class,
                y = emmean,
                group = system,
                colour = system), 
            alpha = .3)


emmeans(object = model_ssl, 
        specs  = pairwise ~ system | src_length_class, 
        lmer.df = "asymptotic")$contrasts #..$contrasts to just print the contrasts

src_length_class = short:
 contrast            estimate      SE  df z.ratio p.value
 Baseline - Marking   0.01612 0.00316 Inf  5.111  <.0001 
 Baseline - PostEdit  0.01267 0.00316 Inf  4.016  0.0002 
 Marking - PostEdit  -0.00345 0.00223 Inf -1.548  0.2683 

src_length_class = typical:
 contrast            estimate      SE  df z.ratio p.value
 Baseline - Marking   0.00782 0.00287 Inf  2.728  0.0175 
 Baseline - PostEdit  0.00484 0.00287 Inf  1.688  0.2098 
 Marking - PostEdit  -0.00298 0.00203 Inf -1.470  0.3052 

src_length_class = very long:
 contrast            estimate      SE  df z.ratio p.value
 Baseline - Marking   0.07286 0.01887 Inf  3.861  0.0003 
 Baseline - PostEdit  0.10756 0.01887 Inf  5.700  <.0001 
 Marking - PostEdit   0.03471 0.01334 Inf  2.601  0.0252 

Degrees-of-freedom method: asymptotic 
P value adjustment: tukey method for comparing a family of 3 estimates

system	mean_ter	baseline_difference
<fct>	<dbl>	<dbl>
Baseline	0.5907133	0.000000000
Marking	0.5783709	-0.012342436
PostEdit	0.5810930	-0.009620325

Parameter	Sum_Squares	df	Mean_Square	F	p
<chr>	<dbl>	<int>	<dbl>	<dbl>	<dbl>
system	0.0875468	2	0.04377340	0.6597807	0.5170368
Residuals	206.9975983	3120	0.06634538	NA	NA

	Sum Sq	Mean Sq	NumDF	DenDF	F value	Pr(>F)
	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>
system	0.1191719	0.05958596	2	6244	17.08588	3.980403e-08

	(Intercept)	systemMarking	systemPostEdit
(Intercept)	6.490016e-05	-3.350085e-06	-3.350085e-06
systemMarking	-3.350085e-06	4.466780e-06	3.350085e-06
systemPostEdit	-3.350085e-06	3.350085e-06	4.466780e-06

	(Intercept)	systemMarking	systemPostEdit
(Intercept)	6.490016e-05	-3.350085e-06	-3.350085e-06
systemMarking	-3.350085e-06	4.466780e-06	3.350085e-06
systemPostEdit	-3.350085e-06	3.350085e-06	4.466780e-06

Using LMEMs to Analyse Model Performance Beyond Mean Measures¶

Marking or Post Edit? What Feedback Method Works Better?¶

A traditional way of analyzing system performance¶

A smarter way to model and analyze the evaluation data¶

Is the Performance Gain from Feedback Uniform for all Sentences?¶

	Baseline	Marking	PostEdit
Baseline - Marking	1	-1	0
Baseline - PostEdit	1	0	-1
Marking - PostEdit	0	1	-1

A data.frame: 3 × 1
.wgt.
<dbl>
1041
3123
3123

A anova: 3 × 6
	Sum Sq	Mean Sq	NumDF	DenDF	F value	Pr(>F)
	<dbl>	<dbl>	<int>	<dbl>	<dbl>	<dbl>
system	0.1453046	0.07265231	2	6240.000	20.92762	8.741701e-10
src_length_class	0.1041833	0.05209164	2	1042.853	15.00508	3.761653e-07
system:src_length_class	0.1127850	0.02819626	4	6240.000	8.12198	1.578164e-06