author:
date: 30. April 2021
knitr::opts_chunk$set(echo = TRUE,
cache.lazy = FALSE,
dev = c("svglite", "pdf", "png"),
dpi = 300,
fig.path = 'figures/',
fig.keep = "high")
#added from: https://github.com/yihui/knitr-examples/blob/master/077-wrap-output.Rmd
library(knitr)
hook_output = knit_hooks$get('output')
knit_hooks$set(output = function(x, options) {
# this hook is used only when the linewidth option is not NULL
if (!is.null(n <- options$linewidth)) {
x = knitr:::split_lines(x)
# any lines wider than n should be wrapped
if (any(nchar(x) > n)) x = strwrap(x, width = n)
x = paste(x, collapse = '\n')
}
hook_output(x, options)
})
library(tidyverse) # A collection of packages for data science. More about it on
# www.tidyverse.com
library(magrittr) # A package that provides pipe operators like %>%
library(lme4) # A package
library(glue) # A package that provides interpolated string functions.
library(latex2exp) # A package that converts latex to plotmath expressions.library(lmerTest)
library(lmerTest) # A package that improves inference for LMEM.
library(emmeans) # A package that allows to conduct post-hoc tests.
library(parameters)
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ── ✔ ggplot2 3.3.3 ✔ purrr 0.3.4 ✔ tibble 3.1.1 ✔ dplyr 1.0.5 ✔ tidyr 1.1.3 ✔ stringr 1.4.0 ✔ readr 1.4.0 ✔ forcats 0.5.1 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag() Attaching package: ‘magrittr’ The following object is masked from ‘package:purrr’: set_names The following object is masked from ‘package:tidyr’: extract Loading required package: Matrix Attaching package: ‘Matrix’ The following objects are masked from ‘package:tidyr’: expand, pack, unpack Attaching package: ‘glue’ The following object is masked from ‘package:dplyr’: collapse Attaching package: ‘lmerTest’ The following object is masked from ‘package:lme4’: lmer The following object is masked from ‘package:stats’: step
read data
#this data set contains replicates for different seeds
data_ter <-
readRDS("data/data_ter.rds")
summary(data_ter)
sentence_id system replication ter src_length 0_0 : 7 Baseline:1041 1:3123 Min. :0.0000 Min. : 1.00 0_1 : 7 Marking :3123 2:2082 1st Qu.:0.4091 1st Qu.:10.00 0_10 : 7 PostEdit:3123 3:2082 Median :0.5652 Median :17.00 0_11 : 7 Mean :0.5813 Mean :19.25 0_12 : 7 3rd Qu.:0.7343 3rd Qu.:26.00 0_13 : 7 Max. :1.6000 Max. :76.00 (Other):7245
average over replicates (seed)
#for a traditional analysis one has to average over replicates
data_ter_mean <-
data_ter %>%
select(-replication) %>%
group_by(across(!ter)) %>%
summarize(ter = mean(ter))
`summarise()` has grouped output by 'sentence_id', 'system'. You can override using the `.groups` argument.
ggplot(data = data_ter_mean) +
theme_bw() +
theme(legend.position = "none") +
xlab("") +
ylab("TER Score") +
geom_boxplot(aes(x = system, y = ter, fill = system), alpha = .3)
data_ter_mean %>%
group_by(system) %>%
summarise(mean_ter = mean(ter)) %>%
mutate(baseline_difference = mean_ter - mean_ter[1])
system | mean_ter | baseline_difference |
---|---|---|
<fct> | <dbl> | <dbl> |
Baseline | 0.5907133 | 0.000000000 |
Marking | 0.5783709 | -0.012342436 |
PostEdit | 0.5810930 | -0.009620325 |
conduct model based ANOVA
model_lm <-
lm(ter ~ system, data = data_ter_mean)
summary(model_lm)
Call: lm(formula = ter ~ system, data = data_ter_mean) Residuals: Min 1Q Median 3Q Max -0.59071 -0.16933 -0.01544 0.14968 1.01891 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.590713 0.007983 73.994 <2e-16 *** systemMarking -0.012342 0.011290 -1.093 0.274 systemPostEdit -0.009620 0.011290 -0.852 0.394 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.2576 on 3120 degrees of freedom Multiple R-squared: 0.0004228, Adjusted R-squared: -0.000218 F-statistic: 0.6598 on 2 and 3120 DF, p-value: 0.517
present model matrix to show dummy coding
model.matrix(model_lm) %>% unique()
(Intercept) | systemMarking | systemPostEdit | |
---|---|---|---|
1 | 1 | 0 | 0 |
2 | 1 | 1 | 0 |
3 | 1 | 0 | 1 |
conduct significance test
model_parameters(anova(model_lm, test = "Chisq"))
Parameter | Sum_Squares | df | Mean_Square | F | p |
---|---|---|---|---|---|
<chr> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
system | 0.0875468 | 2 | 0.04377340 | 0.6597807 | 0.5170368 |
Residuals | 206.9975983 | 3120 | 0.06634538 | NA | NA |
model data via LMEM to account for repeated measurements on the same sentence
model_lmm <-
lmer(ter ~ system + (1 | sentence_id), data = data_ter)
summary(model_lmm)
Linear mixed model fit by REML. t-tests use Satterthwaite's method [ lmerModLmerTest] Formula: ter ~ system + (1 | sentence_id) Data: data_ter REML criterion at convergence: -15463.9 Scaled residuals: Min 1Q Median 3Q Max -10.4761 -0.2997 0.0002 0.2601 8.4976 Random effects: Groups Name Variance Std.Dev. sentence_id (Intercept) 0.064074 0.25313 Residual 0.003487 0.05905 Number of obs: 7287, groups: sentence_id, 1041 Fixed effects: Estimate Std. Error df t value Pr(>|t|) (Intercept) 5.907e-01 8.056e-03 1.138e+03 73.325 < 2e-16 *** systemMarking -1.234e-02 2.113e-03 6.244e+03 -5.840 5.49e-09 *** systemPostEdit -9.620e-03 2.113e-03 6.244e+03 -4.552 5.42e-06 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Correlation of Fixed Effects: (Intr) systmM systemMrkng -0.197 systmPstEdt -0.197 0.750
LMEM omnibus test
anova(model_lmm, lmer.df = "asymptotic", test= "Chisq")
Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | |
---|---|---|---|---|---|---|
<dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | |
system | 0.1191719 | 0.05958596 | 2 | 6244 | 17.08588 | 3.980403e-08 |
reconstruct group means and differences plus test
group_means <-
emmeans(object = model_lmm,
specs = pairwise ~ system,
lmer.df = "asymptotic")
group_means
(Intercept) | systemMarking | systemPostEdit | |
---|---|---|---|
(Intercept) | 6.490016e-05 | -3.350085e-06 | -3.350085e-06 |
systemMarking | -3.350085e-06 | 4.466780e-06 | 3.350085e-06 |
systemPostEdit | -3.350085e-06 | 3.350085e-06 | 4.466780e-06 |
(Intercept) | systemMarking | systemPostEdit |
---|---|---|
1 | 0 | 0 |
1 | 1 | 0 |
1 | 0 | 1 |
structure(function (k, dfargs)
Inf, mesg = "asymptotic")
NA |
.wgt. |
---|
<dbl> |
1041 |
3123 |
3123 |
(Intercept) | systemMarking | systemPostEdit | |
---|---|---|---|
(Intercept) | 6.490016e-05 | -3.350085e-06 | -3.350085e-06 |
systemMarking | -3.350085e-06 | 4.466780e-06 | 3.350085e-06 |
systemPostEdit | -3.350085e-06 | 3.350085e-06 | 4.466780e-06 |
(Intercept) | systemMarking | systemPostEdit |
---|---|---|
0 | -1 | 0 |
0 | 0 | -1 |
0 | 1 | -1 |
structure(function (k, dfargs)
Inf, mesg = "asymptotic")
NA |
system |
---|
<fct> |
Baseline |
Marking |
PostEdit |
Baseline | Marking | PostEdit | |
---|---|---|---|
Baseline - Marking | 1 | -1 | 0 |
Baseline - PostEdit | 1 | 0 | -1 |
Marking - PostEdit | 0 | 1 | -1 |
ggplot(data = data_ter,
aes(x = src_length, y = ter, colour = system)) +
theme_bw()+
theme(legend.position = 'none') +
facet_wrap(~system) +
xlab("Source Sentence Length") +
ylab("TER Score") +
geom_point(aes(colour = system), alpha = .1) + #add points
geom_vline(xintercept = c(15, 55), linetype = "dashed") +
geom_density_2d(alpha = .3) + #add contour lines
geom_smooth(method = "loess", se = FALSE) #add loess smoother
#geom_smooth(method = "gam", se = FALSE) #alternative: gam smoother
`geom_smooth()` using formula 'y ~ x'
classify sentences by length
data_ter_mean %<>%
mutate(src_length_class = factor((src_length > 15) + (src_length > 55),
levels = 0:2,
labels = c("short", "typical", "very long")))
data_ter %<>%
mutate(src_length_class = factor((src_length > 15) + (src_length > 55),
levels = 0:2,
labels = c("short", "typical", "very long")))
data_plot <- data_ter
levels(data_plot$src_length_class) <-
glue("{levels(data_plot$src_length_class)} (#sentences={table(data_ter_mean$src_length_class) / 3})")
ggplot(data = data_plot) +
theme_bw() +
theme(legend.position = "none") +
xlab("") +
ylab("TER Score") +
geom_boxplot(aes(x = src_length_class, y = ter, fill = system), alpha = .3)
LMEM with system/input length interaction as fixed effect
model_ssl <-
#formula expands to:
#ter ~ system + src_length_class + system:src_length_class + (1 | sentence_id)
lmer(ter ~ system*src_length_class + (1 | sentence_id), data = data_ter)
omnibus test for interaction model
anova(model_ssl, lmer.df = "asymptotic")
Sum Sq | Mean Sq | NumDF | DenDF | F value | Pr(>F) | |
---|---|---|---|---|---|---|
<dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> | |
system | 0.1453046 | 0.07265231 | 2 | 6240.000 | 20.92762 | 8.741701e-10 |
src_length_class | 0.1041833 | 0.05209164 | 2 | 1042.853 | 15.00508 | 3.761653e-07 |
system:src_length_class | 0.1127850 | 0.02819626 | 4 | 6240.000 | 8.12198 | 1.578164e-06 |
calc group means investigate interaction
group_means_ssl <-
emmeans(model_ssl, ~ system:src_length_class, lmer.df = "asymptotic")
ggplot(as_tibble(group_means_ssl)) +
theme_bw() +
theme(legend.position = c(0.1,.8)) +
xlab("Source Sentence Length (grouped)") +
ylab("Estimated Group Mean") +
geom_pointrange(aes(x = src_length_class,
y = emmean,
ymin = emmean - SE,
ymax = emmean + SE,
colour = system),
alpha = .7) +
geom_line(aes(x = src_length_class,
y = emmean,
group = system,
colour = system),
alpha = .3)
conduct post hoc test
emmeans(object = model_ssl,
specs = pairwise ~ system | src_length_class,
lmer.df = "asymptotic")$contrasts #..$contrasts to just print the contrasts
src_length_class = short: contrast estimate SE df z.ratio p.value Baseline - Marking 0.01612 0.00316 Inf 5.111 <.0001 Baseline - PostEdit 0.01267 0.00316 Inf 4.016 0.0002 Marking - PostEdit -0.00345 0.00223 Inf -1.548 0.2683 src_length_class = typical: contrast estimate SE df z.ratio p.value Baseline - Marking 0.00782 0.00287 Inf 2.728 0.0175 Baseline - PostEdit 0.00484 0.00287 Inf 1.688 0.2098 Marking - PostEdit -0.00298 0.00203 Inf -1.470 0.3052 src_length_class = very long: contrast estimate SE df z.ratio p.value Baseline - Marking 0.07286 0.01887 Inf 3.861 0.0003 Baseline - PostEdit 0.10756 0.01887 Inf 5.700 <.0001 Marking - PostEdit 0.03471 0.01334 Inf 2.601 0.0252 Degrees-of-freedom method: asymptotic P value adjustment: tukey method for comparing a family of 3 estimates