# Data manipulation
library(plyr)
library(dplyr)
library(tidyr)
library(reshape2)
library(tibble)
# Pretty printing
library(scales)
# Fetching data
library(curl)
# Computing hashes, used for efficient model caching
library(hashr)
# Easier plotting
library(ggplot2); theme_set(theme_minimal())
library(likert)
library(bayesplot)
library(ggpubr)
library(HDInterval)
library(ggridges)
# Baysian modeling
library(brms)
We begin by importing the csv data from the data repository.
d.orig <- read.csv(curl("https://raw.githubusercontent.com/BrokenWindowsInvestigation/Data/master/data.csv"))
d.orig
session <chr> | group <chr> | education_level <chr> | |
---|---|---|---|
6033c6fc5af2c702367b3a93 | students | Bachelor degree | |
6033c6fc5af2c702367b3a93 | students | Bachelor degree | |
6033c7315af2c702367b3a94 | students | Some master studies | |
6033c7315af2c702367b3a94 | students | Some master studies | |
6033d69a5af2c702367b3a95 | students | Some master studies | |
6033d69a5af2c702367b3a95 | students | Some master studies | |
6033d90a5af2c702367b3a96 | students | Some bachelor studies | |
6033d90a5af2c702367b3a96 | students | Some bachelor studies | |
6033ea8a5af2c702367b3a97 | students | Some bachelor studies | |
6034fc165af2c702367b3a98 | students | Some master studies |
### Utility functions for encoding ###
encode.categorical <- function(column, categories) {
factor(column, level = categories)
}
encode.bool <- function(column) {
encode.categorical(column, c("true", "false"))
}
encode.logic <- function(column) {
encode.categorical(column, c(TRUE, FALSE))
}
encode.orderedcategorical <- function(column, categories) {
as.ordered(encode.categorical(column, categories))
}
encode.likert <- function(column) {
encode.orderedcategorical(column, c(-3, -2, -1, 0, 1, 2, 3))
}
### Encode the original data ###
d <- data.frame(
session = factor(d.orig$session),
time = d.orig$time,
reused_logic_constructor = encode.bool(d.orig$reused_logic_constructor),
reused_logic_validation = encode.bool(d.orig$reused_logic_validation),
equals.state = encode.orderedcategorical(
d.orig$equals_state,
c("Not implemented", "Duplicated", "Good")
),
hashcode.state = encode.orderedcategorical(
d.orig$hashcode_state,
c("Not implemented", "Duplicated", "Good")
),
documentation = factor(d.orig$documentation),
var_names_copied_all = d.orig$var_names_copied_all,
var_names_copied_good = d.orig$var_names_copied_good,
var_names_copied_good.ratio = d.orig$var_names_copied_good / d.orig$var_names_copied_all,
var_names_new_all = d.orig$var_names_new_all,
var_names_new_good = d.orig$var_names_new_good,
var_names_new_good.ratio = d.orig$var_names_new_good / d.orig$var_names_new_all,
var_names_edited_all = d.orig$var_names_edited_all,
var_names_edited_good = d.orig$var_names_edited_good,
var_names_edited_good.ratio = d.orig$var_names_edited_good / d.orig$var_names_edited_all,
sonarqube_issues =
d.orig$sonarqube_issues_major +
d.orig$sonarqube_issues_minor +
d.orig$sonarqube_issues_info +
d.orig$sonarqube_issues_critical,
sonarqube_issues.major = d.orig$sonarqube_issues_major,
sonarqube_issues.minor = d.orig$sonarqube_issues_minor,
sonarqube_issues.info = d.orig$sonarqube_issues_info,
sonarqube_issues.critical = d.orig$sonarqube_issues_critical,
group = factor(d.orig$group),
education_level = encode.orderedcategorical(d.orig$education_level, c(
"None",
"Some bachelor studies",
"Bachelor degree",
"Some master studies",
"Master degree",
"Some Ph.D. studies",
"Ph. D."
)),
education_field = factor(d.orig$education_field),
work_domain = factor(d.orig$work_domain),
work_experience_programming = d.orig$work_experience_programming,
work_experience_java = d.orig$work_experience_java,
workplace_pair_programming = encode.bool(d.orig$workplace_pair_programming),
workplace_peer_review = encode.bool(d.orig$workplace_peer_review),
workplace_td_tracking = encode.bool(d.orig$workplace_td_tracking),
workplace_coding_standards = encode.bool(d.orig$workplace_coding_standards),
task_completion = encode.orderedcategorical(d.orig$task_completion, c(
"Not submitted",
"Does not compile",
"Invalid solution",
"Completed"
)),
quality_pre_task = encode.likert(d.orig$quality_pre_task),
quality_post_task = encode.likert(d.orig$quality_post_task),
high_debt_version = encode.bool(d.orig$high_debt_version),
scenario = encode.categorical(d.orig$scenario, c("booking", "tickets")),
order = encode.orderedcategorical(d.orig$order, c(0, 1)),
modified_lines = d.orig$modified_lines,
large_structure_change = encode.bool(d.orig$large_structure_change)
)
d$equals.exists <- encode.logic(d$equals.state != "Not implemented")
d$hashcode.exists <-encode.logic(d$hashcode.state != "Not implemented")
str(d)
## 'data.frame': 73 obs. of 41 variables:
## $ session : Factor w/ 43 levels "6033c6fc5af2c702367b3a93",..: 1 1 2 2 3 3 4 4 5 6 ...
## $ time : int 1055 2263 3249 12 1382 1197 3855 1984 0 5309 ...
## $ reused_logic_constructor : Factor w/ 2 levels "true","false": 1 2 2 2 1 1 1 1 2 2 ...
## $ reused_logic_validation : Factor w/ 2 levels "true","false": 1 2 2 2 1 1 1 1 2 1 ...
## $ equals.state : Ord.factor w/ 3 levels "Not implemented"<..: 2 2 1 1 1 3 1 3 1 3 ...
## $ hashcode.state : Ord.factor w/ 3 levels "Not implemented"<..: 2 2 1 1 1 3 1 3 1 3 ...
## $ documentation : Factor w/ 4 levels "Correct","Incorrect",..: 3 2 2 3 1 3 3 2 4 3 ...
## $ var_names_copied_all : int 6 12 15 0 8 7 5 7 0 9 ...
## $ var_names_copied_good : int 6 1 3 0 8 5 3 7 0 1 ...
## $ var_names_copied_good.ratio: num 1 0.0833 0.2 NaN 1 ...
## $ var_names_new_all : int 4 3 4 0 4 2 4 2 0 22 ...
## $ var_names_new_good : int 4 1 0 0 4 2 4 2 0 11 ...
## $ var_names_new_good.ratio : num 1 0.333 0 NaN 1 ...
## $ var_names_edited_all : int 0 0 0 0 0 0 0 0 0 0 ...
## $ var_names_edited_good : int 0 0 0 0 0 0 0 0 0 0 ...
## $ var_names_edited_good.ratio: num NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
## $ sonarqube_issues : int 0 4 5 0 0 3 1 0 0 12 ...
## $ sonarqube_issues.major : int 0 2 5 0 0 3 1 0 0 8 ...
## $ sonarqube_issues.minor : int 0 2 0 0 0 0 0 0 0 4 ...
## $ sonarqube_issues.info : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sonarqube_issues.critical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ group : Factor w/ 7 levels "code-interested",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ education_level : Ord.factor w/ 6 levels "None"<"Some bachelor studies"<..: 3 3 4 4 4 4 2 2 2 4 ...
## $ education_field : Factor w/ 7 levels "Civil Engineering",..: 2 2 2 2 2 2 7 7 7 7 ...
## $ work_domain : Factor w/ 16 levels "Adtech","App",..: 12 12 3 3 7 7 3 3 12 16 ...
## $ work_experience_programming: num 0 0 0.2 0.2 1 1 0 0 0 2 ...
## $ work_experience_java : num 0 0 0 0 0 0 0 0 0 0 ...
## $ workplace_pair_programming : Factor w/ 2 levels "true","false": 1 1 2 2 2 2 2 2 2 2 ...
## $ workplace_peer_review : Factor w/ 2 levels "true","false": 2 2 1 1 2 2 2 2 2 2 ...
## $ workplace_td_tracking : Factor w/ 2 levels "true","false": 2 2 2 2 2 2 2 2 2 2 ...
## $ workplace_coding_standards : Factor w/ 2 levels "true","false": 2 2 1 1 2 2 1 1 2 2 ...
## $ task_completion : Ord.factor w/ 4 levels "Not submitted"<..: 2 4 3 1 4 4 4 4 1 4 ...
## $ quality_pre_task : Ord.factor w/ 7 levels "-3"<"-2"<"-1"<..: 4 3 6 6 4 2 2 6 4 3 ...
## $ quality_post_task : Ord.factor w/ 7 levels "-3"<"-2"<"-1"<..: 1 1 3 4 4 4 4 4 4 6 ...
## $ high_debt_version : Factor w/ 2 levels "true","false": 2 1 1 2 2 1 1 2 2 1 ...
## $ scenario : Factor w/ 2 levels "booking","tickets": 1 2 1 2 1 2 1 2 1 1 ...
## $ order : Ord.factor w/ 2 levels "0"<"1": 2 1 1 2 1 2 2 1 1 2 ...
## $ modified_lines : int 75 71 78 0 41 22 18 32 0 246 ...
## $ large_structure_change : Factor w/ 2 levels "true","false": 2 2 2 2 2 2 2 2 2 1 ...
## $ equals.exists : Factor w/ 2 levels "TRUE","FALSE": 1 1 2 2 2 1 2 1 2 1 ...
## $ hashcode.exists : Factor w/ 2 levels "TRUE","FALSE": 1 1 2 2 2 1 2 1 2 1 ...
For some models partial data sets and aggregates are needed.
d.sessions <- d %>% group_by(session) %>% dplyr::summarise(
across(task_completion, min),
across(c(
education_level,
education_field,
work_domain,
group,
work_experience_java,
work_experience_programming,
workplace_coding_standards,
workplace_pair_programming,
workplace_peer_review,
workplace_td_tracking
), first)
)
d$work_experience_programming.s = scale(d$work_experience_programming)
d$work_experience_java.s = scale(d$work_experience_java)
d.sessions
session <fct> | task_completion <ord> | education_level <ord> | |
---|---|---|---|
6033c6fc5af2c702367b3a93 | Does not compile | Bachelor degree | |
6033c7315af2c702367b3a94 | Not submitted | Some master studies | |
6033d69a5af2c702367b3a95 | Completed | Some master studies | |
6033d90a5af2c702367b3a96 | Completed | Some bachelor studies | |
6033ea8a5af2c702367b3a97 | Not submitted | Some bachelor studies | |
6034fc165af2c702367b3a98 | Completed | Some master studies | |
603500725af2c702367b3a99 | Completed | Some bachelor studies | |
603f84f15af2c702367b3a9b | Invalid solution | Some master studies | |
603f97625af2c702367b3a9d | Completed | Some master studies | |
603fd5d95af2c702367b3a9e | Completed | Some bachelor studies |
d.sessions.completed <- d.sessions %>% filter(task_completion == "Completed")
d.sessions.completed
session <fct> | task_completion <ord> | education_level <ord> | education_field <fct> | work_domain <fct> | |
---|---|---|---|---|---|
6033d69a5af2c702367b3a95 | Completed | Some master studies | Computer Science | Finance | |
6033d90a5af2c702367b3a96 | Completed | Some bachelor studies | Software Engineering | Automotive | |
6034fc165af2c702367b3a98 | Completed | Some master studies | Software Engineering | Web | |
603500725af2c702367b3a99 | Completed | Some bachelor studies | Software Engineering | None | |
603f97625af2c702367b3a9d | Completed | Some master studies | Computer Science | Web | |
603fd5d95af2c702367b3a9e | Completed | Some bachelor studies | Software Engineering | Automotive | |
60409b7b5af2c702367b3a9f | Completed | Some master studies | Software Engineering | None | |
604b82b5a7718fbed181b336 | Completed | Master degree | Software Engineering | Web | |
6050c1bf856f36729d2e5218 | Completed | Some master studies | Software Engineering | Devops | |
6050e1e7856f36729d2e5219 | Completed | Some bachelor studies | Software Engineering | Mixed |
d.completed <- d %>% filter(task_completion == "Completed")
d.completed$work_experience_programming.s = scale(d.completed$work_experience_programming)
d.completed$work_experience_java.s = scale(d.completed$work_experience_java)
d.completed$time.s = scale(d.completed$time)
d.completed$sonarqube_issues.s = scale(d.completed$sonarqube_issues)
d.completed
session <fct> | time <int> | reused_logic_constructor <fct> | reused_logic_validation <fct> | equals.state <ord> | |
---|---|---|---|---|---|
6033c6fc5af2c702367b3a93 | 2263 | false | false | Duplicated | |
6033d69a5af2c702367b3a95 | 1382 | true | true | Not implemented | |
6033d69a5af2c702367b3a95 | 1197 | true | true | Good | |
6033d90a5af2c702367b3a96 | 3855 | true | true | Not implemented | |
6033d90a5af2c702367b3a96 | 1984 | true | true | Good | |
6034fc165af2c702367b3a98 | 5309 | false | true | Good | |
6034fc165af2c702367b3a98 | 868 | true | true | Not implemented | |
603500725af2c702367b3a99 | 622 | false | false | Duplicated | |
603500725af2c702367b3a99 | 474 | false | false | Duplicated | |
603f84f15af2c702367b3a9b | 448 | true | true | Not implemented |
d.both_completed <- d %>% semi_join(d.sessions.completed, by = "session")
d.both_completed$work_experience_programming.s = scale(d.both_completed$work_experience_programming)
d.both_completed$work_experience_java.s = scale(d.both_completed$work_experience_java)
d.both_completed$time.s = scale(d.both_completed$time)
d.both_completed$sonarqube_issues.s = scale(d.both_completed$sonarqube_issues)
d.both_completed
session <fct> | time <int> | reused_logic_constructor <fct> | reused_logic_validation <fct> | equals.state <ord> | |
---|---|---|---|---|---|
6033d69a5af2c702367b3a95 | 1382 | true | true | Not implemented | |
6033d69a5af2c702367b3a95 | 1197 | true | true | Good | |
6033d90a5af2c702367b3a96 | 3855 | true | true | Not implemented | |
6033d90a5af2c702367b3a96 | 1984 | true | true | Good | |
6034fc165af2c702367b3a98 | 5309 | false | true | Good | |
6034fc165af2c702367b3a98 | 868 | true | true | Not implemented | |
603500725af2c702367b3a99 | 622 | false | false | Duplicated | |
603500725af2c702367b3a99 | 474 | false | false | Duplicated | |
603f97625af2c702367b3a9d | 1084 | false | false | Duplicated | |
603f97625af2c702367b3a9d | 1170 | false | false | Duplicated |
The function extendable_model
takes some basic arguments for creating brms models and returns a function that can be called with additioanl parameters to combine with those passed to extendable_model
. The extendable_model
takes the following arguments:
base_name
is a name that is used to identify this extendable model while caching.base_formula
the formula that will be extended and passed to brms::brm
, represented as string.data
the data frame to be passed to brms::brm
.base_priors
(NULL
) is a vector of priors to be passed to brms::brm
.base_control
(NULL
) is a vector of control options to be passed to brms::brm
.The returned function takes the following arguments:
additional_variables
(NULL
) a vector of aditional variables (predictors) to pass to pass to brms::brm
in adition to base_formula
.additional_priors
(NULL
) a vector of additioanl priors to pass to brms::brm
in adition to base_priors
.only_priors
(FALSE
) indicates if the model should be epty and not compiled, usefull to extract default priors of a model.sample_prior
("no"
) is passed to the sample_prior
of brms::brm
.control_override
(NULL
) takes a vector of control
arguments for brms::brm
that will override base_control
.extendable_model <- function(
base_name,
base_formula,
family,
data,
base_priors = NULL,
base_control = NULL
) {
function(
additional_variables = NULL,
additional_priors = NULL,
only_priors = FALSE,
sample_prior = "no",
control_override = NULL
) {
# Sort variable names for consistent caching and naming
additional_variables.sorted <- sort(additional_variables)
# Build priors
priors <- base_priors
if (!is.null(additional_priors)) {
priors <- c(base_priors, additional_priors)
}
if (only_priors) {
priors <- NULL
}
# Build formula
additional_variables.formula <- paste(additional_variables.sorted, collapse = " + ")
formula <- base_formula
if (!is.null(additional_variables)) {
formula <- paste(base_formula, additional_variables.formula, sep = " + ")
}
# Build cache file name
additional_variables.name <- paste(additional_variables.sorted, collapse = ".")
name <- base_name
if (!is.null(additional_variables)) {
name <- paste(base_name, hash(additional_variables.name), sep = ".")
}
name <- paste(name, paste("sample_priors-", sample_prior, sep = ""), sep = ".")
name <- paste(name, paste("priors_hash-", hash(priors), sep = ""), sep = ".")
name <- paste(name, paste("formula_hash-", hash(formula), sep = ""), sep = ".")
# Get control options
control <- base_control
if (!is.null(control_override)) {
control <- control_override
}
# Create and return the brms model
brm(
formula = as.formula(formula),
family = family,
data = as.data.frame(data),
prior = priors,
empty = only_priors,
sample_prior = sample_prior,
file = paste("fits", name, sep = "/"),
file_refit = "on_change",
seed = 20210421,
control = control
)
}
}
Example usage:
## Not run
m.with <- extendable_model(
base_name = "m",
base_formula = "time ~ 1",
family = negbinomial(),
data = d.both_completed,
base_priors = c(
prior(normal(0, 1), class = "Intercept")
)
)
prior_summary(m.with(only_priors = TRUE))
pp_check(m.with(sample_prior = "only"), nsamples = 200)
summary(m.with())
pp_check(m.with(), nsamples = 200)
loo(
m.with(),
m.with("high_debt_version"),
m.with(c("high_debt_version", "scenario"))
)
## End(Not run)