diff --git a/.gitignore b/.gitignore index 47281dc..c56404e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,8 @@ .Rproj.user .Rhistory .RData .Ruserdata inst/doc doc Meta +.vscode diff --git a/DESCRIPTION b/DESCRIPTION index d71dd95..dad6f0b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,34 +1,34 @@ Package: challengeR Type: Package Title: Analyzing assessment data of biomedical image analysis competitions and visualization of results -Version: 1.0.3 -Date: 2021-10-15 +Version: 1.0.4 +Date: 2022-10-15 Author: Manuel Wiesenfarth, Matthias Eisenmann, Laura Aguilera Saiz, Annette Kopp-Schneider Maintainer: Manuel Wiesenfarth Description: Analyzing assessment data of biomedical image analysis competitions and visualization of results. License: GPL (>= 2) Depends: R (>= 3.5.2), ggplot2 (>= 3.3.0), purrr (>= 0.3.3) Imports: dplyr (>= 0.8.5), graph (>= 1.64.0), knitr (>= 1.28), methods (>= 3.6.0), plyr (>= 1.8.6), relations (>= 0.6-9), reshape2 (>= 1.4.3), rlang (>= 0.4.5), rmarkdown (>= 2.1), tidyr (>= 1.0.2), viridisLite (>= 0.3.0) Suggests: doParallel (>= 1.0.15), foreach (>= 1.4.8), ggpubr (>= 0.2.5), Rgraphviz (>= 2.30.0), testthat (>= 2.1.0) VignetteBuilder: knitr Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.0 diff --git a/R/challengeR.R b/R/challengeR.R index 6d9e9c2..e437d2e 100644 --- a/R/challengeR.R +++ b/R/challengeR.R @@ -1,207 +1,211 @@ # Copyright (c) German Cancer Research Center (DKFZ) # All rights reserved. # # This file is part of challengeR. # # challengeR is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # challengeR is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with challengeR. If not, see . #' Constructs a challenge object #' #' Constructs an S3 object to represent the configuration of an assessment data set originating from a benchmarking competition (so-called "challenge"). #' #' @section Assessment data set: #' The toolkit provides visualization approaches for both challenges designed around a single task (single-task challenges) and for challenges comprising multiple tasks (multi-task challenges). #' For a single-task challenge, the assessment data set (argument \code{object}) requires the following columns: #' \itemize{ #' \item test case identifier (string or numeric) #' \item algorithm identifier (string or numeric) #' \item performance value (numeric) #' } #' #' For a multi-task challenge, the assessment data set (argument \code{object}) requires the following columns: #' \itemize{ #' \item task identifier (string or numeric) #' \item test case identifier (string or numeric) #' \item algorithm identifier (string or numeric) #' \item performance value (numeric) #' } #' #' @section Sanity check: #' It is highly recommended that the sanity check is not disabled when the data set is provided initially. #' It checks that: #' \itemize{ #' \item performance values are numeric (if not, raises error) #' \item algorithm performances are observed for all cases (if not, adds them as NA and emits a message) #' \item cases appear only once for the same algorithm (if not, raises error) #' } #' If the argument \code{na.treat} for treatment of NA is specified, NAs will be handled respectively. #' #' It might be reasonable to disable the sanity check for further computations (e.g., for performance reasons #' during bootstrapping (\code{\link{bootstrap.ranked.list}}) where cases are actually allowed to appear more than once for the same algorithm). #' #' @param object A data frame containing the assessment data. #' @param case A string specifying the name of the column that contains the case identifiers. #' @param algorithm A string specifying the name of the column that contains the algorithm identifiers. #' @param value A string specifying the name of the column that contains the performance values. #' @param by A string specifying the name of the column that contains the task identifiers. Required for multi-task data set. #' @param taskName A string specifying the task name for single-task data set that does not contain a task column. #' This argument is optional for a single-task data set and is ignored for a multi-task data set. #' @param annotator If multiple annotators annotated the test cases, a string specifying the name of the column that contains the annotator identifiers. Only applies to rang-then-aggregate. Use with caution: Currently not tested. #' @param smallBetter A boolean specifying whether small performance values indicate better algorithm performance. #' @param na.treat Indicates how missing perfomance values are treated if sanity check is enabled. It can be 'na.rm', numeric value or function. #' For a numeric value or function, NAs will be replaced by the specified values. For 'na.rm', rows that contain missing values will be removed. #' @param check A boolean to indicate to perform a sanity check of the specified data set and arguments if set to \code{TRUE}. #' #' @return An S3 object to represent the configuration of an assessment data set. #' #' @examples #' # single-task data set #' #' # multi-task data set #' #' @export as.challenge=function(object, case, algorithm, value, by=NULL, taskName=NULL, annotator=NULL, smallBetter=FALSE, na.treat=NULL, # optional check=TRUE) { object=as.data.frame(object[,c(value, algorithm, case, by, annotator)]) - + object[[algorithm]] <- as.factor(object[[algorithm]]) # sanity checks if (check) { if (!is.null(by) && !is.null(taskName)) { warning("Argument 'taskName' is ignored for multi-task data set.") } # Add task column for data set without task column by using the specified task name. if (is.null(by) && !is.null(taskName)) { taskName <- trimws(taskName) if (taskName == "") { stop("Argument 'taskName' is empty.") } object <- cbind(task=taskName, object) by = "task" } # Add task column for data set without task column by using a dummy task name. if (is.null(by) && is.null(taskName)) { object <- cbind(task="dummyTask", object) by = "task" } object=splitby(object,by=by) object=lapply(object,droplevels) missingData = n.missing = list() for (task in names(object)) { if (!all(is.numeric(object[[task]][[value]]))) stop("Performance values must be numeric.") n.missing[[task]] <- sum(is.na(object[[task]][[value]])) # already missing before na.treat; for report if (n.missing[[task]]>0) message("Note: ", n.missing, " missing cases have been found in the data set.") # check for missing cases missingData[[task]]=object[[task]] %>% expand(!!as.symbol(algorithm), !!as.symbol(case))%>% anti_join(object[[task]], by=c( algorithm,case)) if (nrow(missingData[[task]])>0) { if (length(object) == 1 ) { # single task - message("Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:") + message("Performance of not all algorithms has been observed for all cases.\nTherefore, missings have been inserted in the following cases:") } else { # multi task message("Performance of not all algorithms has been observed for all cases in task '", task, - "'. Therefore, missings have been inserted in the following cases:") + "'.\nTherefore, missings have been inserted in the following cases:") } print(as.data.frame(missingData[[task]])) object[[task]]=as.data.frame(object[[task]] %>% complete(!!as.symbol(by), !!as.symbol(algorithm), !!as.symbol(case))) } # check duplicate cases all1=apply(table(object[[task]][[algorithm]], object[[task]][[case]]), 2, function(x) all(x==1)) if (!all(all1)) { n.duplicated <- sum(all1!=1) if (length(object) == 1 ) { # single task if (n.duplicated/length(all1) >= 1/5) { # at least a quarter of the cases is duplicated stop ("The following case(s) appear(s) more than once for the same algorithm. Please revise. ", "Or are you considering a multi-task challenge and forgot to specify argument 'by'?\n", "Case(s): ", paste(names(which(all1!=1)), collapse=", ") ) } else { stop ("The following case(s) appear(s) more than once for the same algorithm. Please revise.\n", "Case(s): ", paste(names(which(all1!=1)), collapse=", ") ) } } else { # multi task stop ("The following case(s) appear(s) more than once for the same algorithm in task '", task, "'. Please revise.\n", "Case(s): ", paste(names(which(all1!=1)), collapse=", ") ) } } if (!is.null(na.treat)) { if (is.numeric(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat else if (is.function(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat(object[[task]][,value][is.na(object[[task]][,value])]) else if (is.character(na.treat) && na.treat=="na.rm") object[[task]]=object[[task]][!is.na(object[[task]][,value]),] } } } - if (check==TRUE && (any(sapply(missingData, function(x) nrow(x))>0) |any(n.missing>0))) { - if (is.null(na.treat)) message("For aggregate-then-rank, na.treat will have to be specified. ", - "For rank-then-aggregate, missings will implicitly lead to the algorithm ranked last for the missing test case.", - "na.treat obligatory if report is intended to be compiled." - ) - else if (is.numeric(na.treat)) message("All missings have been replaced by the value ", na.treat,".\n") + + if (check==TRUE && (any(sapply(missingData, function(x) nrow(x))>0) | any(n.missing>0))) { + ## + ## The message below was disabled because it can cause misinformation even we supply na.treat to as.challenge object + ## + # if (is.null(na.treat)) message("For aggregate-then-rank, na.treat will have to be specified. ", + # "For rank-then-aggregate, missings will implicitly lead to the algorithm ranked last for the missing test case.", + # "na.treat obligatory if report is intended to be compiled." + # ) + if (is.numeric(na.treat)) message("All missings have been replaced by the value ", na.treat,".\n") else if (is.character(na.treat) && na.treat=="na.rm") message("All missings have been removed.") else if (is.function(na.treat)) { message("Missings have been replaced using function ") print(na.treat) } } if (check==TRUE){ attr(object,"n.missing")=n.missing attr(object,"missingData")=missingData } attr(object,"na.treat")=na.treat attr(object,"algorithm")=algorithm attr(object,"value")=value attr(object,"case")=case attr(object,"annotator")=annotator attr(object,"by")=by attr(object,"smallBetter")=smallBetter attr(object,"check")=check class(object)=c("challenge", class(object)) object } diff --git a/R/stability.R b/R/stability.R index 983ccde..eb9ae9f 100644 --- a/R/stability.R +++ b/R/stability.R @@ -1,416 +1,446 @@ # Copyright (c) German Cancer Research Center (DKFZ) # All rights reserved. # # This file is part of challengeR. # # challengeR is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # challengeR is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with challengeR. If not, see . #' @export stability <- function(x,...) UseMethod("stability") #' @export stability.default <- function(x, ...) stop("not implemented for this class") #' @export stabilityByAlgorithm <- function(x,...) UseMethod("stabilityByAlgorithm") #' @export stabilityByAlgorithm.default <- function(x, ...) stop("not implemented for this class") #' @export stabilityByTask <- function(x,...) UseMethod("stabilityByTask") #' @export stabilityByTask.default <- function(x, ...) stop("not implemented for this class") #' Creates a blob plot across tasks #' #' Creates a blob plots visualizing the ranking variability across tasks. #' #' @param x The ranked asssessment data set. #' @param ordering #' @param probs #' @param max_size #' @param freq #' @param shape #' @param ... Further arguments passed to or from other functions. #' #' @return #' #' @examples #' #' @seealso `browseVignettes("challengeR")` #' #' @family functions to visualize cross-task insights #' @export stability.ranked.list=function(x, ordering, probs=c(.025,.975), max_size=6, freq=FALSE, shape=4,...) { if (length(x$data) < 2) { stop("The stability of rankings across tasks cannot be computed for less than two tasks.") } - + dd=melt(x, measure.vars="rank", value.name="rank") %>% dplyr::rename(task="L1") - + if (!missing(ordering)) { if (is.numeric(ordering) & !is.null(names(ordering)) ){ ordering <- names(ordering)[order(ordering)] } else if (!is.character(ordering)){ stop("Argument ordering has to be a named vector of ranks or a vector of algorithm names in the ranking order.") } dd=dd%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) } else dd=dd%>%mutate(algorithm=factor(.data$algorithm)) - + if (!freq) { p = ggplot(dd)+ - geom_count(aes(algorithm, - rank, - color=algorithm, - size = stat(prop*100))) + geom_count(aes(algorithm, + rank, + color=algorithm, + size = stat(prop*100))) } else { p=ggplot(dd)+ - geom_count(aes(algorithm, - rank, - color=algorithm )) + geom_count(aes(algorithm, + rank, + color=algorithm )) } - + + # Define breaks before creating Blob plot + if (max(dd$rank)>5) { + breaks = c(1, seq(5, max(dd$rank), by=5)) + } else { + breaks = seq(1, max(dd$rank)) + } + p+scale_size_area(max_size = max_size)+ stat_summary(aes(algorithm, rank), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(algorithm, rank), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ geom_abline(slope=1, color="gray", linetype="dotted")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, - limits=c(1,max(5,max(dd$rank))), - breaks=c(1,seq(5,max(5,max(dd$rank)),by=5)))+ + limits=c(.4, max(dd$rank)), + breaks=breaks)+ xlab("Algorithm")+ ylab("Rank") - + } rankdist.bootstrap.list=function(x,...){ rankDist=melt(lapply(x$bootsrappedRanks,t), value.name="rank") %>% dplyr::rename(algorithm="Var2",task="L1") rankDist } #' Creates blob plots or stacked frequency plots stratified by algorithm #' #' Creates blob plots (\code{stacked = FALSE}) or stacked frequency plots (\code{stacked = TRUE}) for each algorithm #' from a bootstrapped, ranked assessment data set. #' #' @param x The bootstrapped, ranked assessment data set. #' @param ordering #' @param stacked A boolean specifying whether a stacked frequency plot (\code{stacked = TRUE}) or blob plot (\code{stacked = FALSE}) should be created. #' @param probs #' @param max_size #' @param shape #' @param freq #' @param single #' @param ... Further arguments passed to or from other functions. #' #' @return #' #' @examples #' #' @seealso `browseVignettes("challengeR")` #' #' @family functions to visualize cross-task insights #' @export stabilityByAlgorithm.bootstrap.list=function(x, ordering, stacked = FALSE, probs=c(.025,.975),#only for !stacked max_size=3,#only for !stacked shape=4,#only for !stacked freq=FALSE, #only for stacked single=FALSE,...) { - + if (length(x$data) < 2) { stop("The stability of rankings by algorithm cannot be computed for less than two tasks.") } - + rankDist=rankdist.bootstrap.list(x) - + if (!missing(ordering)) { if (is.numeric(ordering) & !is.null(names(ordering)) ){ ordering <- names(ordering)[order(ordering)] } else if (!is.character(ordering)){ stop("Argument ordering has to be a named vector of ranks or a vector of algorithm names in the ranking order.") } rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) } if (!stacked){ if (single==FALSE){ + + # Define breaks before creating Blob plot + if (max(rankDist$rank)>5) { + breaks = c(1, seq(5, max(rankDist$rank), by=5)) + } else { + breaks = seq(1, max(rankDist$rank)) + } + pl <- ggplot(rankDist)+ geom_count(aes(task , rank, color=algorithm, size = stat(prop*100), group = task ))+ scale_size_area(max_size = max_size)+ stat_summary(aes(task ,rank ), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(task ,rank ), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, - limits=c(1,max(5,max(rankDist$rank))), - breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+ + limits=c(.4, max(rankDist$rank)), + breaks=breaks)+ xlab("Task")+ ylab("Rank") - + } else { pl=list() for (alg in ordering){ rankDist.alg=subset(rankDist, rankDist$algorithm==alg) + + # Define breaks before creating Blob plot + if (max(rankDist$rank)>5) { + breaks = c(1, seq(5, max(rankDist$rank), by=5)) + } else { + breaks = seq(1, max(rankDist$rank)) + } + pl[[alg]]=ggplot(rankDist.alg)+ geom_count(aes(task , rank, color=algorithm, size = stat(prop*100), group = task ))+ scale_size_area(max_size = max_size)+ stat_summary(aes(task , rank ), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(task ,rank ), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, - limits=c(1,max(5,max(rankDist$rank))), - breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+ + limits=c(.4, max(rankDist$rank)), + breaks=breaks)+ xlab("Task")+ ylab("Rank") } names(pl) = ordering class(pl) <- "ggList" } - + } else { #stacked rankDist=rankDist%>% group_by(task)%>% dplyr::count(.data$algorithm, .data$rank)%>% group_by(.data$algorithm)%>% mutate(prop=.data$n/sum(.data$n)*100)%>% ungroup%>% data.frame%>% mutate(rank=as.factor(.data$rank)) - + results= melt.ranked.list(x, measure.vars="rank", value.name="rank") %>% dplyr::select(-.data$variable) colnames(results)[3]="task" if (!missing(ordering)) { if (is.numeric(ordering) & !is.null(names(ordering)) ){ ordering <- names(ordering)[order(ordering)] } else if (!is.character(ordering)){ stop("Argument ordering has to be a named vector of ranks or a vector of algorithm names in the ranking order.") } results=results%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) } if (single==FALSE){ pl<- ggplot(rankDist) + facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) - + if (freq){ pl <- pl + geom_bar(aes(rank, n, fill=task ), position = "stack", stat = "identity") + ylab("Frequency") } else { pl <- pl + geom_bar(aes(rank, prop, fill=task ), position = "stack", stat = "identity")+ ylab("Proportion (%)") } - - pl <- pl + + + pl <- pl + geom_vline(aes(xintercept=rank, color=task), size=.4, linetype="dotted", data=results) + xlab("Rank") } else { pl=list() for (alg in ordering){ rankDist.alg=subset(rankDist, rankDist$algorithm==alg) results.alg=subset(results, results$algorithm==alg) pl[[alg]]=ggplot(rankDist.alg)+ facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) - + if (freq){ pl[[alg]] <- pl[[alg]] + geom_bar(aes(rank, n, fill=task ), position = "stack", stat = "identity") + ylab("Frequency") } else { pl[[alg]] <- pl[[alg]] + geom_bar(aes(rank, prop, fill=task ), position = "stack", stat = "identity")+ ylab("Proportion (%)") } - + pl[[alg]] <- pl[[alg]] + geom_vline(aes(xintercept=rank, color=task), size=.4, linetype="dotted", data=results.alg) + xlab("Rank") } names(pl) = ordering class(pl) <- "ggList" } } pl } #' Creates blob plots stratified by task #' #' Creates blob plots for each task from a bootstrapped, ranked assessment data set. #' #' @param x The bootstrapped, ranked assessment data set. #' @param ordering #' @param probs #' @param max_size #' @param size.ranks #' @param shape #' @param showLabelForSingleTask A boolean specifying whether the task name should be used as title for a single-task data set. #' @param ... Further arguments passed to or from other functions. #' #' @return #' #' @examples #' #' @seealso `browseVignettes("challengeR")` #' #' @family functions to visualize ranking stability #' @family functions to visualize cross-task insights #' @export stabilityByTask.bootstrap.list=function(x, ordering, probs=c(.025,.975), max_size=3, size.ranks=.3*theme_get()$text$size, shape=4, showLabelForSingleTask=FALSE,...){ rankDist=rankdist.bootstrap.list(x) ranks=melt.ranked.list(x, measure.vars="rank", value.name = "full.rank") colnames(ranks)[4]="task" if (!missing(ordering)) { if (is.numeric(ordering) & !is.null(names(ordering)) ){ ordering <- names(ordering)[order(ordering)] } else if (!is.character(ordering)){ stop("Argument ordering has to be a named vector of ranks or a vector of algorithm names in the ranking order.") } ranks$algorithm=factor(ranks$algorithm, levels=ordering) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) } - + + # Define breaks before creating Blob plot + if (max(rankDist$rank)>5) { + breaks = c(1, seq(5, max(rankDist$rank), by=5)) + } else { + breaks = seq(1, max(rankDist$rank)) + } + blobPlot <- ggplot(rankDist)+ geom_count(aes(algorithm , rank, color=algorithm, size = stat(prop*100), group = algorithm ))+ scale_size_area(max_size = max_size)+ geom_abline(slope=1, color="gray", linetype="dotted")+ stat_summary(aes(algorithm ,rank ), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(algorithm ,rank ), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ geom_text(aes(x=algorithm,y=1,label=full.rank), nudge_y=-.6, vjust = 0, size=size.ranks, fontface="plain", family="sans", data=ranks) + coord_cartesian(clip = 'off')+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, - limits=c(.4,max(5,max(rankDist$rank))), - breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+ + limits=c(.4, max(rankDist$rank)), + breaks=breaks)+ xlab("Algorithm")+ ylab("Rank") - - # Create multi-panel plot with task names as labels for multi-task data set or single-task data set when explicitly specified - if (length(x$data) > 1 || showLabelForSingleTask == TRUE) { - blobPlot <- blobPlot + facet_wrap(vars(task)) - } - + + # Create multi-panel plot with task names as labels for multi-task data set or single-task data set when explicitly specified + if (length(x$data) > 1 || showLabelForSingleTask == TRUE) { + blobPlot <- blobPlot + facet_wrap(vars(task)) + } + return(blobPlot) } diff --git a/README.md b/README.md index 5af8781..2e5a224 100644 --- a/README.md +++ b/README.md @@ -1,642 +1,556 @@ Methods and open-source toolkit for analyzing and visualizing challenge results ================ - [Introduction](#introduction) - [Installation](#installation) - [Terms of use](#terms-of-use) - [Usage](#usage) - [Troubleshooting](#troubleshooting) - [Changes](#changes) - [Team](#team) - [Reference](#reference) # Introduction -The current framework is a tool for analyzing and visualizing challenge -results in the field of biomedical image analysis and beyond. - -Biomedical challenges have become the de facto standard for benchmarking -biomedical image analysis algorithms. While the number of challenges is -steadily increasing, surprisingly little effort has been invested in -ensuring high quality design, execution and reporting for these -international competitions. Specifically, results analysis and -visualization in the event of uncertainties have been given almost no -attention in the literature. - -Given these shortcomings, the current framework aims to enable fast and -wide adoption of comprehensively analyzing and visualizing the results -of single-task and multi-task challenges. This approach offers an -intuitive way to gain important insights into the relative and absolute -performance of algorithms, which cannot be revealed by commonly applied -visualization techniques. +The current framework is a tool for analyzing and visualizing challenge results in the field of biomedical image analysis and beyond. + +Biomedical challenges have become the de facto standard for benchmarking biomedical image analysis algorithms. While the number of challenges is steadily increasing, surprisingly little effort has been invested in ensuring high quality design, execution and reporting for these international competitions. Specifically, results analysis and visualization in the event of uncertainties have been given almost no attention in the literature. + +Given these shortcomings, the current framework aims to enable fast and wide adoption of comprehensively analyzing and visualizing the results of single-task and multi-task challenges. This approach offers an intuitive way to gain important insights into the relative and absolute performance of algorithms, which cannot be revealed by commonly applied visualization techniques. # Installation Requires R version \>= 3.5.2 (). -Further, a recent version of Pandoc (>= 1.12.3) is required. RStudio -() automatically includes this so you do not need -to download Pandoc if you plan to use rmarkdown from the RStudio IDE, -otherwise you’ll need to install Pandoc for your platform -(). Finally, if you want to generate -a PDF report you will need to have LaTeX installed (e.g. MiKTeX, MacTeX -or TinyTeX). +Further, a recent version of Pandoc (\>= 1.12.3) is required. RStudio () automatically includes this so you do not need to download Pandoc if you plan to use rmarkdown from the RStudio IDE, otherwise you’ll need to install Pandoc for your platform (). Finally, if you want to generate a PDF report you will need to have LaTeX installed (e.g. MiKTeX, MacTeX or TinyTeX). + +To get the latest released version (master branch) of the R package from GitHub: -To get the latest released version (master branch) of the R package from -GitHub: -``` r +```r if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rgraphviz", dependencies = TRUE) devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` -If you are asked whether you want to update installed packages and you -type “a” for all, you might need administrator permissions to update R -core packages. You can also try to type “n” for updating no packages. If -you are asked “Do you want to install from sources the packages which -need compilation? (Yes/no/cancel)”, you can safely type “no”. +If you are asked whether you want to update installed packages and you type "a" for all, you might need administrator permissions to update R core packages. You can also try to type "n" for updating no packages. If you are asked "Do you want to install from sources the packages which need compilation? (Yes/no/cancel)", you can safely type "no". -If you get *warning* messages (in contrast to *error* messages), these -might not be problematic and you can try to proceed. If you encounter -errors during the setup, looking into the “Troubleshooting” section -might be worth it. +If you get *warning* messages (in contrast to *error* messages), these might not be problematic and you can try to proceed. If you encounter errors during the setup, looking into the "Troubleshooting" section might be worth it. -For Linux users: Some system libraries might be missing. Check the -output in the R console for further hints carefully during the -installation of packages. +For Linux users: Some system libraries might be missing. Check the output in the R console for further hints carefully during the installation of packages. # Terms of use Copyright (c) German Cancer Research Center (DKFZ). All rights reserved. challengeR is available under license GPLv2 or any later version. If you use this software for a publication, please cite: -Wiesenfarth, M., Reinke, A., Landman, B.A., Eisenmann, M., Aguilera -Saiz, L., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. Methods -and open-source toolkit for analyzing and visualizing challenge results. -*Sci Rep* **11**, 2369 (2021). - +Wiesenfarth, M., Reinke, A., Landman, B.A., Eisenmann, M., Aguilera Saiz, L., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. Methods and open-source toolkit for analyzing and visualizing challenge results. *Sci Rep* **11**, 2369 (2021). # Usage -Each of the following steps has to be run to generate the report: (1) -Load package, (2) load data, (3) perform ranking, (4) perform -bootstrapping and (5) generation of the report +Each of the following steps has to be run to generate the report: (1) Load package, (2) load data, (3) perform ranking, (4) perform bootstrapping and (5) generation of the report -You can find R scripts for quickstart in the directory “vignettes”. An -overview of all available plots is provided in the “Visualizations” -vignette demonstrating the use of their corresponding plot functions as -well. +You can find R scripts for quickstart in the directory "vignettes". An overview of all available plots is provided in the "Visualizations" vignette demonstrating the use of their corresponding plot functions as well. -Here, we provide a step-by-step guide that leads you to your final -report. +Here, we provide a step-by-step guide that leads you to your final report. ## 1. Load package Load package -``` r + +```r library(challengeR) ``` ## 2. Load data ### Data requirements Data requires the following *columns*: -- *task identifier* in case of multi-task challenges (string or - numeric) +- *task identifier* in case of multi-task challenges (string or numeric) - *test case identifier* (string or numeric) - *algorithm identifier* (string or numeric) - *metric value* (numeric) -In case of missing metric values, a missing observation has to be -provided (either as blank field or “NA”). - -For example, in a challenge with 2 tasks, 2 test cases and 2 algorithms, -where in task “T2”, test case “case2”, algorithm “A2” didn’t give a -prediction (and thus NA or a blank field for missing value is inserted), -the data set might look like this: - -| Task | TestCase | Algorithm | MetricValue | -|:-----|:---------|:----------|------------:| -| T1 | case1 | A1 | 0.266 | -| T1 | case1 | A2 | 0.202 | -| T1 | case2 | A1 | 0.573 | -| T1 | case2 | A2 | 0.945 | -| T2 | case1 | A1 | 0.372 | -| T2 | case1 | A2 | 0.898 | -| T2 | case2 | A1 | 0.908 | -| T2 | case2 | A2 | NA | +In case of missing metric values, a missing observation has to be provided (either as blank field or "NA"). + +For example, in a challenge with 2 tasks, 2 test cases and 2 algorithms, where in task "T2", test case "case2", algorithm "A2" didn't give a prediction (and thus NA or a blank field for missing value is inserted), the data set might look like this: + + +|Task |TestCase |Algorithm | MetricValue| +|:----|:--------|:---------|-----------:| +|T1 |case1 |A1 | 0.266| +|T1 |case1 |A2 | 0.202| +|T1 |case2 |A1 | 0.573| +|T1 |case2 |A2 | 0.945| +|T2 |case1 |A1 | 0.372| +|T2 |case1 |A2 | 0.898| +|T2 |case2 |A1 | 0.908| +|T2 |case2 |A2 | NA| ### 2.1 Load data from file -If you have assessment data at hand stored in a csv file (if you want to -use simulated data, skip the following code line) use +If you have assessment data at hand stored in a csv file (if you want to use simulated data, skip the following code line) use + -``` r +```r data_matrix <- read.csv(file.choose()) # type ?read.csv for help + ``` -This allows to choose a file interactively, otherwise replace -*file.choose()* by the file path (in style “/path/to/dataset.csv”) in -quotation marks. +This allows to choose a file interactively, otherwise replace *file.choose()* by the file path (in style "/path/to/dataset.csv") in quotation marks. ### 2.2 Simulate data -In the following, simulated data is generated *instead* for illustration -purposes (skip the following code chunk if you have already loaded -data). The data is also stored as “inst/extdata/data_matrix.csv” in the -repository. +In the following, simulated data is generated *instead* for illustration purposes (skip the following code chunk if you have already loaded data). The data is also stored as "inst/extdata/data_matrix.csv" in the repository. + -``` r +```r if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute") n <- 50 set.seed(4) strip <- runif(n,.9,1) c_ideal <- cbind(task="c_ideal", rbind( data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n), data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n), data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n), data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n), data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n) )) set.seed(1) c_random <- data.frame(task="c_random", alg_name=factor(paste0("A",rep(1:5,each=n))), value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5) ) strip2 <- seq(.8,1,length.out=5) a <- permute::allPerms(1:5) c_worstcase <- data.frame(task="c_worstcase", alg_name=c(t(a)), value=rep(strip2,nrow(a)), case=rep(1:nrow(a),each=5) ) c_worstcase <- rbind(c_worstcase, data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1) ) c_worstcase$alg_name <- factor(c_worstcase$alg_name,labels=paste0("A",1:5)) data_matrix <- rbind(c_ideal, c_random, c_worstcase) + ``` ## 3. Perform ranking ### 3.1 Define challenge object Code differs slightly for single- and multi-task challenges. In case of a single-task challenge use -``` r + +```r # Use only task "c_random" in object data_matrix dataSubset <- subset(data_matrix, task=="c_random") challenge <- as.challenge(dataSubset, # Specify which column contains the algorithms, # which column contains a test case identifier # and which contains the metric value: algorithm = "alg_name", case = "case", value = "value", # Specify if small metric values are better smallBetter = FALSE) ``` *Instead*, for a multi-task challenge use -``` r + +```r # Same as above but with 'by="task"' where variable "task" contains the task identifier challenge <- as.challenge(data_matrix, by = "task", algorithm = "alg_name", case = "case", value = "value", smallBetter = FALSE) ``` ### 3.2 Configure ranking Different ranking methods are available, choose one of them: -- for “aggregate-then-rank” use (here: take mean for aggregation) +- for "aggregate-then-rank" use (here: take mean for aggregation) -``` r + +```r ranking <- challenge%>%aggregateThenRank(FUN = mean, # aggregation function, # e.g. mean, median, min, max, # or e.g. function(x) quantile(x, probs=0.05) na.treat = 0, # either "na.rm" to remove missing data, # set missings to numeric value (e.g. 0) # or specify a function, # e.g. function(x) min(x) ties.method = "min" # a character string specifying # how ties are treated, see ?base::rank ) ``` -- *alternatively*, for “rank-then-aggregate” with arguments as above - (here: take mean for aggregation) +- *alternatively*, for "rank-then-aggregate" with arguments as above (here: take mean for aggregation) + -``` r +```r ranking <- challenge%>%rankThenAggregate(FUN = mean, ties.method = "min" ) ``` -- *alternatively*, for test-then-rank based on Wilcoxon signed rank - test +- *alternatively*, for test-then-rank based on Wilcoxon signed rank test -``` r + +```r ranking <- challenge%>%testThenRank(alpha = 0.05, # significance level p.adjust.method = "none", # method for adjustment for # multiple testing, see ?p.adjust na.treat = 0, # either "na.rm" to remove missing data, # set missings to numeric value (e.g. 0) # or specify a function, e.g. function(x) min(x) ties.method = "min" # a character string specifying # how ties are treated, see ?base::rank ) + ``` ## 4. Perform bootstrapping Perform bootstrapping with 1000 bootstrap samples using one CPU -``` r + +```r set.seed(1) ranking_bootstrapped <- ranking%>%bootstrap(nboot = 1000) ``` If you want to use multiple CPUs (here: 8 CPUs), use -``` r + +```r library(doParallel) registerDoParallel(cores = 8) set.seed(1) ranking_bootstrapped <- ranking%>%bootstrap(nboot = 1000, parallel = TRUE, progress = "none") stopImplicitCluster() ``` ## 5. Generate the report -Generate report in PDF, HTML or DOCX format. Code differs slightly for -single- and multi-task challenges. +Generate report in PDF, HTML or DOCX format. Code differs slightly for single- and multi-task challenges. ### 5.1 For single-task challenges -``` r + +```r ranking_bootstrapped %>% report(title = "singleTaskChallengeExample", # used for the title of the report file = "filename", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine = "pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean = TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) + ``` -Argument *file* allows for specifying the output file path as well, -otherwise the working directory is used. If file is specified but does -not have a file extension, an extension will be automatically added -according to the output format given in *format*. Using argument -*clean=FALSE* allows to retain intermediate files, such as separate -files for each figure. +Argument *file* allows for specifying the output file path as well, otherwise the working directory is used. If file is specified but does not have a file extension, an extension will be automatically added according to the output format given in *format*. Using argument *clean=FALSE* allows to retain intermediate files, such as separate files for each figure. -If argument “file” is omitted, the report is created in a temporary -folder with file name “report”. +If argument "file" is omitted, the report is created in a temporary folder with file name "report". ### 5.2 For multi-task challenges -Same as for single-task challenges, but additionally consensus ranking -(rank aggregation across tasks) has to be given. +Same as for single-task challenges, but additionally consensus ranking (rank aggregation across tasks) has to be given. + +Compute ranking consensus across tasks (here: consensus ranking according to mean ranks across tasks) -Compute ranking consensus across tasks (here: consensus ranking -according to mean ranks across tasks) -``` r +```r # See ?relation_consensus for different methods to derive consensus ranking meanRanks <- ranking%>%consensus(method = "euclidean") meanRanks # note that there may be ties (i.e. some algorithms have identical mean rank) ``` -Generate report as above, but with additional specification of consensus -ranking +Generate report as above, but with additional specification of consensus ranking -``` r + +```r ranking_bootstrapped %>% report(consensus = meanRanks, title = "multiTaskChallengeExample", file = "filename", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine = "pdflatex"#LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" ) ``` -The consensus ranking is given according to mean ranks across tasks if -method=“euclidean” where in case of ties (equal ranks for multiple -algorithms) the average rank is used, i.e. ties.method=“average”. +The consensus ranking is given according to mean ranks across tasks if method="euclidean" where in case of ties (equal ranks for multiple algorithms) the average rank is used, i.e. ties.method="average". # Troubleshooting -In this section we provide an overview of issues that the users reported -and how they were solved. +In this section we provide an overview of issues that the users reported and how they were solved. ## Issues related to RStudio ### Issue: Rtools is missing While trying to install the current version of the repository: -``` r + +```r devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` The following warning showed up in the output: -``` r + +```r WARNING: Rtools is required to build R packages, but is not currently installed. ``` -Therefore, Rtools was installed via a separate executable: - and the warning -disappeared. +Therefore, Rtools was installed via a separate executable: and the warning disappeared. #### Solution: -Actually there is no need of installing Rtools, it is not really used in -the toolkit. Insted, choose not to install it when it is asked. See -comment in the installation section: +Actually there is no need of installing Rtools, it is not really used in the toolkit. Insted, choose not to install it when it is asked. See comment in the installation section: -“If you are asked whether you want to update installed packages and you -type “a” for all, you might need administrator rights to update R core -packages. You can also try to type “n” for updating no packages. If you -are asked “Do you want to install from sources the packages which need -compilation? (Yes/no/cancel)”, you can safely type “no”.” +“If you are asked whether you want to update installed packages and you type “a” for all, you might need administrator rights to update R core packages. You can also try to type “n” for updating no packages. If you are asked “Do you want to install from sources the packages which need compilation? (Yes/no/cancel)”, you can safely type “no”.” ### Issue: Package versions are mismatching Installing the current version of the tool from GitHub failed. The error message was: -``` r + +```r byte-compile and prepare package for lazy loading Error: (converted from warning) package 'ggplot2' was built under R version 3.6.3 Execution halted ERROR: lazy loading failed for package 'challengeR' * removing 'C:/Users/.../Documents/R/win-library/3.6/challengeR' * restoring previous 'C:/Users/.../Documents/R/win-library/3.6/challengeR' Error: Failed to install 'challengeR' from GitHub: (converted from warning) installation of package 'C:/Users/.../AppData/Local/Temp/Rtmp615qmV/file4fd419555eb4/challengeR_0.3.1.tar.gz' had non-zero exit status ``` -The problem was that some of the packages that were built under R3.6.1 -had been updated, but the current installed version was still R3.6.1. +The problem was that some of the packages that were built under R3.6.1 had been updated, but the current installed version was still R3.6.1. #### Solution: -The solution was to update R3.6.1 to R3.6.3. Another way would have been -to reset the single packages to the versions built under R3.6.1. +The solution was to update R3.6.1 to R3.6.3. Another way would have been to reset the single packages to the versions built under R3.6.1. ### Issue: Package is missing Installing the current version of the tool from GitHub failed. -``` r + +```r devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` The error message was: -``` r + +```r Error: .onLoad failed in loadNamespace() for 'pkgload', details: call: loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]) error: there is no package called ‘backports’ ``` -The problem was that the packages ‘backports’ had not been installed. +The problem was that the packages 'backports' had not been installed. #### Solution: -The solution was to install ‘backports’ manually. +The solution was to install 'backports' manually. -``` r + +```r install.packages("backports") ``` ### Issue: Packages are not detected correctly -While trying to install the package after running the following -commands: +While trying to install the package after running the following commands: + -``` r +```r if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rgraphviz", dependencies = TRUE) devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` The error message was: -``` r + +```r ERROR: 1: In file(con, "r") : URL 'https://bioconductor.org/config.yaml': status was 'SSL connect error' 2: packages ‘BiocVersion’, ‘Rgraphviz’ are not available (for R version 3.6.1) ``` #### Solution: The solution was to restart RStudio. ## Issues related to MiKTeX ### Issue: Missing packages -While generating the PDF with MiKTeX (2.9), the following error showed -up: +While generating the PDF with MiKTeX (2.9), the following error showed up: -``` r + +```r fatal pdflatex - gui framework cannot be initialized ``` There is an issue with installing missing packages in LaTeX. ##### Solution: -Open your MiKTeX Console –> Settings, select “Always install missing -packages on-the-fly”. Then generate the report. Once the report is -generated, you can reset the settings to your preferred ones. +Open your MiKTeX Console --\> Settings, select "Always install missing packages on-the-fly". Then generate the report. Once the report is generated, you can reset the settings to your preferred ones. ### Issue: Unable to generate report While generating the PDF with MiKTeX (2.9): -``` r + +```r ranking_bootstrapped %>% report(title = "singleTaskChallengeExample", # used for the title of the report file = "filename", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine = "pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean = TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) + ``` The following error showed up: -``` r + +```r output file: filename.knit.md "C:/Program Files/RStudio/bin/pandoc/pandoc" +RTS -K512m -RTS filename.utf8.md --to latex --from markdown+autolink_bare_uris+tex_math_single_backslash --output filename.tex --self-contained --number-sections --highlight-style tango --pdf-engine pdflatex --variable graphics --lua-filter "C:/Users/adm/Documents/R/win-library/3.6/rmarkdown/rmd/lua/pagebreak.lua" --lua-filter "C:/Users/adm/Documents/R/win-library/3.6/rmarkdown/rmd/lua/latex-div.lua" --variable "geometry:margin=1in" Error: LaTeX failed to compile filename.tex. See https://yihui.org/tinytex/r/#debugging for debugging tips. Warning message: In system2(..., stdout = if (use_file_stdout()) f1 else FALSE, stderr = f2) : '"pdflatex"' not found ``` #### Solution: The solution was to restart RStudio. # Changes +#### Version 1.0.4 + +- Fix NaN values cause error ([T28746](https://phabricator.mitk.org/T28746)) +- Fix Bars and dots don't match in podium plot ([T29167](https://phabricator.mitk.org/T29167)) +- Fix y-axis of blob plots always scaled to 5 ([T28966](https://phabricator.mitk.org/T28966)) + #### Version 1.0.3 -- Fix ggplot warning in various places of the report - ([T28710](https://phabricator.mitk.org/T28710)) +- Fix ggplot warning in various places of the report ([T28710](https://phabricator.mitk.org/T28710)) #### Version 1.0.2 -- Fix error when all metric values are the same - ([T28453](https://phabricator.mitk.org/T28453)) -- Fix wrong numer of algorithms shown in report summary - ([T28465](https://phabricator.mitk.org/T28465)) +- Fix error when all metric values are the same ([T28453](https://phabricator.mitk.org/T28453)) +- Fix wrong number of algorithms shown in report summary ([T28465](https://phabricator.mitk.org/T28465)) #### Version 1.0.1 -- Fix error raised in case there are more tasks than algorithms - contained in the dataset - ([T28193](https://phabricator.mitk.org/T28193)) -- Drop restriction that at least three algorithms are required for - bootstrapping ([T28194](https://phabricator.mitk.org/T28194)) -- Avoid blank pages in PDF report when bootstrapping is disabled - ([T28201](https://phabricator.mitk.org/T28201)) -- Handle tasks having only one case for bootstrapping - ([T28202](https://phabricator.mitk.org/T28202)) +- Fix error raised in case there are more tasks than algorithms contained in the dataset ([T28193](https://phabricator.mitk.org/T28193)) +- Drop restriction that at least three algorithms are required for bootstrapping ([T28194](https://phabricator.mitk.org/T28194)) +- Avoid blank pages in PDF report when bootstrapping is disabled ([T28201](https://phabricator.mitk.org/T28201)) +- Handle tasks having only one case for bootstrapping ([T28202](https://phabricator.mitk.org/T28202)) - Update citation ([T28210](https://phabricator.mitk.org/T28210)) #### Version 1.0.0 - Revision of the underlying data structure - Roxygen documentation for main functionality -- Vignettes for quickstart and overview of available plots - demonstrating the use of their corresponding plot functions -- Introduction of unit tests (package coverage >70%) +- Vignettes for quickstart and overview of available plots demonstrating the use of their corresponding plot functions +- Introduction of unit tests (package coverage \>70%) - Troubleshooting section covering potential issues during setup -- Finally: Extensive bug fixes and improvements (for a complete - overview please check the [Phabricator - tasks](https://phabricator.mitk.org/search/query/vtj0qOqH5qL6/)) +- Finally: Extensive bug fixes and improvements (for a complete overview please check the [Phabricator tasks](https://phabricator.mitk.org/search/query/vtj0qOqH5qL6/)) #### Version 0.3.3 -- Force line break to avoid that authors exceed the page in generated - PDF reports +- Force line break to avoid that authors exceed the page in generated PDF reports #### Version 0.3.2 - Correct names of authors #### Version 0.3.1 - Refactoring #### Version 0.3.0 - Major bug fix release #### Version 0.2.5 - Bug fixes #### Version 0.2.4 - Automatic insertion of missings #### Version 0.2.3 - Bug fixes -- Reports for subsets (top list) of algorithms: Use - e.g. `subset(ranking_bootstrapped, top=3) %>% report(...)` (or - `subset(ranking, top=3) %>% report(...)` for report without - bootstrap results) to only show the top 3 algorithms according to - the chosen ranking methods, where `ranking_bootstrapped` and - `ranking` objects as defined in the example. Line plot for ranking - robustness can be used to check whether algorithms performing well - in other ranking methods are excluded. Bootstrapping still takes - entire uncertainty into account. Podium plot and ranking heatmap - neglect excluded algorithms. Only available for single-task - challenges (for multi-task challenges not sensible because each task - would contain a different set of algorithms). -- Reports for subsets of tasks: Use - e.g. `subset(ranking_bootstrapped, tasks=c("task1", "task2","task3")) %>% report(...)` - to restrict report to tasks “task1”, “task2”,“task3. You may want to - recompute the consensus ranking before using - `meanRanks=subset(ranking, tasks=c("task1", "task2", "task3"))%>%consensus(method = "euclidean")` +- Reports for subsets (top list) of algorithms: Use e.g. `subset(ranking_bootstrapped, top=3) %>% report(...)` (or `subset(ranking, top=3) %>% report(...)` for report without bootstrap results) to only show the top 3 algorithms according to the chosen ranking methods, where `ranking_bootstrapped` and `ranking` objects as defined in the example. Line plot for ranking robustness can be used to check whether algorithms performing well in other ranking methods are excluded. Bootstrapping still takes entire uncertainty into account. Podium plot and ranking heatmap neglect excluded algorithms. Only available for single-task challenges (for multi-task challenges not sensible because each task would contain a different set of algorithms). +- Reports for subsets of tasks: Use e.g. `subset(ranking_bootstrapped, tasks=c("task1", "task2","task3")) %>% report(...)` to restrict report to tasks "task1", "task2","task3. You may want to recompute the consensus ranking before using `meanRanks=subset(ranking, tasks=c("task1", "task2", "task3"))%>%consensus(method = "euclidean")` #### Version 0.2.1 -- Introduction in reports now mentions e.g. ranking method, number of - test cases,… -- Function `subset()` allows selection of tasks after bootstrapping, - e.g. `subset(ranking_bootstrapped,1:3)` -- `report()` functions gain argument `colors` (default: - `default_colors`). Change e.g. to `colors=viridisLite::inferno` - which “is designed in such a way that it will analytically be - perfectly perceptually-uniform, both in regular form and also when - converted to black-and-white. It is also designed to be perceived by - readers with the most common form of color blindness.” See package - `viridis` for further similar functions. +- Introduction in reports now mentions e.g. ranking method, number of test cases,... +- Function `subset()` allows selection of tasks after bootstrapping, e.g. `subset(ranking_bootstrapped,1:3)` +- `report()` functions gain argument `colors` (default: `default_colors`). Change e.g. to `colors=viridisLite::inferno` which "is designed in such a way that it will analytically be perfectly perceptually-uniform, both in regular form and also when converted to black-and-white. It is also designed to be perceived by readers with the most common form of color blindness." See package `viridis` for further similar functions. #### Version 0.2.0 -- Improved layout in case of many algorithms and tasks (while probably - still not perfect) +- Improved layout in case of many algorithms and tasks (while probably still not perfect) - Consistent coloring of algorithms across figures -- `report()` function can be applied to ranked object before - bootstrapping (and thus excluding figures based on bootstrapping), - i.e. in the example `ranking %>% report(...)` +- `report()` function can be applied to ranked object before bootstrapping (and thus excluding figures based on bootstrapping), i.e. in the example `ranking %>% report(...)` - bug fixes # Team -The developer team includes members from both division of Computer -Assisted Medical Interventions (CAMI) and Biostatistics at the German -Cancer Research Center (DKFZ): +The developer team includes members from both division of Computer Assisted Medical Interventions (CAMI) and Biostatistics at the German Cancer Research Center (DKFZ): - Manuel Wiesenfarth - Annette Kopp-Schneider - Annika Reinke - Matthias Eisenmann - Laura Aguilera Saiz - Elise Récéjac - Lena Maier-Hein # Reference -Wiesenfarth, M., Reinke, A., Landman, B.A., Eisenmann, M., Aguilera -Saiz, L., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. Methods -and open-source toolkit for analyzing and visualizing challenge results. -*Sci Rep* **11**, 2369 (2021). - +Wiesenfarth, M., Reinke, A., Landman, B.A., Eisenmann, M., Aguilera Saiz, L., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. Methods and open-source toolkit for analyzing and visualizing challenge results. *Sci Rep* **11**, 2369 (2021). - - + diff --git a/Readme.Rmd b/Readme.Rmd index 87935b5..73d3051 100644 --- a/Readme.Rmd +++ b/Readme.Rmd @@ -1,528 +1,533 @@ --- title: "Methods and open-source toolkit for analyzing and visualizing challenge results" output: github_document: toc: yes toc_depth: '1' html_document: toc: yes toc_depth: '1' pdf_document: toc: yes toc_depth: '1' editor_options: chunk_output_type: console --- ```{r, echo = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", # fig.path = "README-", fig.width = 9, fig.height = 5, width=160 ) ``` # Introduction The current framework is a tool for analyzing and visualizing challenge results in the field of biomedical image analysis and beyond. Biomedical challenges have become the de facto standard for benchmarking biomedical image analysis algorithms. While the number of challenges is steadily increasing, surprisingly little effort has been invested in ensuring high quality design, execution and reporting for these international competitions. Specifically, results analysis and visualization in the event of uncertainties have been given almost no attention in the literature. Given these shortcomings, the current framework aims to enable fast and wide adoption of comprehensively analyzing and visualizing the results of single-task and multi-task challenges. This approach offers an intuitive way to gain important insights into the relative and absolute performance of algorithms, which cannot be revealed by commonly applied visualization techniques. # Installation Requires R version >= 3.5.2 (https://www.r-project.org). Further, a recent version of Pandoc (>= 1.12.3) is required. RStudio (https://rstudio.com) automatically includes this so you do not need to download Pandoc if you plan to use rmarkdown from the RStudio IDE, otherwise you’ll need to install Pandoc for your platform (https://pandoc.org/installing.html). Finally, if you want to generate a PDF report you will need to have LaTeX installed (e.g. MiKTeX, MacTeX or TinyTeX). To get the latest released version (master branch) of the R package from GitHub: ```{r, eval=F,R.options,} if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rgraphviz", dependencies = TRUE) devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` If you are asked whether you want to update installed packages and you type "a" for all, you might need administrator permissions to update R core packages. You can also try to type "n" for updating no packages. If you are asked "Do you want to install from sources the packages which need compilation? (Yes/no/cancel)", you can safely type "no". If you get *warning* messages (in contrast to *error* messages), these might not be problematic and you can try to proceed. If you encounter errors during the setup, looking into the "Troubleshooting" section might be worth it. For Linux users: Some system libraries might be missing. Check the output in the R console for further hints carefully during the installation of packages. # Terms of use Copyright (c) German Cancer Research Center (DKFZ). All rights reserved. challengeR is available under license GPLv2 or any later version. If you use this software for a publication, please cite: Wiesenfarth, M., Reinke, A., Landman, B.A., Eisenmann, M., Aguilera Saiz, L., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. Methods and open-source toolkit for analyzing and visualizing challenge results. *Sci Rep* **11**, 2369 (2021). https://doi.org/10.1038/s41598-021-82017-6 # Usage Each of the following steps has to be run to generate the report: (1) Load package, (2) load data, (3) perform ranking, (4) perform bootstrapping and (5) generation of the report You can find R scripts for quickstart in the directory "vignettes". An overview of all available plots is provided in the "Visualizations" vignette demonstrating the use of their corresponding plot functions as well. Here, we provide a step-by-step guide that leads you to your final report. ## 1. Load package Load package ```{r, eval=F} library(challengeR) ``` ## 2. Load data ### Data requirements Data requires the following *columns*: * *task identifier* in case of multi-task challenges (string or numeric) * *test case identifier* (string or numeric) * *algorithm identifier* (string or numeric) * *metric value* (numeric) In case of missing metric values, a missing observation has to be provided (either as blank field or "NA"). For example, in a challenge with 2 tasks, 2 test cases and 2 algorithms, where in task "T2", test case "case2", algorithm "A2" didn't give a prediction (and thus NA or a blank field for missing value is inserted), the data set might look like this: ```{r, eval=T, echo=F,results='asis'} set.seed(1) a <- cbind(expand.grid(Task=paste0("T",1:2),TestCase=paste0("case",1:2),Algorithm=paste0("A",1:2)),MetricValue=round(c(runif(7,0,1),NA),3)) print(knitr::kable(a[order(a$Task,a$TestCase,a$Algorithm),],row.names=F)) ``` ### 2.1 Load data from file If you have assessment data at hand stored in a csv file (if you want to use simulated data, skip the following code line) use ```{r, eval=F, echo=T} data_matrix <- read.csv(file.choose()) # type ?read.csv for help ``` This allows to choose a file interactively, otherwise replace *file.choose()* by the file path (in style "/path/to/dataset.csv") in quotation marks. ### 2.2 Simulate data In the following, simulated data is generated *instead* for illustration purposes (skip the following code chunk if you have already loaded data). The data is also stored as "inst/extdata/data_matrix.csv" in the repository. ```{r, eval=F, echo=T} if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute") n <- 50 set.seed(4) strip <- runif(n,.9,1) c_ideal <- cbind(task="c_ideal", rbind( data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n), data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n), data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n), data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n), data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n) )) set.seed(1) c_random <- data.frame(task="c_random", alg_name=factor(paste0("A",rep(1:5,each=n))), value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5) ) strip2 <- seq(.8,1,length.out=5) a <- permute::allPerms(1:5) c_worstcase <- data.frame(task="c_worstcase", alg_name=c(t(a)), value=rep(strip2,nrow(a)), case=rep(1:nrow(a),each=5) ) c_worstcase <- rbind(c_worstcase, data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1) ) c_worstcase$alg_name <- factor(c_worstcase$alg_name,labels=paste0("A",1:5)) data_matrix <- rbind(c_ideal, c_random, c_worstcase) ``` ## 3. Perform ranking ### 3.1 Define challenge object Code differs slightly for single- and multi-task challenges. In case of a single-task challenge use ```{r, eval=F, echo=T} # Use only task "c_random" in object data_matrix dataSubset <- subset(data_matrix, task=="c_random") challenge <- as.challenge(dataSubset, # Specify which column contains the algorithms, # which column contains a test case identifier # and which contains the metric value: algorithm = "alg_name", case = "case", value = "value", # Specify if small metric values are better smallBetter = FALSE) ``` *Instead*, for a multi-task challenge use ```{r, eval=F, echo=T} # Same as above but with 'by="task"' where variable "task" contains the task identifier challenge <- as.challenge(data_matrix, by = "task", algorithm = "alg_name", case = "case", value = "value", smallBetter = FALSE) ``` ### 3.2 Configure ranking Different ranking methods are available, choose one of them: - for "aggregate-then-rank" use (here: take mean for aggregation) ```{r, eval=F, echo=T} ranking <- challenge%>%aggregateThenRank(FUN = mean, # aggregation function, # e.g. mean, median, min, max, # or e.g. function(x) quantile(x, probs=0.05) na.treat = 0, # either "na.rm" to remove missing data, # set missings to numeric value (e.g. 0) # or specify a function, # e.g. function(x) min(x) ties.method = "min" # a character string specifying # how ties are treated, see ?base::rank ) ``` - *alternatively*, for "rank-then-aggregate" with arguments as above (here: take mean for aggregation) ```{r, eval=F, echo=T} ranking <- challenge%>%rankThenAggregate(FUN = mean, ties.method = "min" ) ``` - *alternatively*, for test-then-rank based on Wilcoxon signed rank test ```{r, eval=F, echo=T} ranking <- challenge%>%testThenRank(alpha = 0.05, # significance level p.adjust.method = "none", # method for adjustment for # multiple testing, see ?p.adjust na.treat = 0, # either "na.rm" to remove missing data, # set missings to numeric value (e.g. 0) # or specify a function, e.g. function(x) min(x) ties.method = "min" # a character string specifying # how ties are treated, see ?base::rank ) ``` ## 4. Perform bootstrapping Perform bootstrapping with 1000 bootstrap samples using one CPU ```{r, eval=F, echo=T} set.seed(1) ranking_bootstrapped <- ranking%>%bootstrap(nboot = 1000) ``` If you want to use multiple CPUs (here: 8 CPUs), use ```{r, eval=F, echo=T} library(doParallel) registerDoParallel(cores = 8) set.seed(1) ranking_bootstrapped <- ranking%>%bootstrap(nboot = 1000, parallel = TRUE, progress = "none") stopImplicitCluster() ``` ## 5. Generate the report Generate report in PDF, HTML or DOCX format. Code differs slightly for single- and multi-task challenges. ### 5.1 For single-task challenges ```{r, eval=F, echo=T} ranking_bootstrapped %>% report(title = "singleTaskChallengeExample", # used for the title of the report file = "filename", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine = "pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean = TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) ``` Argument *file* allows for specifying the output file path as well, otherwise the working directory is used. If file is specified but does not have a file extension, an extension will be automatically added according to the output format given in *format*. Using argument *clean=FALSE* allows to retain intermediate files, such as separate files for each figure. If argument "file" is omitted, the report is created in a temporary folder with file name "report". ### 5.2 For multi-task challenges Same as for single-task challenges, but additionally consensus ranking (rank aggregation across tasks) has to be given. Compute ranking consensus across tasks (here: consensus ranking according to mean ranks across tasks) ```{r, eval=F, echo=T} # See ?relation_consensus for different methods to derive consensus ranking meanRanks <- ranking%>%consensus(method = "euclidean") meanRanks # note that there may be ties (i.e. some algorithms have identical mean rank) ``` Generate report as above, but with additional specification of consensus ranking ```{r, eval=F, echo=T} ranking_bootstrapped %>% report(consensus = meanRanks, title = "multiTaskChallengeExample", file = "filename", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine = "pdflatex"#LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" ) ``` The consensus ranking is given according to mean ranks across tasks if method="euclidean" where in case of ties (equal ranks for multiple algorithms) the average rank is used, i.e. ties.method="average". # Troubleshooting In this section we provide an overview of issues that the users reported and how they were solved. ## Issues related to RStudio ### Issue: Rtools is missing While trying to install the current version of the repository: ```{r, eval=F, echo=T} devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` The following warning showed up in the output: ```{r, eval=F, echo=T} WARNING: Rtools is required to build R packages, but is not currently installed. ``` Therefore, Rtools was installed via a separate executable: https://cran.r-project.org/bin/windows/Rtools/ and the warning disappeared. #### Solution: Actually there is no need of installing Rtools, it is not really used in the toolkit. Insted, choose not to install it when it is asked. See comment in the installation section: “If you are asked whether you want to update installed packages and you type “a” for all, you might need administrator rights to update R core packages. You can also try to type “n” for updating no packages. If you are asked “Do you want to install from sources the packages which need compilation? (Yes/no/cancel)”, you can safely type “no”.” ### Issue: Package versions are mismatching Installing the current version of the tool from GitHub failed. The error message was: ```{r, eval=F, echo=T} byte-compile and prepare package for lazy loading Error: (converted from warning) package 'ggplot2' was built under R version 3.6.3 Execution halted ERROR: lazy loading failed for package 'challengeR' * removing 'C:/Users/.../Documents/R/win-library/3.6/challengeR' * restoring previous 'C:/Users/.../Documents/R/win-library/3.6/challengeR' Error: Failed to install 'challengeR' from GitHub: (converted from warning) installation of package 'C:/Users/.../AppData/Local/Temp/Rtmp615qmV/file4fd419555eb4/challengeR_0.3.1.tar.gz' had non-zero exit status ``` The problem was that some of the packages that were built under R3.6.1 had been updated, but the current installed version was still R3.6.1. #### Solution: The solution was to update R3.6.1 to R3.6.3. Another way would have been to reset the single packages to the versions built under R3.6.1. ### Issue: Package is missing Installing the current version of the tool from GitHub failed. ```{r, eval=F, echo=T} devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` The error message was: ```{r, eval=F, echo=T} Error: .onLoad failed in loadNamespace() for 'pkgload', details: call: loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]) error: there is no package called ‘backports’ ``` The problem was that the packages 'backports' had not been installed. #### Solution: The solution was to install 'backports' manually. ```{r, eval=F, echo=T} install.packages("backports") ``` ### Issue: Packages are not detected correctly While trying to install the package after running the following commands: ```{r, eval=F, echo=T} if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rgraphviz", dependencies = TRUE) devtools::install_github("wiesenfa/challengeR", dependencies = TRUE) ``` The error message was: ```{r, eval=F, echo=T} ERROR: 1: In file(con, "r") : URL 'https://bioconductor.org/config.yaml': status was 'SSL connect error' 2: packages ‘BiocVersion’, ‘Rgraphviz’ are not available (for R version 3.6.1) ``` #### Solution: The solution was to restart RStudio. ## Issues related to MiKTeX ### Issue: Missing packages While generating the PDF with MiKTeX (2.9), the following error showed up: ```{r, eval=F, echo=T} fatal pdflatex - gui framework cannot be initialized ``` There is an issue with installing missing packages in LaTeX. ##### Solution: Open your MiKTeX Console --> Settings, select "Always install missing packages on-the-fly". Then generate the report. Once the report is generated, you can reset the settings to your preferred ones. ### Issue: Unable to generate report While generating the PDF with MiKTeX (2.9): ```{r, eval=F, echo=T} ranking_bootstrapped %>% report(title = "singleTaskChallengeExample", # used for the title of the report file = "filename", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine = "pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean = TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) ``` The following error showed up: ```{r, eval=F, echo=T} output file: filename.knit.md "C:/Program Files/RStudio/bin/pandoc/pandoc" +RTS -K512m -RTS filename.utf8.md --to latex --from markdown+autolink_bare_uris+tex_math_single_backslash --output filename.tex --self-contained --number-sections --highlight-style tango --pdf-engine pdflatex --variable graphics --lua-filter "C:/Users/adm/Documents/R/win-library/3.6/rmarkdown/rmd/lua/pagebreak.lua" --lua-filter "C:/Users/adm/Documents/R/win-library/3.6/rmarkdown/rmd/lua/latex-div.lua" --variable "geometry:margin=1in" Error: LaTeX failed to compile filename.tex. See https://yihui.org/tinytex/r/#debugging for debugging tips. Warning message: In system2(..., stdout = if (use_file_stdout()) f1 else FALSE, stderr = f2) : '"pdflatex"' not found ``` #### Solution: The solution was to restart RStudio. # Changes +#### Version 1.0.4 +- Fix NaN values cause error ([T28746](https://phabricator.mitk.org/T28746)) +- Fix Bars and dots don't match in podium plot ([T29167](https://phabricator.mitk.org/T29167)) +- Fix y-axis of blob plots always scaled to 5 ([T28966](https://phabricator.mitk.org/T28966)) + #### Version 1.0.3 - Fix ggplot warning in various places of the report ([T28710](https://phabricator.mitk.org/T28710)) #### Version 1.0.2 - Fix error when all metric values are the same ([T28453](https://phabricator.mitk.org/T28453)) -- Fix wrong numer of algorithms shown in report summary ([T28465](https://phabricator.mitk.org/T28465)) +- Fix wrong number of algorithms shown in report summary ([T28465](https://phabricator.mitk.org/T28465)) #### Version 1.0.1 - Fix error raised in case there are more tasks than algorithms contained in the dataset ([T28193](https://phabricator.mitk.org/T28193)) - Drop restriction that at least three algorithms are required for bootstrapping ([T28194](https://phabricator.mitk.org/T28194)) - Avoid blank pages in PDF report when bootstrapping is disabled ([T28201](https://phabricator.mitk.org/T28201)) - Handle tasks having only one case for bootstrapping ([T28202](https://phabricator.mitk.org/T28202)) - Update citation ([T28210](https://phabricator.mitk.org/T28210)) #### Version 1.0.0 - Revision of the underlying data structure - Roxygen documentation for main functionality - Vignettes for quickstart and overview of available plots demonstrating the use of their corresponding plot functions - Introduction of unit tests (package coverage >70%) - Troubleshooting section covering potential issues during setup - Finally: Extensive bug fixes and improvements (for a complete overview please check the [Phabricator tasks](https://phabricator.mitk.org/search/query/vtj0qOqH5qL6/)) #### Version 0.3.3 - Force line break to avoid that authors exceed the page in generated PDF reports #### Version 0.3.2 - Correct names of authors #### Version 0.3.1 - Refactoring #### Version 0.3.0 - Major bug fix release #### Version 0.2.5 - Bug fixes #### Version 0.2.4 - Automatic insertion of missings #### Version 0.2.3 - Bug fixes - Reports for subsets (top list) of algorithms: Use e.g. `subset(ranking_bootstrapped, top=3) %>% report(...)` (or `subset(ranking, top=3) %>% report(...)` for report without bootstrap results) to only show the top 3 algorithms according to the chosen ranking methods, where `ranking_bootstrapped` and `ranking` objects as defined in the example. Line plot for ranking robustness can be used to check whether algorithms performing well in other ranking methods are excluded. Bootstrapping still takes entire uncertainty into account. Podium plot and ranking heatmap neglect excluded algorithms. Only available for single-task challenges (for multi-task challenges not sensible because each task would contain a different set of algorithms). - Reports for subsets of tasks: Use e.g. `subset(ranking_bootstrapped, tasks=c("task1", "task2","task3")) %>% report(...)` to restrict report to tasks "task1", "task2","task3. You may want to recompute the consensus ranking before using `meanRanks=subset(ranking, tasks=c("task1", "task2", "task3"))%>%consensus(method = "euclidean")` #### Version 0.2.1 - Introduction in reports now mentions e.g. ranking method, number of test cases,... - Function `subset()` allows selection of tasks after bootstrapping, e.g. `subset(ranking_bootstrapped,1:3)` - `report()` functions gain argument `colors` (default: `default_colors`). Change e.g. to `colors=viridisLite::inferno` which "is designed in such a way that it will analytically be perfectly perceptually-uniform, both in regular form and also when converted to black-and-white. It is also designed to be perceived by readers with the most common form of color blindness." See package `viridis` for further similar functions. #### Version 0.2.0 - Improved layout in case of many algorithms and tasks (while probably still not perfect) - Consistent coloring of algorithms across figures - `report()` function can be applied to ranked object before bootstrapping (and thus excluding figures based on bootstrapping), i.e. in the example `ranking %>% report(...)` - bug fixes # Team The developer team includes members from both division of Computer Assisted Medical Interventions (CAMI) and Biostatistics at the German Cancer Research Center (DKFZ): - Manuel Wiesenfarth - Annette Kopp-Schneider - Annika Reinke - Matthias Eisenmann - Laura Aguilera Saiz - Elise Récéjac - Lena Maier-Hein # Reference Wiesenfarth, M., Reinke, A., Landman, B.A., Eisenmann, M., Aguilera Saiz, L., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. Methods and open-source toolkit for analyzing and visualizing challenge results. *Sci Rep* **11**, 2369 (2021). https://doi.org/10.1038/s41598-021-82017-6 diff --git a/tests/testthat/test-challenge.R b/tests/testthat/test-challenge.R index b3c7ffe..16ff24d 100644 --- a/tests/testthat/test-challenge.R +++ b/tests/testthat/test-challenge.R @@ -1,596 +1,633 @@ # Copyright (c) German Cancer Research Center (DKFZ) # All rights reserved. # # This file is part of challengeR. # # challengeR is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # challengeR is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with challengeR. If not, see . test_that("empty attribute 'taskName' raises error for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) expect_error(as.challenge(data, taskName="", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Argument 'taskName' is empty.", fixed=TRUE) }) test_that("only whitespaces in attribute 'taskName' raises error for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) expect_error(as.challenge(data, taskName=" ", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Argument 'taskName' is empty.", fixed=TRUE) }) test_that("attributes are set for single-task challenge with specified task name", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") expect_equal(attr(actualChallenge, "smallBetter"), FALSE) expect_equal(attr(actualChallenge, "check"), TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1")) # expect that there's no attribute "task" expect_equal(attr(actualChallenge, "task"), NULL) expect_equal(attr(actualChallenge$T1, "task"), NULL) expect_equal(attr(actualChallenge$T2, "task"), NULL) }) test_that("attributes are set for single-task challenge with dummy task name", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") expect_equal(attr(actualChallenge, "smallBetter"), FALSE) expect_equal(attr(actualChallenge, "check"), TRUE) expect_equal(as.vector(actualChallenge$dummyTask$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$dummyTask$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$dummyTask$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$dummyTask$task), c("dummyTask", "dummyTask")) # expect that there's no attribute "task" expect_equal(attr(actualChallenge, "task"), NULL) expect_equal(attr(actualChallenge$dummyTask, "task"), NULL) expect_equal(attr(actualChallenge$dummyTask, "task"), NULL) }) test_that("leading and trailing whitespaces are trimmed for attribute 'taskName'", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) actualChallenge <- as.challenge(data, taskName=" T1 ", algorithm="algo", case="case", value="value", smallBetter=FALSE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) }) test_that("attributes are set for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") expect_equal(attr(actualChallenge, "smallBetter"), TRUE) expect_equal(attr(actualChallenge, "check"), TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$T2$task), c("T2", "T2")) # expect that there's no attribute "task" expect_equal(attr(actualChallenge, "task"), NULL) expect_equal(attr(actualChallenge$T1, "task"), NULL) expect_equal(attr(actualChallenge$T2, "task"), NULL) }) test_that("attributes are set for multi-task challenge with sanity check disabled", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE, check=FALSE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") expect_equal(attr(actualChallenge, "smallBetter"), TRUE) expect_equal(attr(actualChallenge, "check"), FALSE) expect_equal(as.vector(actualChallenge$algo), c("A1", "A2", "A1", "A2")) expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6, 0.2, 0.3)) expect_equal(as.vector(actualChallenge$case), c("C1", "C1", "C1", "C1")) expect_equal(as.vector(actualChallenge$task), c("T1", "T1", "T2", "T2")) }) test_that("attribute 'taskName' is ignored for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_warning(as.challenge(data, taskName="T1", by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE), "Argument 'taskName' is ignored for multi-task data set.", fixed=TRUE) }) test_that("missing algorithm performances are added as NAs for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), - "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1", "T1", "T1")) }) test_that("multi-task data set containing one task is interpreted as single-task data set, missing algorithm performances are added", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), - "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("missing algorithm performances are added as NAs for multi-task challenge (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), - "Performance of not all algorithms has been observed for all cases in task 'T1'. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases in task 'T1'.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1", "T1", "T1")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4, NA)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T2$task), c("T2", "T2", "T2", "T2")) }) test_that("missing algorithm performances are not added as NA with sanity check disabled for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE) expect_equal(as.vector(actualChallenge$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$case), c("C1", "C2")) }) test_that("missing algorithm performances are not added as NA with sanity check disabled for multi-task challenge (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE) expect_equal(as.vector(actualChallenge$algo), c("A1", "A2", "A1", "A1", "A2")) expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6, 0.2, 0.3, 0.4)) expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2", "C1")) }) test_that("case cannot appear more than once per algorithm for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1")) expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("multi-task data set containing one task is interpreted as single-task data set, case cannot appear more than once per algorithm", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1") )) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("case cannot appear more than once per algorithm for multi-task challenge (1 task in data set)", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1") )) expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.7, case="C1"), data.frame(algo="A1", value=0.5, case="C2"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2")) expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1, C2", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm for multi-task challenge (1 task in data set)", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.7, case="C1"), data.frame(algo="A1", value=0.5, case="C2"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2") )) expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1, C2", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm for multi-task challenge (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1") # let T1 pass )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.7, case="C1"), data.frame(algo="A1", value=0.5, case="C2"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2") )) data <- rbind(dataTask1, dataTask2) expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm in task 'T2'. Please revise.\nCase(s): C1, C2", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm when missing data was added for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2")) expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1, C2", fixed=TRUE) }) test_that("user is notified of duplicate cases when multi-task data set is interpreted as single-task data set (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.8, case="C1") )) data <- rbind(dataTask1, dataTask2) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_error(as.challenge(data, taskName="New task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("user is notified of missing algorithm performance when multi-task data set is interpreted as single-task data set (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=0.6, case="C2") )) data <- rbind(dataTask1, dataTask2) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_message(as.challenge(data, taskName="New task", algorithm="algo", case="case", value="value", smallBetter=FALSE), - "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) }) test_that("NAs are replaced by numeric value for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A2", value=NA, case="C2")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.6, 0.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("NAs are replaced by numeric value for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=NA, case="C1"), data.frame(algo="A2", value=0.5, case="C2") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.0, 0.5)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2")) }) test_that("NAs are replaced by function value for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A2", value=NA, case="C2")) replacementFunction <- function(x) { 2 } actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 2.0, 0.6, 2.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("NAs are replaced by function value for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=NA, case="C1"), data.frame(algo="A2", value=0.5, case="C2") )) data <- rbind(dataTask1, dataTask2) replacementFunction <- function(x) { 2 } actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 2.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(2.0, 0.5)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2")) }) test_that("NAs are removed for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A2", value=NA, case="C2")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm") expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1")) }) test_that("NAs are removed for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=NA, case="C1"), data.frame(algo="A2", value=0.5, case="C2") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm") expect_equal(as.vector(actualChallenge$T1$algo), c("A1")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8)) expect_equal(as.vector(actualChallenge$T1$case), c("C1")) expect_equal(as.vector(actualChallenge$T2$algo), c("A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.5)) expect_equal(as.vector(actualChallenge$T2$case), c("C2")) }) test_that("automatically added NAs are replaced by numeric value for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0), - "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.0, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("automatically added NAs are replaced by numeric value for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0), - "Performance of not all algorithms has been observed for all cases in task 'T1'. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases in task 'T1'.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.0, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4, 0.0)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1", "C2")) }) test_that("automatically added NAs are removed for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"), - "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) }) test_that("automatically added NAs are removed for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"), - "Performance of not all algorithms has been observed for all cases in task 'T1'. Therefore, missings have been inserted in the following cases:", fixed=TRUE) + "Performance of not all algorithms has been observed for all cases in task 'T1'.\nTherefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1")) }) +test_that("class of 'algorithm' column must be 'factor' for single-task challenge", { + data <- rbind( + data.frame(algo="A1", value=0.8, case="C1"), + data.frame(algo="A1", value=0.7, case="C2"), + data.frame(algo="A2", value=0.6, case="C1"), + data.frame(algo="A2", value=0.5, case="C2")) + + actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0) + + expect_equal(class(actualChallenge$T1$algo), "factor") +}) + +test_that("class of 'algorithm' column must be 'factor' for multi-task challenge", { + dataTask1 <- cbind(task="T1", + rbind( + data.frame(algo="A1", value=0.81, case="C1"), + data.frame(algo="A2", value=0.72, case="C1"), + data.frame(algo="A1", value=0.65, case="C2"), + data.frame(algo="A2", value=0.95, case="C2") + )) + dataTask2 <- cbind(task="T2", + rbind( + data.frame(algo="A1", value=0.75, case="C1"), + data.frame(algo="A2", value=0.82, case="C1"), + data.frame(algo="A1", value=0.66, case="C2"), + data.frame(algo="A2", value=0.84, case="C2") + )) + + data <- rbind(dataTask1, dataTask2) + + actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE) + + expect_equal(class(actualChallenge$T1$algo), "factor") + expect_equal(class(actualChallenge$T2$algo), "factor") +}) + +