diff --git a/R/Bootstrap.R b/R/Bootstrap.R index eed0e84..f87f0be 100644 --- a/R/Bootstrap.R +++ b/R/Bootstrap.R @@ -1,223 +1,225 @@ # Copyright (c) German Cancer Research Center (DKFZ) # All rights reserved. # # This file is part of challengeR. # # challengeR is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 2 of the License, or # (at your option) any later version. # # challengeR is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with challengeR. If not, see . bootstrap <- function(object,...) UseMethod("bootstrap") bootstrap.default <- function(object, ...) stop("not implemented for this class") #' Performs bootstrapping #' #' Performs bootstrapping on a ranked assessment data set and applies the ranking method to each bootstrap sample. One bootstrap sample of #' a task with \code{n} cases consists of \code{n} cases randomly drawn with replacement from this task. #' A total of \code{nboot} of these bootstrap samples are drawn. #' +#' To ensure reproducibility, please use RNG kind = "L'Ecuyer-CMRG" in set.seed(), e.g. set.seed(1, kind = "L'Ecuyer-CMRG"). +#' #' @param object The ranked assessment data set. #' @param nboot The number of bootstrap samples. #' @param parallel A boolean specifying whether parallel processing should be enabled. #' @param progress A string specifying the type of progress indication. #' @param ... Further arguments passed to or from other functions. #' #' @return An S3 object of class "bootstrap.list" to represent a bootstrapped, ranked assessment data set. #' #' @examples #' #' \dontrun{ #' # perform bootstrapping with 1000 bootstrap samples using one CPU -#' set.seed(1) +#' set.seed(1, kind="L'Ecuyer-CMRG") #' ranking_bootstrapped <- bootstrap(ranking, nboot = 1000) #' } #' #' \dontrun{ #' # perform bootstrapping using multiple CPUs (here: 8 CPUs) #' library(doParallel) #' registerDoParallel(cores=8) -#' set.seed(1) +#' set.seed(1, kind="L'Ecuyer-CMRG") #' ranking_bootstrapped <- bootstrap(ranking, nboot = 1000, parallel = TRUE, progress = "none") #' stopImplicitCluster() #' } #' #' @export bootstrap.ranked.list=function(object, nboot, parallel=FALSE, progress="text", ...){ if (parallel & RNGkind()[1] != "L'Ecuyer-CMRG") { - warning("To ensure reproducibility please use kind = \"L'Ecuyer-CMRG\" in set.seed(), e.g. set.seed(1, kind = \"L'Ecuyer-CMRG\").") + warning("To ensure reproducibility, please use RNG kind = \"L'Ecuyer-CMRG\" in set.seed(), e.g. set.seed(1, kind = \"L'Ecuyer-CMRG\").") } algorithm=attr(object$data,"algorithm") by=attr(object$data,"case") # exclude if only 1 test case or only 1 algorithm tidy.data.id=sapply(object$data, function(data.subset) { ifelse((length(unique(data.subset[[by]]))==1 | length(unique(data.subset[[algorithm]]))<=1 ), yes=FALSE, no=TRUE) }) if (sum(tidy.data.id)==0) { if (length(object$matlist)>1) stop("All tasks only contained 1 test case. Bootstrapping with 1 test case not sensible.") else stop("Only 1 test case included. Bootstrapping with 1 test case not sensible.") } if (sum(tidy.data.id)%aggregateThenRank(FUN=median, ties.method="min") set.seed(1) expect_error(rankingBootstrapped <- ranking%>%bootstrap(nboot=10), "Only 1 test case included. Bootstrapping with 1 test case not sensible.", fixed = TRUE) }) test_that("multi-task bootstrapping, all tasks with 1 test case stopped with message", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) dataTask3 <- cbind(task="T3", rbind( data.frame(algo="A1", value=0.1, case="C1"), data.frame(algo="A2", value=0.8, case="C1") )) data <- rbind(dataTask1, dataTask2, dataTask3) challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") set.seed(1) expect_error(rankingBootstrapped <- ranking%>%bootstrap(nboot=10), "All tasks only contained 1 test case. Bootstrapping with 1 test case not sensible.", fixed = TRUE) }) test_that("multi-task bootstrapping, only one task with >1 test case continued with message", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1"), data.frame(algo="A1", value=0.2, case="C2"), data.frame(algo="A2", value=0.3, case="C2") )) dataTask3 <- cbind(task="T3", rbind( data.frame(algo="A1", value=0.1, case="C1"), data.frame(algo="A2", value=0.8, case="C1") )) data <- rbind(dataTask1, dataTask2, dataTask3) challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") set.seed(1) expect_message(rankingBootstrapped <- ranking%>%bootstrap(nboot=3), "Task(s) T1, T3 with only 1 test case excluded from bootstrapping.", fixed = TRUE) }) test_that("two sequential bootstrappings yield same results", { data <- read.csv(system.file("extdata", "data_matrix.csv", package="challengeR", mustWork=TRUE)) challenge <- as.challenge(data, by="task", algorithm="alg_name", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%rankThenAggregate(FUN=mean, ties.method="min") set.seed(1) rankingBootstrapped1 <- ranking%>%bootstrap(nboot=10) set.seed(1) rankingBootstrapped2 <- ranking%>%bootstrap(nboot=10) expect_equal(rankingBootstrapped1, rankingBootstrapped2) }) test_that("two parallel bootstrappings yield same results", { data <- read.csv(system.file("extdata", "data_matrix.csv", package="challengeR", mustWork=TRUE)) challenge <- as.challenge(data, by="task", algorithm="alg_name", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%rankThenAggregate(FUN=mean, ties.method="min") library(doParallel) numCores <- detectCores(logical=FALSE) registerDoParallel(cores=numCores) set.seed(1, kind="L'Ecuyer-CMRG") rankingBootstrapped1 <- ranking%>%bootstrap(nboot=10, parallel=TRUE, progress="none") set.seed(1, kind="L'Ecuyer-CMRG") rankingBootstrapped2 <- ranking%>%bootstrap(nboot=10, parallel=TRUE, progress="none") stopImplicitCluster() expect_equal(rankingBootstrapped1, rankingBootstrapped2) }) test_that("parallel bootstrapping raises warning if RNG \"L'Ecuyer-CMRG\" is not used", { data <- read.csv(system.file("extdata", "data_matrix.csv", package="challengeR", mustWork=TRUE)) challenge <- as.challenge(data, by="task", algorithm="alg_name", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%rankThenAggregate(FUN=mean, ties.method="min") library(doParallel) numCores <- detectCores(logical=FALSE) registerDoParallel(cores=numCores) set.seed(1, kind="Super-Duper") expect_warning(rankingBootstrapped <- ranking%>%bootstrap(nboot=10, parallel=TRUE, progress="none"), - "To ensure reproducibility please use kind = \"L'Ecuyer-CMRG\" in set.seed(), e.g. set.seed(1, kind = \"L'Ecuyer-CMRG\").", fixed = TRUE) + "To ensure reproducibility, please use RNG kind = \"L'Ecuyer-CMRG\" in set.seed(), e.g. set.seed(1, kind = \"L'Ecuyer-CMRG\").", fixed = TRUE) stopImplicitCluster() }) diff --git a/vignettes/MultiTask_rank-then-aggregate.R b/vignettes/MultiTask_rank-then-aggregate.R index 7d97424..61f5fab 100644 --- a/vignettes/MultiTask_rank-then-aggregate.R +++ b/vignettes/MultiTask_rank-then-aggregate.R @@ -1,79 +1,79 @@ ## Multitask, rank-then-aggregate ranking ## 1\. Load package library(challengeR) ## 2\. Load data if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute") n=50 set.seed(4) strip=runif(n,.9,1) c_ideal=cbind(task="c_ideal", rbind( data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n), data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n), data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n), data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n), data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n) )) set.seed(1) c_random=data.frame(task="c_random", alg_name=factor(paste0("A",rep(1:5,each=n))), value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5) ) strip2=seq(.8,1,length.out=5) a=permute::allPerms(1:5) c_worstcase=data.frame(task="c_worstcase", alg_name=c(t(a)), value=rep(strip2,nrow(a)), case=rep(1:nrow(a),each=5) ) c_worstcase=rbind(c_worstcase, data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1) ) c_worstcase$alg_name=factor(c_worstcase$alg_name,labels=paste0("A",1:5)) data_matrix=rbind(c_ideal, c_random, c_worstcase) ## 3 Perform ranking ### 3.1 Define challenge object challenge=as.challenge(data_matrix, by="task", algorithm="alg_name", case="case", value="value", smallBetter = FALSE) ### 3.2 Perform ranking ranking=challenge%>%rankThenAggregate(FUN = mean, ties.method = "min" ) ## 4\. Perform bootstrapping library(doParallel) registerDoParallel(cores=8) -set.seed(1) +set.seed(1, kind="L'Ecuyer-CMRG") ranking_bootstrapped=ranking%>%bootstrap(nboot=1000, parallel=TRUE, progress = "none") stopImplicitCluster() ## 5\. Generate the report meanRanks=ranking%>%consensus(method = "euclidean") meanRanks # note that there may be ties (i.e. some algorithms have identical mean rank) ranking_bootstrapped %>% report(consensus=meanRanks, title="multiTaskChallengeExample", file = "MultiTask_rank-then-aggregate", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine="pdflatex",#LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean=TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) diff --git a/vignettes/MultiTask_test-then-rank.R b/vignettes/MultiTask_test-then-rank.R index 554823c..55b4183 100644 --- a/vignettes/MultiTask_test-then-rank.R +++ b/vignettes/MultiTask_test-then-rank.R @@ -1,82 +1,82 @@ ## Multi-task, test-then-rank based on Wilcoxon signed rank ranking ## 1\. Load package library(challengeR) ## 2\. Load data if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute") n=50 set.seed(4) strip=runif(n,.9,1) c_ideal=cbind(task="c_ideal", rbind( data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n), data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n), data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n), data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n), data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n) )) set.seed(1) c_random=data.frame(task="c_random", alg_name=factor(paste0("A",rep(1:5,each=n))), value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5) ) strip2=seq(.8,1,length.out=5) a=permute::allPerms(1:5) c_worstcase=data.frame(task="c_worstcase", alg_name=c(t(a)), value=rep(strip2,nrow(a)), case=rep(1:nrow(a),each=5) ) c_worstcase=rbind(c_worstcase, data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1) ) c_worstcase$alg_name=factor(c_worstcase$alg_name,labels=paste0("A",1:5)) data_matrix=rbind(c_ideal, c_random, c_worstcase) ## 3 Perform ranking ### 3.1 Define challenge object challenge=as.challenge(data_matrix, by="task", algorithm="alg_name", case="case", value="value", smallBetter = FALSE) ### 3.2 Perform ranking #{r, eval=F, echo=T} ranking=challenge%>%testThenRank(alpha=0.05, p.adjust.method="none", na.treat=0, ties.method = "min" ) ## 4\. Perform bootstrapping library(doParallel) registerDoParallel(cores=8) -set.seed(1) +set.seed(1, kind="L'Ecuyer-CMRG") ranking_bootstrapped=ranking%>%bootstrap(nboot=1000, parallel=TRUE, progress = "none") stopImplicitCluster() ## 5\. Generate the report meanRanks=ranking%>%consensus(method = "euclidean") meanRanks # note that there may be ties (i.e. some algorithms have identical mean rank) ranking_bootstrapped %>% report(consensus=meanRanks, title="multiTaskChallengeExample", file = "MultiTask_test-then-rank", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine="pdflatex",#LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean=TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) diff --git a/vignettes/SingleTask_aggregate-then-rank.R b/vignettes/SingleTask_aggregate-then-rank.R index c1004cd..c9dbeba 100644 --- a/vignettes/SingleTask_aggregate-then-rank.R +++ b/vignettes/SingleTask_aggregate-then-rank.R @@ -1,71 +1,71 @@ ## Single task, aggregate-then-rank ranking ## 1\. Load package library(challengeR) ## 2\. Load data if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute") n=50 set.seed(4) strip=runif(n,.9,1) c_ideal=cbind(task="c_ideal", rbind( data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n), data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n), data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n), data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n), data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n) )) set.seed(1) c_random=data.frame(task="c_random", alg_name=factor(paste0("A",rep(1:5,each=n))), value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5) ) strip2=seq(.8,1,length.out=5) a=permute::allPerms(1:5) c_worstcase=data.frame(task="c_worstcase", alg_name=c(t(a)), value=rep(strip2,nrow(a)), case=rep(1:nrow(a),each=5) ) c_worstcase=rbind(c_worstcase, data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1) ) c_worstcase$alg_name=factor(c_worstcase$alg_name,labels=paste0("A",1:5)) data_matrix=rbind(c_ideal, c_random, c_worstcase) ## 3 Perform ranking ### 3.1 Define challenge object dataSubset=subset(data_matrix, task=="c_random") challenge=as.challenge(dataSubset, algorithm="alg_name", case="case", value="value", smallBetter = FALSE) ### 3.2 Perform ranking ranking=challenge%>%aggregateThenRank(FUN = mean, na.treat=0, ties.method = "min") ## 4\. Perform bootstrapping library(doParallel) registerDoParallel(cores=8) -set.seed(1) +set.seed(1, kind="L'Ecuyer-CMRG") ranking_bootstrapped=ranking%>%bootstrap(nboot=1000, parallel=TRUE, progress = "none") stopImplicitCluster() ## 5\. Generate the report ranking_bootstrapped %>% report(title="singleTaskChallengeExample", # used for the title of the report file = "SingleTask_aggregate-then-rank", format = "PDF", # format can be "PDF", "HTML" or "Word" latex_engine="pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex" clean=TRUE #optional. Using TRUE will clean intermediate files that are created during rendering. ) diff --git a/vignettes/visualizations.Rmd b/vignettes/visualizations.Rmd index 93c3796..5205c44 100644 --- a/vignettes/visualizations.Rmd +++ b/vignettes/visualizations.Rmd @@ -1,167 +1,167 @@ --- title: "Visualizations" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Visualizations} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup} library(challengeR) ``` The package offers an intuitive way to gain important insights into the relative and absolute performance of algorithms. It enables you to generate a benchmarking report that contains visualizations and respective explanations. An overview of all available visualizations is provided on this page demonstrating the use of their corresponding plot functions. This might be of interest if you want to generate the plots separately (e.g. to apply other styles). The provided plots are described in the following sections: * Visualizing assessment data * Visualizing ranking stability * Visualizing cross-task insights Details can be found in [Wiesenfarth et al. (2021)](https://rdcu.be/ceiaN). # Visualizing assessment data ```{r} data <- read.csv(system.file("extdata", "data_matrix.csv", package = "challengeR", mustWork = TRUE)) challenge <- as.challenge(data, by = "task", algorithm = "alg_name", case = "case", value = "value", smallBetter = FALSE) ranking <- challenge%>%aggregateThenRank(FUN = mean, ties.method = "min") ``` ## Dot- and boxplots Dot- and boxplots visualize the assessment data separately for each algorithm. Boxplots representing descriptive statistics for all test cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual test cases. ```{r boxplot} boxplot(ranking) ``` ## Podium plots Upper part of the podium plot: Algorithms are color-coded, and each colored dot in the plot represents a performance value achieved with the respective algorithm. The actual value is encoded by the y-axis. Each podium (here: $p = 5$) represents one possible rank, ordered from best (1) to worst (here: 5). The assignment of values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding test case.Note that the plot part above each podium place is further subdivided into $p$ “columns”, where each column represents one algorithm. Dots corresponding to identical test cases are connected by a line, producing the spaghetti structure shown here. Lower part: Bar charts represent the relative frequency at which each algorithm actually achieves the rank encoded by the podium place. ```{r podium,fig.width=5} podium(ranking, layout.heights = c(.6, 0.4)) ``` ## Ranking heatmaps In a ranking heatmap, each cell $\left( i, A_j \right)$ shows the absolute frequency of cases in which algorithm $A_j$ achieved rank $i$. ```{r rankingHeatmap} rankingHeatmap(ranking) ``` # Visualizing ranking stability The ranking robustness can by analyzed with respect to the ranking method used (see [Wiesenfarth et al. (2021)](https://rdcu.be/ceiaN) for different ranking methods). ## Line plots Line plots visualize the robustness of ranking across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods. ```{r lineplot, fig.width = 7} methodsplot(challenge) ``` For a specific ranking method, the ranking stability can be investigated via bootstrapping and the testing approach. A ranking object containing the bootstrapping samples has to be created which serves as the basis for the plots. ```{r bootstrapping, results = "hide"} -set.seed(1) +set.seed(1, kind="L'Ecuyer-CMRG") rankingBootstrapped <- ranking%>%bootstrap(nboot = 1000) ``` ## Blob plots Blob plots for visualizing ranking stability are based on bootstrap sampling. Algorithms are color-coded, and the area of each blob at position $\left( A_i, \text{rank } j \right)$ is proportional to the relative frequency $A_i$ achieved rank $j$ (here across $b = 1000$ bootstrap samples). The median rank for each algorithm is indicated by a black cross. 95% bootstrap intervals across bootstrap samples (ranging from the 2.5th to the 97.5th percentile of the bootstrap distribution) are indicated by black lines. ```{r stabilityByTask1, fig.width = 7} stabilityByTask(rankingBootstrapped) ``` ## Violin plots Violin plots provide a more condensed way to analyze bootstrap results. In these plots, the focus is on the comparison of the ranking list computed on the full assessment data and the individual bootstrap samples, respectively. Kendall’s $\tau$ is chosen for comparison as it is has an upper and lower bound (+1/-1). Kendall’s $\tau$ is computed for each pair of rankings, and a violin plot that simultaneously depicts a boxplot and a density plot is generated from the results. ```{r violin, results = "hide"} violin(rankingBootstrapped) ``` ## Significance maps Significance maps visualize ranking stability based on statistical significance. They depict incidence matrices of pairwise significant test results for the one-sided Wilcoxon signed rank test at 5% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that performance values of the algorithm on the x-axis are significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference. ```{r significanceMap} significanceMap(ranking) ``` # Visualizing cross-task insights For cross-task insights, a consensus ranking (rank aggregation across tasks) has to be given additionally. The consensus ranking according to mean ranks across tasks is computed here. ```{r} meanRanks <- ranking%>%consensus(method = "euclidean") ``` The consensus ranking is given according to mean ranks across tasks if method="euclidean" where in case of ties (equal ranks for multiple algorithms) the average rank is used, i.e. ties.method="average". ## Characterization of algorithms The primary goal of most multi-task challenges is to identify methods that consistently outperform competing algorithms across all tasks. We propose the followig methods for analyzing this: ### Blob plots visualizing the ranking variability across tasks Blob plots visualize the distribution of ranks across tasks. All ranks that an algorithm achieved in any task are displayed along the y-axis, with the area of the blob being proportional to the frequency. If all tasks provided the same stable ranking, narrow intervals around the diagonal would be expected. Consensus rankings above algorithm names highlight the presence of ties. ```{r stability, fig.width = 5, fig.height = 4} stability(ranking, ordering = names(meanRanks)) ``` ### Blob plots visualizing the ranking variability based on bootstrapping This variant of the blob plot approach involves replacing the algorithms on the x-axis with the tasks and then generating a separate plot for each algorithm. This allows assessing the variability of rankings for each algorithm across multiple tasks and bootstrap samples. Here, color coding is used for the tasks, and separation by algorithm enables a relatively straightforward strength-weaknesses analysis for individual methods. ```{r stabilityByAlgorithm1, fig.width = 7, fig.height = 5} stabilityByAlgorithm(rankingBootstrapped, ordering = names(meanRanks)) ``` ### Stacked frequency plots visualizing the ranking variability based on bootstrapping An alternative representation is provided by a stacked frequency plot of the observed ranks, separated by algorithm. Observed ranks across bootstrap samples are displayed with coloring according to the task. For algorithms that achieve the same rank in different tasks for the full assessment data set, vertical lines are on top of each other. Vertical lines allow to compare the achieved rank of each algorithm over different tasks. ```{r stabilityByAlgorithm2, fig.width = 7, fig.height = 5} stabilityByAlgorithm(rankingBootstrapped, ordering = names(meanRanks), stacked = TRUE) ``` ## Characterization of tasks It may also be useful to structure the analysis around the different tasks. This section proposes visualizations to analyze and compare tasks of a competition. ### Blob plots visualizing bootstrap results Bootstrap results can be shown in a blob plot showing one plot for each task. Algorithms should be ordered according to the consensus ranking. In this view, the spread of the blobs for each algorithm can be compared across tasks. Deviations from the diagonal indicate deviations from the consensus ranking (over tasks). Specifically, if rank distribution of an algorithm is consistently below the diagonal, the algorithm performed better in this task than on average across tasks, while if the rank distribution of an algorithm is consistently above the diagonal, the algorithm performed worse in this task than on average across tasks. At the bottom of each panel, ranks for each algorithm in the tasks are provided. ```{r stabilityByTask2, fig.width = 7, fig.height = 3.5} stabilityByTask(rankingBootstrapped, ordering = names(meanRanks)) ``` ### Violin plots visualizing bootstrap results To obtain a more condensed visualization, violin plots (see above) can be applied separately to all tasks. The overall stability of the rankings can then be compared by assessing the locations and lengths of the violins. ### Cluster analysis There is increasing interest in assessing the similarity of the tasks, e.g., for pre-training a machine learning algorithm. A potential approach to this could involve the comparison of the rankings for a challenge. Given the same teams participate in all tasks, it may be of interest to cluster tasks into groups where rankings of algorithms are similar and to identify tasks which lead to very dissimilar rankings of algorithms. To enable such an analysis, we propose the generation of a dendrogram from hierarchical cluster analysis. Here, it depicts clusters according to a chosen distance measure (Spearman’s footrule) as well as a chosen agglomeration method (complete agglomeration). ```{r dendrogram, fig.width = 7, fig.height = 3.5} dendrogram(ranking) ```