diff --git a/DESCRIPTION b/DESCRIPTION index f66a29b..8baec69 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,33 +1,34 @@ Package: challengeR Type: Package Title: Analyzing assessment data of biomedical image analysis competitions and visualization of results Version: 0.3.3 Date: 2020-04-18 Author: Manuel Wiesenfarth, Annette Kopp-Schneider Maintainer: Manuel Wiesenfarth Description: Analyzing assessment data of biomedical image analysis competitions and visualization of results. License: GPL-3 Depends: R (>= 3.5.2), ggplot2 (>= 3.3.0), purrr (>= 0.3.3) Imports: dplyr (>= 0.8.5), graph (>= 1.64.0), knitr (>= 1.28), methods (>= 3.6.0), plyr (>= 1.8.6), relations (>= 0.6-9), reshape2 (>= 1.4.3), rlang (>= 0.4.5), rmarkdown (>= 2.1), tidyr (>= 1.0.2), viridisLite (>= 0.3.0) Suggests: doParallel (>= 1.0.15), foreach (>= 1.4.8), ggpubr (>= 0.2.5), Rgraphviz (>= 2.30.0), testthat (>= 2.1.0) VignetteBuilder: knitr +Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.0 diff --git a/R/S3.R b/R/S3.R index 17f4a6e..5f22425 100644 --- a/R/S3.R +++ b/R/S3.R @@ -1,35 +1,26 @@ utils::globalVariables(c(".")) "+.ggList" <- function (e1, e2){ pp <- e1 if(is.ggplot(pp)) plotList <- list(pp) else if(is.list(pp)) plotList <- pp else stop("Can't handle an object of class ", class(pp)) - + for(i in 1:length(plotList)){ p <- plotList[[i]] if(is.ggplot(p)) plotList[[i]] <- p + e2 } - + if(is.ggplot(pp)) plotList[[1]] else plotList } "%++%" <- `+.ggList` print.ranked.list <-function(x,...) print(x$matlist, ...) print.aggregated.list <-function(x,...) print(x$matlist, ...) print.aggregated <-function(x,...) print(x$mat,...) print.ranked <-function(x,...) print(x$mat[order(x$mat$rank),],...) print.ggList <- function(x, ...) { for(i in 1:length(x)) print(x[[i]]) } - - - - - - - - - diff --git a/R/boxplot.R b/R/boxplot.R index e39f20c..dbad99d 100644 --- a/R/boxplot.R +++ b/R/boxplot.R @@ -1,68 +1,80 @@ - +#' Creates dot- and boxplots +#' +#' Creates dot- and boxplots visualizing the assessment data separately for each algorithm. +#' Boxplots representing descriptive statistics for all test cases (median, quartiles and outliers) +#' are combined with horizontally jittered dots representing individual test cases. +#' +#' @param x The ranked assessment data set. +#' @param color A string specifying the color of the dots. +#' @param jitter.width A numeric value specifying the jitter width of the dots. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize assessment data +#' @export boxplot.ranked.list=function(x, color="blue", jitter.width=0.25,...){ algo=attr(x$data,"algorithm") value=attr(x$data,"value") ranking=x x=x$data for (i in names(x)) { x[[i]][[algo]]=factor(x[[i]][[algo]], levels=rownames(ranking$matlist[[i]][order(ranking$matlist[[i]]$rank),])) } a=lapply(1:length(x),function(id){ ggplot(aes_string(algo,value),data=x[[id]])+ geom_jitter(position=position_jitter(width=jitter.width, height=0), color=color,...)+ geom_boxplot(outlier.shape = NA,fill=NA)+ ggtitle(names(x)[id]) + theme(axis.text.x=element_text(angle = -90, hjust = 0)) + xlab("Algorithm") + ylab("Metric value") }) # Remove title for single-task data set if (length(a) == 1) { a[[1]]$labels$title <- NULL } else { names(a) = names(x$matlist) } class(a) <- "ggList" a } - - - - - boxplot.comparedRanks.list=function(x,...){ tau=sapply(x,function(z) z$tau) boxplot(tau,ylim=c(0,1.0),las=2, outline=FALSE, ylab="Kendall's tau",...) stripchart(tau, vertical = TRUE, method = "jitter", pch = 21, col = "blue", add=TRUE,...) } - boxplot.bootstrap.list=function(x,...){ winner.noboot=winner.ranked.list(x) x2=winnerFrequencies(x) n.bootstraps= ncol(x$bootsrappedRanks[[1]]) perc_boot_Winner=lapply(1:length(x2),function(i){ x2.i=x2[[i]] winner.id=which(rownames(x2.i)%in%rownames(winner.noboot[[i]])) #could be multiple winners!!!! 100*x2.i[winner.id,3,drop=F]/n.bootstraps }) boxplot(unlist(perc_boot_Winner),ylim=c(0,100),las=2, outline=FALSE, ylab="% Bootstraps",xlab="Winner ranks 1", sub=paste(n.bootstraps,"Bootstraps"),...) stripchart(unlist(perc_boot_Winner), vertical = TRUE, method = "jitter", pch = 21, col = "blue", add=TRUE,...) } diff --git a/R/consensus.R b/R/consensus.R index 325b778..abe8b4d 100644 --- a/R/consensus.R +++ b/R/consensus.R @@ -1,15 +1,30 @@ +#' @export consensus <- function(object,...) UseMethod("consensus") + +#' @export consensus.default <- function(object, ...) stop("not implemented for this class") +#' Computes a consensus ranking +#' +#' Computes a consensus ranking (rank aggregation) across tasks. +#' +#' @param object The ranked asssessment data set. +#' @param method A string specifying the method to derive the consensus ranking, see [relations::consensus()] for the methods. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' @export consensus.ranked.list=function(object, method, ...){ - relensemble= relation_ensemble(list = as.relation(object)) - cons=relation_consensus(relensemble, - method = method,...) # consensus ranking according to mean ranks across tasks if method="euclidean. + relensemble= relation_ensemble(list = as.relation(object)) + cons=relation_consensus(relensemble, + method = method,...) # consensus ranking according to mean ranks across tasks if method="euclidean". # See ?relation_consensus for different methods to derive consensus ranking res=sort(relation_scores(cons, decreasing=FALSE)) # note that there may be ties (i.e. some algorithms have identical mean rank) attr(res,"method")=method res } diff --git a/R/dendrogram.R b/R/dendrogram.R index 2afbf57..89ef5c3 100644 --- a/R/dendrogram.R +++ b/R/dendrogram.R @@ -1,25 +1,45 @@ +#' @export dendrogram <- function(object,...) UseMethod("dendrogram") + +#' @export dendrogram.default <- function(object, ...) stop("not implemented for this class") +#' Creates a cluster dendrogram +#' +#' Creates a cluster dendrogram from a ranked assessment data set. +#' +#' @param object The ranked assessment data set. +#' @param dist A string specifying the distance measure to be used, see [relations::dissimilarity()]. +#' @param method A string specifying agglomeration method to be used, see [stats::hclust()]. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize cross-task insights +#' @export dendrogram.ranked.list <- function(object, dist = "symdiff", #the distance measure to be used. see ?relation_dissimilarity method = "complete", #the agglomeration method to be used. see ?hclust ... # arguments passed to stats:::plot.hclust ){ relensemble=as.relation.ranked.list(object) - d <- relation_dissimilarity(relensemble, + d <- relation_dissimilarity(relensemble, method = dist) clust <- hclust(d, method=method) dots <- match.call(expand.dots = FALSE)$... if (is.null(dots$xlab)) dots$xlab <- "" if (is.null(dots$sub)) dots$sub <- "" if (is.null(dots$main)) dots$main <- paste0("Cluster Dendrogram (", method, " agglomeration)") do.call(plot, c(list(x = clust), dots) ) invisible(list(dist = d, hclust = clust )) } diff --git a/R/methodsplot.R b/R/methodsplot.R index c52cb6f..85b4113 100644 --- a/R/methodsplot.R +++ b/R/methodsplot.R @@ -1,108 +1,130 @@ +#' @export methodsplot <- function(x,...) UseMethod("methodsplot") + +#' @export methodsplot.default <- function(x, ...) stop("not implemented for this class") +#' Creates line plots +#' +#' Create line plots that visualize the robustness of ranking across different ranking methods from a challenge object. +#' +#' @param x The challenge object. +#' @param na.treat Indicates how missing perfomance values are treated if sanity check is enabled. It can be 'na.rm', numeric value or function. +#' For a numeric value or function, NAs will be replaced by the specified values. For 'na.rm', rows that contain missing values will be removed. +#' @param methods A list of ranking methods that should be incorporated. +#' @param ordering +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize ranking stability +#' @export methodsplot.challenge=function(x, na.treat=NULL, methods=list(testBased=.%>%test() %>% rank(ties.method = "min"), meanThenRank= .%>% aggregate( FUN="mean") %>% rank(ties.method = "min"), medianThenRank=.%>% aggregate( FUN="median") %>% rank(ties.method = "min"), rankThenMean= .%>%rank(ties.method = "min") %>% aggregate( FUN="mean") %>%rank(ties.method = "min"), rankThenMedian=.%>%rank(ties.method = "min") %>% aggregate( FUN="median") %>%rank(ties.method = "min") ), ordering, ...) { if (any(sapply(x, function(task) any(is.na(task[,attr(x, "value")]))))) { # only if missings present, else do nothing if (is.null(na.treat)) { warning("Please specify na.treat in as.challenge()") return(NULL) } else { xx = melt(x, id.vars=c(attr(x,"value"), attr(x,"algorithm") , attr(x,"case"), attr(x,"annotator"), attr(x,"by") )) x=as.challenge(xx, value=attr(x,"value"), algorithm=attr(x,"algorithm") , case=attr(x,"case"), by=attr(x,"by"), annotator = attr(x,"annotator"), smallBetter = attr(x,"smallBetter"), na.treat=na.treat) } } a=lapply(methods,function(fun) fun(x)) dat=melt(a,measure.vars="rank") colnames(dat)[4:5]=c("task","rankingMethod") if (missing(ordering)){ lev=sort(unique(dat$algorithm)) lab=lev } else { lev=ordering lab=lev } dat=dat%>% dplyr::rename(rank=.data$value)%>% mutate(rank=factor(.data$rank))%>% mutate(task=factor(.data$task))%>% mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab)) linePlot <- ggplot(data = dat) + aes(x = rankingMethod, y = rank, color=algorithm, group=algorithm ) + geom_line(size=1)+ xlab("Ranking method") + ylab("Rank")+ theme( strip.placement = "outside", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) ) # Create multi-panel plot with task names as titles for multi-task data set if (length(x) > 1) { linePlot <- linePlot + facet_wrap(~ task) } return(linePlot) } # methodsplot.ranked.list does not exist, use methodpsplot.challenge instead since consonsus ranking needed for ordering (or alphabetical ordering instead) #similar plot to methods plot, instead of across ranking methods across tasks lineplot <- function(x,...) UseMethod("lineplot") lineplot.default <- function(x, ...) stop("not implemented for this class") lineplot.challenge=function(x, ordering,...){ if (inherits(x,"list")) { dat=melt(x,measure.vars="rank") colnames(dat)[4]=c("task") if (missing(ordering)){ lev=sort(unique(dat$algorithm)) lab=lev } else { lev=ordering lab=paste(1:length(ordering),ordering) } dat=dat%>% dplyr::rename(rank=.data$value)%>% mutate(rank=factor(.data$rank))%>% mutate(task=factor(.data$task))%>% mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab)) ggplot(data = dat) + aes(x = task, y = rank, color=algorithm, group=algorithm ) + geom_line(size=1)+ theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) ) } else stop("Only applicable to multiple tasks") } diff --git a/R/podium.R b/R/podium.R index e5d769d..4a9708d 100644 --- a/R/podium.R +++ b/R/podium.R @@ -1,163 +1,221 @@ +#' @export podium <- function(object,...) UseMethod("podium") + +#' @export podium.default <- function(object, ...) stop("not implemented for this class") +#' Creates podium plots +#' +#' Creates podium plots from one or more ranked assessment data sets. +#' +#' @param object The ranked asssessment data set. +#' @param xlab A string specifying the x-axis label. +#' @param ylab A string specifying the y-axis label. +#' @param lines.show +#' @param lines.alpha +#' @param lines.lwd +#' @param col +#' @param lines.col +#' @param dots.pch +#' @param dots.cex +#' @param places.lty +#' @param places.col +#' @param legendfn +#' @param layout.heights +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize assessment data +#' @export podium.ranked.list=function(object, - xlab = NULL, - ylab = NULL, + xlab = "Podium", + ylab = "Performance", lines.show = TRUE, lines.alpha = 0.2, lines.lwd = 1, col, lines.col = col, dots.pch = 19, dots.cex = 1, places.lty = 2, places.col = 1, legendfn = function(algs, cols) { legend("topleft", algs, lwd = 1, col = cols, bg = "white") }, layout.heights=c(1,0.4), ...){ - if (is.null(xlab)) xlab <- "Podium" - if (is.null(ylab)) ylab <- "Performance" x=object$data podiumPlots <- length(names(x)) for (subt in names(x)) { ordering=t(object$matlist[[subt]][,"rank",drop=F])["rank",] if (missing(col)) col=default_colors(length(ordering), algorithms = names(ordering)) dd=as.challenge(x[[subt]], value=attr(x,"value"), algorithm=attr(x,"algorithm"), case=attr(x,"case"), by=attr(x, "by"), annotator = attr(x,"annotator"), smallBetter = attr(x,"smallBetter"), na.treat=object$call[[1]][[1]]$na.treat) podiumPlot <- podium(dd, ordering=ordering, xlab = xlab, ylab = ylab, lines.show = lines.show, lines.alpha = lines.alpha, lines.lwd = lines.lwd, col=col, lines.col = lines.col, dots.pch = dots.pch, dots.cex = dots.cex, places.lty = places.lty, places.col = places.col, legendfn = legendfn, layout.heights=layout.heights, ...) if (length(names(x)) > 1) { title(subt,outer=T,line=-3) } append(podiumPlots, podiumPlot) } } +#' Creates a podium plot +#' +#' Creates a podium plot from a challenge object. +#' +#' @param object The challenge object. +#' @param ordering +#' @param xlab A string specifying the x-axis label. +#' @param ylab A string specifying the y-axis label. +#' @param lines.show +#' @param lines.alpha +#' @param lines.lwd +#' @param col +#' @param lines.col +#' @param dots.pch +#' @param dots.cex +#' @param places.lty +#' @param places.col +#' @param legendfn +#' @param layout.heights +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize assessment data +#' @export podium.challenge=function(object, ordering, xlab = NULL, ylab = NULL, lines.show = FALSE, lines.alpha = 0.2, lines.lwd = 1, col,lines.col = col, dots.pch = 19, dots.cex = 1, places.lty = 2, places.col = 1, legendfn = function(algs, cols) { legend("topleft", algs, lwd = 1, col = cols, bg = "white") }, layout.heights=c(1,0.4), ...) { ranking=object%>%rank( ties.method = "random" ) task <- ranking$matlist[[1]] dat=as.data.frame(table(task[[attr(object, "algorithm")]], task$rank, dnn=c("algorithm","rank")), responseName = "Count") form=as.formula(paste(attr(object,"case"), attr(object,"algorithm"), sep="~")) ranks=acast(task, form, value.var="rank") values=acast(task, form, value.var=attr(object, "value")) nranks=acast(dat, algorithm~rank, value.var="Count") nalgs <- ncol(ranks) algs <- colnames(ranks) barorder <- order(ordering) orderedAlgorithms= names(ordering)[barorder] ylim=range(task[[attr(object,"value")]], na.rm = TRUE) dotplotborders <- (0:nalgs) * nalgs dotplaces <- (1:nalgs) - 0.5 names(dotplaces) <- orderedAlgorithms linecols <- sapply(lines.col, function(c) { r <- col2rgb(c) rgb(r[1], r[2], r[3], alpha = round(255 * lines.alpha), maxColorValue = 255) }) opar <- par(no.readonly = TRUE) layout(matrix(c(1, 2), nrow = 2, byrow = TRUE), heights =layout.heights) mar <- par("mar") par(mar = c(0, mar[2], mar[3], mar[4])) plot(dotplotborders, rep(ylim[2], nalgs + 1), type = "n", ylim = ylim, ylab = ylab, xlab = "", axes = F) axis(1, at = dotplotborders, labels = NA, lwd = par("lwd")) axis(2, lwd = par("lwd")) box() abline(v = dotplotborders, lty = places.lty, col = places.col) linesegments <- function(x, y, ...) { n <- length(x) segments(x[-n], y[-n], x[-1], y[-1], ...) } drawthe <- function(fn, col, ...) { for (i in 1:nrow(values)) { r <- ranks[i, ] o <- order(r) performances <- (values[i, ])[o] places <- (dotplaces[names(r)] + ((r - 1) * nalgs))[o] fn(places, performances, col = col[names(r)[o]], ...) } } if (lines.show) drawthe(linesegments, linecols, lwd = lines.lwd) drawthe(points, col, pch = dots.pch, cex = dots.cex) legendfn(orderedAlgorithms, col[orderedAlgorithms]) par(mar = c(mar[1], mar[2], 0, mar[4])) barplot(nranks[barorder,], beside = TRUE, width = 1, axes = F, space = c(0, 0), border = NA, ylim = c(0, nrow(ranks)), names.arg = paste(1:nalgs, ".", sep = ""), col = col[orderedAlgorithms], xlab = xlab) axis(1, at = c(0, dotplotborders), labels = NA, lwd = par("lwd")) box() par(opar) } diff --git a/R/rankingHeatmap.R b/R/rankingHeatmap.R index 82bfbcf..6c71d59 100644 --- a/R/rankingHeatmap.R +++ b/R/rankingHeatmap.R @@ -1,63 +1,98 @@ +#' @export rankingHeatmap <- function(x,...) UseMethod("rankingHeatmap") + +#' @export rankingHeatmap.default <- function(x, ...) stop("not implemented for this class") +#' Creates ranking heatmaps +#' +#' Creates ranking heatmaps from one or more ranked assessment data sets. +#' +#' @param x The ranked asssessment data set. +#' @param ties.method A string specifying how ties are treated, see [base::rank()]. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize assessment data +#' @export rankingHeatmap.ranked.list=function (x,ties.method="min",...) { xx=x$data a=lapply(names(x$matlist),function(subt){ ordering=rownames(x$matlist[[subt]])[order(x$matlist[[subt]]$rank)] dd=as.challenge(xx[[subt]], value=attr(xx,"value"), algorithm=attr(xx,"algorithm") , case=attr(xx,"case"), by=attr(xx, "by"), annotator = attr(xx,"annotator"), smallBetter = attr(xx,"smallBetter"), na.treat=x$call[[1]][[1]]$na.treat) rankingHeatmap(dd, ordering=ordering, ties.method=ties.method,...) + ggtitle(subt) }) # Remove title for single-task data set if (length(a) == 1) { a[[1]]$labels$title <- NULL } else { names(a) = names(x$matlist) } class(a) <- "ggList" a } - +#' Creates a ranking heatmap +#' +#' Creates a ranking heatmap from a challenge object. +#' +#' @param x The challenge object. +#' @param ordering +#' @param ties.method A string specifying how ties are treated, see [base::rank()]. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize assessment data +#' @export rankingHeatmap.challenge=function(x, ordering, ties.method="min",...) { ranking=x%>%rank( ties.method = ties.method ) task <- ranking$matlist[[1]] dat=as.data.frame(table(task[[attr(x,"algorithm")]], task$rank, dnn=c("algorithm","rank")), responseName = "Count") dat$algorithm=factor(dat$algorithm, levels=ordering) ncases=length(unique(task[[attr(x,"case")]])) ggplot(dat)+ geom_raster(aes(algorithm, rank, fill= Count))+ geom_hline(yintercept = seq(1.5,max(task$rank)-.5,by=1), color=grey(.8),size=.3)+ geom_vline(xintercept = seq(1.5,length(unique(dat$algorithm))-.5,by=1), color=grey(.8),size=.3)+ scale_fill_viridis_c(direction = -1, limits=c(0,ncases) )+ theme(axis.text.x = element_text(angle = 90), aspect.ratio=1)+ xlab("Algorithm")+ ylab("Rank") } diff --git a/R/wrapper.R b/R/rankingMethods.R similarity index 96% rename from R/wrapper.R rename to R/rankingMethods.R index bef8971..328a09e 100644 --- a/R/wrapper.R +++ b/R/rankingMethods.R @@ -1,84 +1,84 @@ #' Performs ranking via aggregate-then-rank #' #' Performs ranking by first aggregating performance values across all cases (e.g., with the mean, median or another quantile) for each algorithm. #' This aggregate is then used to compute a rank for each algorithm. #' #' @param object The challenge object. #' @param FUN The aggregation function, e.g. mean, median, min, max, function(x), quantile(x, probs=0.05). -#' @param ties.method A string specifying how ties are treated, see \code{\link{base::rank}}. +#' @param ties.method A string specifying how ties are treated, see [base::rank()]. #' @param ... Further arguments passed to or from other functions. #' #' @return An S3 object of class "ranked.list" to represent a ranked assessment data set. #' #' @examples #' #' \dontrun{ #' aggregateThenRank(challenge, FUN = mean, ties.method = "average", na.treat = 0) #' } #' #' @family ranking functions #' @export aggregateThenRank=function(object,FUN,ties.method = "min",...){ object %>% aggregate(FUN=FUN,...) %>% rank(ties.method = ties.method) } #' Performs ranking via test-then-rank #' #' Computes statistical hypothesis tests based on Wilcoxon signed rank test for each possible #' pair of algorithms to assess differences in metric values between the algorithms. #' Then ranking is performed according to the number of significant one-sided test results. #' If algorithms have the same number of significant test results, then they obtain the same rank. #' #' @param object The challenge object. -#' @param ties.method A string specifying how ties are treated, see \code{\link{base::rank}}. +#' @param ties.method A string specifying how ties are treated, see [base::rank()]. #' @param ... Further arguments passed to or from other functions. #' #' @return An S3 object of class "ranked.list" to represent a ranked assessment data set. #' #' @examples #' \dontrun{ #' testThenRank(challenge, #' alpha=0.05, # significance level #' p.adjust.method="none", # method for adjustment for multiple testing, see ?p.adjust #' na.treat = 0) #' } #' #' @family ranking functions #' @export testThenRank=function(object, ties.method = "min",...){ object %>% aggregate(FUN="significance",...) %>% rank(ties.method = ties.method) } #' Performs ranking via rank-then-aggregate #' -#' Performs ranking by first computing a rank for each case for each algorithm (”rank first”). +#' Performs ranking by first computing a rank for each case for each algorithm ("rank first"). #' The final rank is based on the aggregated ranks for the cases. This ranking method handles missing values implicitly #' by assigning the worst rank to missing algorithm performances. #' #' #' @param object The challenge object. #' @param FUN The aggregation function, e.g., mean, median, min, max, function(x), quantile(x, probs=0.05). -#' @param ties.method A string specifying how ties are treated, see \code{\link{base::rank}}. +#' @param ties.method A string specifying how ties are treated, see [base::rank()]. #' #' @return An S3 object of class "ranked.list" to represent a ranked assessment data set. #' #' @examples #' \dontrun{ #' rankThenAggregate(challenge, FUN = mean) #' } #' #' @family ranking functions #' @export rankThenAggregate=function(object, FUN, ties.method = "min" ){ object %>% rank(ties.method = ties.method)%>% aggregate(FUN=FUN) %>% rank(ties.method = ties.method) # small rank is always best, i.e. smallBetter always TRUE } diff --git a/R/report.R b/R/report.R index c9d8f05..8f8467a 100644 --- a/R/report.R +++ b/R/report.R @@ -1,165 +1,167 @@ +#' @export report <- function(object,...) UseMethod("report") -report.default <- function(object, ...) stop("not implemented for this class") +#' @export +report.default <- function(object, ...) stop("not implemented for this class") #' Generates a benchmarking report with bootstrapping results #' #' Generates a benchmarking report in PDF, HTML or Word format with bootstrapping results. #' It contains the rankings, plots of the raw assessment data and plots of the ranking stability. #' For multi-task challenges, it also contains plots of cross-task insights. If you are interested in #' the individual plots as separate files, set argument \code{clean} to \code{FALSE} and specify \code{fig.format}. #' #' @param object The ranked (bootstrapped) assessment data set. #' @param consensus The rank aggregation across tasks (consensus ranking). Only needed for a multi-task data set. #' @param file A string specifying the file name of the report. It allows for specifying the output file path as well, #' otherwise the working directory is used. If \code{file} does not have a file extension, an extension will be automatically #' added according to the output format given in \code{format}. If the argument is omitted, the report is created in a #' temporary folder with file name "report". #' @param title A string specifying the title of the report. #' @param colors The color scheme that is applied to the plots. #' @param format A string specifying the format of the report. The options are "PDF", "HTML" or "Word". #' @param latex_engine A string specifying the LaTeX engine for producing PDF output. The Options are "pdflatex", "lualatex", and "xelatex". #' @param clean A boolean indicating whether intermediate files (e.g. individual plots) should be kept. Using \code{TRUE} will clean #' intermediate files that are created during rendering. #' @param fig.format A vector of strings containing the file format of the figures that are not removed if \code{clean} is set to \code{FALSE}. -#' The options are "jpeg", "png" and "pdf", e.g. \code{fig.format = c("jpeg", "png", "pdf")}. +#' The options are "jpeg", "png" and "pdf", e.g. \code{fig.format = c("jpeg", "png", "pdf")}. #' @param dpi A positive integer specifying the resolution of the generated plot (\code{fig.format} "jpeg" or "png") in dots per inch (DPI). #' @param open A boolean specifying whether the report should be opened with the default system viewer after generation. #' @param ... Further arguments passed to or from other functions. #' #' @return #' #' @examples #' @export report.bootstrap.list=function(object, consensus, file, title="", colors=default_colors, format="PDF", latex_engine="pdflatex", clean=TRUE, fig.format = NULL, # file format of figures if clean==FALSE, can be vector, e.g. fig.format=c('jpeg','png', 'pdf') dpi = 150, # DPI, relevant for bitmaps if clean==FALSE and fig.format specified open=TRUE,...){ # Copy the report file to a temporary directory before processing it, in # case we don't have write permissions to the current working dir (which # can happen when deployed). if (missing(file)) tempReport <- file.path(tempdir(), "report.Rmd") else { a=strsplit(file,"/")[[1]] path=paste0(a[-length(a)],collapse="/") if (path=="") tempReport=file.path(paste0(strsplit(a[length(a)], ".", fixed=T)[[1]][1],".Rmd")) else tempReport=file.path(path,paste0(strsplit(a[length(a)], ".", fixed=T)[[1]][1],".Rmd")) } file.copy(file.path(system.file("appdir", package = "challengeR"), - "reportMultiple.Rmd"), + "report.Rmd"), tempReport, overwrite = TRUE) if (length(object$matlist) > 1) { consensus = consensus isMultiTask = TRUE } else { consensus = NULL isMultiTask = FALSE } bootstrappingEnabled = TRUE if (is(object, "ranked.list")) { bootstrappingEnabled = FALSE } # Set up parameters to pass to Rmd document if (!is.null(fig.format) & format=="PDF") fig.format=c("pdf",fig.format) if (!is.null(fig.format) && fig.format[1]=="pdf" && format=="Word") fig.format <- c(fig.format[-1], fig.format[1]) # in word avoid use of pdf to be embedded in document params <- list( object=object, consensus=consensus, name=title, colors=colors, isMultiTask=isMultiTask, bootstrappingEnabled=bootstrappingEnabled, fig.format = fig.format, dpi = dpi ) # Knit the document, passing in the `params` list, and eval it in a # child of the global environment (this isolates the code in the document # from the code in this app). out <- render(tempReport, switch( format, PDF = pdf_document(number_sections=T, latex_engine=latex_engine), HTML = html_document(number_sections=T), Word = word_document(df_print="kable") ), params = params, envir = new.env(parent = globalenv()), clean = clean, ... ) if (!missing(file)){ if (is.na(strsplit(file,".",fixed=T)[[1]][2])) file=paste0(file, ".", strsplit(out,".",fixed=T)[[1]][2]) file.rename(out, file) } else file=out file.remove(tempReport) if (open) system(paste0('open "', file, '"')) } #' Generates a benchmarking report without bootstrapping results #' #' Generates a benchmarking report in PDF, HTML or Word format without bootstrapping results. #' It contains the rankings, plots of the raw assessment data and plots of the ranking stability. #' For multi-task challenges, it also contains plots of cross-task insights. If you are interested in #' the individual plots as separate files, set argument \code{clean} to \code{FALSE} and specify \code{fig.format}. #' #' @param object The ranked assessment data set. #' @param consensus The rank aggregation across tasks (consensus ranking). Only needed for a multi-task data set. #' @param file A string specifying the file name of the report. It allows for specifying the output file path as well, #' otherwise the working directory is used. If \code{file} does not have a file extension, an extension will be automatically #' added according to the output format given in \code{format}. If the argument is omitted, the report is created in a #' temporary folder with file name "report". #' @param title A string specifying the title of the report. #' @param colors The color scheme that is applied to the plots. #' @param format A string specifying the format of the report. The options are "PDF", "HTML" or "Word". #' @param latex_engine A string specifying the LaTeX engine for producing PDF output. The Options are "pdflatex", "lualatex", and "xelatex". #' @param clean A boolean indicating whether intermediate files (e.g. individual plots) should be kept. Using \code{TRUE} will clean #' intermediate files that are created during rendering. #' @param fig.format A vector of strings containing the file format of the figures that are not removed if \code{clean} is set to \code{FALSE}. #' The options are "jpeg", "png" and "pdf", e.g. \code{fig.format = c("jpeg", "png", "pdf")}. #' @param dpi A positive integer specifying the resolution of the generated plot (\code{fig.format} "jpeg" or "png") in dots per inch (DPI). #' @param open A boolean specifying whether the report should be opened with the default system viewer after generation. #' @param ... Further arguments passed to or from other functions. #' #' @return #' #' @examples #' @export report.ranked.list=function(object, consensus, file, title="", colors=default_colors, format="PDF", latex_engine="pdflatex", clean=TRUE, fig.format = NULL, # file format of figures if clean=FALSE, can be vector, e.g. fig.format=c('jpeg','png', 'pdf') dpi = 150, # DPI, relevant for bitmaps if clean==FALSE and fig.format specified open=TRUE, ...){ report.bootstrap.list(object, consensus, file, title, colors, format, latex_engine, clean, fig.format, dpi, open, ...) } diff --git a/R/significancePlot.R b/R/significanceMap.R similarity index 89% rename from R/significancePlot.R rename to R/significanceMap.R index ce7d599..d3fab6b 100644 --- a/R/significancePlot.R +++ b/R/significanceMap.R @@ -1,151 +1,173 @@ +#' @export significanceMap <- function(object,...) UseMethod("significanceMap") + +#' @export significanceMap.default <- function(object, ...) stop("not implemented for this class") +#' Creates significance maps +#' +#' Creates significance maps from a ranked assessment data set. +#' +#' @param object The ranked assessment data set. +#' @param alpha A numeric values specifying the significance level. +#' @param p.adjust.method A string specifying the adjustment method for multiple testing, see [stats::p.adjust()]. +#' @param order +#' @param size.rank +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize ranking stability +#' @export significanceMap.ranked.list=function(object, alpha=0.05,p.adjust.method="holm", order=FALSE, size.rank=.3*theme_get()$text$size,...){ a=object$data%>%decision.challenge(na.treat=object$call[[1]][[1]]$na.treat, alpha=alpha, p.adjust.method=p.adjust.method) aa=lapply(a, as.relation.challenge.incidence) names(aa)=names(object$data) relensemble= do.call(relation_ensemble,args = aa) res=list() for (task in names(object$data)){ res[[task]]=significanceMap.data.frame(object=object$matlist[[task]], relation_object=relensemble[[task]], order=order, size.rank=size.rank,... ) + ggtitle(task) } # Remove title for single-task data set if (length(res) == 1) { res[[1]]$labels$title <- NULL } else { names(res) = names(object$matlist) - + } class(res) <- "ggList" res } significanceMap.data.frame=function(object, relation_object, order=FALSE, size.rank=.3*theme_get()$text$size,...){ object$algorithm=rownames(object) inc=relation_incidence(relation_object) if (order){ scores=apply(inc,1, function(x) sum(x==0)-1) scores2=apply(inc,2, function(x) sum(x==1))[names(scores)]#+1-nrow(inc)) scores=data.frame(algorithm=names(scores), score=scores, score2=scores2, stringsAsFactors =F) scores=right_join(scores, object, by="algorithm") ordering= (scores[order(scores$score, scores$score2, scores$rank),"algorithm"]) scores=scores[,1:3] } else ordering= names(sort(t(object[,"rank",drop=F])["rank",])) inc=inc[ordering,] incidence.mat=melt(inc) colnames(incidence.mat)=c("algorithm","notsigPair", "decision") incidence.mat$algorithm=as.character(incidence.mat$algorithm) incidence.mat$notsigPair=as.character(incidence.mat$notsigPair) incidence.mat=right_join(incidence.mat, object, by="algorithm") if (order) incidence.mat=right_join(incidence.mat, scores, by="algorithm") incidence.mat=incidence.mat%>%mutate(algorithm=factor(.data$algorithm, levels=ordering), notsigPair=factor(.data$notsigPair, levels=ordering)) incidence.mat$decision=as.factor(incidence.mat$decision) p=ggplot(incidence.mat) + geom_raster(aes(algorithm, notsigPair, fill=decision),...)+ geom_raster(aes(algorithm,algorithm), fill="white")+ geom_abline(slope=1) + coord_cartesian(clip = 'off')+ theme(aspect.ratio=1, axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), plot.margin=unit(c(1,1,1,1), "lines"), legend.position="none")+ ylab("Algorithm")+ xlab("Algorithm")+ scale_fill_manual(values=cividis(2,begin=0,end=1,alpha=.7)) fixy=0 th_get=theme_get() # grid on top lt=th_get$panel.grid$linetype if (is.null(lt)) lt=th_get$line$linetype gridSize=c(th_get$panel.grid.major$size,th_get$panel.grid$size,th_get$line$size)[1] - + #p=p+theme(panel.background = element_rect(fill = NA),panel.ontop=TRUE) #-> grid will be on top of diagonal #fix: f=ggplot_build(p) p= p + geom_vline(xintercept=f$layout$panel_params[[1]]$x$breaks, linetype=lt, color=th_get$panel.grid$colour, size=gridSize)+ geom_hline(yintercept=f$layout$panel_params[[1]]$y$breaks, linetype=lt, color=th_get$panel.grid$colour, size=gridSize)+ geom_abline(slope=1)+ geom_text(aes(x=algorithm,y=fixy,label=rank), nudge_y=.5, vjust = 0, size=size.rank, fontface="plain",family="sans" ) if (order) p= p+ geom_text(aes(x=algorithm,y=fixy,label=score), nudge_y=0, vjust = 0, size=size.rank, fontface="plain",family="sans") + annotate("text", x=0,y=fixy+.5, vjust = 0, size=size.rank, fontface="plain", family="sans", label="original")+ annotate("text",x=0,y=fixy, vjust = 0, size=size.rank, fontface="plain",family="sans",label="new") return(p) } diff --git a/R/visualization.R b/R/stability.R similarity index 83% rename from R/visualization.R rename to R/stability.R index d37032d..44ec0d8 100644 --- a/R/visualization.R +++ b/R/stability.R @@ -1,301 +1,371 @@ +#' @export stability <- function(x,...) UseMethod("stability") + +#' @export stability.default <- function(x, ...) stop("not implemented for this class") + +#' @export stabilityByAlgorithm <- function(x,...) UseMethod("stabilityByAlgorithm") + +#' @export stabilityByAlgorithm.default <- function(x, ...) stop("not implemented for this class") + +#' @export stabilityByTask <- function(x,...) UseMethod("stabilityByTask") -stabilityByTask.default <- function(x, ...) stop("not implemented for this class") +#' @export +stabilityByTask.default <- function(x, ...) stop("not implemented for this class") +#' Creates a blob plot across tasks +#' +#' Creates a blob plots visualizing the ranking variability across tasks. +#' +#' @param x The ranked asssessment data set. +#' @param ordering +#' @param probs +#' @param max_size +#' @param freq +#' @param shape +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize cross-task insights +#' @export stability.ranked.list=function(x, ordering, probs=c(.025,.975), max_size=6, freq=FALSE, shape=4,...) { if (length(x$data) < 2) { stop("The stability of rankings across tasks cannot be computed for less than two tasks.") } dd=melt(x, measure.vars="rank", value.name="rank") %>% dplyr::rename(task="L1") if (!missing(ordering)) { dd=dd%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) } else dd=dd%>%mutate(algorithm=factor(.data$algorithm)) if (!freq) { p = ggplot(dd)+ - geom_count(aes(algorithm , + geom_count(aes(algorithm, rank, - color=algorithm , + color=algorithm, size = stat(prop*100))) } else { p=ggplot(dd)+ geom_count(aes(algorithm, - rank - ,color=algorithm )) + rank, + color=algorithm )) } p+scale_size_area(max_size = max_size)+ - stat_summary(aes(algorithm ,rank ), + stat_summary(aes(algorithm, rank), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ - stat_summary(aes(algorithm ,rank ), + stat_summary(aes(algorithm, rank), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ geom_abline(slope=1, color="gray", linetype="dotted")+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, limits=c(1,max(5,max(dd$rank))), breaks=c(1,seq(5,max(5,max(dd$rank)),by=5)))+ xlab("Algorithm")+ ylab("Rank") } rankdist.bootstrap.list=function(x,...){ rankDist=melt(lapply(x$bootsrappedRanks,t), value.name="rank") %>% dplyr::rename(algorithm="Var2",task="L1") rankDist } - - +#' Creates blob plots or stacked frequency plots stratified by algorithm +#' +#' Creates blob plots (\code{stacked = FALSE}) or stacked frequency plots (\code{stacked = TRUE}) for each algorithm +#' from a bootstrapped, ranked assessment data set. +#' +#' @param x The bootstrapped, ranked assessment data set. +#' @param ordering +#' @param stacked A boolean specifying whether a stacked frequency plot (\code{stacked = TRUE}) or blob plot (\code{stacked = FALSE}) should be created. +#' @param probs +#' @param max_size +#' @param shape +#' @param freq +#' @param single +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize cross-task insights +#' @export stabilityByAlgorithm.bootstrap.list=function(x, ordering, stacked = FALSE, probs=c(.025,.975),#only for !stacked max_size=3,#only for !stacked shape=4,#only for !stacked freq=FALSE, #only for stacked single=FALSE,...) { - + if (length(x$data) < 2) { stop("The stability of rankings by algorithm cannot be computed for less than two tasks.") } - + rankDist=rankdist.bootstrap.list(x) - + if (!missing(ordering)) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) - + if (!stacked){ if (single==FALSE){ pl <- ggplot(rankDist)+ geom_count(aes(task , rank, color=algorithm, size = stat(prop*100), group = task ))+ scale_size_area(max_size = max_size)+ stat_summary(aes(task ,rank ), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(task ,rank ), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, limits=c(1,max(5,max(rankDist$rank))), breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+ xlab("Task")+ ylab("Rank") - + } else { pl=list() for (alg in ordering){ rankDist.alg=subset(rankDist, rankDist$algorithm==alg) pl[[alg]]=ggplot(rankDist.alg)+ geom_count(aes(task , rank, color=algorithm, size = stat(prop*100), group = task ))+ scale_size_area(max_size = max_size)+ stat_summary(aes(task , rank ), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(task ,rank ), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, limits=c(1,max(5,max(rankDist$rank))), breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+ xlab("Task")+ ylab("Rank") } names(pl) = names(x$matlist) class(pl) <- "ggList" } - + } else { #stacked rankDist=rankDist%>% group_by(task)%>% dplyr::count(.data$algorithm, .data$rank)%>% group_by(.data$algorithm)%>% mutate(prop=.data$n/sum(.data$n)*100)%>% ungroup%>% data.frame%>% mutate(rank=as.factor(.data$rank)) results= melt.ranked.list(x, measure.vars="rank", value.name="rank") %>% dplyr::select(-.data$variable) colnames(results)[3]="task" if (!missing(ordering)) results=results%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) - + if (single==FALSE){ pl<- ggplot(rankDist) + facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) - + if (freq){ pl <- pl + geom_bar(aes(rank, n, fill=task ), position = "stack", stat = "identity") + ylab("Frequency") } else { pl <- pl + geom_bar(aes(rank, prop, fill=task ), position = "stack", stat = "identity")+ ylab("Proportion (%)") } - + pl <- pl + geom_vline(aes(xintercept=rank, color=task), size=.4, linetype="dotted", data=results) + xlab("Rank") } else { pl=list() for (alg in ordering){ rankDist.alg=subset(rankDist, rankDist$algorithm==alg) results.alg=subset(results, results$algorithm==alg) pl[[alg]]=ggplot(rankDist.alg)+ facet_wrap(vars(algorithm))+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) - + if (freq){ pl[[alg]] <- pl[[alg]] + geom_bar(aes(rank, n, fill=task ), position = "stack", stat = "identity") + ylab("Frequency") } else { pl[[alg]] <- pl[[alg]] + geom_bar(aes(rank, prop, fill=task ), position = "stack", stat = "identity")+ ylab("Proportion (%)") } - - pl[[alg]] <- pl[[alg]] + + + pl[[alg]] <- pl[[alg]] + geom_vline(aes(xintercept=rank, color=task), size=.4, linetype="dotted", data=results.alg) + xlab("Rank") } names(pl) = names(x$matlist) class(pl) <- "ggList" } } pl } - - - +#' Creates blob plots stratified by task +#' +#' Creates blob plots for each task from a bootstrapped, ranked assessment data set. +#' +#' @param x The bootstrapped, ranked assessment data set. +#' @param ordering +#' @param probs +#' @param max_size +#' @param size.ranks +#' @param shape +#' @param showLabelForSingleTask A boolean specifying whether the task name should be used as title for a single-task data set. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize ranking stability +#' @family functions to visualize cross-task insights +#' @export stabilityByTask.bootstrap.list=function(x, ordering, probs=c(.025,.975), max_size=3, size.ranks=.3*theme_get()$text$size, shape=4, showLabelForSingleTask=FALSE,...){ rankDist=rankdist.bootstrap.list(x) ranks=melt.ranked.list(x, measure.vars="rank", value.name = "full.rank") colnames(ranks)[4]="task" if (!missing(ordering)) { ranks$algorithm=factor(ranks$algorithm, levels=ordering) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, levels=ordering)) } blobPlot <- ggplot(rankDist)+ geom_count(aes(algorithm , rank, color=algorithm, size = stat(prop*100), group = algorithm ))+ scale_size_area(max_size = max_size)+ geom_abline(slope=1, color="gray", linetype="dotted")+ stat_summary(aes(algorithm ,rank ), geom="point", shape=shape, fun.data=function(x) data.frame(y=median(x)),...)+ stat_summary(aes(algorithm ,rank ), geom="linerange", fun.data=function(x) data.frame(ymin=quantile(x,probs[1]), ymax=quantile(x,probs[2])))+ geom_text(aes(x=algorithm,y=1,label=full.rank), nudge_y=-.6, vjust = 0, size=size.ranks, fontface="plain", family="sans", data=ranks) + coord_cartesian(clip = 'off')+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ guides(size = guide_legend(title="%"))+ scale_y_continuous(minor_breaks=NULL, limits=c(.4,max(5,max(rankDist$rank))), breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+ xlab("Algorithm")+ ylab("Rank") # Create multi-panel plot with task names as labels for multi-task data set or single-task data set when explicitly specified if (length(x$data) > 1 || showLabelForSingleTask == TRUE) { blobPlot <- blobPlot + facet_wrap(vars(task)) } return(blobPlot) } diff --git a/R/violin.R b/R/violin.R index d66feda..a6fb69a 100644 --- a/R/violin.R +++ b/R/violin.R @@ -1,91 +1,106 @@ +#' @export violin <- function(x,...) UseMethod("violin") + +#' @export violin.default <- function(x, ...) stop("not implemented for this class") +#' Creates a violin plot +#' +#' Creates a violin plot from a bootstrapped, ranked assessment data set. +#' +#' @param x The bootstrapped, ranked assessment data set. +#' @param ... Further arguments passed to or from other functions. +#' +#' @return +#' +#' @examples +#' +#' @seealso `browseVignettes("challengeR")` +#' +#' @family functions to visualize ranking stability +#' @export violin.bootstrap.list=function(x,...){ ken=melt(kendall.bootstrap.list(x)) colnames(ken)[2]="Task" cat("\n\nSummary Kendall's tau:\n") ss=ken%>%group_by(Task)%>% summarise(mean=mean(value,na.rm=T), median=median(value,na.rm=T), q25=quantile(value,probs = .25,na.rm=T), q75=quantile(value,probs = .75,na.rm=T))%>% arrange(desc(median)) print(knitr::kable(as.data.frame(ss))) # drop task if no kendall could be computed - noResults <- sapply(split(ss,ss$Task), + noResults <- sapply(split(ss,ss$Task), function(x) all(is.na(x[,-1]))) if (any(noResults)) { - cat("\nNo Kendall's tau could be calculated for any bootstrap sample in task ", + cat("\nNo Kendall's tau could be calculated for any bootstrap sample in task ", names(noResults)[noResults], " because of missing variability. Task dropped from figure.",fill=F) ken <- ken %>% filter(Task %in% names(noResults)[!noResults]) - + } - + xAxisText <- element_blank() # Show task names as tick mark labels only for multi-task data set if (length(x$data) > 1) { xAxisText <- element_text(angle = 90, vjust = 0.5, hjust = 1) } ken%>%mutate(Task=factor(.data$Task, levels=ss$Task))%>% ggplot(aes(Task,value))+ geom_violin(alpha=.3, color=NA, na.rm=TRUE, fill="blue")+ geom_boxplot(width=0.1, na.rm=TRUE, fill="white")+ theme(axis.text.x = xAxisText, legend.position = "none")+ ylab("Kendall's tau")+ scale_y_continuous(limits=c(min(min(ken$value),0), max(max(ken$value),1))) } - - kendall.bootstrap.list=function(x){ ken=lapply(1:length(x$bootsrappedRanks),function(Task){ id=match(rownames( x$bootsrappedRanks[[Task]]), rownames(x$matlist[[Task]]) ) sapply(x$bootsrappedRanks[[Task]], function(bootSample) suppressWarnings(kendall(bootSample, x$matlist[[Task]]$rank[id]))) } ) names(ken)=names((x$bootsrappedRanks)) if (sum(is.na(x))>0){ cat("Bootstrap samples without variability in rankings (all algorithms ranked 1) excluded.\n Frequency of such samples by task:\n",fill = T) sapply(ken,function(x) sum(is.na(x))) } return(ken) } - density.bootstrap.list=function(x,...){ ken=melt(kendall.bootstrap.list(x)) colnames(ken)[2]="Task" cat("\n\nSummary Kendall's tau\n") ss=ken%>%group_by(Task)%>% summarise(mean=mean(value,na.rm=T), median=median(value,na.rm=T), q25=quantile(value,probs = .25,na.rm=T), q75=quantile(value,probs = .75,na.rm=T))%>% arrange(desc(median)) print(as.data.frame(ss)) ggplot(ken)+ geom_density(aes(value,fill=Task),alpha=.3,color=NA) } diff --git a/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd b/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd index 1b6ac2c..21225d7 100644 --- a/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd +++ b/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd @@ -1,69 +1,67 @@ ### Ranking stability: Ranking variability via bootstrap approach -Blob plot of bootstrap results over the different tasks separated +A blob plot of bootstrap results over the different tasks separated by algorithm allows another perspective on the assessment data. This gives deeper insights into the characteristics -of tasks and the ranking uncertainty of the algorithms in each -task. +of tasks and the ranking uncertainty of the algorithms in each task. \bigskip ```{r blobplot_bootstrap_byAlgorithm,fig.width=7,fig.height = 5} #stabilityByAlgorithm.bootstrap.list if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){ stabilityByAlgorithm(boot_object, ordering=ordering_consensus, max_size = 9, size=4, shape=4, single = F) + scale_color_manual(values=cols) } else { pl=stabilityByAlgorithm(boot_object, ordering=ordering_consensus, max_size = 9, size=4, shape=4, single = T) for (i in 1:length(pl)) print(pl[[i]] + scale_color_manual(values=cols) + guides(size = guide_legend(title="%"),color="none") ) } ``` An alternative representation is provided by a stacked frequency plot of the observed ranks, separated by algorithm. Observed ranks across bootstrap samples are -displayed with coloring according to task. For algorithms that +displayed with coloring according to the task. For algorithms that achieve the same rank in different tasks for the full assessment data set, vertical lines are on top of each other. Vertical lines allow to compare the achieved rank of each algorithm over different tasks. \bigskip ```{r stackedFrequencies_bootstrap_byAlgorithm,fig.width=7,fig.height = 5} -#stabilityByAlgorithm.bootstrap.list if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){ stabilityByAlgorithm(boot_object, ordering=ordering_consensus, stacked = TRUE, single = F) } else { pl=stabilityByAlgorithm(boot_object, ordering=ordering_consensus, stacked = TRUE, single = T) print(pl) } ``` diff --git a/inst/appdir/reportMultiple.Rmd b/inst/appdir/report.Rmd similarity index 98% rename from inst/appdir/reportMultiple.Rmd rename to inst/appdir/report.Rmd index 5f32b4b..c8ff6c7 100644 --- a/inst/appdir/reportMultiple.Rmd +++ b/inst/appdir/report.Rmd @@ -1,409 +1,405 @@ --- params: object: NA colors: NA name: NULL consensus: NA isMultiTask: NA bootstrappingEnabled: NA fig.format: NULL dpi: NULL title: "Benchmarking report for `r params$name` " author: "created by challengeR v`r packageVersion('challengeR')`" date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`" editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} options(width=80) #out.format <- knitr::opts_knit$get("out.format") out.format <- knitr::opts_knit$get("rmarkdown.pandoc.to") img_template <- switch( out.format, docx = list("img-params"=list(dpi=150, fig.width=6, fig.height=6, out.width="504px", out.height="504px")), { # default list("img-params"=list( fig.width=7, fig.height = 3, dpi=300)) } ) knitr::opts_template$set( img_template ) knitr::opts_chunk$set(echo = F) # ,#fig.width=7,fig.height = 3,dpi=300, if (out.format != "docx") knitr::opts_chunk$set(fig.align = "center") if (!is.null(params$fig.format)) knitr::opts_chunk$set(dev = params$fig.format) # can be vector, e.g. fig.format=c('jpeg','png', 'pdf') if (!is.null(params$dpi)) knitr::opts_chunk$set(dpi = params$dpi) theme_set(theme_light()) isMultiTask = params$isMultiTask bootstrappingEnabled = params$bootstrappingEnabled ``` ```{r } object = params$object if (isMultiTask) { -ordering_consensus=names(params$consensus) + ordering_consensus=names(params$consensus) } else { ordering_consensus=names(sort(t(object$matlist[[1]][,"rank",drop=F])["rank",])) } color.fun=params$colors ``` ```{r } challenge_multiple=object$data ranking.fun=object$FUN cols_numbered=cols=color.fun(length(ordering_consensus)) names(cols)=ordering_consensus names(cols_numbered)= paste(1:length(cols),names(cols)) if (bootstrappingEnabled) { boot_object = params$object challenge_multiple=boot_object$data ranking.fun=boot_object$FUN object=challenge_multiple%>%ranking.fun object$FUN.list = boot_object$FUN.list object$fulldata=boot_object$fulldata # only not NULL if subset of algorithms used - - cols_numbered=cols=color.fun(length(ordering_consensus)) - names(cols)=ordering_consensus - names(cols_numbered)= paste(1:length(cols),names(cols)) } ``` This document presents a systematic report on the benchmark study "`r params$name`". Input data comprises raw metric values for all algorithms and cases. Generated plots are: ```{r, child=if (!isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewSingleTaskNoBootstrapping.Rmd", package="challengeR")} ``` ```{r, child=if (!isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewSingleTaskBootstrapping.Rmd", package="challengeR")} ``` ```{r, child=if (isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewMultiTaskNoBootstrapping.Rmd", package="challengeR")} ``` ```{r, child=if (isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewMultiTaskBootstrapping.Rmd", package="challengeR")} ``` Details can be found in Wiesenfarth et al. (2019). ```{r,results='asis'} if (isMultiTask) { cat("# Rankings\n") } else { cat("# Ranking") } ``` Algorithms within a task are ranked according to the following ranking scheme: ```{r,results='asis'} a=( lapply(object$FUN.list[1:2],function(x) { if (!is.character(x)) return(paste0("aggregate using function ", paste(gsub("UseMethod","", deparse(functionBody(x))), collapse=" ") )) else if (x=="rank") return(x) else return(paste0("aggregate using function ",x)) })) cat("    *",paste0(a,collapse=" then "),"*",sep="") if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop_significance' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.") ``` ```{r,results='asis'} if (isMultiTask) { cat("Ranking for each task:\n") for (t in 1:length(object$matlist)){ cat("\n",names(object$matlist)[t],": ") n.cases=nrow(challenge_multiple[[t]])/length(unique(challenge_multiple[[t]][[attr(challenge_multiple,"algorithm")]])) numberOfAlgorithms <- length(levels(challenge_multiple[[t]][[attr(challenge_multiple, "algorithm")]])) cat("\nThe analysis is based on", numberOfAlgorithms, "algorithms and", n.cases, "cases.", attr(object$data,"n.missing")[[t]], "missing cases have been found in the data set. ") if (nrow(attr(object$data,"missingData")[[t]])>0) { if(attr(object$data,"n.missing")[[t]]==0 ) cat("However, ") else if(attr(object$data,"n.missing")[[t]]>0 ) cat("Additionally, ") cat("performance of not all algorithms has been observed for all cases in task '", names(object$matlist)[t], "'. Therefore, missings have been inserted in the following cases:") print(knitr::kable(as.data.frame(attr(object$data,"missingData")[[t]]))) } if (nrow(attr(object$data,"missingData")[[t]])>0 | attr(object$data,"n.missing")[[t]]>0) { if (is.numeric(attr(object$data,"na.treat"))) cat("All missings have been replaced by values of", attr(object$data,"na.treat"),".\n") else if (is.character(attr(object$data,"na.treat")) && attr(object$data,"na.treat")=="na.rm") cat("All missings have been removed.") else if (is.function(attr(object$data,"na.treat"))) { cat("Missings have been replaced using function ") print(attr(object$data,"na.treat")) } else if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="rank") cat("Missings lead to the algorithm ranked last for the missing case.") } x=object$matlist[[t]] print(knitr::kable(x[order(x$rank),])) } } else { n.cases=nrow(challenge_multiple[[1]])/length(unique(challenge_multiple[[1]][[attr(challenge_multiple,"algorithm")]])) # Is subset of algorithms used? if (!is.null(object$fulldata[[1]])) { cat("The top ", length(levels(challenge_multiple[[1]][[attr(challenge_multiple, "algorithm")]])), " out of ", length(levels(object$fulldata[[1]][[attr(challenge_multiple, "algorithm")]])), " algorithms are considered.\n") cat("\nThe analysis is based on", n.cases, "cases. ") } else { cat("\nThe analysis is based on", length(levels(challenge_multiple[[1]][[attr(challenge_multiple, "algorithm")]])), "algorithms and", n.cases, "cases. ") } cat(attr(object$data,"n.missing")[[1]], "missing cases have been found in the data set. ") if (nrow(attr(object$data,"missingData")[[1]])>0) { if(attr(object$data,"n.missing")[[1]]==0 ) cat("However, ") else if(attr(object$data,"n.missing")[[1]]>0 ) cat("Additionally, ") cat("performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:") print(knitr::kable(as.data.frame(attr(object$data,"missingData")[[1]]))) } if (nrow(attr(object$data,"missingData")[[1]])>0 | attr(object$data,"n.missing")[[1]]>0) { if (is.numeric(attr(object$data,"na.treat"))) cat("All missings have been replaced by values of", attr(object$data,"na.treat"),".\n") else if (is.character(attr(object$data,"na.treat")) && attr(object$data,"na.treat")=="na.rm") cat("All missings have been removed.") else if (is.function(attr(object$data,"na.treat"))) { cat("Missings have been replaced using function ") print(attr(object$data,"na.treat")) } else if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="rank") cat("Missings lead to the algorithm ranked last for the missing case.") } cat("\n\nRanking:") x=object$matlist[[1]] print(knitr::kable(x[order(x$rank),])) } ``` \bigskip ```{r, child=if (isMultiTask) system.file("appdir", "consensusRanking.Rmd", package="challengeR")} ``` # Visualization of raw assessment data ```{r,results='asis'} if (isMultiTask) { cat("The algorithms are ordered according to the computed ranks for each task.") } ``` ## Dot- and boxplot *Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual cases. \bigskip ```{r boxplots} boxplot(object, size=.8) ``` ## Podium plot *Podium plots* (see also Eugster et al., 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering_consensus)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering_consensus)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering_consensus)`). Dots corresponding to identical cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. ```{r, include=FALSE, fig.keep="none",dev=NULL} plot.new() algs=ordering_consensus l=legend("topright", paste0(1:length(algs),": ",algs), lwd = 1, cex=1.4,seg.len=1.1, title="Rank: Alg.", plot=F) w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc') h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc') addy=max(grconvertY(l$rect$h,"user","inches"),6) ``` ```{r podium,eval=T,fig.width=12, fig.height=addy} #c(bottom, left, top, right op<-par(pin=c(par()$pin[1],6), omd=c(0, 1-w, 0, 1), mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-.5,0), cex.axis=1.5, cex.lab=1.5, cex.main=1.7) oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines") if (oh>0) par(oma=c(oh,0,0,0)) set.seed(38) podium(object, col=cols, lines.show = T, lines.alpha = .4, dots.cex=.9, ylab="Metric value", layout.heights=c(1,.35), legendfn = function(algs, cols) { legend(par('usr')[2], par('usr')[4], xpd=NA, paste0(1:length(algs),": ",algs), lwd = 1, col = cols, bg = NA, cex=1.4, seg.len=1.1, title="Rank: Alg.") } ) par(op) ``` ## Ranking heatmap *Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of cases in which algorithm $A_j$ achieved rank $i$. \bigskip ```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'} rankingHeatmap(object) ``` # Visualization of ranking stability ```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationBlobPlots.Rmd", package="challengeR")} ``` ```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationViolinPlots.Rmd", package="challengeR")} ``` ## *Significance maps* for visualizing ranking stability based on statistical significance *Significance maps* depict incidence matrices of pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values from the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference. \bigskip ```{r significancemap,fig.width=6, fig.height=6,out.width='200%'} significanceMap(object,alpha=0.05,p.adjust.method="holm") ``` ## Ranking robustness to ranking methods *Line plots* for visualizing ranking robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods. \bigskip ```{r lineplot,fig.width=8, fig.height=6,out.width='95%'} if (length(object$matlist)<=6 &nrow((object$matlist[[1]]))<=10 ){ methodsplot(challenge_multiple, ordering = ordering_consensus, na.treat=object$call[[1]][[1]]$na.treat) + scale_color_manual(values=cols) }else { x=challenge_multiple for (subt in names(challenge_multiple)){ dd=as.challenge(x[[subt]], value=attr(x,"value"), algorithm=attr(x,"algorithm") , case=attr(x,"case"), annotator = attr(x,"annotator"), by=attr(x,"by"), smallBetter = attr(x,"smallBetter"), na.treat=object$call[[1]][[1]]$na.treat ) print(methodsplot(dd, ordering = ordering_consensus) + ggtitle(subt) + scale_color_manual(values=cols) ) } } ``` ```{r, child=if (isMultiTask) system.file("appdir", "visualizationAcrossTasks.Rmd", package="challengeR")} ``` # References Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121* M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory and inferential analysis of benchmark experiments,” Institut fuer Statistik, Ludwig-Maximilians-Universitaet Muenchen, Germany, Technical Report 30, 2008. [Online]. Available: http://epub.ub.uni-muenchen.de/4134/. diff --git a/data_matrix.csv b/inst/extdata/data_matrix.csv similarity index 100% rename from data_matrix.csv rename to inst/extdata/data_matrix.csv diff --git a/man/aggregate.challenge.Rd b/man/aggregate.challenge.Rd new file mode 100644 index 0000000..b452a41 --- /dev/null +++ b/man/aggregate.challenge.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aaggregate.R +\name{aggregate.challenge} +\alias{aggregate.challenge} +\title{Title} +\usage{ +\method{aggregate}{challenge}( + x, + FUN = mean, + na.treat, + alpha = 0.05, + p.adjust.method = "none", + parallel = FALSE, + progress = "none", + ... +) +} +\arguments{ +\item{...}{} +} +\value{ + +} +\description{ +Title +} diff --git a/man/aggregateThenRank.Rd b/man/aggregateThenRank.Rd new file mode 100644 index 0000000..8e57e32 --- /dev/null +++ b/man/aggregateThenRank.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rankingMethods.R +\name{aggregateThenRank} +\alias{aggregateThenRank} +\title{Performs ranking via aggregate-then-rank} +\usage{ +aggregateThenRank(object, FUN, ties.method = "min", ...) +} +\arguments{ +\item{object}{The challenge object.} + +\item{FUN}{The aggregation function, e.g. mean, median, min, max, function(x), quantile(x, probs=0.05).} + +\item{ties.method}{A string specifying how ties are treated, see \code{\link[base:rank]{base::rank()}}.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ +An S3 object of class "ranked.list" to represent a ranked assessment data set. +} +\description{ +Performs ranking by first aggregating performance values across all cases (e.g., with the mean, median or another quantile) for each algorithm. +This aggregate is then used to compute a rank for each algorithm. +} +\examples{ + +\dontrun{ + aggregateThenRank(challenge, FUN = mean, ties.method = "average", na.treat = 0) +} + +} +\seealso{ +Other ranking functions: +\code{\link{rankThenAggregate}()}, +\code{\link{testThenRank}()} +} +\concept{ranking functions} diff --git a/man/as.challenge.Rd b/man/as.challenge.Rd new file mode 100644 index 0000000..c08e63f --- /dev/null +++ b/man/as.challenge.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/challengeR.R +\name{as.challenge} +\alias{as.challenge} +\title{Constructs a challenge object} +\usage{ +as.challenge( + object, + case, + algorithm, + value, + by = NULL, + taskName = NULL, + annotator = NULL, + smallBetter = FALSE, + na.treat = NULL, + check = TRUE +) +} +\arguments{ +\item{object}{A data frame containing the assessment data.} + +\item{case}{A string specifying the name of the column that contains the case identifiers.} + +\item{algorithm}{A string specifying the name of the column that contains the algorithm identifiers.} + +\item{value}{A string specifying the name of the column that contains the performance values.} + +\item{by}{A string specifying the name of the column that contains the task identifiers. Required for multi-task data set.} + +\item{taskName}{A string specifying the task name for single-task data set that does not contain a task column. +This argument is optional for a single-task data set and is ignored for a multi-task data set.} + +\item{annotator}{Not supported} + +\item{smallBetter}{A boolean specifying whether small performance values indicate better algorithm performance.} + +\item{na.treat}{Indicates how missing perfomance values are treated if sanity check is enabled. It can be 'na.rm', numeric value or function. +For a numeric value or function, NAs will be replaced by the specified values. For 'na.rm', rows that contain missing values will be removed.} + +\item{check}{A boolean to indicate to perform a sanity check of the specified data set and arguments if set to \code{TRUE}.} +} +\value{ +An S3 object to represent the configuration of an assessment data set. +} +\description{ +Constructs an S3 object to represent the configuration of an assessment data set originating from a benchmarking competition (so-called "challenge"). +} +\section{Assessment data set}{ + +The toolkit provides visualization approaches for both challenges designed around a single task (single-task challenges) and for challenges comprising multiple tasks (multi-task challenges). +For a single-task challenge, the assessment data set (argument \code{object}) requires the following columns: +\itemize{ +\item test case identifier (string or numeric) +\item algorithm identifier (string or numeric) +\item performance value (numeric) +} + +For a multi-task challenge, the assessment data set (argument \code{object}) requires the following columns: +\itemize{ +\item task identifier (string or numeric) +\item test case identifier (string or numeric) +\item algorithm identifier (string or numeric) +\item performance value (numeric) +} +} + +\section{Sanity check}{ + +It is highly recommended that the sanity check is not disabled when the data set is provided initially. +It checks that: +\itemize{ +\item performance values are numeric (if not, raises error) +\item algorithm performances are observed for all cases (if not, adds them as NA and emits a message) +\item cases appear only once for the same algorithm (if not, raises error) +} +If the argument \code{na.treat} for treatment of NA is specified, NAs will be handled respectively. + +It might be reasonable to disable the sanity check for further computations (e.g., for performance reasons +during bootstrapping (\code{\link{bootstrap.ranked.list}}) where cases are actually allowed to appear more than once for the same algorithm). +} + +\examples{ +# single-task data set + +# multi-task data set + +} diff --git a/man/bootstrap.ranked.list.Rd b/man/bootstrap.ranked.list.Rd new file mode 100644 index 0000000..45106ba --- /dev/null +++ b/man/bootstrap.ranked.list.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Bootstrap.R +\name{bootstrap.ranked.list} +\alias{bootstrap.ranked.list} +\title{Performs bootstrapping} +\usage{ +\method{bootstrap}{ranked.list}(object, nboot, parallel = FALSE, progress = "text", ...) +} +\arguments{ +\item{object}{The ranked assessment data set.} + +\item{nboot}{The number of bootstrap samples.} + +\item{parallel}{A boolean specifying whether parallel processing should be enabled.} + +\item{progress}{A string specifying the type of progress indication.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ +An S3 object of class "bootstrap.list" to represent a bootstrapped, ranked assessment data set. +} +\description{ +Performs bootstrapping on a ranked assessment data set and applies the ranking method to each bootstrap sample. One bootstrap sample of +a task with \code{n} cases consists of \code{n} cases randomly drawn with replacement from this task. +A total of \code{nboot} of these bootstrap samples are drawn. +} +\examples{ + +\dontrun{ + # perform bootstrapping with 1000 bootstrap samples using one CPU + set.seed(1) + ranking_bootstrapped <- bootstrap(ranking, nboot = 1000) +} + +\dontrun{ + # perform bootstrapping using multiple CPUs (here: 8 CPUs) + library(doParallel) + registerDoParallel(cores=8) + set.seed(1) + ranking_bootstrapped <- bootstrap(ranking, nboot = 1000, parallel = TRUE, progress = "none") + stopImplicitCluster() +} + +} diff --git a/man/boxplot.ranked.list.Rd b/man/boxplot.ranked.list.Rd new file mode 100644 index 0000000..8fccd50 --- /dev/null +++ b/man/boxplot.ranked.list.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/boxplot.R +\name{boxplot.ranked.list} +\alias{boxplot.ranked.list} +\title{Creates dot- and boxplots} +\usage{ +\method{boxplot}{ranked.list}(x, color = "blue", jitter.width = 0.25, ...) +} +\arguments{ +\item{x}{The ranked assessment data set.} + +\item{color}{A string specifying the color of the dots.} + +\item{jitter.width}{A numeric value specifying the jitter width of the dots.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates dot- and boxplots visualizing the assessment data separately for each algorithm. +Boxplots representing descriptive statistics for all test cases (median, quartiles and outliers) +are combined with horizontally jittered dots representing individual test cases. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize assessment data: +\code{\link{podium.challenge}()}, +\code{\link{podium.ranked.list}()}, +\code{\link{rankingHeatmap.challenge}()}, +\code{\link{rankingHeatmap.ranked.list}()} +} +\concept{functions to visualize assessment data} diff --git a/man/consensus.ranked.list.Rd b/man/consensus.ranked.list.Rd new file mode 100644 index 0000000..3e8a9e6 --- /dev/null +++ b/man/consensus.ranked.list.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/consensus.R +\name{consensus.ranked.list} +\alias{consensus.ranked.list} +\title{Computes a consensus ranking} +\usage{ +\method{consensus}{ranked.list}(object, method, ...) +} +\arguments{ +\item{object}{The ranked asssessment data set.} + +\item{method}{A string specifying the method to derive the consensus ranking, see \code{\link[relations:consensus]{relations::consensus()}} for the methods.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Computes a consensus ranking (rank aggregation) across tasks. +} diff --git a/man/dendrogram.ranked.list.Rd b/man/dendrogram.ranked.list.Rd new file mode 100644 index 0000000..0a3abc3 --- /dev/null +++ b/man/dendrogram.ranked.list.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dendrogram.R +\name{dendrogram.ranked.list} +\alias{dendrogram.ranked.list} +\title{Creates a cluster dendrogram} +\usage{ +\method{dendrogram}{ranked.list}(object, dist = "symdiff", method = "complete", ...) +} +\arguments{ +\item{object}{The ranked assessment data set.} + +\item{dist}{A string specifying the distance measure to be used, see \code{\link[relations:dissimilarity]{relations::dissimilarity()}}.} + +\item{method}{A string specifying agglomeration method to be used, see \code{\link[stats:hclust]{stats::hclust()}}.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates a cluster dendrogram from a ranked assessment data set. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize cross-task insights: +\code{\link{stability.ranked.list}()}, +\code{\link{stabilityByAlgorithm.bootstrap.list}()}, +\code{\link{stabilityByTask.bootstrap.list}()} +} +\concept{functions to visualize cross-task insights} diff --git a/man/methodsplot.challenge.Rd b/man/methodsplot.challenge.Rd new file mode 100644 index 0000000..7f33fd2 --- /dev/null +++ b/man/methodsplot.challenge.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methodsplot.R +\name{methodsplot.challenge} +\alias{methodsplot.challenge} +\title{Creates line plots} +\usage{ +\method{methodsplot}{challenge}( + x, + na.treat = NULL, + methods = list(testBased = . \%>\% test() \%>\% rank(ties.method = "min"), + meanThenRank = . \%>\% aggregate(FUN = "mean") \%>\% rank(ties.method = "min"), + medianThenRank = . \%>\% aggregate(FUN = "median") \%>\% rank(ties.method = "min"), + rankThenMean = . \%>\% rank(ties.method = "min") \%>\% aggregate(FUN = "mean") \%>\% + rank(ties.method = "min"), rankThenMedian = . \%>\% rank(ties.method = "min") \%>\% + aggregate(FUN = "median") \%>\% rank(ties.method = "min")), + ordering, + ... +) +} +\arguments{ +\item{x}{The challenge object.} + +\item{na.treat}{Indicates how missing perfomance values are treated if sanity check is enabled. It can be 'na.rm', numeric value or function. +For a numeric value or function, NAs will be replaced by the specified values. For 'na.rm', rows that contain missing values will be removed.} + +\item{methods}{A list of ranking methods that should be incorporated.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Create line plots that visualize the robustness of ranking across different ranking methods from a challenge object. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize ranking stability: +\code{\link{significanceMap.ranked.list}()}, +\code{\link{stabilityByTask.bootstrap.list}()}, +\code{\link{violin.bootstrap.list}()} +} +\concept{functions to visualize ranking stability} diff --git a/man/podium.challenge.Rd b/man/podium.challenge.Rd new file mode 100644 index 0000000..989f29d --- /dev/null +++ b/man/podium.challenge.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/podium.R +\name{podium.challenge} +\alias{podium.challenge} +\title{Creates a podium plot} +\usage{ +\method{podium}{challenge}( + object, + ordering, + xlab = NULL, + ylab = NULL, + lines.show = FALSE, + lines.alpha = 0.2, + lines.lwd = 1, + col, + lines.col = col, + dots.pch = 19, + dots.cex = 1, + places.lty = 2, + places.col = 1, + legendfn = function(algs, cols) { legend("topleft", algs, lwd = 1, col = cols, bg + = "white") }, + layout.heights = c(1, 0.4), + ... +) +} +\arguments{ +\item{object}{The challenge object.} + +\item{xlab}{A string specifying the x-axis label.} + +\item{ylab}{A string specifying the y-axis label.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates a podium plot from a challenge object. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize assessment data: +\code{\link{boxplot.ranked.list}()}, +\code{\link{podium.ranked.list}()}, +\code{\link{rankingHeatmap.challenge}()}, +\code{\link{rankingHeatmap.ranked.list}()} +} +\concept{functions to visualize assessment data} diff --git a/man/podium.ranked.list.Rd b/man/podium.ranked.list.Rd new file mode 100644 index 0000000..6ce2c4f --- /dev/null +++ b/man/podium.ranked.list.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/podium.R +\name{podium.ranked.list} +\alias{podium.ranked.list} +\title{Creates podium plots} +\usage{ +\method{podium}{ranked.list}( + object, + xlab = "Podium", + ylab = "Performance", + lines.show = TRUE, + lines.alpha = 0.2, + lines.lwd = 1, + col, + lines.col = col, + dots.pch = 19, + dots.cex = 1, + places.lty = 2, + places.col = 1, + legendfn = function(algs, cols) { legend("topleft", algs, lwd = 1, col = cols, bg + = "white") }, + layout.heights = c(1, 0.4), + ... +) +} +\arguments{ +\item{object}{The ranked asssessment data set.} + +\item{xlab}{A string specifying the x-axis label.} + +\item{ylab}{A string specifying the y-axis label.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates podium plots from one or more ranked assessment data sets. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize assessment data: +\code{\link{boxplot.ranked.list}()}, +\code{\link{podium.challenge}()}, +\code{\link{rankingHeatmap.challenge}()}, +\code{\link{rankingHeatmap.ranked.list}()} +} +\concept{functions to visualize assessment data} diff --git a/man/rankThenAggregate.Rd b/man/rankThenAggregate.Rd new file mode 100644 index 0000000..116e74b --- /dev/null +++ b/man/rankThenAggregate.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rankingMethods.R +\name{rankThenAggregate} +\alias{rankThenAggregate} +\title{Performs ranking via rank-then-aggregate} +\usage{ +rankThenAggregate(object, FUN, ties.method = "min") +} +\arguments{ +\item{object}{The challenge object.} + +\item{FUN}{The aggregation function, e.g., mean, median, min, max, function(x), quantile(x, probs=0.05).} + +\item{ties.method}{A string specifying how ties are treated, see \code{\link[base:rank]{base::rank()}}.} +} +\value{ +An S3 object of class "ranked.list" to represent a ranked assessment data set. +} +\description{ +Performs ranking by first computing a rank for each case for each algorithm ("rank first"). +The final rank is based on the aggregated ranks for the cases. This ranking method handles missing values implicitly +by assigning the worst rank to missing algorithm performances. +} +\examples{ +\dontrun{ + rankThenAggregate(challenge, FUN = mean) +} + +} +\seealso{ +Other ranking functions: +\code{\link{aggregateThenRank}()}, +\code{\link{testThenRank}()} +} +\concept{ranking functions} diff --git a/man/rankingHeatmap.challenge.Rd b/man/rankingHeatmap.challenge.Rd new file mode 100644 index 0000000..0f6efda --- /dev/null +++ b/man/rankingHeatmap.challenge.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rankingHeatmap.R +\name{rankingHeatmap.challenge} +\alias{rankingHeatmap.challenge} +\title{Creates a ranking heatmap} +\usage{ +\method{rankingHeatmap}{challenge}(x, ordering, ties.method = "min", ...) +} +\arguments{ +\item{x}{The challenge object.} + +\item{ties.method}{A string specifying how ties are treated, see \code{\link[base:rank]{base::rank()}}.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates a ranking heatmap from a challenge object. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize assessment data: +\code{\link{boxplot.ranked.list}()}, +\code{\link{podium.challenge}()}, +\code{\link{podium.ranked.list}()}, +\code{\link{rankingHeatmap.ranked.list}()} +} +\concept{functions to visualize assessment data} diff --git a/man/rankingHeatmap.ranked.list.Rd b/man/rankingHeatmap.ranked.list.Rd new file mode 100644 index 0000000..a896b69 --- /dev/null +++ b/man/rankingHeatmap.ranked.list.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rankingHeatmap.R +\name{rankingHeatmap.ranked.list} +\alias{rankingHeatmap.ranked.list} +\title{Creates ranking heatmaps} +\usage{ +\method{rankingHeatmap}{ranked.list}(x, ties.method = "min", ...) +} +\arguments{ +\item{x}{The ranked asssessment data set.} + +\item{ties.method}{A string specifying how ties are treated, see \code{\link[base:rank]{base::rank()}}.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates ranking heatmaps from one or more ranked assessment data sets. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize assessment data: +\code{\link{boxplot.ranked.list}()}, +\code{\link{podium.challenge}()}, +\code{\link{podium.ranked.list}()}, +\code{\link{rankingHeatmap.challenge}()} +} +\concept{functions to visualize assessment data} diff --git a/man/report.bootstrap.list.Rd b/man/report.bootstrap.list.Rd new file mode 100644 index 0000000..df0b71a --- /dev/null +++ b/man/report.bootstrap.list.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/report.R +\name{report.bootstrap.list} +\alias{report.bootstrap.list} +\title{Generates a benchmarking report with bootstrapping results} +\usage{ +\method{report}{bootstrap.list}( + object, + consensus, + file, + title = "", + colors = default_colors, + format = "PDF", + latex_engine = "pdflatex", + clean = TRUE, + fig.format = NULL, + dpi = 150, + open = TRUE, + ... +) +} +\arguments{ +\item{object}{The ranked (bootstrapped) assessment data set.} + +\item{consensus}{The rank aggregation across tasks (consensus ranking). Only needed for a multi-task data set.} + +\item{file}{A string specifying the file name of the report. It allows for specifying the output file path as well, +otherwise the working directory is used. If \code{file} does not have a file extension, an extension will be automatically +added according to the output format given in \code{format}. If the argument is omitted, the report is created in a +temporary folder with file name "report".} + +\item{title}{A string specifying the title of the report.} + +\item{colors}{The color scheme that is applied to the plots.} + +\item{format}{A string specifying the format of the report. The options are "PDF", "HTML" or "Word".} + +\item{latex_engine}{A string specifying the LaTeX engine for producing PDF output. The Options are "pdflatex", "lualatex", and "xelatex".} + +\item{clean}{A boolean indicating whether intermediate files (e.g. individual plots) should be kept. Using \code{TRUE} will clean +intermediate files that are created during rendering.} + +\item{fig.format}{A vector of strings containing the file format of the figures that are not removed if \code{clean} is set to \code{FALSE}. +The options are "jpeg", "png" and "pdf", e.g. \code{fig.format = c("jpeg", "png", "pdf")}.} + +\item{dpi}{A positive integer specifying the resolution of the generated plot (\code{fig.format} "jpeg" or "png") in dots per inch (DPI).} + +\item{open}{A boolean specifying whether the report should be opened with the default system viewer after generation.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Generates a benchmarking report in PDF, HTML or Word format with bootstrapping results. +It contains the rankings, plots of the raw assessment data and plots of the ranking stability. +For multi-task challenges, it also contains plots of cross-task insights. If you are interested in +the individual plots as separate files, set argument \code{clean} to \code{FALSE} and specify \code{fig.format}. +} diff --git a/man/report.ranked.list.Rd b/man/report.ranked.list.Rd new file mode 100644 index 0000000..49131e2 --- /dev/null +++ b/man/report.ranked.list.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/report.R +\name{report.ranked.list} +\alias{report.ranked.list} +\title{Generates a benchmarking report without bootstrapping results} +\usage{ +\method{report}{ranked.list}( + object, + consensus, + file, + title = "", + colors = default_colors, + format = "PDF", + latex_engine = "pdflatex", + clean = TRUE, + fig.format = NULL, + dpi = 150, + open = TRUE, + ... +) +} +\arguments{ +\item{object}{The ranked assessment data set.} + +\item{consensus}{The rank aggregation across tasks (consensus ranking). Only needed for a multi-task data set.} + +\item{file}{A string specifying the file name of the report. It allows for specifying the output file path as well, +otherwise the working directory is used. If \code{file} does not have a file extension, an extension will be automatically +added according to the output format given in \code{format}. If the argument is omitted, the report is created in a +temporary folder with file name "report".} + +\item{title}{A string specifying the title of the report.} + +\item{colors}{The color scheme that is applied to the plots.} + +\item{format}{A string specifying the format of the report. The options are "PDF", "HTML" or "Word".} + +\item{latex_engine}{A string specifying the LaTeX engine for producing PDF output. The Options are "pdflatex", "lualatex", and "xelatex".} + +\item{clean}{A boolean indicating whether intermediate files (e.g. individual plots) should be kept. Using \code{TRUE} will clean +intermediate files that are created during rendering.} + +\item{fig.format}{A vector of strings containing the file format of the figures that are not removed if \code{clean} is set to \code{FALSE}. +The options are "jpeg", "png" and "pdf", e.g. \code{fig.format = c("jpeg", "png", "pdf")}.} + +\item{dpi}{A positive integer specifying the resolution of the generated plot (\code{fig.format} "jpeg" or "png") in dots per inch (DPI).} + +\item{open}{A boolean specifying whether the report should be opened with the default system viewer after generation.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Generates a benchmarking report in PDF, HTML or Word format without bootstrapping results. +It contains the rankings, plots of the raw assessment data and plots of the ranking stability. +For multi-task challenges, it also contains plots of cross-task insights. If you are interested in +the individual plots as separate files, set argument \code{clean} to \code{FALSE} and specify \code{fig.format}. +} diff --git a/man/significanceMap.ranked.list.Rd b/man/significanceMap.ranked.list.Rd new file mode 100644 index 0000000..b48291b --- /dev/null +++ b/man/significanceMap.ranked.list.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/significanceMap.R +\name{significanceMap.ranked.list} +\alias{significanceMap.ranked.list} +\title{Creates significance maps} +\usage{ +\method{significanceMap}{ranked.list}( + object, + alpha = 0.05, + p.adjust.method = "holm", + order = FALSE, + size.rank = 0.3 * theme_get()$text$size, + ... +) +} +\arguments{ +\item{object}{The ranked assessment data set.} + +\item{alpha}{A numeric values specifying the significance level.} + +\item{p.adjust.method}{A string specifying the adjustment method for multiple testing, see \code{\link[stats:p.adjust]{stats::p.adjust()}}.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates significance maps from a ranked assessment data set. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize ranking stability: +\code{\link{methodsplot.challenge}()}, +\code{\link{stabilityByTask.bootstrap.list}()}, +\code{\link{violin.bootstrap.list}()} +} +\concept{functions to visualize ranking stability} diff --git a/man/stability.ranked.list.Rd b/man/stability.ranked.list.Rd new file mode 100644 index 0000000..6471bfe --- /dev/null +++ b/man/stability.ranked.list.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stability.R +\name{stability.ranked.list} +\alias{stability.ranked.list} +\title{Creates a blob plot across tasks} +\usage{ +\method{stability}{ranked.list}( + x, + ordering, + probs = c(0.025, 0.975), + max_size = 6, + freq = FALSE, + shape = 4, + ... +) +} +\arguments{ +\item{x}{The ranked asssessment data set.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates a blob plots visualizing the ranking variability across tasks. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize cross-task insights: +\code{\link{dendrogram.ranked.list}()}, +\code{\link{stabilityByAlgorithm.bootstrap.list}()}, +\code{\link{stabilityByTask.bootstrap.list}()} +} +\concept{functions to visualize cross-task insights} diff --git a/man/stabilityByAlgorithm.bootstrap.list.Rd b/man/stabilityByAlgorithm.bootstrap.list.Rd new file mode 100644 index 0000000..e0fb356 --- /dev/null +++ b/man/stabilityByAlgorithm.bootstrap.list.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stability.R +\name{stabilityByAlgorithm.bootstrap.list} +\alias{stabilityByAlgorithm.bootstrap.list} +\title{Creates blob plots or stacked frequency plots stratified by algorithm} +\usage{ +\method{stabilityByAlgorithm}{bootstrap.list}( + x, + ordering, + stacked = FALSE, + probs = c(0.025, 0.975), + max_size = 3, + shape = 4, + freq = FALSE, + single = FALSE, + ... +) +} +\arguments{ +\item{x}{The bootstrapped, ranked assessment data set.} + +\item{stacked}{A boolean specifying whether a stacked frequency plot (\code{stacked = TRUE}) or blob plot (\code{stacked = FALSE}) should be created.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates blob plots (\code{stacked = FALSE}) or stacked frequency plots (\code{stacked = TRUE}) for each algorithm +from a bootstrapped, ranked assessment data set. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize cross-task insights: +\code{\link{dendrogram.ranked.list}()}, +\code{\link{stability.ranked.list}()}, +\code{\link{stabilityByTask.bootstrap.list}()} +} +\concept{functions to visualize cross-task insights} diff --git a/man/stabilityByTask.bootstrap.list.Rd b/man/stabilityByTask.bootstrap.list.Rd new file mode 100644 index 0000000..d2d4c24 --- /dev/null +++ b/man/stabilityByTask.bootstrap.list.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stability.R +\name{stabilityByTask.bootstrap.list} +\alias{stabilityByTask.bootstrap.list} +\title{Creates blob plots stratified by task} +\usage{ +\method{stabilityByTask}{bootstrap.list}( + x, + ordering, + probs = c(0.025, 0.975), + max_size = 3, + size.ranks = 0.3 * theme_get()$text$size, + shape = 4, + showLabelForSingleTask = FALSE, + ... +) +} +\arguments{ +\item{x}{The bootstrapped, ranked assessment data set.} + +\item{showLabelForSingleTask}{A boolean specifying whether the task name should be used as title for a single-task data set.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates blob plots for each task from a bootstrapped, ranked assessment data set. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize ranking stability: +\code{\link{methodsplot.challenge}()}, +\code{\link{significanceMap.ranked.list}()}, +\code{\link{violin.bootstrap.list}()} + +Other functions to visualize cross-task insights: +\code{\link{dendrogram.ranked.list}()}, +\code{\link{stability.ranked.list}()}, +\code{\link{stabilityByAlgorithm.bootstrap.list}()} +} +\concept{functions to visualize cross-task insights} +\concept{functions to visualize ranking stability} diff --git a/man/subset.bootstrap.list.Rd b/man/subset.bootstrap.list.Rd new file mode 100644 index 0000000..f12d99f --- /dev/null +++ b/man/subset.bootstrap.list.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/subset.R +\name{subset.bootstrap.list} +\alias{subset.bootstrap.list} +\title{Extracts a subset of algorithms or tasks} +\usage{ +\method{subset}{bootstrap.list}(x, top, tasks, ...) +} +\arguments{ +\item{x}{The bootstrapped, ranked asssessment data set.} + +\item{top}{A positive integer specifying the amount of top performing algorithms to be retrieved.} + +\item{tasks}{A vector of strings containing the task identifiers that should remain in the subset.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ +An S3 object of class "bootstrap.list" to represent a bootstrapped, ranked assessment data set. +} +\description{ +Extracts the top performing algorithms or a subset of tasks. +} +\section{Reports for subsets (top list) of algorithms}{ + +If ties are present in the ranking, the subset will consist of more than \code{top} algorithms. +Line plots for ranking robustness can be used to check whether algorithms performing well in other +ranking methods are excluded. Bootstrapping still takes entire uncertainty into account. +Podium plots and ranking heatmaps neglect excluded algorithms. Only available for single-task challenges +(for multi-task challenges not sensible because each task would contain a different set of algorithms). +} + +\section{Reports for subsets of tasks}{ + +You may want to recompute the consensus ranking after creating the subset. An error will be raised +if a task identifier is not contained in the assessment data set to avoid subsequent errors. +} + +\examples{ + +\dontrun{ + # only show the top 3 algorithms according to the chosen ranking method + subset(ranking_bootstrapped, top = 3) \%>\% report(...) +} + +\dontrun{ + # restrict report to tasks "task1" and "task2" and recompute consensus ranking + meanRanks <- subset(ranking, tasks = c("task1", "task2")) \%>\% consensus(method = "euclidean") +} + +} diff --git a/man/subset.ranked.list.Rd b/man/subset.ranked.list.Rd new file mode 100644 index 0000000..6203ab1 --- /dev/null +++ b/man/subset.ranked.list.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/subset.R +\name{subset.ranked.list} +\alias{subset.ranked.list} +\title{Extracts a subset of algorithms or tasks} +\usage{ +\method{subset}{ranked.list}(x, top, tasks, ...) +} +\arguments{ +\item{x}{The ranked asssessment data set.} + +\item{top}{A positive integer specifying the amount of top performing algorithms to be retrieved.} + +\item{tasks}{A vector of strings containing the task identifiers that should remain in the subset.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ +An S3 object of class "ranked.list" to represent a ranked assessment data set. +} +\description{ +Extracts the top performing algorithms or a subset of tasks. +} +\section{Reports for subsets (top list) of algorithms}{ + +If ties are present in the ranking, the subset will consist of more than \code{top} algorithms. +Line plots for ranking robustness can be used to check whether algorithms performing well in other +ranking methods are excluded. Bootstrapping still takes entire uncertainty into account. +Podium plots and ranking heatmaps neglect excluded algorithms. Only available for single-task challenges +(for multi-task challenges not sensible because each task would contain a different set of algorithms). +} + +\section{Reports for subsets of tasks}{ + +You may want to recompute the consensus ranking after creating the subset. An error will be raised +if a task identifier is not contained in the assessment data set to avoid subsequent errors. +} + +\examples{ + +\dontrun{ + # only show the top 3 algorithms according to the chosen ranking method + subset(ranking, top = 3) \%>\% report(...) +} + +\dontrun{ + # restrict report to tasks "task1" and "task2" + subset(ranking, tasks=c("task1", "task2")) \%>\% report(...) +} + +} diff --git a/man/testThenRank.Rd b/man/testThenRank.Rd new file mode 100644 index 0000000..fa1f17c --- /dev/null +++ b/man/testThenRank.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rankingMethods.R +\name{testThenRank} +\alias{testThenRank} +\title{Performs ranking via test-then-rank} +\usage{ +testThenRank(object, ties.method = "min", ...) +} +\arguments{ +\item{object}{The challenge object.} + +\item{ties.method}{A string specifying how ties are treated, see \code{\link[base:rank]{base::rank()}}.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ +An S3 object of class "ranked.list" to represent a ranked assessment data set. +} +\description{ +Computes statistical hypothesis tests based on Wilcoxon signed rank test for each possible +pair of algorithms to assess differences in metric values between the algorithms. +Then ranking is performed according to the number of significant one-sided test results. +If algorithms have the same number of significant test results, then they obtain the same rank. +} +\examples{ +\dontrun{ + testThenRank(challenge, + alpha=0.05, # significance level + p.adjust.method="none", # method for adjustment for multiple testing, see ?p.adjust + na.treat = 0) +} + +} +\seealso{ +Other ranking functions: +\code{\link{aggregateThenRank}()}, +\code{\link{rankThenAggregate}()} +} +\concept{ranking functions} diff --git a/man/violin.bootstrap.list.Rd b/man/violin.bootstrap.list.Rd new file mode 100644 index 0000000..87d90e2 --- /dev/null +++ b/man/violin.bootstrap.list.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/violin.R +\name{violin.bootstrap.list} +\alias{violin.bootstrap.list} +\title{Creates a violin plot} +\usage{ +\method{violin}{bootstrap.list}(x, ...) +} +\arguments{ +\item{x}{The bootstrapped, ranked assessment data set.} + +\item{...}{Further arguments passed to or from other functions.} +} +\value{ + +} +\description{ +Creates a violin plot from a bootstrapped, ranked assessment data set. +} +\examples{ + +} +\seealso{ +\code{browseVignettes("challengeR")} + +Other functions to visualize ranking stability: +\code{\link{methodsplot.challenge}()}, +\code{\link{significanceMap.ranked.list}()}, +\code{\link{stabilityByTask.bootstrap.list}()} +} +\concept{functions to visualize ranking stability} diff --git a/tests/testthat/test-blobPlotStabilityAcrossTasks.R b/tests/testthat/test-blobPlotStabilityAcrossTasks.R index ffadb9f..6b90733 100644 --- a/tests/testthat/test-blobPlotStabilityAcrossTasks.R +++ b/tests/testthat/test-blobPlotStabilityAcrossTasks.R @@ -1,46 +1,78 @@ test_that("blob plot for visualizing ranking stability across tasks raises error for single-task data set", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A3", value=0.4, case="C1"), data.frame(algo="A1", value=0.2, case="C2"), data.frame(algo="A2", value=0.1, case="C2"), data.frame(algo="A3", value=0.0, case="C2")) challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") expect_error(stability(ranking), "The stability of rankings across tasks cannot be computed for less than two tasks.", fixed=TRUE) }) test_that("blob plot for visualizing ranking stability across tasks returns one plot for multi-task data set", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A3", value=0.4, case="C1"), data.frame(algo="A1", value=0.2, case="C2"), data.frame(algo="A2", value=0.1, case="C2"), data.frame(algo="A3", value=0.0, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1"), data.frame(algo="A3", value=0.4, case="C1"), data.frame(algo="A1", value=0.7, case="C2"), data.frame(algo="A2", value=0.8, case="C2"), data.frame(algo="A3", value=0.9, case="C2") )) data <- rbind(dataTask1, dataTask2) challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") actualPlot <- stability(ranking) expect_is(actualPlot, "ggplot") }) + +test_that("blob plot for visualizing ranking stability across tasks returns one plot for multi-task data set when consensus ranking is given", { + dataTask1 <- cbind(task="T1", + rbind( + data.frame(algo="A1", value=0.8, case="C1"), + data.frame(algo="A2", value=0.6, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.2, case="C2"), + data.frame(algo="A2", value=0.1, case="C2"), + data.frame(algo="A3", value=0.0, case="C2") + )) + dataTask2 <- cbind(task="T2", + rbind( + data.frame(algo="A1", value=0.2, case="C1"), + data.frame(algo="A2", value=0.3, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.7, case="C2"), + data.frame(algo="A2", value=0.8, case="C2"), + data.frame(algo="A3", value=0.9, case="C2") + )) + + data <- rbind(dataTask1, dataTask2) + + challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) + + ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") + + meanRanks <- ranking%>%consensus(method = "euclidean") + + actualPlot <- stability(ranking, ordering = names(meanRanks)) + expect_is(actualPlot, "ggplot") +}) diff --git a/tests/testthat/test-blobPlotStabilityByAlgorithm.R b/tests/testthat/test-blobPlotStabilityByAlgorithm.R index d0b0e62..c3f9195 100644 --- a/tests/testthat/test-blobPlotStabilityByAlgorithm.R +++ b/tests/testthat/test-blobPlotStabilityByAlgorithm.R @@ -1,52 +1,90 @@ test_that("blob plot for visualizing ranking stability by algorithm raises error for single-task data set", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A3", value=0.4, case="C1"), data.frame(algo="A1", value=0.2, case="C2"), data.frame(algo="A2", value=0.1, case="C2"), data.frame(algo="A3", value=0.0, case="C2")) challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") set.seed(1) rankingBootstrapped <- ranking%>%bootstrap(nboot=10) expect_error(stabilityByAlgorithm(rankingBootstrapped), "The stability of rankings by algorithm cannot be computed for less than two tasks.", fixed=TRUE) }) test_that("blob plot for visualizing ranking stability by algorithm returns one plot for multi-task data set", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A3", value=0.4, case="C1"), data.frame(algo="A1", value=0.2, case="C2"), data.frame(algo="A2", value=0.1, case="C2"), data.frame(algo="A3", value=0.0, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1"), data.frame(algo="A3", value=0.4, case="C1"), data.frame(algo="A1", value=0.7, case="C2"), data.frame(algo="A2", value=0.8, case="C2"), data.frame(algo="A3", value=0.9, case="C2") )) data <- rbind(dataTask1, dataTask2) challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") set.seed(1) rankingBootstrapped <- ranking%>%bootstrap(nboot=10) actualPlot <- stabilityByAlgorithm(rankingBootstrapped) expect_is(actualPlot, "ggplot") }) + +test_that("blob plot for visualizing ranking stability by algorithm returns a plot for each algorithm", { + dataTask1 <- cbind(task="T1", + rbind( + data.frame(algo="A1", value=0.8, case="C1"), + data.frame(algo="A2", value=0.6, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.2, case="C2"), + data.frame(algo="A2", value=0.1, case="C2"), + data.frame(algo="A3", value=0.0, case="C2") + )) + dataTask2 <- cbind(task="T2", + rbind( + data.frame(algo="A1", value=0.2, case="C1"), + data.frame(algo="A2", value=0.3, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.7, case="C2"), + data.frame(algo="A2", value=0.8, case="C2"), + data.frame(algo="A3", value=0.9, case="C2") + )) + + data <- rbind(dataTask1, dataTask2) + + challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) + + ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") + + set.seed(1) + rankingBootstrapped <- ranking%>%bootstrap(nboot=10) + + meanRanks <- ranking%>%consensus(method = "euclidean") + + actualPlot <- stabilityByAlgorithm(rankingBootstrapped, ordering = names(meanRanks), single = TRUE) + expect_equal(length(actualPlot), 3) + expect_is(actualPlot[[1]], "ggplot") + expect_is(actualPlot[[2]], "ggplot") + expect_is(actualPlot[[3]], "ggplot") +}) diff --git a/tests/testthat/test-stackedFrequencyPlotStabilityByAlgorithm.R b/tests/testthat/test-stackedFrequencyPlotStabilityByAlgorithm.R new file mode 100644 index 0000000..922c577 --- /dev/null +++ b/tests/testthat/test-stackedFrequencyPlotStabilityByAlgorithm.R @@ -0,0 +1,90 @@ +test_that("stacked frequency plot for visualizing ranking stability by algorithm raises error for single-task data set", { + data <- rbind( + data.frame(algo="A1", value=0.8, case="C1"), + data.frame(algo="A2", value=0.6, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.2, case="C2"), + data.frame(algo="A2", value=0.1, case="C2"), + data.frame(algo="A3", value=0.0, case="C2")) + + challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE) + + ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") + + set.seed(1) + rankingBootstrapped <- ranking%>%bootstrap(nboot=10) + + expect_error(stabilityByAlgorithm(rankingBootstrapped, stacked = TRUE), + "The stability of rankings by algorithm cannot be computed for less than two tasks.", fixed=TRUE) +}) + +test_that("stacked frequency plot for visualizing ranking stability by algorithm returns one plot for multi-task data set", { + dataTask1 <- cbind(task="T1", + rbind( + data.frame(algo="A1", value=0.8, case="C1"), + data.frame(algo="A2", value=0.6, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.2, case="C2"), + data.frame(algo="A2", value=0.1, case="C2"), + data.frame(algo="A3", value=0.0, case="C2") + )) + dataTask2 <- cbind(task="T2", + rbind( + data.frame(algo="A1", value=0.2, case="C1"), + data.frame(algo="A2", value=0.3, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.7, case="C2"), + data.frame(algo="A2", value=0.8, case="C2"), + data.frame(algo="A3", value=0.9, case="C2") + )) + + data <- rbind(dataTask1, dataTask2) + + challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) + + ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") + + set.seed(1) + rankingBootstrapped <- ranking%>%bootstrap(nboot=10) + + actualPlot <- stabilityByAlgorithm(rankingBootstrapped, stacked = TRUE) + expect_is(actualPlot, "ggplot") +}) + +test_that("stacked frequency plot for visualizing ranking stability by algorithm returns a plot for each algorithm", { + dataTask1 <- cbind(task="T1", + rbind( + data.frame(algo="A1", value=0.8, case="C1"), + data.frame(algo="A2", value=0.6, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.2, case="C2"), + data.frame(algo="A2", value=0.1, case="C2"), + data.frame(algo="A3", value=0.0, case="C2") + )) + dataTask2 <- cbind(task="T2", + rbind( + data.frame(algo="A1", value=0.2, case="C1"), + data.frame(algo="A2", value=0.3, case="C1"), + data.frame(algo="A3", value=0.4, case="C1"), + data.frame(algo="A1", value=0.7, case="C2"), + data.frame(algo="A2", value=0.8, case="C2"), + data.frame(algo="A3", value=0.9, case="C2") + )) + + data <- rbind(dataTask1, dataTask2) + + challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE) + + ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min") + + set.seed(1) + rankingBootstrapped <- ranking%>%bootstrap(nboot=10) + + meanRanks <- ranking%>%consensus(method = "euclidean") + + actualPlot <- stabilityByAlgorithm(rankingBootstrapped, ordering = names(meanRanks), stacked = TRUE, single = TRUE) + expect_equal(length(actualPlot), 3) + expect_is(actualPlot[[1]], "ggplot") + expect_is(actualPlot[[2]], "ggplot") + expect_is(actualPlot[[3]], "ggplot") +}) diff --git a/vignettes/visualizations.Rmd b/vignettes/visualizations.Rmd new file mode 100644 index 0000000..c5da0fb --- /dev/null +++ b/vignettes/visualizations.Rmd @@ -0,0 +1,166 @@ +--- +title: "Visualizations" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Visualizations} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(challengeR) +``` + +The package offers an intuitive way to gain important insights into the relative and absolute performance of algorithms. It enables you to generate a benchmarking report that contains visualizations and respective explanations. An overview of all available visualizations is provided on this page demonstrating the use of their corresponding plot functions. This might be of interest if you want to generate the plots separately (e.g. to apply other styles). + +The provided plots are described in the following sections: + +* Visualizing assessment data +* Visualizing ranking stability +* Visualizing cross-task insights + +Details can be found in [Wiesenfarth et al. (2019)](https://arxiv.org/abs/1910.05121). + +# Visualizing assessment data + +```{r} +data <- read.csv(system.file("extdata", "data_matrix.csv", package = "challengeR", mustWork = TRUE)) + +challenge <- as.challenge(data, by = "task", algorithm = "alg_name", case = "case", value = "value", smallBetter = FALSE) + +ranking <- challenge%>%aggregateThenRank(FUN = mean, ties.method = "min") +``` + +## Dot- and boxplots +Dot- and boxplots visualize the assessment data separately for each algorithm. Boxplots representing descriptive statistics for all test cases +(median, quartiles and outliers) are combined with horizontally jittered dots representing individual test cases. +```{r} +boxplot(ranking) +``` + +## Podium plots +Upper part of the podium plot: Algorithms are color-coded, and each colored dot in the plot represents a performance value achieved with the respective algorithm. The actual value is encoded by the y-axis. Each podium (here: $p = 5$) represents one possible rank, ordered from best (1) to worst (here: 5). The assignment of values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding test case.Note that the plot part above each podium place is further subdivided into $p$ “columns”, where each column represents one algorithm. Dots corresponding to identical test cases are connected by a line, producing the spaghetti structure shown here. Lower part: Bar charts represent the relative frequency at which each algorithm actually achieves the rank encoded by the podium place. +```{r} +# The podium plot is not available as an encapsulated function yet. +# podium(challenge, xlab = "Podium", ylab = "Metric value") +``` + +## Ranking heatmaps +In a ranking heatmap, each cell $\left( i, A_j \right)$ shows the absolute frequency of cases in which algorithm $A_j$ achieved rank $i$. +```{r} +rankingHeatmap(ranking) +``` + +# Visualizing ranking stability + +The ranking robustness can by analyzed with respect to the ranking method used (see [Wiesenfarth et al. (2019)](https://arxiv.org/abs/1910.05121) for different ranking methods). + +## Line plots +Line plots visualize the robustness of ranking across different ranking methods. Each algorithm is represented by one colored line. +For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods. +```{r, fig.width = 7} +methodsplot(challenge) +``` + +For a specific ranking method, the ranking stability can be investigated via bootstrapping and the testing approach. + +A ranking object containing the bootstrapping samples has to be created which serves as the basis for the plots. + +```{r, results = "hide"} +set.seed(1) +rankingBootstrapped <- ranking%>%bootstrap(nboot = 1000) +``` + +## Blob plots + +Blob plots for visualizing ranking stability are based on bootstrap sampling. Algorithms are color-coded, and the area of each blob at position $\left( A_i, \text{rank } j \right)$ is proportional to the relative frequency $A_i$ achieved rank $j$ (here across $b = 1000$ bootstrap samples). The median rank for each algorithm is indicated by a black cross. 95% bootstrap intervals across bootstrap samples (ranging from the 2.5th to +the 97.5th percentile of the bootstrap distribution) are indicated by black lines. + +```{r, fig.width = 7} +stabilityByTask(rankingBootstrapped) +``` + +## Violin plots + +Violin plots provide a more condensed way to analyze bootstrap results. In these plots, the focus is on the comparison of the ranking list computed on the full assessment data and the individual bootstrap samples, respectively. Kendall’s $\tau$ is chosen for comparison as it is has an upper and lower bound (+1/-1). Kendall’s $\tau$ is computed for each pair of rankings, and a violin plot that simultaneously depicts a boxplot and a density plot is generated from the results. + +```{r, results = "hide"} +violin(rankingBootstrapped) +``` + +## Significance maps + +Significance maps visualize ranking stability based on statistical significance. They depict incidence matrices of pairwise significant test results for the one-sided Wilcoxon signed rank test at 5% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that performance values of the algorithm on the x-axis are significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference. + +```{r} +significanceMap(ranking) +``` + +# Visualizing cross-task insights + +For cross-task insights, a consensus ranking (rank aggregation across tasks) has to be given additionally. The consensus ranking according to mean ranks across tasks is computed here. + +```{r} +meanRanks <- ranking%>%consensus(method = "euclidean") +``` + +## Characterization of algorithms + +The primary goal of most multi-task challenges is to identify methods that consistently outperform competing algorithms across all tasks. We propose the followig methods for analyzing this: + +### Blob plots visualizing the ranking variability across tasks + +Blob plots visualize the distribution of ranks across tasks. All ranks that an algorithm achieved in any task are displayed along the y-axis, with the area of the blob being proportional to the frequency. If all tasks provided the same stable ranking, narrow intervals around the diagonal would be expected. Consensus rankings above algorithm names highlight the presence of ties. + +```{r, fig.width = 5, fig.height = 4} +stability(ranking, ordering = names(meanRanks)) +``` + +### Blob plots visualizing the ranking variability based on bootstrapping + +This variant of the blob plot approach involves replacing the algorithms on the x-axis with the tasks and then generating a separate plot for each algorithm. This allows assessing the variability of rankings for each algorithm across multiple tasks and bootstrap samples. Here, color coding is used for the tasks, and separation by algorithm enables a relatively straightforward strength-weaknesses analysis for individual methods. + +```{r, fig.width = 7, fig.height = 5} +stabilityByAlgorithm(rankingBootstrapped, ordering = names(meanRanks)) +``` + +### Stacked frequency plots visualizing the ranking variability based on bootstrapping + +An alternative representation is provided by a stacked frequency plot of the observed ranks, separated by algorithm. Observed ranks across bootstrap samples are displayed with coloring according to the task. For algorithms that achieve the same rank in different tasks for the full assessment data set, vertical lines are on top of each other. Vertical lines allow to compare the achieved rank of each algorithm over different tasks. + +```{r, fig.width = 7, fig.height = 5} +stabilityByAlgorithm(rankingBootstrapped, ordering = names(meanRanks), stacked = TRUE) +``` + +## Characterization of tasks + +It may also be useful to structure the analysis around the different tasks. This section proposes visualizations to analyze and compare tasks of a competition. + +### Blob plots visualizing bootstrap results + +Bootstrap results can be shown in a blob plot showing one plot for each task. Algorithms should be ordered according to +the consensus ranking. In this view, the spread of the blobs for each algorithm can be compared across tasks. Deviations from the diagonal indicate deviations from the consensus ranking (over tasks). Specifically, if rank distribution of an algorithm is consistently below the diagonal, the algorithm performed better in this task than on average across tasks, while if the rank distribution of an algorithm is consistently above the diagonal, the algorithm performed worse in this task than on average across tasks. At the bottom of each panel, ranks for each algorithm in the tasks are provided. + +```{r, fig.width = 7, fig.height = 3.5} +stabilityByTask(rankingBootstrapped, ordering = names(meanRanks)) +``` + +### Violin plots visualizing bootstrap results + +To obtain a more condensed visualization, violin plots (see above) can be applied separately to all tasks. The overall stability of the rankings can then be compared by assessing the locations and lengths of the violins. + +### Cluster analysis + +There is increasing interest in assessing the similarity of the tasks, e.g., for pre-training a machine learning algorithm. A potential approach to this could involve the comparison of the rankings for a challenge. Given the same teams participate in all tasks, it may be of interest to cluster tasks into groups where rankings of algorithms are similar and to identify tasks which lead to very dissimilar rankings of algorithms. To enable such an analysis, we propose the generation of a dendrogram from hierarchical cluster analysis. Here, it depicts clusters according to a chosen distance measure (Spearman’s footrule) as well as a chosen agglomeration method (complete agglomeration). + + +```{r, fig.width = 7, fig.height = 3.5} +dendrogram(ranking) +```