diff --git a/NAMESPACE b/NAMESPACE
index de641e4..fda212d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,235 +1,209 @@
 #exportPattern("^[[:alpha:]]+")
 
 export(
-  "aggregate.bootstrap", "aggregate.bootstrap.list", "aggregate.challenge", "aggregate.ranked", "aggregate.ranked.list",
-  "aggregateList", #?
-  "aggregateThenRank", 
+  "aggregate.bootstrap", "aggregate.bootstrap.list", "aggregate.challenge", "aggregate.ranked.list",
+  "aggregateThenRank",
   "as.challenge",
   "as.relation.challenge.incidence", "as.relation.ranked.list",
-  "bootstrap", "bootstrap.ranked", "bootstrap.ranked.list", #"bootstrap.default",
-  "boxplot.ranked", "boxplot.ranked.list", "boxplot.bootstrap.list", "boxplot.comparedRanks.list",
-  #"check_strict_preference", 
+  "bootstrap", "bootstrap.ranked.list", #"bootstrap.default",
+  "boxplot.ranked.list", "boxplot.bootstrap.list", "boxplot.comparedRanks.list",
+  #"check_strict_preference",
   "compareRanks",    "compareRanks.ranked",    "compareRanks.ranked.list", # "compareRanks.default",
-  "consensus",       "consensus.ranked.list", #"consensus.default", 
+  "consensus",       "consensus.ranked.list", #"consensus.default",
   "Decision", "decision.challenge",
-  "blobplot","blobplot.ranked",
   "default_colors",
-  "density.bootstrap.list", 
-  "extract.workflow", 
-  "kendall",    "kendall.bootstrap.list", 
-  #  "melt.aggregated.list",    "melt.ranked", "melt.ranked.list", 
-  # "merge.list", 
-  "lineplot.challenge", 
-  "methodsplot","methodsplot.challenge","methodsplot.ranked",
+  "density.bootstrap.list",
+  "extract.workflow",
+  "kendall",    "kendall.bootstrap.list",
+  # "merge.list",
+  "lineplot.challenge",
+  "methodsplot","methodsplot.challenge",
   "network", "plot.network",
-  "podium", "podium.ranked","podium.ranked.list", "podium.challenge",#"podium.default", 
-  "print.aggregated", "print.aggregated.list", "print.comparedRanks",    "print.ranked", "print.ranked.list", 
-  #"quickmerge.list", 
-  "rank",    "rank.aggregated", "rank.aggregated.list", "rank.aggregatedRanks", "rank.aggregatedRanks.list", "rank.challenge", #"rank.default",
-  "rankFrequencies", "rankFrequencies.bootstrap",    "rankFrequencies.bootstrap.list", #"rankFrequencies.default", 
+  "podium","podium.ranked.list", "podium.challenge",#"podium.default",
+  "print.aggregated", "print.aggregated.list", "print.comparedRanks",    "print.ranked", "print.ranked.list",
+  #"quickmerge.list",
+  "rank", "rank.aggregated.list", "rank.aggregatedRanks.list", "rank.challenge", #"rank.default",
+  "rankFrequencies", "rankFrequencies.bootstrap",    "rankFrequencies.bootstrap.list", #"rankFrequencies.default",
   #"rankNA2",
   "rankThenAggregate",
-  "rankingHeatmap","rankingHeatmap.ranked","rankingHeatmap.ranked.list", "relation_dissimilarity.ranked.list",
-  "report", "report.bootstrap", "report.bootstrap.list", 
-  "second", 
+  "rankingHeatmap", "rankingHeatmap.ranked.list", "relation_dissimilarity.ranked.list",
+  "report", "report.bootstrap.list",
+  "second",
   "select.if", "select.if.aggregated.list", "select.if.comparedRanks.list", "select.if.list", "select.if.ranked.list",
-  "significanceMap", 
-  "spearmansFootrule", "spearmansWeightedFootrule", 
-  "splitby", 
+  "significanceMap",
+  "spearmansFootrule", "spearmansWeightedFootrule",
+  "splitby",
   "stability", "stabilityByAlgorithm", "stabilityByAlgorithmStacked","stabilityByTask",
-  "stability.ranked.list", "stability.bootstrap","relation_dissimilarity",
-  "stabilityByAlgorithm.bootstrap.list",   
+  "stability.ranked.list", "relation_dissimilarity",
+  "stabilityByAlgorithm.bootstrap.list",
   "stabilityByAlgorithmStacked.bootstrap.list",
   "stabilityByTask.bootstrap.list",
-  "subset.aggregated.list",    "subset.comparedRanks.list", "subset.list", "subset.ranked.list", "subset.bootstrap.list", "subset.ranked","subset.bootstrap",#"which.top",
-  "test",    "test.challenge", "test.default", 
+  "subset.aggregated.list",    "subset.comparedRanks.list", "subset.list", "subset.ranked.list", "subset.bootstrap.list", #"which.top",
+  "taskSubset.ranked.list", "taskSubset.bootstrap.list",
+  "test",    "test.challenge", "test.default",
   "testThenRank",
-  "violin", "violin.bootstrap.list","violin.bootstrap",
-  "winner", "winner.bootstrap",    "winner.bootstrap.list", "winner.default", "winner.ranked",    "winner.ranked.list", 
+  "violin", "violin.bootstrap.list",
+  "winner", "winner.bootstrap",    "winner.bootstrap.list", "winner.default", "winner.ranked",    "winner.ranked.list",
   "winnerFrequencies",    "winnerFrequencies.bootstrap", "winnerFrequencies.bootstrap.list",    "winnerFrequencies.default"
   )
- 
- 
+
+
 importFrom("dplyr", "bind_rows","group_by","summarise","select_if","filter","mutate","right_join","anti_join","ungroup","arrange","desc")
 importFrom("rlang",":=",".data","!!")
 importFrom("reshape2","melt", "acast")
 importFrom("utils", "capture.output", "methods")
 importFrom("plyr", "llply")
 importFrom("knitr", "kable")
 importFrom("tidyr", "complete","expand")
 importFrom("purrr", "%>%")
 importFrom("rmarkdown", "render","word_document","pdf_document","html_document")
 importFrom("viridisLite", "viridis","cividis")
 importFrom("ggplot2", "aes","aes_string","geom_abline", "geom_bar", "geom_boxplot", "geom_count", "geom_density", "geom_jitter",
     "geom_line", "geom_point", "geom_raster", "geom_step", "geom_text", "geom_violin","annotate","guide_legend",
     "geom_vline", "ggplot", "ggtitle","vars","xlab","ylab","scale_size_area","theme_get","rel","geom_hline","ggplot_build","scale_fill_manual",
     "scale_y_continuous","coord_cartesian", "element_text", "facet_wrap", "position_jitter", "stat", "stat_summary", "theme", "unit","guides","scale_fill_viridis_c")
 importFrom("grDevices", "col2rgb", "gray", "rgb", "grey")
 importFrom("graphics", "abline", "axis", "barplot", "box", "layout",
                "legend", "par", "plot", "points", "segments","boxplot", "stripchart", "title", "grconvertX", "plot.new")
 importFrom("stats", "as.dist", "as.formula", "median", "p.adjust", "density",
                "quantile", "aggregate", "cor", "wilcox.test", "terms.formula", "complete.cases")
 importFrom("methods", "new")
 importFrom("relations","relation","as.relation",
     "relation_domain", "relation_incidence", "relation_is_asymmetric","relation_consensus","relation_ensemble",
     "relation_is_irreflexive", "relation_is_negatively_transitive",
     "relation_is_transitive", "relation_is_trichotomous", "relation_scores",
     "relation_violations","relation_dissimilarity")
 importFrom("graph", "addEdge")
 
 
 S3method(print, comparedRanks)
 S3method(print, aggregated)
 S3method(print, ranked)
 S3method(print, aggregated.list)
 S3method(print, ranked.list)
 
 S3method(aggregate, challenge)
-S3method(aggregate, ranked)
 S3method(aggregate, ranked.list)
 S3method(aggregate, bootstrap.list)
 S3method(aggregate, bootstrap)
 
 S3method(test, default)
 S3method(test, challenge)
 
 S3method(Aggregate, default)
 S3method(Aggregate, data.frame)
 S3method(Aggregate, list)
 
 S3method(Rank, default)
-S3method(Rank, data.frame)
 S3method(Rank, list)
 
 S3method(rank, default)
 S3method(rank, challenge)
-S3method(rank, aggregated)
 S3method(rank, aggregated.list)
-S3method(rank, aggregatedRanks)
 S3method(rank, aggregatedRanks.list)
 
-S3method(blobplot, default)
-S3method(blobplot, ranked)
-
 S3method(bootstrap, default)
-S3method(bootstrap, ranked)
 S3method(bootstrap, ranked.list)
 
 S3method(winner, default)
 S3method(winner, ranked)
 S3method(winner, ranked.list)
 S3method(winner, bootstrap)
 S3method(winner, bootstrap.list)
 
 S3method(rankFrequencies, default)
 S3method(rankFrequencies, bootstrap)
 S3method(rankFrequencies, bootstrap.list)
 
 S3method(winnerFrequencies, default)
 S3method(winnerFrequencies, bootstrap)
 S3method(winnerFrequencies, bootstrap.list)
 
 
 S3method(compareRanks,default)
 S3method(compareRanks,ranked)
 S3method(compareRanks,ranked.list)
 
 S3method(merge,list)
 
 S3method(melt,ranked.list)
-S3method(melt,ranked)
 S3method(melt,aggregated.list)
 
-S3method(boxplot,ranked)
 S3method(boxplot,ranked.list)
 S3method(boxplot,comparedRanks.list)
 S3method(boxplot,bootstrap.list)
 
 S3method(select.if,default)
 S3method(select.if,list)
 S3method(select.if,aggregated.list)
 S3method(select.if,ranked.list)
 S3method(select.if,comparedRanks.list)
 
 S3method(subset,list)
 S3method(subset,bootstrap.list)
 S3method(subset,aggregated.list)
 S3method(subset,ranked.list)
 S3method(subset,comparedRanks.list)
-S3method(subset,ranked)
-S3method(subset,bootstrap)
 
+S3method(taskSubset,ranked.list)
+S3method(taskSubset,bootstrap.list)
 
 S3method(podium,default)
 S3method(podium,challenge)
-S3method(podium,ranked)
 S3method(podium,ranked.list)
 
 S3method(network,default)
 S3method(network,ranked.list)
 S3method(network,dist)
 S3method(plot,network)
 
 S3method(density,bootstrap.list)
 
 S3method(as.relation,challenge.incidence)
 S3method(as.relation,ranked.list)
 
 S3method(subset,bootstrap.list)
 S3method(subset,ranked.list)
 S3method(subset,list)
 S3method(subset,comparedRanks.list)
 S3method(subset,aggregated.list)
- 
+
 S3method(decision,challenge)
 S3method(decision,default)
 
 S3method(lineplot,challenge)
 S3method(lineplot,default)
 
 S3method(methodsplot,challenge)
-S3method(methodsplot,ranked)
 S3method(methodsplot,default)
 
 S3method(significanceMap,data.frame)
-S3method(significanceMap,ranked)
 S3method(significanceMap,ranked.list)
 S3method(significanceMap,default)
 
-
-
 S3method(violin,bootstrap.list)
-S3method(violin,bootstrap)
 S3method(violin,default)
 
-
-#S3method(rankingHeatmap,challenge)
-S3method(rankingHeatmap,ranked)
 S3method(rankingHeatmap,ranked.list)
 S3method(rankingHeatmap,default)
 
 S3method(relation_dissimilarity,ranked.list)
 S3method(relation_dissimilarity,default)
- 
+
 S3method(stabilityByTask,bootstrap.list)
 S3method(stabilityByTask,default)
-S3method(stability,bootstrap)
 S3method(stability,default)
 S3method(stability,ranked.list)
 
 S3method(stabilityByAlgorithm,bootstrap.list)
 S3method(stabilityByAlgorithm,default)
 
 S3method(stabilityByAlgorithmStacked,bootstrap.list)
 S3method(stabilityByAlgorithmStacked,default)
 
 S3method(consensus,ranked.list)
 S3method(consensus,default)
- 
+
 S3method(report,bootstrap.list)
-S3method(report,ranked)
 S3method(report,ranked.list)
-S3method(report,bootstrap)
 S3method(report,default)
-
-
diff --git a/R/Aggregate.R b/R/Aggregate.R
index ed50eca..a93f190 100644
--- a/R/Aggregate.R
+++ b/R/Aggregate.R
@@ -1,167 +1,163 @@
 Aggregate <- function(object,...) UseMethod("Aggregate")
 Aggregate.default <- function(object, ...) aggregate(object,...)  #stats::aggregate
 
 
 Aggregate.data.frame <-function(object,
                                 x,
-                                algorithm, 
+                                algorithm,
                                 FUN=mean,
                                 na.treat="na.rm", #can be na.rm, numeric value or function
-                                case, 
-                                alpha=0.05, p.adjust.method="none", 
+                                case,
+                                alpha=0.05, p.adjust.method="none",
                                 alternative="one.sided",
                                 test.fun=function(x,y) wilcox.test(x,
                                                                    y,
                                                                    alternative = alternative,
-                                                                   exact=FALSE, 
+                                                                   exact=FALSE,
                                                                    paired = TRUE)$p.value,
-                                largeBetter=TRUE, # only needed for significance 
+                                largeBetter=TRUE, # only needed for significance
                                 ...                      ){
-  call=match.call(expand.dots = T)  
+  call=match.call(expand.dots = T)
   if (is.numeric(na.treat)) object[,x][is.na(object[,x])]=na.treat
   else if (is.function(na.treat)) object[,x][is.na(object[,x])]=na.treat(object[,x][is.na(object[,x])])
   else if (na.treat=="na.rm") object=object[!is.na(object[,x]),]
-  else stop("Argument \"na.treat\" is invalid. It can be \"na.rm\", numeric value or function.")
+  else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.")
 
   if (is.character(FUN) && FUN=="significance"){
     if(missing(case)| missing(largeBetter)| missing(alpha)) stop("If FUN='significance' arguments case, largeBetter and alpha need to be given")
     if (length(unique(object[[algorithm]]))<=1){
       warning("only one ", algorithm, " available")
       agg=data.frame()
     } else {
       agg = significance(object,
                          x,
                          algorithm,
                          case,
                          alpha,
                          largeBetter,
                          p.adjust.method = p.adjust.method,
                          alternative = alternative,
                          ...
       )
     }
     isSignificance=TRUE
   } else {
     FUNname=as.character(call$FUN)
     if (is.character(FUN))  FUN=try(eval(parse(text=FUN)),silent = T)
-    
+
     if (!is.function(FUN)) stop("FUN has to be a function (possibly as character) or 'significance'")
     agg <- aggregate(object[, x],
                      by = list(object[, algorithm]),
                      FUN = function(z) do.call(FUN, args = list(x = z))
     )
     names(agg)=c(algorithm,
                  paste0(x,"_",
                         FUNname))
     rownames(agg)=agg[,1]
     agg=agg[,-1,drop=F]
     isSignificance=FALSE
   }
   res=list(FUN = . %>% (call),
            FUN.list=list(FUN),
            call=list(call),
            data=object,
            mat=agg,
            isSignificance= isSignificance)
-  
+
   class(res)=c("aggregated",class(res))
   res
-  
+
 }
 
 
 Aggregate.list <-function(object,
                           x,
                           algorithm,
                           FUN = mean,
                           na.treat = "na.rm",
                           parallel = FALSE,
                           progress = "none",
                           case,
                           alpha = 0.05,
                           p.adjust.method = "none",
                           alternative = "one.sided",
                           test.fun = function(x, y) wilcox.test(x,
                                                                 y,
                                                                 alternative = alternative,
                                                                 exact = FALSE,
                                                                 paired = TRUE)$p.value,
                           largeBetter = TRUE,   # only needed for significance
                           ...            ) {
-  call=match.call(expand.dots = T)  
+  call=match.call(expand.dots = T)
   if (is.character(FUN) && FUN=="significance"){
     if(missing(case)|  missing(largeBetter)|  missing(alpha)) stop("If FUN='significance' arguments case, largeBetter and alpha need to be given")
-    matlist=llply(1:length(object), 
-                  function(id){ 
+    matlist=llply(1:length(object),
+                  function(id){
                     piece=object[[id]]
                     if (length(unique(piece[[algorithm]]))<=1){
                       warning("only one ", algorithm, " available in element ", names(object)[id])
                       return(data.frame("prop_significance"=rep(NA,length(unique(piece[[algorithm]]))),
-                                        row.names = unique(piece[[algorithm]])))  
-                    } 
+                                        row.names = unique(piece[[algorithm]])))
+                    }
                     if (is.numeric(na.treat)) piece[,x][is.na(piece[,x])]=na.treat
                     else if (is.function(na.treat)) piece[,x][is.na(piece[,x])]=na.treat(piece[,x][is.na(piece[,x])])
                     else if (na.treat=="na.rm") piece=piece[!is.na(piece[,x]),]
-                    xmean <- significance(piece, 
-                                          x, 
-                                          algorithm, 
-                                          case, 
-                                          alpha, 
+                    else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.")
+
+                    xmean <- significance(piece,
+                                          x,
+                                          algorithm,
+                                          case,
+                                          alpha,
                                           p.adjust.method=p.adjust.method,
                                           largeBetter,
                                           alternative=alternative,
                                           ...)
                     class(xmean)=c("aggregated",
                                    class(xmean))
                     xmean
-                    }, 
+                    },
                   .parallel=parallel,
                   .progress=progress
     )
     isSignificance=TRUE
-    
+
   } else {
     if (is.character(FUN)) FUN=try(eval(parse(text=FUN)),
                                    silent = T)
     FUNname=as.character(call$FUN)
     if (!is.function(FUN)) stop("FUN has to be a function (possibly as character) or 'significance'")
-    
-    matlist=llply(object, 
-                  function(piece){ 
+
+    matlist=llply(object,
+                  function(piece){
                     if (is.numeric(na.treat)) piece[,x][is.na(piece[,x])]=na.treat
                     else if (is.function(na.treat)) piece[,x][is.na(piece[,x])]=na.treat(piece[,x][is.na(piece[,x])])
                     else if (na.treat=="na.rm") piece=piece[!is.na(piece[,x]),]
-                    
-                    xmean <- aggregate(piece[,x], 
-                                       by=list(piece[,algorithm]), 
+                    else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.")
+
+                    xmean <- aggregate(piece[,x],
+                                       by=list(piece[,algorithm]),
                                        FUN=function(z) do.call(FUN,args=list(x=z)))
                     names(xmean)=c(algorithm,
                                    paste0(x,"_",FUNname))
                     rownames(xmean)=xmean[,1]
                     xmean=xmean[,-1,drop=F]
                     xmean
-                  }, 
+                  },
                   .parallel=parallel,
                   .progress=progress
                   )
     isSignificance=FALSE
   }
   names(matlist)=names(object)
   res=list(FUN = . %>% (call),
            FUN.list=list(FUN),
            call=list(call),
            data=object,
-           matlist=matlist, 
+           matlist=matlist,
            isSignificance=isSignificance
            )
-  
+
   class(res)=c("aggregated.list",class(res))
   res
 }
-
-
-
-
-
-
-
diff --git a/R/AggregateList.R b/R/AggregateList.R
deleted file mode 100644
index 244be3d..0000000
--- a/R/AggregateList.R
+++ /dev/null
@@ -1,22 +0,0 @@
-
-# FUNname=capture.output(get_expr(FUN))[2]
-# FUNname=sub("UseMethod(","",FUNname,fixed = T)
-
-# depracate?
-aggregateList=function(object,
-                       x,
-                       FUN=mean){
-  if (is.character(FUN)) {
-      FUNname=paste0(x,"_",FUN)
-      FUN=try(eval(parse(text=FUN)),silent = T)
-    } else FUNname=paste0(x,"_aggregated")
-    
-  melt(object)%>%
-    filter(variable==x)%>%
-    group_by(algorithm)%>%
-    summarise(!!FUNname := FUN(value))%>% 
-    as.data.frame
-}
-
-
-
diff --git a/R/Bootstrap.R b/R/Bootstrap.R
index 21b7736..549c7c4 100644
--- a/R/Bootstrap.R
+++ b/R/Bootstrap.R
@@ -1,251 +1,179 @@
 bootstrap <- function(object,...) UseMethod("bootstrap")
 bootstrap.default <- function(object, ...) stop("not implemented for this class")
 
 
-bootstrap.ranked=function(object,
-                          nboot, 
-                          parallel=FALSE,
-                          progress="text",...){
-  data=object$data
-  algorithm=attr(data,"algorithm")
-  by=attr(data,"case")
-  index=unique(data[[by]])
-  # stop if only 1 data set or less than 3 algorithms
-    if (length(index)==1 |  length(unique(data[[algorithm]]))<=2 ) stop("There need to be at least 2 unique values in ", 
-                                                                        by, 
-                                                                        "and at least 3 unique in ",
-                                                                        algorithm)
-
-  drawsample=function(piece,...){ 
-    bootIndex=data.frame(sample(index,
-                                size=length(index),
-                                replace=TRUE))
-    colnames(bootIndex)=by
-    bootData=merge(bootIndex,data,by=by)
-    # bootIndex=sample(index,size=length(index),replace=TRUE)
-    # bootData=bind_rows(lapply(bootIndex,function(zz) data[data[[by]]==zz,]))
-    #3rd alternative (untested)
-    # bootIndex=sample(index,size=length(index),replace=TRUE)
-    #bootData=bind_rows(split(data,data[[by]])[bootIndex]))
-    attr(bootData,"inverseOrder")=attr(object$data,"inverseOrder")
-    attr(bootData,"algorithm")=attr(object$data,"algorithm")
-    attr(bootData,"case")=attr(object$data,"case")
-    attr(bootData,"check")=FALSE
-    object$FUN(bootData)$mat
-  }
-  res=llply(.data=1:nboot, 
-            .fun =drawsample , 
-            .parallel=parallel,
-            .progress=progress)
-
-  rankmat=res[[1]][,-1,drop=F]
-  for (j in 2:length(res)){
-    rankmat=merge(rankmat,
-                  res[[j]][,-1,drop=F],
-                  by="row.names", 
-                  suffixes = c(paste0(".",j-1),
-                               paste0(".",j))) #maybe replayce by plyr::join() which is supposed to be faster
-    rownames(rankmat)=rankmat[,"Row.names"]
-    rankmat=rankmat[,-1]
-  }
-  aggmat=res[[1]][,-2,drop=F]
-  for (j in 2:length(res)){
-    aggmat=merge(aggmat,
-                 res[[j]][,-2,drop=F],
-                 by="row.names", 
-                 suffixes = c(paste0(".",j-1),
-                              paste0(".",j))) 
-    rownames(aggmat)=aggmat[,"Row.names"]
-    aggmat=aggmat[,-1]
-  }
-  
-  res=list(bootsrappedRanks=rankmat,
-           bootsrappedAggregate=aggmat, 
-           data=data,
-           mat=object$mat,
-           FUN=object$FUN,
-           FUN.list=object$FUN.list)
-  class(res)="bootstrap"
-  res
-}
-
-
-
 bootstrap.ranked.list=function(object,
                                nboot,
                                parallel=FALSE,
                                progress="text",
                                ...){
   algorithm=attr(object$data,"algorithm")
   by=attr(object$data,"case")
-  
+
   # exclude if only 1 data set or less than 3 algorithms
-  tidy.data.id=sapply(object$data, 
+  tidy.data.id=sapply(object$data,
                       function(data.subset) {
                         ifelse((length(unique(data.subset[[by]]))==1 |  length(unique(data.subset[[algorithm]]))<=2 ),
                                yes=FALSE,
                                no=TRUE)
                         })
   tidy.data=object$data[tidy.data.id]
   tidy.matlist=object$matlist[tidy.data.id]
-  
-  res= llply(1:nboot, 
+
+  res= llply(1:nboot,
              function(it){
                # draw 1 sample for each task
                bootDatalist = lapply(tidy.data, function(data.subset) {
                  index = unique(data.subset[[by]])
-                 
+
                  # bootIndex=sample(index,size=length(index),replace=TRUE)
                  # bootData=bind_rows(lapply(bootIndex,function(zz) data.subset[data.subset[[by]]==zz,]))
                  # faster:
-                 bootIndex = data.frame(sample(index, 
-                                               size = length(index), 
+                 bootIndex = data.frame(sample(index,
+                                               size = length(index),
                                                replace = TRUE))
                  colnames(bootIndex) = by
-                 bootData = merge(bootIndex, 
-                                  data.subset, 
+                 bootData = merge(bootIndex,
+                                  data.subset,
                                   by = by)
                  bootData
                })
                attr(bootDatalist, "inverseOrder") = attr(object$data, "inverseOrder")
                attr(bootDatalist, "algorithm") = attr(object$data, "algorithm")
                attr(bootDatalist, "case") = attr(object$data, "case")
                attr(bootDatalist, "check") = FALSE
                object$FUN(bootDatalist)$mat
-             }, 
-             .parallel = parallel, 
+             },
+             .parallel = parallel,
              .progress = progress)
 
-  
-  
+
+
   # rankmatlist=lapply(res[[1]],function(z) z[,"rank",drop=F])
   # for (j in 2:length(res)){
   #   rankmatlist=merge.list(rankmatlist,lapply(res[[j]],function(z) z[,"rank",drop=F]),by="row.names", suffixes = c(paste0(".",j-1),paste0(".",j)))
   #   rankmatlist=lapply(rankmatlist, function(z) {
   #     rownames(z)=z[,"Row.names"]
   #     z=z[,-1]
   #     })
   # }
-  # 
+  #
   # aggmatlist=lapply(res[[1]],function(z) z[,-2,drop=F])
   # for (j in 2:length(res)){
   #   aggmatlist=merge.list(aggmatlist,lapply(res[[j]],function(z) z[,-2,drop=F]),by="row.names", suffixes = c(paste0(".",j-1),paste0(".",j)))
   #   aggmatlist=lapply(aggmatlist, function(z) {
   #     rownames(z)=z[,"Row.names"]
   #     z=z[,-1]
   #     })
   # }
   rankmatlist = lapply(res[[1]],
                        function(z) z[, "rank", drop = F]
                        )
   for (j in 2:length(res)) {
-    rankmatlist = quickmerge.list(rankmatlist, 
-                                  lapply(res[[j]], 
+    rankmatlist = quickmerge.list(rankmatlist,
+                                  lapply(res[[j]],
                                          function(z)  z[, "rank", drop = F]))
   }
-  
-  aggmatlist = lapply(res[[1]], 
+
+  aggmatlist = lapply(res[[1]],
                       function(z) z[, -2, drop = F])
   for (j in 2:length(res)) {
-    aggmatlist = quickmerge.list(aggmatlist, 
-                                 lapply(res[[j]], 
+    aggmatlist = quickmerge.list(aggmatlist,
+                                 lapply(res[[j]],
                                         function(z) z[, -2, drop = F]))
   }
-  
+
   final=list(bootsrappedRanks=rankmatlist,
-             bootsrappedAggregate=aggmatlist, 
+             bootsrappedAggregate=aggmatlist,
              data=object$data,
              matlist=tidy.matlist,
              FUN=object$FUN,
              FUN.list=object$FUN.list)
   class(final)=c("bootstrap.list")
   final
 }
 
 
 
 
 
 
 ####################################################################################################
 # deprecate following functions?
 
 
 
 rankFrequencies <- function(object,...) UseMethod("rankFrequencies")
 rankFrequencies.default <- function(object, ...) stop("not implemented for this class")
 
 rankFrequencies.bootstrap=function(object, who,...){
   if (is.data.frame(who)) who=rownames(who)
   if (length(who)==1){
     res=table(t(object$bootsrappedRanks[rownames(object$bootsrappedRanks)==who,]))
     cat("\n",who,"\n")
     print(res)
   } else {
     res=lapply(who, function(w){
       rr=table(t(object$bootsrappedRanks[rownames(object$bootsrappedRanks)==w,]))
     cat(w,"\n")
       print(rr)
       cat("\n")
       rr
     })
   }
   res=c(list(rankFrequencies=res),object)
   invisible(res)
 }
 
 rankFrequencies.bootstrap.list=function(object, who,...){
   if (is.data.frame(who)) who=rownames(who)
   res=lapply(object$bootsrappedRanks,function(bootMat){
     if (length(who)==1){
       res=table(t(bootMat[rownames(bootMat)==who,]))
       cat("\n",who,"\n")
       print(res)
     } else {
       res=lapply(who, function(w){
         rr=table(t(bootMat[rownames(bootMat)==w,]))
         cat(w,"\n")
         print(rr)
         cat("\n")
         rr
       })
     }
     res
   })
   res=c(list(rankFrequencies=res),object)
   invisible(res)
 }
 
 
 
 
 winnerFrequencies <- function(object,...) UseMethod("winnerFrequencies")
 winnerFrequencies.default <- function(object, ...) stop("not implemented for this class")
 
 # Achtung: bester rank muss ==1 sein und nicht z.B. 1.5
 winnerFrequencies.bootstrap=function(object,...){
   rankings_dicho=ifelse(object$bootsrappedRanks==1,1,0)
   winnerFrequencies=data.frame(winnerFrequency=rowSums(rankings_dicho),row.names = rownames(object$bootsrappedRanks))
   res=merge(object$mat,winnerFrequencies,by="row.names",...)
   rownames(res)=res[,1]
   res=res[,-1]
   # res=c(res=res,object)
   # class(res)="bootstrapResults"
   res
 }
 
 winnerFrequencies.bootstrap.list=function(object,...){
   res=lapply(1:length(object$bootsrappedRanks),function(id){
     rankings_dicho=ifelse(object$bootsrappedRanks[[id]]==1,1,0)
     winnerFrequencies=data.frame(winnerFrequency=rowSums(rankings_dicho),row.names = rownames(object$bootsrappedRanks[[id]]))
     res=merge(object$matlist[[id]],winnerFrequencies,by="row.names",...)
     rownames(res)=res[,1]
     res=res[,-1]
     res
   })
   names(res)=names(object$bootsrappedRanks)
   res
 }
-
-
-
diff --git a/R/Rank.R b/R/Rank.R
index 3bb24fb..d7119eb 100644
--- a/R/Rank.R
+++ b/R/Rank.R
@@ -1,134 +1,74 @@
 Rank <- function(object,...) UseMethod("Rank")
 Rank.default <- function(object, ...) rank(object,...)  #base::rank
 
-Rank.data.frame <-function(object,
-                           x,
-                           annotator,
-                           ties.method="min",
-                           largeBetter=FALSE,
-                           ...){
-  call=match.call(expand.dots = T)  
-  if (attr(object,"check") && 
-      largeBetter && 
-      any(is.na(object[[x]])) && 
-      min(object[[x]],na.rm=TRUE)==0){
-        message("There are missing metric values and metric values exactly equal to zero. 
-                 Have some actually missing values been entered as zero in some instances?
-                 If yes, specify optional argument na.treat=0 in as.challenge().")
-  }
-  if (missing(annotator)){
-    res=bind_rows(lapply(split(object,
-                               object[[attr(object,"case")]]), 
-                         function(object.case) 
-                              cbind(object.case,
-                                    rank=rankNA2(object.case[[x]],
-                                                 ties.method = ties.method,
-                                                 largeBetter = largeBetter)
-                                    )
-                         )
-                  )
-   } else {
-    if (length(annotator)==1) annotator=object[,annotator] 
-    else annotator=as.list(object[,annotator])
-    byAnnotator=split(object,annotator) 
-    res=bind_rows(lapply(byAnnotator,
-                         function(annotator.i){
-                           bind_rows(lapply(split(annotator.i,
-                                                  annotator.i[[attr(object,"case")]]), 
-                                            function(annotator.case) 
-                                              cbind(annotator.case,
-                                                    rank=rankNA2(annotator.case[[x]],
-                                                                 ties.method = ties.method,
-                                                                 largeBetter = largeBetter)
-                                                    )
-                                            )
-                           )
-                           }
-                         ))
-    
-  }
-  
-  res=list(FUN = . %>% (call),
-           call=list(call),
-           data=object,
-           mat=res)
-  class(res)=c("ranked",class(res))
-  
-  res
-}
-
-
-
 Rank.list <- function(object,
                       x,
                       annotator,
                       ties.method="min",
                       largeBetter=FALSE,
                       ...){
-    
-    call=match.call(expand.dots = T)  
+
+    call=match.call(expand.dots = T)
     annotator.missing=missing(annotator)
-    if (any(sapply(object, 
+    if (any(sapply(object,
                    function(task) {
-                     (attr(object,"check") && 
-                      largeBetter && 
-                      any(is.na(task[[x]])) && 
+                     (attr(object,"check") &&
+                      largeBetter &&
+                      any(is.na(task[[x]])) &&
                       min(task[[x]], na.rm=TRUE)==0)
                   })
             )) {
-        message("There are missing metric values and metric values exactly equal to zero. 
+        message("There are missing metric values and metric values exactly equal to zero.
                  Have some actually missing values been entered as zero in some instances?
                  If yes, specify optional argument na.treat=0 in as.challenge().")
     }
-    
-    matlist=lapply(object, 
+
+    matlist=lapply(object,
                    function(task){
                      if (annotator.missing){
                        res=bind_rows(
                          lapply(split(task,
                                       task[[attr(object,"case")]]),
-                                function(task.case) 
+                                function(task.case)
                                   cbind(task.case,
                                         rank=rankNA2(task.case[[x]],
                                                      ties.method = ties.method,
                                                      largeBetter = largeBetter)
                                         )
                                 )
                          )
                         class(res)[2]="ranked"
                         res
                        } else {
                          byAnnotator=split(task,
-                                           as.list(task[,annotator])) 
+                                           as.list(task[,annotator]))
                          temp=bind_rows(
                            lapply(byAnnotator,
                                   function(annotator){
                                     bind_rows(
                                       lapply(split(annotator,
                                                    annotator[[attr(object,"case")]]),
-                                             function(annotator.case) 
+                                             function(annotator.case)
                                                cbind(annotator.case,
                                                      rank=rankNA2(annotator.case[[x]],
                                                                   ties.method = ties.method,
                                                                   largeBetter = largeBetter)
                                                      )
                                              )
                                     )
                                     }
                                   )
                            )
                          class(temp)[2]="ranked"
                          temp
                          }
-                     }  
+                     }
                    )
     res=list(FUN = . %>% (call),
              call=list(call),
              data=object,
              matlist=matlist)
-    
+
     class(res)=c("ranked.list",class(res))
     res
-   }
-
-
+}
diff --git a/R/Rank.aggregated.R b/R/Rank.aggregated.R
deleted file mode 100644
index 5a1d473..0000000
--- a/R/Rank.aggregated.R
+++ /dev/null
@@ -1,61 +0,0 @@
-
-rank.aggregated <-function(object,
-                           ties.method="min",
-                           largeBetter,
-                           ...){
-  call=match.call(expand.dots = F)
-  if (missing(largeBetter)){
-    if (!is.null(attr(object$data,"largeBetter"))) largeBetter=attr(object$data,"largeBetter")
-    else stop("largeBetter has to be provided either in as.challenge() or rank()")
-    
-    if (object$isSignificance) largeBetter=TRUE  # smallBetter (largeBetter) already taken care of by one-sided test nature of signficance
-  }  
-  call=call("rank.aggregated",
-            object=call$object,
-            ties.method=ties.method,
-            largeBetter=largeBetter)
-  mat=object$mat
-  
-  if (nrow(mat)>0) r=rankNA2(mat[,ncol(mat)],
-                             ties.method=ties.method,
-                             largeBetter=largeBetter)
-  else r=NULL
-  
-  res=list(mat=cbind(mat,rank=r),
-           data=object$data,
-           call=list(object$call,call),
-           FUN =  . %>% (object$FUN) %>%  (call),
-           FUN.list=c(object$FUN.list,"rank")
-           )
-  class(res)=c("ranked",class(res))
-  res
-}
-
-
-
-rank.aggregatedRanks <-function(object,
-                                ties.method="min",
-                                ...){
-  call=match.call(expand.dots = F)  
-  call=call("rank.aggregatedRanks",
-            object=call$object,
-            ties.method=ties.method)
-  mat=object$mat
-  
-  if (nrow(mat)>0) r=rankNA2(mat[,ncol(mat)],
-                             ties.method=ties.method,
-                             largeBetter=FALSE)
-  else r=NULL
-  
-  res=list(mat=cbind(mat,rank=r),
-           data=object$data,
-           call=list(object$call,call),
-           FUN =  . %>% (object$FUN) %>%  (call),
-           FUN.list=c(object$FUN.list,"rank")
-           )
-  class(res)=c("ranked",class(res))
-  res
-}
-
-
-
diff --git a/R/aaggregate.R b/R/aaggregate.R
index 72da78b..223af22 100644
--- a/R/aaggregate.R
+++ b/R/aaggregate.R
@@ -1,202 +1,166 @@
 test <- function(x,...) UseMethod("test")
 test.default <- function(x, ...) stop("not implemented for this class")
 test.challenge=function(x,...) aggregate.challenge(x=x,
                                                    FUN="significance",...)
 
 
 #' Title
 #'
-#' @param x 
-#' @param FUN 
-#' @param na.treat 
-#' @param alpha 
-#' @param p.adjust.method 
-#' @param parallel 
-#' @param progress 
-#' @param ... 
+#' @param x
+#' @param FUN
+#' @param na.treat
+#' @param alpha
+#' @param p.adjust.method
+#' @param parallel
+#' @param progress
+#' @param ...
 #'
 #' @return
 #' @export
 #'
 #' @examples
 aggregate.challenge=function(x,
                              FUN=mean,
                              na.treat, #either "na.rm", numeric value or function
-                             alpha=0.05, p.adjust.method="none",# only needed for significance 
+                             alpha=0.05, p.adjust.method="none",# only needed for significance
                              parallel=FALSE,
                              progress="none",...){
   call=as.list(match.call())
-  
+
   if (missing(na.treat)){ #na.treat only optional if no missing values in data set
     if (!inherits(x,"list")){
       if (!any(is.na(x[,attr(x, "value")]))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect
     } else {
-      if (!any(sapply(x, 
+      if (!any(sapply(x,
                       function(task) any(is.na(task[,attr(x, "value")]))))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect
     }
   }
-  
+
   res1=do.call("Aggregate",list(object=x,
                                 x=attr(x,"value"),
                                 algorithm=attr(x,"algorithm"),
                                 FUN=call$FUN,
                                 na.treat=na.treat,
                                 parallel=parallel,
                                 progress=progress,
                                 case=attr(x,"case"),
                                 alpha=alpha, p.adjust.method=p.adjust.method,
                                 largeBetter=attr(x,"largeBetter") # only needed for significance
   ))
-  
+
   call2=call("Aggregate",
-             object=call$x, 
+             object=call$x,
              x=attr(x,"value"),
              algorithm=attr(x,"algorithm"),
              FUN=call$FUN,
              na.treat=na.treat,
              parallel=parallel,progress=progress,
-             case=attr(x,"case"), 
-             alpha=alpha, p.adjust.method=p.adjust.method, 
-             largeBetter=attr(x,"largeBetter") # only needed for significance 
+             case=attr(x,"case"),
+             alpha=alpha, p.adjust.method=p.adjust.method,
+             largeBetter=attr(x,"largeBetter") # only needed for significance
   )
 
-  if (inherits(x,"list")){    
+  if (inherits(x,"list")){
     res=list(FUN = . %>% (call2),
              call=list(call2),
              FUN.list=list(FUN),
              data=x,
              matlist=res1$matlist,
              isSignificance=res1$isSignificance)
-    
+
     class(res)=c("aggregated.list",class(res))
   } else {
     res=list(FUN = . %>% (call2),
              call=list(call2),
              FUN.list=list(FUN),
              data=x,
              mat=res1$mat,
              isSignificance=res1$isSignificance)
-    
-    class(res)=c("aggregated",class(res))
-    
-  }  
-  res
-
-}
 
+    class(res)=c("aggregated",class(res))
 
-aggregate.ranked <-function(x,
-                            FUN=mean, ...                      ){
-  call=match.call(expand.dots = F)  
-  call=call("aggregate.ranked",
-            x=call$x,
-            FUN=FUN)
-  algorithm=attr(x$data,"algorithm")
-  mat=x$mat
-  what="rank"
-  xmean <- aggregate(mat[,what], 
-                     by=list(mat[,algorithm]), 
-                     FUN=function(z) do.call(FUN,args=list(x=z)))
-  names(xmean)=c(algorithm,paste0(what,
-                                  "_",
-                                  strsplit(capture.output(suppressWarnings(print(methods(FUN),
-                                                                                 byclass=T)))[1],
-                                           " ")[[1]][2]
-                                  )
-                 )
-  rownames(xmean)=xmean[,1]
-  xmean=xmean[,-1,drop=F]
-  res=list(FUN = . %>% (x$FUN) %>%  (call),
-           FUN.list=c(x$FUN.list,FUN),
-           call=c(x$call,call),
-           data=x$data,
-           mat=xmean)
-  class(res)=c("aggregatedRanks",class(res))
+  }
   res
-}
 
+}
 
 
 aggregate.ranked.list <-function(x,
-                                 FUN=mean,      
+                                 FUN=mean,
                                  ...){
-  call=match.call(expand.dots = F)  
+  call=match.call(expand.dots = F)
   call=call("aggregate.ranked.list",
             x=call$x,
             FUN=FUN)
-  
+
   algorithm=attr(x$data,"algorithm")
   resmatlist=Aggregate.list(x$matlist,
                             x="rank",
                             algorithm=algorithm,
                             FUN=FUN,...)$matlist
   resmatlist=lapply(resmatlist,
                     function(z) as.data.frame(z))
   res=list(matlist=resmatlist,
            call=c(x$call,call),
            data=x$data,
            FUN =  . %>% (x$FUN) %>%  (call),
            FUN.list=c(x$FUN.list,FUN)
   )
   class(res)=c("aggregatedRanks.list",class(res))
   res
-  
-}
-
-
 
+}
 
 
 aggregate.bootstrap.list <-function(x,
                                     what="metric",
                                     FUN=mean,
                                     ...){
-  call=match.call(expand.dots = T)  
+  call=match.call(expand.dots = T)
   if (is.character(FUN)) FUN=try(eval(parse(text=FUN)),
                                  silent = T)
   FUNname=as.character(call$FUN)
-  
+
   if (!is.function(FUN)) stop("FUN has to be a function (possibly as character)")
-  matlist=llply(1:length(x$bootsrappedRank), 
-                function(i.piece){ 
+  matlist=llply(1:length(x$bootsrappedRank),
+                function(i.piece){
                   if (what=="ranks") xmean <- as.data.frame(apply(x$bootsrappedRank[[i.piece]],1,FUN=FUN))
-                  else xmean <- as.data.frame(apply(x$bootsrappedAggregate[[i.piece]],1,FUN=FUN)) 
+                  else xmean <- as.data.frame(apply(x$bootsrappedAggregate[[i.piece]],1,FUN=FUN))
                   names(xmean)=paste0(what,"_",FUNname)
                  xmean
                 })
-  
-  
+
+
   names(matlist)=names(x$bootsrappedRank)
   res=list(FUN = . %>% (call),
            call=list(call),
            data=x,
            matlist=matlist)
-  
+
   class(res)=c("aggregated.list",class(res))
   res
 }
 
 aggregate.bootstrap<-function(x,what="metric",FUN=mean,
                               ...            ){
-  call=match.call(expand.dots = T)  
+  call=match.call(expand.dots = T)
   if (is.character(FUN)) FUN=try(eval(parse(text=FUN)),silent = T)
   FUNname=as.character(call$FUN)
-  
+
   if (!is.function(FUN)) stop("FUN has to be a function (possibly as character)")
-  
+
   if (what=="ranks") xmean <- as.data.frame(apply(x$bootsrappedRank,
                                                   1,
                                                   FUN=FUN))
   else xmean <- as.data.frame(apply(x$bootsrappedAggregate,
                                     1,
-                                    FUN=FUN)) 
+                                    FUN=FUN))
   names(xmean)=paste0(what,"_",FUNname)
   res=list(FUN = . %>% (call),
            call=list(call),
            data=x,
            mat=xmean)
-  
+
   class(res)=c("aggregated",class(res))
   res
 }
-
diff --git a/R/blobplot.ranked.R b/R/blobplot.ranked.R
deleted file mode 100644
index 1cc5275..0000000
--- a/R/blobplot.ranked.R
+++ /dev/null
@@ -1,43 +0,0 @@
-blobplot <- function(x,...) UseMethod("blobplot")
-blobplot.default <- function(x, ...) stop("not implemented for this class")
-
-blobplot.ranked=function(x,
-                         ties.method="min",
-                         probs=c(.025,.975),
-                         max_size=6,
-                         shape=4,
-                         ...){ 
-  ordering=rownames(x$mat)[order(x$mat$rank)]
-  xx=x$data  
-  ranking=xx%>%rank( ties.method = ties.method )
-  
-  dat=ranking$mat
-  algorithm=attr(xx,"algorithm")
-  dat[[algorithm]]=factor(dat[[algorithm]], levels=ordering)
-  #dat[["case"]]=as.character(dat[["case"]])
-  p = ggplot(dat)+
-    #  geom_line(aes(x = algorithm, y = rank,color=case,  group=case ),size=.2,linetype=1,show.legend = F)+
-    geom_count(aes(!!as.symbol(algorithm),
-                   rank,
-                   color=!!as.symbol(algorithm),
-                   size = stat(prop*100)))
-  
-  p+scale_size_area(max_size = max_size)+
-    stat_summary(aes(!!as.symbol(algorithm),rank),
-                 geom="point",shape=shape,
-                 fun.data=function(x) data.frame(y=median(x)),...) +
-    stat_summary(aes(!!as.symbol(algorithm),rank ),
-                 geom="linerange",
-                 fun.data=function(x) data.frame(ymin=quantile(x,probs[1]),
-                                                 ymax=quantile(x,probs[2])))+
-    geom_abline(slope=1,color="gray",linetype="dotted")+
-    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ 
-    guides(size = guide_legend(title="%"),color="none")+
-    scale_y_continuous(minor_breaks=NULL,
-                       limits=c(1,max(5,max(dat$rank))), 
-                       breaks=c(1,seq(5,max(5,max(dat$rank)),by=5)))+
-    xlab("Algorithm")+ylab("Rank")
-}
-
-
-
diff --git a/R/boxplot.R b/R/boxplot.R
index 1d059ba..7139c6e 100644
--- a/R/boxplot.R
+++ b/R/boxplot.R
@@ -1,90 +1,76 @@
 
-
-boxplot.ranked=function(x,
-                        color="blue",
-                        jitter.width=0.25,...){
-  algo=attr(x$data,"algorithm")
-  value=attr(x$data,"value")
-  ranking=x
-  x=x$data
-  
-  x[[algo]]=factor(x[[algo]],
-                   levels=rownames(ranking$mat[order(ranking$mat$rank),]))
-  ggplot(aes_string(algo,value),data=x)+
-    geom_jitter(position=position_jitter(width=jitter.width, height=0),color=color,...)+
-    geom_boxplot(outlier.shape = NA,fill=NA)+ 
-    theme(axis.text.x=element_text(angle = -90, hjust = 0))
-  
-}
-
-
 boxplot.ranked.list=function(x,
                              color="blue",
                              jitter.width=0.25,...){
   algo=attr(x$data,"algorithm")
   value=attr(x$data,"value")
   ranking=x
   x=x$data
-  
+
   for (i in names(x)) {
-    x[[i]][[algo]]=factor(x[[i]][[algo]], 
+    x[[i]][[algo]]=factor(x[[i]][[algo]],
                           levels=rownames(ranking$matlist[[i]][order(ranking$matlist[[i]]$rank),]))
   }
-  
+
   a=lapply(1:length(x),function(id){
     ggplot(aes_string(algo,value),data=x[[id]])+
       geom_jitter(position=position_jitter(width=jitter.width, height=0),
                   color=color,...)+
       geom_boxplot(outlier.shape = NA,fill=NA)+
-      ggtitle(names(x)[id]) + 
+      ggtitle(names(x)[id]) +
       theme(axis.text.x=element_text(angle = -90, hjust = 0))
- 
+
   })
+
+  # Remove title for single-task data set
+  if (length(a) == 1) {
+    a[[1]]$labels$title <- NULL
+  }
   a
 }
 
 
 
 
-  
+
 
 boxplot.comparedRanks.list=function(x,...){
   tau=sapply(x,function(z) z$tau)
-  boxplot(tau,ylim=c(0,1.0),las=2, outline=FALSE, 
+  boxplot(tau,ylim=c(0,1.0),las=2, outline=FALSE,
           ylab="Kendall's tau",...)
-  stripchart(tau, 
-             vertical = TRUE, method = "jitter", 
-             pch = 21, col = "blue", add=TRUE,...) 
-  
+  stripchart(tau,
+             vertical = TRUE, method = "jitter",
+             pch = 21, col = "blue", add=TRUE,...)
+
 }
 
 
 boxplot.bootstrap.list=function(x,...){
-  winner.noboot=winner.ranked.list(x) 
+  winner.noboot=winner.ranked.list(x)
   x2=winnerFrequencies(x)
   n.bootstraps= ncol(x$bootsrappedRanks[[1]])
   perc_boot_Winner=lapply(1:length(x2),function(i){
     x2.i=x2[[i]]
     winner.id=which(rownames(x2.i)%in%rownames(winner.noboot[[i]])) #could be multiple winners!!!!
     100*x2.i[winner.id,3,drop=F]/n.bootstraps
   })
-  
+
   boxplot(unlist(perc_boot_Winner),ylim=c(0,100),las=2, outline=FALSE,
           ylab="% Bootstraps",xlab="Winner ranks 1",
           sub=paste(n.bootstraps,"Bootstraps"),...)
-  stripchart(unlist(perc_boot_Winner), 
-             vertical = TRUE, method = "jitter", 
-             pch = 21, col = "blue", add=TRUE,...) 
+  stripchart(unlist(perc_boot_Winner),
+             vertical = TRUE, method = "jitter",
+             pch = 21, col = "blue", add=TRUE,...)
 }
 
 
 
 
 # winnerFrequencies(bb)
  # winner.datax=winner(datax, largeBetter=TRUE) #no bootstrap
  #  x2=numberRank1(datax,originalranking.datax,datax_boot)
  #  boot_W_1=x2[which(x2$algorithm_id==winner.datax),3]
  #  boot_NW_1=sum(x2[-which(x2$algorithm_id==winner.datax),3]>9)
 # perc_boot_Winner=boot_W_1/Data$N_Bootstraps
 # perc_boot_NotWinner=Data$Bootstrap_Rank1_NotWinner/(Data$N_Algo-1)
 
diff --git a/R/challengeR.R b/R/challengeR.R
index 26f5643..86cd1d1 100644
--- a/R/challengeR.R
+++ b/R/challengeR.R
@@ -1,120 +1,102 @@
 #' Title
 #'
-#' @param object 
-#' @param value 
-#' @param algorithm 
-#' @param case 
-#' @param by 
-#' @param annotator 
-#' @param smallBetter 
-#' @param na.treat 
-#' @param check 
+#' @param object
+#' @param value
+#' @param algorithm
+#' @param case
+#' @param taskName Optional for single-task data set that does not contain a task column.
+#' @param by The name of the column that contains the task identifiers. Required for multi-task data set.
+#' @param annotator
+#' @param smallBetter
+#' @param na.treat
+#' @param check
 #'
 #' @return
 #' @export
 #'
 #' @examples
-as.challenge=function(object, 
-                      value, 
+as.challenge=function(object,
+                      value,
                       algorithm ,
                       case=NULL,
-                      by=NULL, 
-                      annotator=NULL, 
+                      taskName=NULL,
+                      by=NULL,
+                      annotator=NULL,
                       smallBetter=FALSE,
                       na.treat=NULL, # optional
-                      check=TRUE){
+                      check=TRUE) {
+
+  object=object[,c(value, algorithm, case, by, annotator)]
 
-  object=object[,c(value,algorithm,case,by,annotator)]
- 
-  # if (missing(na.treat)){
-  #   if (!smallBetter){
-  #     message("Setting na.treat=0, i.e. setting any missing metric value to zero.")
-  #     na.treat=0
-  # }
-  
   # sanity checks
-  if (check){
-      if (is.null(by)){
-        missingData=object %>% 
-          expand(!!as.symbol(algorithm),
-                 !!as.symbol(case)) %>% 
-          anti_join(object,
-                    by=c(algorithm,case))
-        if (nrow(missingData)>0) {
-          message("Performance of not all algorithms is observed for all cases. Inserted as missings in following cases:")
-          print(as.data.frame(missingData))
-          object=as.data.frame(object %>% 
-                                 complete(!!as.symbol(algorithm),
-                                          !!as.symbol(case)))
-      } else {
-        object=droplevels(object)
-        all1=apply(table(object[[algorithm]],
-                         object[[case]]), 
+  if (check) {
+
+    if (!is.null(by) && !is.null(taskName)) {
+      warning("Argument 'taskName' is ignored for multi-task data set.")
+    }
+
+    # Add task column for data set without task column by using the specified task name.
+    if (is.null(by) && !is.null(taskName)) {
+      taskName <- trimws(taskName)
+
+      if (taskName == "") {
+        stop("Argument 'taskName' is empty.")
+      }
+
+      object <- cbind(task=taskName, object)
+      by = "task"
+    }
+
+    # Add task column for data set without task column by using a dummy task name.
+    if (is.null(by) && is.null(taskName)) {
+      object <- cbind(task="dummyTask", object)
+      by = "task"
+    }
+
+    object=splitby(object,by=by)
+    object=lapply(object,droplevels)
+    for (task in names(object)) {
+      missingData=object[[task]] %>%
+        expand(!!as.symbol(algorithm),
+               !!as.symbol(case))%>%
+        anti_join(object[[task]],
+                  by=c( algorithm,case))
+      if (nrow(missingData)>0) {
+        message("Performance of not all algorithms is observed for all cases in task '",
+                task,
+                "'. Inserted as missings in following cases:")
+        print(as.data.frame(missingData))
+        object[[task]]=as.data.frame(object[[task]] %>%
+                                       complete(!!as.symbol(algorithm),
+                                                !!as.symbol(case)))
+      }
+      else {
+        all1=apply(table(object[[task]][[algorithm]],
+                         object[[task]][[case]]),
                    2,
                    function(x) all(x==1))
-        if (!all(all1)) stop ("Case(s) (", 
+        if (!all(all1)) stop ("Case(s) (",
                               paste(names(which(all1!=1)),
-                                    collapse=", "), 
-                              ") appear(s) more than once for the same algorithm")
-        
+                                    collapse=", "),
+                              ") appear(s) more than once for the same algorithm in task '",
+                              task, "'.")
       }
-        
-      if (!is.null(na.treat)){
-        if (is.numeric(na.treat)) object[,value][is.na(object[,value])]=na.treat
-        else if (is.function(na.treat)) object[,value][is.na(object[,value])]=na.treat(object[,value][is.na(object[,value])])
-        else if (na.treat=="na.rm") object=object[!is.na(object[,value]),]
+
+      if (!is.null(na.treat)) {
+        if (is.numeric(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat
+        else if (is.function(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat(object[[task]][,value][is.na(object[[task]][,value])])
+        else if (na.treat=="na.rm") object[[task]]=object[[task]][!is.na(object[[task]][,value]),]
       }
-        
-    } else {
-        object=splitby(object,by=by)
-        object=lapply(object,droplevels)
-        for (task in names(object)){
-          missingData=object[[task]] %>% 
-            expand(!!as.symbol(algorithm),
-                   !!as.symbol(case))%>% 
-            anti_join(object[[task]],
-                      by=c( algorithm,case))
-          if (nrow(missingData)>0) {
-            message("Performance of not all algorithms is observed for all cases in task ",
-                    task,
-                    ". Inserted as missings in following cases:")
-            print(as.data.frame(missingData))
-            object[[task]]=as.data.frame(object[[task]] %>% 
-                                           complete(!!as.symbol(algorithm),
-                                                    !!as.symbol(case)))
-           } else {
-            all1=apply(table(object[[task]][[algorithm]],
-                             object[[task]][[case]]), 
-                       2,
-                       function(x) all(x==1))
-            if (!all(all1)) stop ("Case(s) (", 
-                                  paste(names(which(all1!=1)),
-                                        collapse=", "), 
-                                  ") appear(s) more than once for the same algorithm in task ", 
-                                  task)
-           }
-        
-          if (!is.null(na.treat)){
-            if (is.numeric(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat
-            else if (is.function(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat(object[[task]][,value][is.na(object[[task]][,value])])
-            else if (na.treat=="na.rm") object[[task]]=object[[task]][!is.na(object[[task]][,value]),]
-          }
-          
-        }
-  
     }
-    
   }
-  
+
   attr(object,"algorithm")=algorithm
   attr(object,"value")=value
   attr(object,"case")=case
   attr(object,"annotator")=annotator
-  attr(object,"by")=by 
+  attr(object,"by")=by
   attr(object,"largeBetter")=!smallBetter
   attr(object,"check")=check
-  class(object)=c("challenge",class(object))
+  class(object)=c("challenge", class(object))
   object
 }
-
-
diff --git a/R/graph.R b/R/graph.R
index dab4637..aa753b1 100644
--- a/R/graph.R
+++ b/R/graph.R
@@ -1,165 +1,167 @@
 network <- function(x,...) UseMethod("network")
 network.default <- function(x, ...) stop("not implemented for this class")
 
 network.ranked.list=function(x,
-                             method = "symdiff", 
+                             method = "symdiff",
                              edge.col,
                              edge.lwd,
                              rate=1.05,
                              cols,
-                              ...
-                             
-){
+                              ...) {
+  if (length(x$data) < 3) {
+    stop("The cluster analysis is only sensible for more than two tasks.")
+  }
+
   # use ranking list
   relensemble=as.relation.ranked.list(x)
-  
+
   # # use relations
   #   a=challenge_multi%>%decision.challenge(p.adjust.method="none")
   #   aa=lapply(a,as.relation.challenge.incidence)
   #   names(aa)=names(challenge_multi)
   #   relensemble= do.call(relation_ensemble,args = aa)
   d <- relation_dissimilarity(relensemble, method = method)
 
   #coloring
   # # use relations
   # rm <-my.bsranking(relensemble) #for coloring
   # uw <- apply(rm, 2,
   #             function(x) {
   #               w <- which(x == 1)
   #               ifelse(length(w) == 1,
   #                      names(w), "none")
   #             })
   # use ranking list
   uw=lapply(x$matlist,function(task.i) rownames(task.i)[which(task.i$rank==1)])
   uw=sapply(uw, function(task.i) ifelse(length(task.i)==1,yes = task.i,no="none"))
-  
-  network.dist(d, 
+
+  network.dist(d,
                  edge.col = edge.col,# grDevices::grey.colors(nn), #terrain_hcl(nn, c=c(65,0), l=c(45,90), power=c(1/2,1.5)),
                  edge.lwd =edge.lwd,#4*rev(1.2^seq_len(length(unique(d)))/(1.2^length((unique(d))))),# seq(1, .001, length.out=nn),
                  rate=rate,
                  node.fill = cols[uw],...
   )
 }
 
 
 
 network.dist=
-  function (x, rate=1.05, #ndists.show = length(sort(unique(x))), 
-            edge.col = gray(0.7), 
-            edge.lwd = 1, 
-            node.fill = NULL, 
+  function (x, rate=1.05, #ndists.show = length(sort(unique(x))),
+            edge.col = gray(0.7),
+            edge.lwd = 1,
+            node.fill = NULL,
             ...) {
     nn=length(unique(c(x))) # ==max(rm) number of different distance levels
     if (is.function(edge.col)) edge.col=edge.col(nn)
     data <- as.matrix(x)
     nodes <- colnames(data)
     nnodes <- length(nodes)
     dists <- sort(unique(x))
     ndists <- length(dists)
-    dshow <- dists#[seq_len(ndists.show)] 
+    dshow <- dists#[seq_len(ndists.show)]
     ndshow <- length(dshow)
     edge.col <- rep(edge.col, ndshow)
     edge.lwd <- rep(edge.lwd, ndshow)
     edge.len <- ceiling((rate)^dists)# exponential distance
     #   edge.len <- ceiling((1.2)^(seq_len(ndists) - 1)) #verwende ordnung
     #   edge.len <- ceiling((1.05)^(dists-min(dists)+1))# verwende distance mit min==1
     edge.weight <- rev(dists) #rev(seq_len(ndists))
-    edge.lty <- c(rep("solid", ndshow), 
+    edge.lty <- c(rep("solid", ndshow),
                   rep("blank", length(dists) - ndshow))
-    graph <- new("graphNEL", 
-                 nodes = nodes, 
+    graph <- new("graphNEL",
+                 nodes = nodes,
                  edgemode = "undirected")
     edgeAttrs <- list()
     nodeAttrs <- list()
     for (i in 1:(nnodes - 1)) {
       for (j in (i + 1):nnodes) {
         s <- data[i, j]
         # if (s %in% dshow) {
         t <- which(s == dists)
         graph <- graph::addEdge(nodes[i], nodes[j], graph, 1) #edge.weight[t])
         n <- paste(nodes[i], nodes[j], sep = "~")
         edgeAttrs$len[n] <- edge.len[t] # laenge exponentiell
         #        edgeAttrs$len[n] <- s # laenge prop zu distance
         edgeAttrs$color[n] <- "black"#edge.col[t]
         edgeAttrs$lwd[n] <- edge.lwd[t]
         edgeAttrs$lty[n] <- 1#edge.lty[t]
         #   }
       }
     }
-    if (!is.null(node.fill)) 
+    if (!is.null(node.fill))
       nodeAttrs$fillcolor[nodes] <- node.fill
-    
+
     out= list(graph=graph,
-              nodeAttrs = nodeAttrs, 
+              nodeAttrs = nodeAttrs,
               edgeAttrs = edgeAttrs,
               tasknames=nodes,
               leg.col=node.fill[unique(names(node.fill))]
     )
     class(out)="network"
     out
   }
 
 
 plot.network=function(x,
                       layoutType = "neato",
                       fixedsize=TRUE,
                       fontsize,
                       width,
                       height,
                       shape="ellipse",
                       cex=.8,
                       ...
 ){
   graph=x$graph
   nodeAttrs=x$nodeAttrs
   edgeAttrs=x$edgeAttrs
   leg.col=x$leg.col
-  
+
   layoutType = layoutType
   attrs <- Rgraphviz::getDefaultAttrs(layoutType = layoutType)
   attrs$node$fixedsize <- fixedsize
   attrs$node$shape=shape
   if (missing(fontsize)) {
     attrs$node$fontsize <- max(sapply(x$tasknames,nchar))-1
   } else attrs$node$fontsize=fontsize
   if (missing(width)){
-    attrs$node$width <- max(sapply(x$tasknames,nchar)) 
+    attrs$node$width <- max(sapply(x$tasknames,nchar))
   } else attrs$node$width=width
   if (missing(height)) {
     attrs$node$height <- max(sapply(x$tasknames,nchar))/2
   } else attrs$node$height=height
-  
-  ag <- Rgraphviz::agopen(graph, 
-                          "", 
-                          layoutType = layoutType, 
-                          attrs = attrs, 
-                          nodeAttrs = nodeAttrs, 
+
+  ag <- Rgraphviz::agopen(graph,
+                          "",
+                          layoutType = layoutType,
+                          attrs = attrs,
+                          nodeAttrs = nodeAttrs,
                           edgeAttrs = edgeAttrs)
-  
+
     plot.new()
-    l=legend("topright",  
-             names(leg.col), 
-             lwd = 1, 
-             cex=cex, 
+    l=legend("topright",
+             names(leg.col),
+             lwd = 1,
+             cex=cex,
              bg =NA,
              plot=F)# bg="white")
     w <- grconvertX(l$rect$w, to='inches')
-    
+
     Rgraphviz::plot(ag,mai=c(0,0,0,w),...)
-    legend(par('usr')[2], par('usr')[4], 
-           xpd=NA,  
-           names(leg.col), 
-           lwd = 1, 
-           col = leg.col, 
+    legend(par('usr')[2], par('usr')[4],
+           xpd=NA,
+           names(leg.col),
+           lwd = 1,
+           col = leg.col,
            bg =NA,
            cex=cex)# bg="white")
- 
+
 }
 
 
 
 
 
 #library(R.utils)
 #reassignInPackage("beplot0.matrix","benchmark",my.beplot0.matrix)
 #reassignInPackage("beplot0.AlgorithmPerformance","benchmark",my.beplot0.AlgorithmPerformance)
diff --git a/R/melt.R b/R/melt.R
index 43b0d5b..3f0e0aa 100644
--- a/R/melt.R
+++ b/R/melt.R
@@ -1,13 +1,7 @@
 melt.ranked.list=melt.aggregated.list=function(object,...){
   matlist=lapply(object$matlist, function(z){
     z$algorithm=rownames(z)
     z
   })
   melt(matlist,id.vars="algorithm",...)
 }
-
-melt.ranked=
-  function(object,...){
-    object$mat$algorithm=rownames(object$mat)
-    object$mat
-  }
diff --git a/R/methodsplot.R b/R/methodsplot.R
index b97abdd..1414b80 100644
--- a/R/methodsplot.R
+++ b/R/methodsplot.R
@@ -1,246 +1,108 @@
 methodsplot <- function(x,...) UseMethod("methodsplot")
 methodsplot.default <- function(x, ...) stop("not implemented for this class")
 
-methodsplot.challenge=function(x, 
+methodsplot.challenge=function(x,
                                na.treat=NULL,
                                methods=list(testBased=.%>%test() %>% rank(ties.method = "min"),
                                             meanThenRank=  .%>%  aggregate(  FUN="mean") %>% rank(ties.method = "min"),
                                             medianThenRank=.%>% aggregate(  FUN="median") %>% rank(ties.method = "min"),
                                             rankThenMean= .%>%rank(ties.method = "min") %>%  aggregate(  FUN="mean") %>%rank(ties.method = "min"),
                                             rankThenMedian=.%>%rank(ties.method = "min") %>%  aggregate(  FUN="median") %>%rank(ties.method = "min")
                                             ),
-                               ordering, ...){
-  
-   if (!inherits(x,"list")){
-    if (any(is.na(x[,attr(x, "value")]))) { # only if missings present, else do nothing
-      if (is.null(na.treat)){
-        warning("Please specify na.treat in as.challenge()")
-        return(NULL)
-      } else {
-        x=as.challenge(x,
-                      value=attr(x,"value"), 
-                      algorithm=attr(x,"algorithm") ,
-                      case=attr(x,"case"),
-                      annotator = attr(x,"annotator"),
-                      smallBetter = !attr(x,"largeBetter"),
-                      na.treat=na.treat)
-       } 
-    }
-  } else {
-    if (any(sapply(x, 
-                    function(task) any(is.na(task[,attr(x, "value")])))))  { # only if missings present, else do nothing
-      if (is.null(na.treat)){
-        warning("Please specify na.treat in as.challenge()")
-        return(NULL)
-      } else {
-        xx = melt(x,
-                  id.vars=c(attr(x,"value"), 
-                            attr(x,"algorithm") ,
-                            attr(x,"case"),
-                            attr(x,"annotator"),
-                            attr(x,"by")
-        ))
-        
-        x=as.challenge(xx,
-                       value=attr(x,"value"), 
-                       algorithm=attr(x,"algorithm") ,
-                       case=attr(x,"case"),
-                       by=attr(x,"by"),
-                       annotator = attr(x,"annotator"),
-                       smallBetter = !attr(x,"largeBetter"),
-                       na.treat=na.treat)
-      } 
-    }
-  }
+                               ordering, ...) {
 
-  if (inherits(x,"list"))  {
-    a=lapply(methods,function(fun) fun(x))  
-    dat=melt(a,measure.vars="rank")
-    colnames(dat)[4:5]=c("task","rankingMethod")  
-    
-    if (missing(ordering)){
-      lev=sort(unique(dat$algorithm))
-      lab=lev
-    } else {
-      lev=ordering
-      #   lab=paste(1:length(ordering),ordering)
-      lab=lev
-    }
-    
-    dat=dat%>% 
-      dplyr::rename(rank=.data$value)%>%
-      mutate(rank=factor(.data$rank))%>%
-      mutate(task=factor(.data$task))%>%
-      mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab))
-    
-    ggplot(data = dat) +
-      aes(x = rankingMethod, y = rank, color=algorithm, group=algorithm ) +
-      geom_line(size=1)+
-      facet_wrap( ~ task)+
-      xlab("Ranking method")  +
-      ylab("Rank")+ 
-      theme(
-        # legend.position = "none",
-        #panel.spacing = unit(0, "lines"),
-        #strip.background = element_blank(),
-        #     strip.text.x = element_text(angle = 90),
-        #    strip.text.y = element_text(angle = 0),
-        strip.placement = "outside",
-        #   axis.text.y = element_text(color=levels(dat$algorithm)) ,
-        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) 
-      )#+legend_none()
-    
-  } else {
-    
-    a=lapply(methods,function(fun) fun(x))  
-    if (is.null(x$fulldata)) a=lapply(methods,function(fun) fun(x$data)) else {
-      a=lapply(methods,function(fun) fun(x$fulldata))
-      a=lapply(a, function(aa) {
-        res=aa
-        res$mat=aa$mat[rownames(a$testBased$mat)%in%rownames(x$mat),]
-        res
-      })
-      
-    } 
-    dat=melt(a)
-    colnames(dat)[ncol(dat)]=c("rankingMethod")  
-    
-    if (missing(ordering)){
-      lev=sort(unique(dat$algorithm))
-      lab=lev
+  if (any(sapply(x,
+                  function(task) any(is.na(task[,attr(x, "value")]))))) { # only if missings present, else do nothing
+    if (is.null(na.treat)) {
+      warning("Please specify na.treat in as.challenge()")
+      return(NULL)
     } else {
-      lev=ordering
-      #  lab=paste(1:length(ordering),ordering)
-      lab=lev
+      xx = melt(x,
+                id.vars=c(attr(x,"value"),
+                          attr(x,"algorithm") ,
+                          attr(x,"case"),
+                          attr(x,"annotator"),
+                          attr(x,"by")
+      ))
+
+      x=as.challenge(xx,
+                     value=attr(x,"value"),
+                     algorithm=attr(x,"algorithm") ,
+                     case=attr(x,"case"),
+                     by=attr(x,"by"),
+                     annotator = attr(x,"annotator"),
+                     smallBetter = !attr(x,"largeBetter"),
+                     na.treat=na.treat)
     }
-    
-    dat=dat%>% 
-      mutate(rank=factor(.data$rank))%>%
-      mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab))
-    
-    ggplot(data = dat) +
-      aes(x = rankingMethod, y = rank, color=algorithm, group=algorithm ) +
-      geom_line(size=1)+
-      xlab("Ranking method")+
-      ylab("Rank")+
-      theme(
-        strip.placement = "outside",
-        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) )
-    
   }
-  
-  
-}
-
-
 
+  a=lapply(methods,function(fun) fun(x))
+  dat=melt(a,measure.vars="rank")
+  colnames(dat)[4:5]=c("task","rankingMethod")
 
-methodsplot.ranked=function(x, 
-                            methods=list(testBased=.%>%test() %>% rank(ties.method = "min"),
-                                         meanThenRank=  .%>%  aggregate(  FUN="mean") %>% rank(ties.method = "min"),
-                                         medianThenRank=.%>% aggregate(  FUN="median") %>% rank(ties.method = "min"),
-                                         rankThenMean= .%>%rank(ties.method = "min") %>%  aggregate(  FUN="mean") %>%rank(ties.method = "min"),
-                                         rankThenMedian=.%>%rank(ties.method = "min") %>%  aggregate(  FUN="median") %>%rank(ties.method = "min")
-                                         ), 
-                            ...){
-  
-  na.treat=x$call[[1]][[1]]$na.treat
-  if (any(is.na(x$data[,attr(x, "value")]))) { #missings present
-    if (is.null(na.treat)){
-      warning("Please specify na.treat in as.challenge()")
-      return(NULL)
-    } else {
-      if (is.null(x$fulldata)) {
-        xx=x$data
-        x$data=as.challenge(xx,
-                        value=attr(xx,"value"), 
-                        algorithm=attr(xx,"algorithm") ,
-                        case=attr(xx,"case"),
-                        annotator = attr(xx,"annotator"),
-                        smallBetter = !attr(xx,"largeBetter"),
-                        na.treat=na.treat)
-      } else {
-        xx=x$fulldata
-        x$fulldata=as.challenge(xx,
-                            value=attr(xx,"value"), 
-                            algorithm=attr(xx,"algorithm") ,
-                            case=attr(xx,"case"),
-                            annotator = attr(xx,"annotator"),
-                            smallBetter = !attr(xx,"largeBetter"),
-                            na.treat=na.treat)
-      }
-    } 
+  if (missing(ordering)){
+    lev=sort(unique(dat$algorithm))
+    lab=lev
+  } else {
+    lev=ordering
+    lab=lev
   }
-  
-  
-    if (is.null(x$fulldata)) {
-      a=lapply(methods,function(fun) fun(x$data)) 
-    } else {
-      a=lapply(methods,function(fun) fun(x$fulldata))
-      a=lapply(a, function(aa) {
-        res=aa
-        res$mat=aa$mat[rownames(a$testBased$mat)%in%rownames(x$mat),]
-        res
-      })
-               
-    } 
-    dat=melt(a)
-    colnames(dat)[ncol(dat)]=c("rankingMethod")  
-    
-    ordering=  names(sort(t(x$mat[,"rank",drop=F])["rank",]))
-    lab=lev=ordering
-
-    dat=dat%>% 
-      mutate(rank=factor(.data$rank))%>%
-      mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab))
-    
-    ggplot(data = dat) +
-      aes(x = rankingMethod, y = rank, color=algorithm, group=algorithm ) +
-      geom_line(size=1)+
-      xlab("Ranking method")+
-      ylab("Rank")+
-      theme(
-        strip.placement = "outside",
-        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) )
-    
-}
 
-# methodsplot.ranked.list does not exist, use methodpsplot.challenge instead since consonsus ranking needed for ordering (or alphabetical ordering instead)
+  dat=dat%>%
+    dplyr::rename(rank=.data$value)%>%
+    mutate(rank=factor(.data$rank))%>%
+    mutate(task=factor(.data$task))%>%
+    mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab))
 
+  linePlot <- ggplot(data = dat) +
+    aes(x = rankingMethod, y = rank, color=algorithm, group=algorithm ) +
+    geom_line(size=1)+
+    xlab("Ranking method")  +
+    ylab("Rank")+
+    theme(
+      strip.placement = "outside",
+      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
+    )
 
+  # Create multi-panel plot with task names as titles for multi-task data set
+  if (length(x) > 1) {
+    linePlot <- linePlot + facet_wrap(~ task)
+  }
 
+  return(linePlot)
+}
 
+# methodsplot.ranked.list does not exist, use methodpsplot.challenge instead since consonsus ranking needed for ordering (or alphabetical ordering instead)
 
 #similar plot to methods plot, instead of across ranking methods across tasks
 lineplot <- function(x,...) UseMethod("lineplot")
 lineplot.default <- function(x, ...) stop("not implemented for this class")
 
-lineplot.challenge=function(x, 
+lineplot.challenge=function(x,
                             ordering,...){
   if (inherits(x,"list"))  {
     dat=melt(x,measure.vars="rank")
-    colnames(dat)[4]=c("task")  
-    
+    colnames(dat)[4]=c("task")
+
     if (missing(ordering)){
       lev=sort(unique(dat$algorithm))
       lab=lev
     } else {
       lev=ordering
       lab=paste(1:length(ordering),ordering)
     }
-    
-    dat=dat%>% 
+
+    dat=dat%>%
       dplyr::rename(rank=.data$value)%>%
       mutate(rank=factor(.data$rank))%>%
       mutate(task=factor(.data$task))%>%
       mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab))
-    
+
     ggplot(data = dat) +
       aes(x = task, y = rank, color=algorithm, group=algorithm ) +
       geom_line(size=1)+
       theme(
         axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) )
-    
+
   } else stop("Only applicable to multiple tasks")
 }
-
diff --git a/R/podium.R b/R/podium.R
index 1c04be9..66109ba 100644
--- a/R/podium.R
+++ b/R/podium.R
@@ -1,222 +1,163 @@
 podium <- function(object,...) UseMethod("podium")
 podium.default <- function(object, ...) stop("not implemented for this class")
 
-
-podium.ranked=function(object, 
-                       xlab = NULL,
-                       ylab = NULL,
-                       lines.show = TRUE,
-                       lines.alpha = 0.2,
-                       lines.lwd = 1,
-                       col,
-                       lines.col = col,
-                       dots.pch = 19,
-                       dots.cex = 1,
-                       places.lty = 2,
-                       places.col = 1,
-                       legendfn = function(algs, cols) {
-                         legend("topleft",
-                                algs,
-                                lwd = 1,
-                                col = cols,
-                                bg = "white")
-                       },
-                       layout.heights = c(1, 0.4),
-                       ...) {
-  ordering=t(object$mat[,"rank",drop=F])["rank",]
-  if (is.null(xlab)) xlab <- "Podium"
-  if (is.null(ylab)) ylab <- "Performance"
-  if (missing(col)) col=default_colors(length(ordering),
-                                       algorithms = names(ordering))
-  #dd=object$data  
-  # dd will be same as object$data, except that na.treat is handled if aggregateThenRank
-    x=object$data
-    dd=as.challenge(x,
-                    value=attr(x,"value"), 
-                    algorithm=attr(x,"algorithm") ,
-                    case=attr(x,"case"),
-                    annotator = attr(x,"annotator"),
-                    smallBetter = !attr(x,"largeBetter"),
-                    na.treat=object$call[[1]][[1]]$na.treat)
-  
-  podium( dd, 
-          ordering=ordering,
-          xlab = xlab, 
-          ylab = ylab, 
-          lines.show = lines.show, 
-          lines.alpha = lines.alpha, 
-          lines.lwd = lines.lwd, 
-          col=col, 
-          lines.col = lines.col, 
-          dots.pch = dots.pch, 
-          dots.cex = dots.cex, 
-          places.lty = places.lty, 
-          places.col = places.col, 
-          legendfn = legendfn, 
-          layout.heights=layout.heights, 
-          ...)  
-}  
-  
-  
-
-podium.ranked.list=function(object, 
-                            xlab = NULL, 
-                            ylab = NULL, 
-                            lines.show = TRUE, 
-                            lines.alpha = 0.2, 
-                            lines.lwd = 1, 
-                            col, 
-                            lines.col = col, 
-                            dots.pch = 19, 
-                            dots.cex = 1, 
-                            places.lty = 2, 
-                            places.col = 1, 
+podium.ranked.list=function(object,
+                            xlab = NULL,
+                            ylab = NULL,
+                            lines.show = TRUE,
+                            lines.alpha = 0.2,
+                            lines.lwd = 1,
+                            col,
+                            lines.col = col,
+                            dots.pch = 19,
+                            dots.cex = 1,
+                            places.lty = 2,
+                            places.col = 1,
                             legendfn = function(algs, cols) {
                               legend("topleft", algs, lwd = 1, col = cols, bg = "white")
-                            }, 
-                            layout.heights=c(1,0.4), 
+                            },
+                            layout.heights=c(1,0.4),
                             ...){
     if (is.null(xlab)) xlab <- "Podium"
     if (is.null(ylab)) ylab <- "Performance"
     x=object$data
-    for (subt in names(x)){
+
+    podiumPlots <- length(names(x))
+
+    for (subt in names(x)) {
       ordering=t(object$matlist[[subt]][,"rank",drop=F])["rank",]
       if (missing(col)) col=default_colors(length(ordering),
                                            algorithms = names(ordering))
-      
-       dd=as.challenge(x[[subt]],
-                      value=attr(x,"value"), 
-                      algorithm=attr(x,"algorithm") ,
+
+      dd=as.challenge(x[[subt]],
+                      value=attr(x,"value"),
+                      algorithm=attr(x,"algorithm"),
                       case=attr(x,"case"),
+                      by=attr(x, "by"),
                       annotator = attr(x,"annotator"),
                       smallBetter = !attr(x,"largeBetter"),
                       na.treat=object$call[[1]][[1]]$na.treat)
-      
-      
-      podium( dd, 
-              ordering=ordering,
-              xlab = xlab, ylab = ylab, 
-              lines.show = lines.show, 
-              lines.alpha = lines.alpha, 
-              lines.lwd = lines.lwd, 
-              col=col, 
-              lines.col = lines.col, 
-              dots.pch = dots.pch, 
-              dots.cex = dots.cex, 
-              places.lty = places.lty, 
-              places.col = places.col, 
-              legendfn = legendfn, 
-              layout.heights=layout.heights, 
-              ...)  
-      title(subt,outer=T,line=-3)    
+
+      podiumPlot <- podium(dd,
+             ordering=ordering,
+             xlab = xlab, ylab = ylab,
+             lines.show = lines.show,
+             lines.alpha = lines.alpha,
+             lines.lwd = lines.lwd,
+             col=col,
+             lines.col = lines.col,
+             dots.pch = dots.pch,
+             dots.cex = dots.cex,
+             places.lty = places.lty,
+             places.col = places.col,
+             legendfn = legendfn,
+             layout.heights=layout.heights,
+             ...)
+
+      if (length(names(x)) > 1) {
+        title(subt,outer=T,line=-3)
+      }
+
+      append(podiumPlots, podiumPlot)
     }
-  
 }
 
+podium.challenge=function(object,
+                          ordering,
+                          xlab = NULL, ylab = NULL,
+                          lines.show = FALSE, lines.alpha = 0.2,
+                          lines.lwd = 1,
+                          col,lines.col = col,
+                          dots.pch = 19, dots.cex = 1,
+                          places.lty = 2, places.col = 1,
+                          legendfn = function(algs, cols) {
+                            legend("topleft", algs, lwd = 1, col = cols, bg = "white")
+                          },
+                          layout.heights=c(1,0.4),
+                          ...) {
 
+  ranking=object%>%rank( ties.method = "random" )
 
+  task <- ranking$matlist[[1]]
 
-podium.challenge=function(object, 
-                     ordering,
-                        xlab = NULL, ylab = NULL, 
-                        lines.show = FALSE, lines.alpha = 0.2, 
-                        lines.lwd = 1, 
-                        col,lines.col = col, 
-                        dots.pch = 19, dots.cex = 1, 
-                        places.lty = 2, places.col = 1, 
-                        legendfn = function(algs, cols) {
-                          legend("topleft", algs, lwd = 1, col = cols, bg = "white")
-                        }, 
-                        layout.heights=c(1,0.4), 
-                        ...){
-  
-  ranking=object%>%rank( ties.method = "random" )
-  
-  dat=as.data.frame(table(ranking$mat[[attr(object,"algorithm")]],
-                          ranking$mat$rank,
+  dat=as.data.frame(table(task[[attr(object, "algorithm")]],
+                          task$rank,
                           dnn=c("algorithm","rank")),
                     responseName = "Count")
 
-  form=as.formula(paste(attr(object,"case"),attr(object,"algorithm"),sep="~"))
-  ranks=acast(ranking$mat,form,value.var= "rank")
-  values=acast(object,form,value.var= attr(object,"value"))
-  nranks=acast(dat,algorithm~rank,value.var= "Count")
-  
+  form=as.formula(paste(attr(object,"case"), attr(object,"algorithm"), sep="~"))
+  ranks=acast(task, form, value.var="rank")
+  values=acast(task, form, value.var=attr(object, "value"))
+  nranks=acast(dat, algorithm~rank, value.var="Count")
+
   nalgs <- ncol(ranks)
   algs <- colnames(ranks)
-  
+
   barorder <- order(ordering)
   orderedAlgorithms= names(ordering)[barorder]
-  
-  ylim=range(object[[attr(object,"value")]], na.rm = TRUE)
-  
+
+  ylim=range(task[[attr(object,"value")]], na.rm = TRUE)
+
   dotplotborders <- (0:nalgs) * nalgs
   dotplaces <- (1:nalgs) - 0.5
   names(dotplaces) <- orderedAlgorithms
-  
+
   linecols <- sapply(lines.col, function(c) {
     r <- col2rgb(c)
-    rgb(r[1], r[2], r[3], 
-        alpha = round(255 * lines.alpha), 
+    rgb(r[1], r[2], r[3],
+        alpha = round(255 * lines.alpha),
         maxColorValue = 255)
   })
-  
+
   opar <- par(no.readonly = TRUE)
-  layout(matrix(c(1, 2), nrow = 2, byrow = TRUE), 
+  layout(matrix(c(1, 2), nrow = 2, byrow = TRUE),
          heights =layout.heights)
-  
+
   mar <- par("mar")
   par(mar = c(0, mar[2], mar[3], mar[4]))
-  
-  plot(dotplotborders, rep(ylim[2], nalgs + 1), 
-       type = "n", 
-       ylim = ylim, ylab = ylab, xlab = "", 
+
+  plot(dotplotborders, rep(ylim[2], nalgs + 1),
+       type = "n",
+       ylim = ylim, ylab = ylab, xlab = "",
        axes = F)
-  axis(1, at = dotplotborders, 
+  axis(1, at = dotplotborders,
        labels = NA, lwd = par("lwd"))
   axis(2, lwd = par("lwd"))
   box()
   abline(v = dotplotborders, lty = places.lty, col = places.col)
   linesegments <- function(x, y, ...) {
     n <- length(x)
     segments(x[-n], y[-n], x[-1], y[-1], ...)
   }
   drawthe <- function(fn, col, ...) {
     for (i in 1:nrow(values)) {
       r <- ranks[i, ]
       o <- order(r)
       performances <- (values[i, ])[o]
       places <- (dotplaces[names(r)] + ((r - 1) * nalgs))[o]
       fn(places, performances, col = col[names(r)[o]], ...)
     }
   }
   if (lines.show) drawthe(linesegments, linecols, lwd = lines.lwd)
-  
+
   drawthe(points, col, pch = dots.pch, cex = dots.cex)
-  
-  legendfn(orderedAlgorithms, 
+
+  legendfn(orderedAlgorithms,
            col[orderedAlgorithms])
-  
+
   par(mar = c(mar[1], mar[2], 0, mar[4]))
-  barplot(nranks[barorder,], 
-          beside = TRUE, 
-          width = 1, 
-          axes = F, 
-          space = c(0, 0), 
-          border = NA, 
-          ylim = c(0, nrow(ranks)), 
-          names.arg = paste(1:nalgs, ".", sep = ""), 
-          col = col[orderedAlgorithms], 
+  barplot(nranks[barorder,],
+          beside = TRUE,
+          width = 1,
+          axes = F,
+          space = c(0, 0),
+          border = NA,
+          ylim = c(0, nrow(ranks)),
+          names.arg = paste(1:nalgs, ".", sep = ""),
+          col = col[orderedAlgorithms],
           xlab = xlab)
-  axis(1, at = c(0, dotplotborders), 
+  axis(1, at = c(0, dotplotborders),
        labels = NA, lwd = par("lwd"))
   box()
   par(opar)
 }
-
-
-
-
-
-
-
diff --git a/R/rankingHeatmap.R b/R/rankingHeatmap.R
index 5c3465a..0be6bd0 100644
--- a/R/rankingHeatmap.R
+++ b/R/rankingHeatmap.R
@@ -1,75 +1,60 @@
 rankingHeatmap <- function(x,...) UseMethod("rankingHeatmap")
 rankingHeatmap.default <- function(x, ...) stop("not implemented for this class")
 
-rankingHeatmap.ranked=function (x,ties.method="min",...) {
-  ordering=rownames(x$mat)[order(x$mat$rank)]
-  #dd=x$data  
-  # dd will be same as x$data, except that na.treat is handled if aggregateThenRank
-  dd=as.challenge(x$data,
-                  value=attr(x$data,"value"), 
-                  algorithm=attr(x$data,"algorithm") ,
-                  case=attr(x$data,"case"),
-                  annotator = attr(x$data,"annotator"),
-                  smallBetter = !attr(x$data,"largeBetter"),
-                  na.treat=x$call[[1]][[1]]$na.treat)
-  
-  rankingHeatmap(dd,                 
-                 ordering=ordering,
-                 ties.method=ties.method,...)
-}
-
-
 rankingHeatmap.ranked.list=function (x,ties.method="min",...) {
-  
+
   xx=x$data
-  
+
   a=lapply(names(x$matlist),function(subt){
     ordering=rownames(x$matlist[[subt]])[order(x$matlist[[subt]]$rank)]
-    
-    dd=as.challenge(xx[[subt]],value=attr(xx,"value"), 
+
+    dd=as.challenge(xx[[subt]],
+                    value=attr(xx,"value"),
                     algorithm=attr(xx,"algorithm") ,
                     case=attr(xx,"case"),
+                    by=attr(xx, "by"),
                     annotator = attr(xx,"annotator"),
                     smallBetter = !attr(xx,"largeBetter"),
                     na.treat=x$call[[1]][[1]]$na.treat)
-    
+
     rankingHeatmap(dd,
                    ordering=ordering,
-                   ties.method=ties.method,...) + ggtitle(subt) 
+                   ties.method=ties.method,...) + ggtitle(subt)
   })
+
+  # Remove title for single-task data set
+  if (length(a) == 1) {
+    a[[1]]$labels$title <- NULL
+  }
+
   a
 }
 
 
 rankingHeatmap.challenge=function(x,
                                   ordering,
-                                  ties.method="min",...){
+                                  ties.method="min",...) {
   ranking=x%>%rank( ties.method = ties.method )
-  
-  dat=as.data.frame(table(ranking$mat[[attr(x,"algorithm")]],
-                          ranking$mat$rank,
+
+  task <- ranking$matlist[[1]]
+
+  dat=as.data.frame(table(task[[attr(x,"algorithm")]],
+                          task$rank,
                           dnn=c("algorithm","rank")),
                     responseName = "Count")
   dat$algorithm=factor(dat$algorithm, levels=ordering)
-  # dat$Count=as.factor(dat$Count)
-  # dat$Count[dat$Count==0]=NA
-  ncases=length(unique(x[[attr(x,"case")]]))
+  ncases=length(unique(task[[attr(x,"case")]]))
   ggplot(dat)+
-    geom_raster(aes(algorithm,rank, fill= Count))+
-    geom_hline(yintercept = seq(1.5,max(ranking$mat$rank)-.5,by=1),
+    geom_raster(aes(algorithm, rank, fill= Count))+
+    geom_hline(yintercept = seq(1.5,max(task$rank)-.5,by=1),
                color=grey(.8),size=.3)+
     geom_vline(xintercept = seq(1.5,length(unique(dat$algorithm))-.5,by=1),
                color=grey(.8),size=.3)+
     scale_fill_viridis_c(direction = -1,
-                         limits=c(0,ncases),
-                         # limits=c(1,ncases),
-                         # breaks=function(a) round(seq(a[1],a[2],length.out=5)),
-                         # na.value = "white"
+                         limits=c(0,ncases)
     )+
     theme(axis.text.x = element_text(angle = 90),
           aspect.ratio=1)+
     xlab("Algorithm")+
     ylab("Rank")
-  #scale_y_discrete(name="Rank",breaks=rev(ranking$matlist[[subt]]$rank))
-  
 }
diff --git a/R/report.R b/R/report.R
index 7882db7..dec475f 100644
--- a/R/report.R
+++ b/R/report.R
@@ -1,278 +1,162 @@
 report <- function(object,...) UseMethod("report")
 report.default <- function(object, ...) stop("not implemented for this class")
 
-report.bootstrap=function(object,
-                          file,
-                          title="<Challenge name>",
-                          colors=default_colors,
-                          format="PDF",
-                          latex_engine="pdflatex",
-                          open=TRUE,...){
-
-  # Copy the report file to a temporary directory before processing it, in
-  # case we don't have write permissions to the current working dir (which
-  # can happen when deployed).
-  if (missing(file)) tempReport <- file.path(tempdir(), "report.Rmd")
-  else {
-    a=strsplit(file,"/")[[1]]
-    path=paste0(a[-length(a)],collapse="/")
-    if (path=="") tempReport=file.path(paste0(strsplit(a[length(a)],".",fixed=T)[[1]][1],".Rmd"))
-    else tempReport=file.path(path,
-                              paste0(strsplit(a[length(a)],".",fixed=T)[[1]][1],".Rmd"))
-  } 
-  file.copy(file.path(system.file("appdir", package = "challengeR"), 
-                      "reportSingle.Rmd"), 
-            tempReport, 
-            overwrite = TRUE)
-  
-  # Set up parameters to pass to Rmd document
-  params <- list(
-    object=object,
-    name=title,
-    colors=colors
-  )
-  
-  
-  # Knit the document, passing in the `params` list, and eval it in a
-  # child of the global environment (this isolates the code in the document
-  # from the code in this app).
-   out <- render(tempReport, 
-                 switch(    
-                   format,
-                   PDF = pdf_document(number_sections=T,
-                                      latex_engine=latex_engine), 
-                   HTML = html_document(number_sections=T), 
-                   Word = word_document()
-                   ),
-                 params = params,
-                 envir = new.env(parent = globalenv()),
-                 ...
-  )
-  
-  if (!missing(file)){
-    if (is.na(strsplit(file,".",fixed=T)[[1]][2])) file=paste0(file,
-                                                               ".",
-                                                               strsplit(out,".",fixed=T)[[1]][2])
-    file.rename(out, file) 
-  } else file=out
-   
-  file.remove(tempReport)
-  
-  if (open) system(paste0('open "', file, '"'))
-  
-}
-
-
-
 report.bootstrap.list=function(object,
                                consensus,
                                file,
                                title="<Challenge name>",
                                colors=default_colors,
                                format="PDF",
                                latex_engine="pdflatex",
                                open=TRUE,...){
 
   # Copy the report file to a temporary directory before processing it, in
   # case we don't have write permissions to the current working dir (which
   # can happen when deployed).
   if (missing(file)) tempReport <- file.path(tempdir(), "report.Rmd")
   else {
     a=strsplit(file,"/")[[1]]
     path=paste0(a[-length(a)],collapse="/")
     if (path=="") tempReport=file.path(paste0(strsplit(a[length(a)],
                                                        ".",
                                                        fixed=T)[[1]][1],".Rmd"))
     else tempReport=file.path(path,paste0(strsplit(a[length(a)],
                                                    ".",
                                                    fixed=T)[[1]][1],".Rmd"))
-  } 
-  file.copy(file.path(system.file("appdir", package = "challengeR"), 
-                      "reportMultiple.Rmd"), 
-            tempReport, 
+  }
+  file.copy(file.path(system.file("appdir", package = "challengeR"),
+                      "reportMultiple.Rmd"),
+            tempReport,
             overwrite = TRUE)
-  
+
+  if (length(object$matlist) > 1) {
+    consensus = consensus
+    isMultiTask = TRUE
+  }
+  else {
+    consensus = NULL
+    isMultiTask = FALSE
+  }
+
   # Set up parameters to pass to Rmd document
   params <- list(
     object=object,
     consensus=consensus,
     name=title,
-    colors=colors
+    colors=colors,
+    isMultiTask=isMultiTask,
+    bootstrappingEnabled=TRUE
   )
-  
+
   # Knit the document, passing in the `params` list, and eval it in a
   # child of the global environment (this isolates the code in the document
   # from the code in this app).
   # render(tempReport, output_file = file,
   #   params = params,
   #   envir = new.env(parent = globalenv())
   # )
-  out <- render(tempReport, 
-                switch(    
+  out <- render(tempReport,
+                switch(
                   format,
                   PDF = pdf_document(number_sections=T,
-                                     latex_engine=latex_engine), 
-                  HTML = html_document(number_sections=T), 
+                                     latex_engine=latex_engine),
+                  HTML = html_document(number_sections=T),
                   Word = word_document(df_print="kable")
                   ),
                 params = params,
                 envir = new.env(parent = globalenv()),
                 ...
   )
-  
-  if (!missing(file)){
-    if (is.na(strsplit(file,".",fixed=T)[[1]][2])) file=paste0(file,
-                                                               ".",
-                                                               strsplit(out,".",fixed=T)[[1]][2])
-    file.rename(out, file) 
-  } else file=out
-  
-  file.remove(tempReport)
-  
-  if (open) system(paste0('open "', file, '"'))
-  
-}
-
-
-
-########################
 
-
-report.ranked=function(object,
-                       file,
-                       title="<Challenge name>",
-                       colors=default_colors,
-                       format="PDF",
-                       latex_engine="pdflatex",
-                       open=TRUE,...){
-
-  # Copy the report file to a temporary directory before processing it, in
-  # case we don't have write permissions to the current working dir (which
-  # can happen when deployed).
-  if (missing(file)) tempReport <- file.path(tempdir(), "report.Rmd")
-  else {
-    a=strsplit(file,"/")[[1]]
-    path=paste0(a[-length(a)],collapse="/")
-    if (path=="") tempReport=file.path(paste0(strsplit(a[length(a)],
-                                                       ".",
-                                                       fixed=T)[[1]][1],".Rmd"))
-    else tempReport=file.path(path,
-                              paste0(strsplit(a[length(a)],
-                                              ".",
-                                              fixed=T)[[1]][1],".Rmd"))
-  } 
-  file.copy(file.path(system.file("appdir", package = "challengeR"), 
-                      "reportSingleShort.Rmd"),
-            tempReport, 
-            overwrite = TRUE)
-  
-  # Set up parameters to pass to Rmd document
-  params <- list(
-    object=object,
-    name=title,
-    colors=colors
-  )
-  
-  
-  # Knit the document, passing in the `params` list, and eval it in a
-  # child of the global environment (this isolates the code in the document
-  # from the code in this app).
-  out <- render(tempReport, 
-                switch(
-                  format,
-                  PDF = pdf_document(number_sections=T,
-                                     latex_engine=latex_engine),
-                  HTML = html_document(number_sections=T),
-                  Word = word_document()
-                  ),
-                params = params,
-                envir = new.env(parent = globalenv()),...
-  )
-  
   if (!missing(file)){
     if (is.na(strsplit(file,".",fixed=T)[[1]][2])) file=paste0(file,
                                                                ".",
                                                                strsplit(out,".",fixed=T)[[1]][2])
-    file.rename(out, file) 
+    file.rename(out, file)
   } else file=out
-  
+
   file.remove(tempReport)
-  
+
   if (open) system(paste0('open "', file, '"'))
-  
 }
 
-
-
 report.ranked.list=function(object,
                             consensus,
                             file,
                             title="<Challenge name>",
                             colors=default_colors,
                             format="PDF",
                             latex_engine="pdflatex",
                             open=TRUE,
                             ...){
 
   # Copy the report file to a temporary directory before processing it, in
   # case we don't have write permissions to the current working dir (which
   # can happen when deployed).
-  if (missing(file)) tempReport <- file.path(tempdir(), 
+  if (missing(file)) tempReport <- file.path(tempdir(),
                                              "report.Rmd")
   else {
     a=strsplit(file,"/")[[1]]
     path=paste0(a[-length(a)],
                 collapse="/")
     if (path=="") tempReport=file.path(paste0(strsplit(a[length(a)],
                                                        ".",
                                                        fixed=T)[[1]][1],
                                               ".Rmd"))
     else tempReport=file.path(path,
                               paste0(strsplit(a[length(a)],
                                               ".",
                                               fixed=T)[[1]][1],
                                      ".Rmd"))
-  } 
-  file.copy(file.path(system.file("appdir", package = "challengeR"), 
-                      "reportMultipleShort.Rmd"), 
-            tempReport, 
+  }
+  file.copy(file.path(system.file("appdir", package = "challengeR"),
+                      "reportMultiple.Rmd"),
+            tempReport,
             overwrite = TRUE)
-  
+
+  if (length(object$matlist) > 1) {
+    consensus = consensus
+    isMultiTask = TRUE
+  }
+  else {
+    consensus = NULL
+    isMultiTask = FALSE
+  }
+
   # Set up parameters to pass to Rmd document
   params <- list(
     object=object,
     consensus=consensus,
     name=title,
-    colors=colors
+    colors=colors,
+    isMultiTask=isMultiTask,
+    bootstrappingEnabled=FALSE
   )
-  
+
   # Knit the document, passing in the `params` list, and eval it in a
   # child of the global environment (this isolates the code in the document
   # from the code in this app).
-  out <- render(tempReport, 
+  out <- render(tempReport,
                 switch(
                   format,
                   PDF = pdf_document(number_sections=T,
-                                     latex_engine=latex_engine), 
-                  HTML = html_document(number_sections=T), 
+                                     latex_engine=latex_engine),
+                  HTML = html_document(number_sections=T),
                   Word = word_document(df_print="kable")
                   ),
                 params = params,
                 envir = new.env(parent = globalenv()),
                 ...
                 )
-  
+
   if (!missing(file)){
     if (is.na(strsplit(file,".",fixed=T)[[1]][2])) file=paste0(file,
                                                                ".",
                                                                strsplit(out,".",fixed=T)[[1]][2])
-    file.rename(out, file) 
+    file.rename(out, file)
   } else file=out
-  
+
   file.remove(tempReport)
-  
+
   if (open) system(paste0('open "', file, '"'))
-  
 }
-
-
diff --git a/R/rrank.R b/R/rrank.R
index 34f9f59..020d373 100644
--- a/R/rrank.R
+++ b/R/rrank.R
@@ -1,59 +1,48 @@
 rank <- function(object,...) UseMethod("rank")
 rank.default <- function(object, ...) base::rank(object,...)  #stats::aggregate
 
 rank.challenge=function(object,
                         x,
                         ties.method="min",...){
   call=as.list(match.call())
   if (!is.null(attr(object,"annotator"))) {
     call2=call("Rank",
-               object=call$object, 
-               x=attr(object,"value"), 
+               object=call$object,
+               x=attr(object,"value"),
                annotator=c(attr(object,"annotator")),
                ties.method=ties.method,
-               largeBetter=attr(object,"largeBetter") 
+               largeBetter=attr(object,"largeBetter")
              )
     res1=do.call("Rank",list(object=object,
                              x=attr(object,"value"),
                              annotator=c(attr(object,"annotator")),
                              ties.method=ties.method,
-                             largeBetter=attr(object,"largeBetter") 
+                             largeBetter=attr(object,"largeBetter")
            ))
-    
+
   } else {
     call2=call("Rank",
-               object=call$object, 
-               x=attr(object,"value"), 
+               object=call$object,
+               x=attr(object,"value"),
                ties.method=ties.method,
-               largeBetter=attr(object,"largeBetter") 
+               largeBetter=attr(object,"largeBetter")
              )
     res1=do.call("Rank",list(object=object,
                              x=attr(object,"value"),
                              ties.method=ties.method,
-                             largeBetter=attr(object,"largeBetter") 
+                             largeBetter=attr(object,"largeBetter")
            ))
-    
+
   }
-  
-  if (inherits(object,"list")){    
-    res=list(FUN = . %>% (call2),
-             call=list(call2),
-             FUN.list=list("rank"),
-             data=object,
-             matlist=res1$matlist)
-    
-    class(res)=c("ranked.list",class(res))
-  } else {
-    res=list(FUN = . %>% (call2),
-             call=list(call2),
-             FUN.list=list("rank"),
-             data=object,
-             mat=res1$mat)
-    
-    class(res)=c("ranked",class(res))
-    
-  }  
+
+  res=list(FUN = . %>% (call2),
+           call=list(call2),
+           FUN.list=list("rank"),
+           data=object,
+           matlist=res1$matlist)
+
+  class(res)=c("ranked.list",class(res))
   res
 }
 
 
diff --git a/R/significancePlot.R b/R/significancePlot.R
index 1d1c867..adc892b 100644
--- a/R/significancePlot.R
+++ b/R/significancePlot.R
@@ -1,165 +1,145 @@
 significanceMap <- function(object,...) UseMethod("significanceMap")
 significanceMap.default <- function(object, ...) stop("not implemented for this class")
 
-
-significanceMap.ranked=function(object,
-                                alpha=0.05,p.adjust.method="holm",
-                                order=FALSE,
-                                size.rank=.3*theme_get()$text$size,...){
-  
-  relensemble= object$data%>%decision.challenge(na.treat=object$call[[1]][[1]]$na.treat,
-                                                alpha=alpha,
-                                                p.adjust.method=p.adjust.method
-                                                ) %>% as.relation
-  
-  significanceMap(object=object$mat,
-                  relation_object=relensemble,
-                  order=order,
-                  size.rank=size.rank,...
-  )
-  
-}  
-
-
-
 significanceMap.ranked.list=function(object,
                                      alpha=0.05,p.adjust.method="holm",
                                      order=FALSE,
                                      size.rank=.3*theme_get()$text$size,...){
-  
+
   a=object$data%>%decision.challenge(na.treat=object$call[[1]][[1]]$na.treat,
                                      alpha=alpha,
                                      p.adjust.method=p.adjust.method)
-  
+
   aa=lapply(a, as.relation.challenge.incidence)
   names(aa)=names(object$data)
-  
+
   relensemble= do.call(relation_ensemble,args = aa)
-  
+
   res=list()
-  for (Task in names(object$data)){
-    res[[Task]]=significanceMap.data.frame(object=object$matlist[[Task]],
-                                           relation_object=relensemble[[Task]],
+  for (task in names(object$data)){
+    res[[task]]=significanceMap.data.frame(object=object$matlist[[task]],
+                                           relation_object=relensemble[[task]],
                                            order=order,
                                            size.rank=size.rank,...
-                                           ) + ggtitle(Task)
-    
+                                           ) + ggtitle(task)
+
   }
-  
-  res  
-}  
 
+  # Remove title for single-task data set
+  if (length(res) == 1) {
+    res[[1]]$labels$title <- NULL
+  }
+
+  res
+}
 
 
 significanceMap.data.frame=function(object,
                                     relation_object,
                                     order=FALSE,
                                     size.rank=.3*theme_get()$text$size,...){
 
   object$algorithm=rownames(object)
   inc=relation_incidence(relation_object)
-  
+
   if (order){
     scores=apply(inc,1,
                  function(x) sum(x==0)-1)
     scores2=apply(inc,2,
                   function(x) sum(x==1))[names(scores)]#+1-nrow(inc))
     scores=data.frame(algorithm=names(scores),
                       score=scores,
                       score2=scores2,
                       stringsAsFactors =F)
     scores=right_join(scores,
                       object,
                       by="algorithm")
-    
+
     ordering= (scores[order(scores$score,
                             scores$score2,
                             scores$rank),"algorithm"])
     scores=scores[,1:3]
   } else ordering=  names(sort(t(object[,"rank",drop=F])["rank",]))
-  
+
   inc=inc[ordering,]
-  
+
   incidence.mat=melt(inc)
   colnames(incidence.mat)=c("algorithm","notsigPair",     "decision")
   incidence.mat$algorithm=as.character(incidence.mat$algorithm)
   incidence.mat$notsigPair=as.character(incidence.mat$notsigPair)
   incidence.mat=right_join(incidence.mat,
                            object,
                            by="algorithm")
   if (order) incidence.mat=right_join(incidence.mat,
                                       scores,
                                       by="algorithm")
-  
-  incidence.mat=incidence.mat%>%mutate(algorithm=factor(.data$algorithm, 
+
+  incidence.mat=incidence.mat%>%mutate(algorithm=factor(.data$algorithm,
                                                         levels=ordering),
-                                       notsigPair=factor(.data$notsigPair, 
+                                       notsigPair=factor(.data$notsigPair,
                                                          levels=ordering))
-  
+
   incidence.mat$decision=as.factor(incidence.mat$decision)
 
   p=ggplot(incidence.mat) +
     geom_raster(aes(algorithm,
                     notsigPair,
                     fill=decision),...)+
     geom_raster(aes(algorithm,algorithm),
                 fill="white")+
-    geom_abline(slope=1) +    
+    geom_abline(slope=1) +
     coord_cartesian(clip = 'off')+
     theme(aspect.ratio=1,
           axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
           plot.margin=unit(c(1,1,1,1), "lines"),
           legend.position="none")+
     ylab("Algorithm")+
     xlab("Algorithm")+
     scale_fill_manual(values=cividis(2,begin=0,end=1,alpha=.7))
 
-
-
   fixy=0
   th_get=theme_get()
-  # grid on top 
+  # grid on top
     lt=th_get$panel.grid$linetype
     if (is.null(lt)) lt=th_get$line$linetype
 
   #p=p+theme(panel.background = element_rect(fill = NA),panel.ontop=TRUE) #-> grid will be on top of diagonal
   #fix:
     f=ggplot_build(p)
-    p= p + geom_vline(xintercept=f$layout$panel_params[[1]]$x.major_source, 
+    p= p + geom_vline(xintercept=f$layout$panel_params[[1]]$x.major_source,
                       linetype=lt,
-                      color=th_get$panel.grid$colour, 
+                      color=th_get$panel.grid$colour,
                       size=rel(th_get$panel.grid.major$size))+
-      geom_hline(yintercept=f$layout$panel_params[[1]]$y.major_source, 
+      geom_hline(yintercept=f$layout$panel_params[[1]]$y.major_source,
                  linetype=lt,
-                 color=th_get$panel.grid$colour, 
+                 color=th_get$panel.grid$colour,
                  size=rel(th_get$panel.grid.major$size))+
       geom_abline(slope=1)+
       geom_text(aes(x=algorithm,y=fixy,label=rank),
-                nudge_y=.5, 
+                nudge_y=.5,
                 vjust = 0,
                 size=size.rank,
                 fontface="plain",family="sans"
       )
 
-  
+
   if (order) p=  p+
       geom_text(aes(x=algorithm,y=fixy,label=score),
-                nudge_y=0, 
+                nudge_y=0,
                 vjust = 0, size=size.rank,
-                fontface="plain",family="sans") + 
+                fontface="plain",family="sans") +
       annotate("text",
-               x=0,y=fixy+.5,  
-               vjust = 0, 
-               size=size.rank, 
+               x=0,y=fixy+.5,
+               vjust = 0,
+               size=size.rank,
                fontface="plain",
                family="sans",
                label="original")+
-      annotate("text",x=0,y=fixy,  
-               vjust = 0, 
+      annotate("text",x=0,y=fixy,
+               vjust = 0,
                size=size.rank,
                fontface="plain",family="sans",label="new")
-  
+
   return(p)
 
 }
-
diff --git a/R/subset.R b/R/subset.R
index c5a1633..51f43e2 100644
--- a/R/subset.R
+++ b/R/subset.R
@@ -1,95 +1,113 @@
+subset <- function(x,...) UseMethod("subset")
+subset.default <- function(x, ...) stop("not implemented for this class")
+taskSubset <- function(x,...) UseMethod("taskSubset")
+taskSubset.default <- function(x, ...) stop("not implemented for this class")
+
+
 subset.comparedRanks.list=function(x,
                                    tasks,...){
   res=x[tasks]
   class(res)="comparedRanks.list"
   res
 }
 
 subset.list=function(x,
                      tasks,...){
   x[tasks]
 }
 
-subset.bootstrap.list=function(x,
-                               tasks,...){
-  if (!is.null(as.list(match.call(expand.dots = T))$top)) stop("Subset of algorithms only sensible for single task challenges.")
-  res=list(bootsrappedRanks=x$bootsrappedRanks[tasks],
-           bootsrappedAggregate=x$bootsrappedAggregate[tasks],
-           matlist=x$matlist[tasks],
-           data=x$data[tasks],
-           FUN=x$FUN
-  )
-  
-  attrib=attributes(x$data)
-  attrib$names=attr(res$data,"names")
-  attributes(res$data)=attrib
-  class(res)="bootstrap.list"
-  res
-  
-}
-
-subset.ranked.list=function(x,
-                            tasks,...){
-  if (!is.null(as.list(match.call(expand.dots = T))$top)) stop("Subset of algorithms only sensible for single task challenges.")
-  res=list(matlist=x$matlist[tasks],
-           data=x$data[tasks],
-           call=x$call,
-           FUN=x$FUN,
-           FUN.list=x$FUN.list
-  )
-  
-  attrib=attributes(x$data)
-  attrib$names=attr(res$data,"names")
-  attributes(res$data)=attrib
-  class(res)=c("ranked.list","list")
-  res
-  
-}
-
 subset.aggregated.list=function(x,
                                 tasks,...){
-  call=match.call(expand.dots = T)  
+  call=match.call(expand.dots = T)
   if (!is.null(as.list(call$top))) stop("Subset of algorithms only sensible for single task challenges.")
   matlist=x$matlist[tasks]
   res=list(matlist=matlist,
            call=list(x$call,call),
            data=x$data,
            FUN =  . %>% (x$FUN) %>%  (call)
   )
-  
+
   class(res)=class(x)
   res
- 
+
 }
 
 
 which.top=function(object,
                    top){
   mat=object$mat[object$mat$rank<=top,]
   rownames(mat)#[order(mat$rank)]
 }
 
-subset.ranked=function(x,
-                       top,...){
+subset.ranked.list <- function(x,
+                               top,...) {
+
+  if (length(x$matlist) != 1) {
+    stop("Subset of algorithms only sensible for single-task challenges.")
+  }
+
+  taskMat <- x$matlist[[1]]
+  taskData <- x$data[[1]]
   objectTop=x
-  objectTop$mat=objectTop$mat[objectTop$mat$rank<=top,]
-  objectTop$data=objectTop$data[objectTop$data[[attr(objectTop$data,"algorithm")]]%in% rownames(objectTop$mat),]
-  objectTop$data[[attr(objectTop$data,"algorithm")]]=droplevels(objectTop$data[[attr(objectTop$data,"algorithm")]])
-  
+  objectTop$matlist[[1]]=taskMat[taskMat$rank<=top,]
+
+  taskMatRowNames <- rownames(objectTop$matlist[[1]])
+  attribute <- attr(objectTop$data,"algorithm")
+
+  selectedRowNames <- taskData[[attribute]] %in% taskMatRowNames
+  objectTop$data[[1]] <- taskData[selectedRowNames,]
+  objectTop$data[[1]][[attribute]] <- droplevels(objectTop$data[[1]][[attribute]])
+
   objectTop$fulldata=x$data
   objectTop
 }
 
 
-subset.bootstrap=function(x,
-                          top,...){
-  objectTop=x
-  objectTop$mat=objectTop$mat[objectTop$mat$rank<=top,]
-  objectTop$data=objectTop$data[objectTop$data[[attr(objectTop$data,"algorithm")]]%in% rownames(objectTop$mat),]
-  objectTop$data[[attr(objectTop$data,"algorithm")]]=droplevels(objectTop$data[[attr(objectTop$data,"algorithm")]])
-  objectTop$fulldata=x$data
-  objectTop$bootsrappedRanks=objectTop$bootsrappedRanks[rownames(objectTop$mat),]
-  objectTop$bootsrappedAggregate=objectTop$bootsrappedAggregate[rownames(objectTop$mat),]
+subset.bootstrap.list=function(x,
+                               top,...) {
+
+  if (length(x$matlist) != 1) {
+    stop("Subset of algorithms only sensible for single-task challenges.")
+  }
+
+  objectTop <- subset.ranked.list(x, top = top)
+
+  objectTop$bootsrappedRanks[[1]] <- objectTop$bootsrappedRanks[[1]][rownames(objectTop$matlist[[1]]),]
+  objectTop$bootsrappedAggregate[[1]] <- objectTop$bootsrappedAggregate[[1]][rownames(objectTop$matlist[[1]]),]
   objectTop
 }
 
+
+taskSubset.ranked.list <- function(x,
+                                   tasks,...) {
+
+  res=list(matlist=x$matlist[tasks],
+           data=x$data[tasks],
+           call=x$call,
+           FUN=x$FUN,
+           FUN.list=x$FUN.list
+  )
+
+  attrib=attributes(x$data)
+  attrib$names=attr(res$data,"names")
+  attributes(res$data)=attrib
+  class(res)=c("ranked.list","list")
+  res
+}
+
+taskSubset.bootstrap.list <- function(x,
+                                      tasks,...) {
+
+  res=list(bootsrappedRanks=x$bootsrappedRanks[tasks],
+           bootsrappedAggregate=x$bootsrappedAggregate[tasks],
+           matlist=x$matlist[tasks],
+           data=x$data[tasks],
+           FUN=x$FUN
+  )
+
+  attrib=attributes(x$data)
+  attrib$names=attr(res$data,"names")
+  attributes(res$data)=attrib
+  class(res)="bootstrap.list"
+  res
+}
diff --git a/R/violin.R b/R/violin.R
index a62c6e0..2d17676 100644
--- a/R/violin.R
+++ b/R/violin.R
@@ -1,81 +1,78 @@
 violin <- function(x,...) UseMethod("violin")
 violin.default <- function(x, ...) stop("not implemented for this class")
 
-violin.bootstrap=function(x,...){
-  a=list(bootsrappedRanks=list(x$bootsrappedRanks),
-         matlist=list(x$mat))
-  names(a$bootsrappedRanks)=names(a$matlist)=""
-  violin.bootstrap.list(a,...)
-  
-}
-
-
 violin.bootstrap.list=function(x,...){
   ken=melt(kendall.bootstrap.list(x))
   colnames(ken)[2]="Task"
   cat("\n\nSummary Kendall's tau\n")
   ss=ken%>%group_by(Task)%>%
     summarise(mean=mean(value,na.rm=T),
               median=median(value,na.rm=T),
               q25=quantile(value,probs = .25,na.rm=T),
-              q75=quantile(value,probs = .75,na.rm=T))%>% 
+              q75=quantile(value,probs = .75,na.rm=T))%>%
     arrange(desc(median))
-  
+
   print(as.data.frame(ss))
-  
-  ken%>%mutate(Task=factor(.data$Task, 
+
+  xAxisText <- element_blank()
+
+  # Show task names as tick mark labels only for multi-task data set
+  if (length(x$data) > 1) {
+    xAxisText <- element_text(angle = 90, vjust = 0.5, hjust = 1)
+  }
+
+  ken%>%mutate(Task=factor(.data$Task,
                            levels=ss$Task))%>%
     ggplot(aes(Task,value))+
     geom_violin(alpha=.3,
                 color=NA,
                 fill="blue")+
-    geom_boxplot(width=0.1, 
+    geom_boxplot(width=0.1,
                  fill="white")+
-    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
+    theme(axis.text.x = xAxisText,
           legend.position = "none")+
     ylab("Kendall's tau")+
     scale_y_continuous(limits=c(min(min(ken$value),0),
                                 max(max(ken$value),1)))
 }
 
 
 
 kendall.bootstrap.list=function(x){
   ken=lapply(1:length(x$bootsrappedRanks),function(Task){
     id=match(rownames( x$bootsrappedRanks[[Task]]),
              rownames(x$matlist[[Task]]) )
-    sapply(x$bootsrappedRanks[[Task]], 
+    sapply(x$bootsrappedRanks[[Task]],
            function(bootSample) suppressWarnings(kendall(bootSample,
                                                          x$matlist[[Task]]$rank[id])))
   } )
   names(ken)=names((x$bootsrappedRanks))
-  
+
   if (sum(is.na(x))>0){
    cat("Bootstrap samples without variability in rankings (all algorithms ranked 1) excluded.\n Frequency of such samples by task:\n",fill = T)
     sapply(ken,function(x) sum(is.na(x)))
   }
-  
-  
+
+
   return(ken)
-  
+
 }
 
 
 density.bootstrap.list=function(x,...){
   ken=melt(kendall.bootstrap.list(x))
   colnames(ken)[2]="Task"
-  
+
   cat("\n\nSummary Kendall's tau\n")
   ss=ken%>%group_by(Task)%>%
     summarise(mean=mean(value,na.rm=T),
               median=median(value,na.rm=T),
               q25=quantile(value,probs = .25,na.rm=T),
-              q75=quantile(value,probs = .75,na.rm=T))%>% 
+              q75=quantile(value,probs = .75,na.rm=T))%>%
     arrange(desc(median))
-  
+
   print(as.data.frame(ss))
-  
+
   ggplot(ken)+
     geom_density(aes(value,fill=Task),alpha=.3,color=NA)
 }
-
diff --git a/R/visualization.R b/R/visualization.R
index 19158bc..93cf5a4 100644
--- a/R/visualization.R
+++ b/R/visualization.R
@@ -1,275 +1,270 @@
 stability <- function(x,...) UseMethod("stability")
 stability.default <- function(x, ...) stop("not implemented for this class")
 stabilityByAlgorithm <- function(x,...) UseMethod("stabilityByAlgorithm")
 stabilityByAlgorithm.default <- function(x, ...) stop("not implemented for this class")
 stabilityByAlgorithmStacked <- function(x,...) UseMethod("stabilityByAlgorithmStacked")
 stabilityByAlgorithmStacked.default <- function(x, ...) stop("not implemented for this class")
 stabilityByTask <- function(x,...) UseMethod("stabilityByTask")
 stabilityByTask.default <- function(x, ...) stop("not implemented for this class")
 
 
 stability.ranked.list=function(x,
                                ordering,
                                probs=c(.025,.975),
                                max_size=6,
                                freq=FALSE,
-                               shape=4,...){ 
+                               shape=4,...) {
+  if (length(x$data) < 2) {
+    stop("The stability of rankings across tasks cannot be computed for less than two tasks.")
+  }
+
   dd=melt(x,
           measure.vars="rank",
           value.name="rank") %>% dplyr::rename(task="L1")
-  
+
   if (!missing(ordering)) {
-    dd=dd%>%mutate(algorithm=factor(.data$algorithm, 
+    dd=dd%>%mutate(algorithm=factor(.data$algorithm,
                                     levels=ordering))
   } else dd=dd%>%mutate(algorithm=factor(.data$algorithm))
-  
+
   if (!freq) {
     p = ggplot(dd)+
           geom_count(aes(algorithm ,
                          rank,
                          color=algorithm ,
                          size = stat(prop*100)))
   } else {
     p=ggplot(dd)+
         geom_count(aes(algorithm,
                        rank
                        ,color=algorithm ))
-  }  
-  
+  }
+
   p+scale_size_area(max_size = max_size)+
     stat_summary(aes(algorithm ,rank ),
                  geom="point",
                  shape=shape,
                  fun.data=function(x) data.frame(y=median(x)),...)+
     stat_summary(aes(algorithm ,rank ),
                  geom="linerange",
                  fun.data=function(x) data.frame(ymin=quantile(x,probs[1]),
                                                  ymax=quantile(x,probs[2])))+
     geom_abline(slope=1,
                 color="gray",
                 linetype="dotted")+
-    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ 
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
     guides(size = guide_legend(title="%"))+
     scale_y_continuous(minor_breaks=NULL,
-                       limits=c(1,max(5,max(dd$rank))), 
+                       limits=c(1,max(5,max(dd$rank))),
                        breaks=c(1,seq(5,max(5,max(dd$rank)),by=5)))+
     xlab("Algorithm")+
     ylab("Rank")
-  
+
 }
 
 
 rankdist.bootstrap.list=function(x,...){
   rankDist=melt(lapply(x$bootsrappedRanks,t),
                 value.name="rank") %>% dplyr::rename(algorithm="Var2",task="L1")
   rankDist
 }
 
 
 
 stabilityByAlgorithm.bootstrap.list=function(x,
                                              ordering,
                                              probs=c(.025,.975),
                                              max_size=3,
                                              shape=4,
-                                             single=FALSE,...){
- rankDist=rankdist.bootstrap.list(x)
- 
-if (!missing(ordering)) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, 
+                                             single=FALSE,...) {
+
+  if (length(x$data) < 2) {
+    stop("The stability of rankings by algorithm cannot be computed for less than two tasks.")
+  }
+
+  rankDist=rankdist.bootstrap.list(x)
+
+if (!missing(ordering)) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm,
                                                                     levels=ordering))
 
 if (single==FALSE){
   ggplot(rankDist)+
     geom_count(aes(task ,
                    rank,
                    color=algorithm,
-                   size = stat(prop*100), 
+                   size = stat(prop*100),
                    group = task ))+
     scale_size_area(max_size = max_size)+
     stat_summary(aes(task ,rank ),
                  geom="point",
                  shape=shape,
                  fun.data=function(x) data.frame(y=median(x)),...)+
     stat_summary(aes(task ,rank ),
                  geom="linerange",
                  fun.data=function(x) data.frame(ymin=quantile(x,probs[1]),
                                                  ymax=quantile(x,probs[2])))+
-    facet_wrap(vars(algorithm))+ 
-    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ 
+    facet_wrap(vars(algorithm))+
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
     guides(size = guide_legend(title="%"))+
     scale_y_continuous(minor_breaks=NULL,
-                       limits=c(1,max(5,max(rankDist$rank))), 
+                       limits=c(1,max(5,max(rankDist$rank))),
                        breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+
     xlab("Task")+
     ylab("Rank")
-  
+
 } else {
   pl=list()
   for (alg in ordering){
     rankDist.alg=subset(rankDist,
                         rankDist$algorithm==alg)
     pl[[alg]]=ggplot(rankDist.alg)+
             geom_count(aes(task ,
                            rank,
                            color=algorithm,
-                           size = stat(prop*100), 
+                           size = stat(prop*100),
                            group = task ))+
             scale_size_area(max_size = max_size)+
             stat_summary(aes(task ,
                              rank ),
                          geom="point",
                          shape=shape,
                          fun.data=function(x) data.frame(y=median(x)),...)+
             stat_summary(aes(task ,rank ),
                          geom="linerange",
                          fun.data=function(x) data.frame(ymin=quantile(x,probs[1]),
                                                          ymax=quantile(x,probs[2])))+
-            facet_wrap(vars(algorithm))+ 
-            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ 
+            facet_wrap(vars(algorithm))+
+            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
             guides(size = guide_legend(title="%"))+
             scale_y_continuous(minor_breaks=NULL,
-                               limits=c(1,max(5,max(rankDist$rank))), 
+                               limits=c(1,max(5,max(rankDist$rank))),
                                breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+
             xlab("Task")+
             ylab("Rank")
-    
-    
+
+
   }
   pl
 }
- 
+
 }
 
 
 
 
 stabilityByAlgorithmStacked.bootstrap.list=function(x,
                                                     ordering,
-                                                    freq=FALSE,...){
+                                                    freq=FALSE,...) {
+  if (length(x$data) < 2) {
+    stop("The stability of rankings by algorithm cannot be computed for less than two tasks.")
+  }
+
   rankDist=rankdist.bootstrap.list(x)
-  if (!missing(ordering)) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, 
+  if (!missing(ordering)) rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm,
                                                                       levels=ordering))
   rankDist=rankDist%>%group_by(task)%>%dplyr::count(.data$algorithm,
                                                     .data$rank)
   rankDist=rankDist%>%group_by(.data$algorithm)%>%mutate(prop=.data$n/sum(.data$n))%>%ungroup
-  rankDist=rankDist%>%data.frame%>%mutate(rank=as.factor(.data$rank))   
-  
-  
+  rankDist=rankDist%>%data.frame%>%mutate(rank=as.factor(.data$rank))
+
+
   results= melt.ranked.list(x,
                             measure.vars="rank",
                             value.name="rank") %>%dplyr::select(-.data$variable)
   colnames(results)[3]="task"
-  if (!missing(ordering)) results=results%>%mutate(algorithm=factor(.data$algorithm, 
+  if (!missing(ordering)) results=results%>%mutate(algorithm=factor(.data$algorithm,
                                                                     levels=ordering))
-  
-  if (freq)  
+
+  if (freq)
     ggplot(rankDist) +
       geom_bar(aes(rank,
                    n,
-                   fill=task ),   
-               position = "stack", 
-               stat = "identity") +    
-      facet_wrap(vars(algorithm))+ 
+                   fill=task ),
+               position = "stack",
+               stat = "identity") +
+      facet_wrap(vars(algorithm))+
       geom_vline(aes(xintercept=rank,
                      color=task),
                  size=.6,
                  linetype="dotted",
                  data=results )+
       theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
       xlab("Rank")
-  else    
+  else
     ggplot(rankDist)+
       geom_bar(aes(rank,
                    prop,
-                   fill=task ),   
-               position = "stack", 
+                   fill=task ),
+               position = "stack",
                stat = "identity")+
-      facet_wrap(vars(algorithm))+ 
+      facet_wrap(vars(algorithm))+
       geom_vline(aes(xintercept=rank,
                      color=task),
                  size=.4,
                  linetype="dotted",
                  data=results)+
       theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
       xlab("Rank")
 }
 
 
-stability.bootstrap=function(x,
-                             ordering,
-                             probs=c(.025,.975),
-                             max_size=3,
-                             size.ranks=.3*theme_get()$text$size,
-                             shape=4,...){
-  if (missing(ordering)) ordering=  names(sort(t(x$mat[,"rank",drop=F])["rank",]))
-  a=list(bootsrappedRanks=list(x$bootsrappedRanks),
-         matlist=list(x$mat))
-  names(a$bootsrappedRanks)=names(a$matlist)=""
-  stabilityByTask.bootstrap.list(a,
-                                 ordering=ordering,
-                                 probs=probs,
-                                 max_size = max_size,
-                                 size.ranks=size.ranks,
-                                 shape=shape,...)
-  
-  
-}
-
-
-
 stabilityByTask.bootstrap.list=function(x,
                                         ordering,
                                         probs=c(.025,.975),
                                         max_size=3,
                                         size.ranks=.3*theme_get()$text$size,
-                                        shape=4,...){
+                                        shape=4,
+                                        showLabelForSingleTask=FALSE,...){
   rankDist=rankdist.bootstrap.list(x)
   ranks=melt.ranked.list(x,
-                         measure.vars="rank", 
+                         measure.vars="rank",
                          value.name = "full.rank")
   colnames(ranks)[4]="task"
   if (!missing(ordering)) {
-    ranks$algorithm=factor(ranks$algorithm, 
+    ranks$algorithm=factor(ranks$algorithm,
                            levels=ordering)
-    rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm, 
+    rankDist=rankDist%>%mutate(algorithm=factor(.data$algorithm,
                                                 levels=ordering))
   }
- 
-  ggplot(rankDist)+ 
+
+  blobPlot <- ggplot(rankDist)+
     geom_count(aes(algorithm ,
                    rank,
                    color=algorithm,
-                   size = stat(prop*100), 
+                   size = stat(prop*100),
                    group = algorithm ))+
     scale_size_area(max_size = max_size)+
     geom_abline(slope=1,
                 color="gray",
                 linetype="dotted")+
     stat_summary(aes(algorithm ,rank ),
                  geom="point",
                  shape=shape,
                  fun.data=function(x) data.frame(y=median(x)),...)+
     stat_summary(aes(algorithm ,rank ),
                  geom="linerange",
                  fun.data=function(x) data.frame(ymin=quantile(x,probs[1]),
                                                  ymax=quantile(x,probs[2])))+
     geom_text(aes(x=algorithm,y=1,label=full.rank),
-              nudge_y=-.6, 
+              nudge_y=-.6,
               vjust = 0,
               size=size.ranks,
               fontface="plain",
               family="sans",
-              data=ranks) +    
+              data=ranks) +
     coord_cartesian(clip = 'off')+
-    facet_wrap(vars(task))+ 
-    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+ 
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
     guides(size = guide_legend(title="%"))+
     scale_y_continuous(minor_breaks=NULL,
-                       limits=c(.4,max(5,max(rankDist$rank))), 
+                       limits=c(.4,max(5,max(rankDist$rank))),
                        breaks=c(1,seq(5,max(5,max(rankDist$rank)),by=5)))+
     xlab("Algorithm")+
     ylab("Rank")
-}
-
 
+  # Create multi-panel plot with task names as labels for multi-task data set or single-task data set when explicitly specified
+  if (length(x$data) > 1 || showLabelForSingleTask == TRUE) {
+    blobPlot <- blobPlot + facet_wrap(vars(task))
+  }
 
+  return(blobPlot)
+}
diff --git a/README.md b/README.md
index 52a75b6..b05bee6 100644
--- a/README.md
+++ b/README.md
@@ -1,386 +1,388 @@
 Methods and open-source toolkit for analyzing and visualizing challenge
 results
 ================
 
   - [Installation](#installation)
   - [Terms of use](#terms-of-use)
   - [Usage](#usage)
   - [Changes](#changes)
   - [Reference](#reference)
 
 Note that this is ongoing work (version 0.3.3), there may be updates
 with possibly major changes. *Please make sure that you use the most
 current version\!*
 
 Change log at the end of this document.
 
 # Installation
 
 Requires R version \>= 3.5.2 (<https://www.r-project.org>).
 
 Further, a recent version of Pandoc (\>= 1.12.3) is required. RStudio
 (<https://rstudio.com>) automatically includes this so you do not need
 to download Pandoc if you plan to use rmarkdown from the RStudio IDE,
 otherwise you’ll need to install Pandoc for your platform
 (<https://pandoc.org/installing.html>). Finally, if you want to generate
 a pdf report you will need to have LaTeX installed (e.g. MiKTeX, MacTeX
 or TinyTeX).
 
 To get the current development version of the R package from Github:
 
 ``` r
 if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
 if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
 BiocManager::install("Rgraphviz", dependencies = TRUE)
 devtools::install_github("wiesenfa/challengeR", dependencies = TRUE)
 ```
 
 If you are asked whether you want to update installed packages and you
 type “a” for all, you might need administrator rights to update R core
 packages. You can also try to type “n” for updating no packages. If you
 are asked “Do you want to install from sources the packages which need
 compilation? (Yes/no/cancel)”, you can safely type “no”.
 
 If you get *Warning messages* (in contrast to *Error* messages), these
 might not be problematic and you can try to proceed.
 
 # Terms of use
 
 Licenced under GPL-3. If you use this software for a publication, cite
 
 Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein,
 L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for
 analyzing and visualizing challenge results. *arXiv preprint
 arXiv:1910.05121*
 
 # Usage
 
 Each of the following steps have to be run to generate the report: (1)
 Load package, (2) load data, (3) perform ranking, (4) perform
 bootstrapping and (5) generation of the report
 
 ## 1\. Load package
 
 Load package
 
 ``` r
 library(challengeR)
 ```
 
 ## 2\. Load data
 
 ### Data requirements
 
 Data requires the following *columns*
 
   - a *task identifier* in case of multi-task challenges.
   - a *test case identifier*
   - the *algorithm name*
   - the *metric value*
 
 In case of missing metric values, a missing observation has to be
 provided (either as blank field or “NA”).
 
 For example, in a challenge with 2 tasks, 2 test cases and 2 algorithms,
 where in task “T2”, test case “case2”, algorithm “A2” didn’t give a
 prediction (and thus NA or a blank field for missing value is inserted),
 the data set might look like this:
 
 | Task | TestCase | Algorithm | MetricValue |
 | :--- | :------- | :-------- | ----------: |
 | T1   | case1    | A1        |       0.266 |
 | T1   | case1    | A2        |       0.202 |
 | T1   | case2    | A1        |       0.573 |
 | T1   | case2    | A2        |       0.945 |
 | T2   | case1    | A1        |       0.372 |
 | T2   | case1    | A2        |       0.898 |
 | T2   | case2    | A1        |       0.908 |
 | T2   | case2    | A2        |          NA |
 
 ### Load data
 
 If you have assessment data at hand stored in a csv file (if you want to
 use simulated data skip the following code line) use
 
 ``` r
 data_matrix=read.csv(file.choose()) # type ?read.csv for help
 ```
 
 This allows to choose a file interactively, otherwise replace
 *file.choose()* by the file path (in style “/path/to/dataset.csv”) in
 quotation marks.
 
 For illustration purposes, in the following simulated data is generated
 *instead* (skip the following code chunk if you have already loaded
 data). The data is also stored as “data\_matrix.csv” in the repository.
 
 ``` r
 if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute")
 
 n=50
 
 set.seed(4)
 strip=runif(n,.9,1)
 c_ideal=cbind(task="c_ideal",
             rbind(
               data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n),
               data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n),
               data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n),
               data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n),
               data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n)
             ))
 
 set.seed(1)
 c_random=data.frame(task="c_random",
                        alg_name=factor(paste0("A",rep(1:5,each=n))),
                        value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5)
                        )
 
 strip2=seq(.8,1,length.out=5)
 a=permute::allPerms(1:5)
 c_worstcase=data.frame(task="c_worstcase",
                      alg_name=c(t(a)),
                      value=rep(strip2,nrow(a)),
                      case=rep(1:nrow(a),each=5)
                      )
 c_worstcase=rbind(c_worstcase,
                 data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1)
           )
 c_worstcase$alg_name=factor(c_worstcase$alg_name,labels=paste0("A",1:5))
 
 data_matrix=rbind(c_ideal, c_random, c_worstcase)
 ```
 
 ## 3 Perform ranking
 
 ### 3.1 Define challenge object
 
 Code differs slightly for single and multi task challenges.
 
 In case of a single task challenge use
 
 ``` r
 # Use only task "c_random" in object data_matrix
   dataSubset=subset(data_matrix, task=="c_random")
 
-  challenge=as.challenge(dataSubset, 
+  challenge=as.challenge(dataSubset,
+                        # Specify how to refer to the task in plots and reports
+                        taskName="Task 1",
                         # Specify which column contains the algorithm, 
                         # which column contains a test case identifier 
                         # and which contains the metric value:
                         algorithm="alg_name", case="case", value="value", 
                         # Specify if small metric values are better
                         smallBetter = FALSE)
 ```
 
 *Instead*, for a multi-task challenge use
 
 ``` r
 # Same as above but with 'by="task"' where variable "task" contains the task identifier
   challenge=as.challenge(data_matrix, 
                          by="task", 
                          algorithm="alg_name", case="case", value="value", 
                          smallBetter = FALSE)
 ```
 
 ### 3.2 Perform ranking
 
 Different ranking methods are available, choose one of them:
 
   - for “aggregate-then-rank” use (here: take mean for aggregation)
 
 <!-- end list -->
 
 ``` r
 ranking=challenge%>%aggregateThenRank(FUN = mean, # aggregation function, 
                                                   # e.g. mean, median, min, max, 
                                                   # or e.g. function(x) quantile(x, probs=0.05)
                                       na.treat=0, # either "na.rm" to remove missing data, 
                                                   # set missings to numeric value (e.g. 0) 
                                                   # or specify a function, 
                                                   # e.g. function(x) min(x)
                                       ties.method = "min" # a character string specifying 
                                                           # how ties are treated, see ?base::rank
                                             )  
 ```
 
   - *alternatively*, for “rank-then-aggregate” with arguments as above
     (here: take mean for aggregation):
 
 <!-- end list -->
 
 ``` r
 ranking=challenge%>%rankThenAggregate(FUN = mean,
                                       ties.method = "min"
                                       )
 ```
 
   - *alternatively*, for test-then-rank based on Wilcoxon signed rank
     test:
 
 <!-- end list -->
 
 ``` r
 ranking=challenge%>%testThenRank(alpha=0.05, # significance level
                                  p.adjust.method="none",  # method for adjustment for
                                                           # multiple testing, see ?p.adjust
                                  na.treat=0, # either "na.rm" to remove missing data,
                                              # set missings to numeric value (e.g. 0)
                                              # or specify a function, e.g. function(x) min(x)
                                  ties.method = "min" # a character string specifying
                                                      # how ties are treated, see ?base::rank
                      )
 ```
 
 ## 4\. Perform bootstrapping
 
 Perform bootstrapping with 1000 bootstrap samples using one CPU
 
 ``` r
 set.seed(1)
 ranking_bootstrapped=ranking%>%bootstrap(nboot=1000)
 ```
 
 If you want to use multiple CPUs (here: 8 CPUs), use
 
 ``` r
 library(doParallel)
 registerDoParallel(cores=8)  
 set.seed(1)
 ranking_bootstrapped=ranking%>%bootstrap(nboot=1000, parallel=TRUE, progress = "none")
 stopImplicitCluster()
 ```
 
 ## 5\. Generate the report
 
 Generate report in PDF, HTML or DOCX format. Code differs slightly for
 single and multi task challenges.
 
 ### 5.1 For single task challenges
 
 ``` r
 ranking_bootstrapped %>% 
   report(title="singleTaskChallengeExample", # used for the title of the report
          file = "filename", 
          format = "PDF", # format can be "PDF", "HTML" or "Word"
          latex_engine="pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex"
          clean=TRUE #optional. Using TRUE will clean intermediate files that are created during rendering.
         ) 
 ```
 
 Argument *file* allows for specifying the output file path as well,
 otherwise the working directory is used. If file is specified but does
 not have a file extension, an extension will be automatically added
 according to the output format given in *format*. Using argument
 *clean=FALSE* allows to retain intermediate files, such as separate
 files for each figure.
 
 If argument “file” is omitted, the report is created in a temporary
 folder with file name “report”.
 
 ### 5.1 For multi task challenges
 
 Same as for single task challenges, but additionally consensus ranking
 (rank aggregation across tasks) has to be given.
 
 Compute ranking consensus across tasks (here: consensus ranking
 according to mean ranks across tasks):
 
 ``` r
 # See ?relation_consensus for different methods to derive consensus ranking
 meanRanks=ranking%>%consensus(method = "euclidean") 
 meanRanks # note that there may be ties (i.e. some algorithms have identical mean rank)
 ```
 
 Generate report as above, but with additional specification of consensus
 ranking
 
 ``` r
 ranking_bootstrapped %>% 
   report(consensus=meanRanks,
          title="multiTaskChallengeExample",
          file = "filename", 
          format = "PDF", # format can be "PDF", "HTML" or "Word"
          latex_engine="pdflatex"#LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex"
         )
 ```
 
 # Changes
 
 #### Version 0.3.3
 
   - Force line break to avoid that authors exceed the page in generated
     PDF reports
 
 #### Version 0.3.2
 
   - Correct names of authors
 
 #### Version 0.3.1
 
   - Refactoring
 
 #### Version 0.3.0
 
   - Major bug fix release
 
 #### Version 0.2.5
 
   - Bug fixes
 
 #### Version 0.2.4
 
   - Automatic insertion of missings
 
 #### Version 0.2.3
 
   - Bug fixes
   - Reports for subsets (top list) of algorithms: Use
     e.g. `subset(ranking_bootstrapped, top=3) %>% report(...)` (or
     `subset(ranking, top=3) %>% report(...)` for report without
     bootstrap results) to only show the top 3 algorithms according to
     the chosen ranking methods, where `ranking_bootstrapped` and
     `ranking` objects as defined in the example. Line plot for ranking
     robustness can be used to check whether algorithms performing well
     in other ranking methods are excluded. Bootstrapping still takes
     entire uncertainty into account. Podium plot neglect and ranking
     heatmap neglect excluded algorithms. Only available for single task
     challenges (for mutli task challenges not sensible because each task
     would contain a different sets of algorithms).
   - Reports for subsets of tasks: Use e.g. `subset(ranking_bootstrapped,
     tasks=c("task1", "task2","task3)) %>% report(...)` to restrict
     report to tasks “task1”, “task2”,"task3. You may want to recompute
     the consensus ranking before using `meanRanks=subset(ranking,
     tasks=c("task1", "task2","task3))%>%consensus(method = "euclidean")`
 
 #### Version 0.2.1
 
   - Introduction in reports now mentions e.g. ranking method, number of
     test cases,…
   - Function `subset()` allows selection of tasks after bootstrapping,
     e.g. `subset(ranking_bootstrapped,1:3)`
   - `report()` functions gain argument `colors` (default:
     `default_colors`). Change e.g. to `colors=viridisLite::inferno`
     which “is designed in such a way that it will analytically be
     perfectly perceptually-uniform, both in regular form and also when
     converted to black-and-white. It is also designed to be perceived by
     readers with the most common form of color blindness.” See package
     `viridis` for further similar functions.
 
 #### Version 0.2.0
 
   - Improved layout in case of many algorithms and tasks (while probably
     still not perfect)
   - Consistent coloring of algorithms across figures
   - `report()` function can be applied to ranked object before
     bootstrapping (and thus excluding figures based on bootstrapping),
     i.e. in the example `ranking %>% report(...)`
   - bug fixes
 
 # Reference
 
 Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein,
 L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for
 analyzing and visualizing challenge results. *arXiv preprint
 arXiv:1910.05121*
 
 ![alt text](HIP_Logo.png)
diff --git a/Readme.Rmd b/Readme.Rmd
index ef1ef63..d72823a 100644
--- a/Readme.Rmd
+++ b/Readme.Rmd
@@ -1,326 +1,328 @@
 ---
 title: Methods and open-source toolkit for analyzing and visualizing challenge results
 output:
   github_document:
     toc: yes
     toc_depth: 1
   pdf_document:
     toc: yes
     toc_depth: '3'
 editor_options:
   chunk_output_type: console
 ---
 
 
 
 
 ```{r, echo = FALSE}
 knitr::opts_chunk$set(
  collapse = TRUE,
   comment = "#>",
  # fig.path = "README-",
     fig.width = 9,
     fig.height = 5,
     width=160
 )
 ```
 
 
 Note that this is ongoing work (version `r packageVersion("challengeR")`), there may be updates with possibly major changes. *Please make sure that you use the most current version!* 
 
 Change log at the end of this document.
 
 
 # Installation
 
 Requires R version >= 3.5.2 (https://www.r-project.org).
 
 Further, a recent version of Pandoc (>= 1.12.3) is required. RStudio (https://rstudio.com) automatically includes this so you do not need to download Pandoc if you plan to use rmarkdown from the RStudio IDE, otherwise you’ll need to install Pandoc for your platform (https://pandoc.org/installing.html). Finally, if you want to generate a pdf report you will need to have LaTeX installed (e.g. MiKTeX, MacTeX or TinyTeX).
 
 
 To get the current development version of the R package from Github:
 
 ```{r, eval=F,R.options,}
 if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
 if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
 BiocManager::install("Rgraphviz", dependencies = TRUE)
 devtools::install_github("wiesenfa/challengeR", dependencies = TRUE)
 ```
 
 If you are asked whether you want to update installed packages and you type "a" for all, you might need administrator rights to update R core packages. You can also try to type "n" for updating no packages. If you are asked "Do you want to install from sources the packages which need compilation? (Yes/no/cancel)", you can safely type "no".
 
 If you get *Warning messages* (in contrast to *Error* messages), these might not be problematic and you can try to proceed. 
 
 # Terms of use
 Licenced under GPL-3. If you use this software for a publication, cite
 
 Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
 
 
  
 # Usage
 Each of the following steps have to be run to generate the report: (1) Load package, (2) load data, (3) perform ranking, (4) perform bootstrapping and (5) generation of the report
 
 ## 1. Load package
 Load package
 ```{r, eval=F}
 library(challengeR)
 ```
 
 
 ## 2. Load data
 
 ### Data requirements
 Data requires the following *columns*  
 
 * a *task identifier* in case of multi-task challenges.
 * a *test case identifier* 
 * the *algorithm name* 
 * the *metric value* 
 
 In case of missing metric values, a missing observation has to be provided (either as blank field or "NA").
 
 
 For example, in a challenge with 2 tasks, 2 test cases and 2 algorithms, where in task "T2", test case "case2", algorithm "A2" didn't give a prediction (and thus NA or a blank field for missing value is inserted), the data set might look like this: 
 
 ```{r, eval=T, echo=F,results='asis'}
 set.seed(1)
 a=cbind(expand.grid(Task=paste0("T",1:2),TestCase=paste0("case",1:2),Algorithm=paste0("A",1:2)),MetricValue=round(c(runif(7,0,1),NA),3))
 print(knitr::kable(a[order(a$Task,a$TestCase,a$Algorithm),],row.names=F))
 ```
 
 
 
 ### Load data
 If you have assessment data at hand stored in a csv file (if you want to use simulated data skip the following code line) use
 ```{r, eval=F, echo=T}
 data_matrix=read.csv(file.choose()) # type ?read.csv for help
 
 ```
 
 This allows to choose a file interactively, otherwise replace *file.choose()* by the file path (in style "/path/to/dataset.csv") in quotation marks.
 
 
 
 
 
 For illustration purposes, in the following simulated data is generated *instead* (skip the following code chunk if you have already loaded data). The data is also stored as "data_matrix.csv" in the repository.
 ```{r, eval=F, echo=T}
 if (!requireNamespace("permute", quietly = TRUE)) install.packages("permute")
 
 n=50
 
 set.seed(4)
 strip=runif(n,.9,1)
 c_ideal=cbind(task="c_ideal",
             rbind(
               data.frame(alg_name="A1",value=runif(n,.9,1),case=1:n),
               data.frame(alg_name="A2",value=runif(n,.8,.89),case=1:n),
               data.frame(alg_name="A3",value=runif(n,.7,.79),case=1:n),
               data.frame(alg_name="A4",value=runif(n,.6,.69),case=1:n),
               data.frame(alg_name="A5",value=runif(n,.5,.59),case=1:n)
             ))
 
 set.seed(1)
 c_random=data.frame(task="c_random",
                        alg_name=factor(paste0("A",rep(1:5,each=n))),
                        value=plogis(rnorm(5*n,1.5,1)),case=rep(1:n,times=5)
                        )
 
 strip2=seq(.8,1,length.out=5)
 a=permute::allPerms(1:5)
 c_worstcase=data.frame(task="c_worstcase",
                      alg_name=c(t(a)),
                      value=rep(strip2,nrow(a)),
                      case=rep(1:nrow(a),each=5)
                      )
 c_worstcase=rbind(c_worstcase,
                 data.frame(task="c_worstcase",alg_name=1:5,value=strip2,case=max(c_worstcase$case)+1)
           )
 c_worstcase$alg_name=factor(c_worstcase$alg_name,labels=paste0("A",1:5))
 
 data_matrix=rbind(c_ideal, c_random, c_worstcase)
 
 ```
 
 
 ## 3 Perform ranking 
 
 ### 3.1 Define challenge object
 Code differs slightly for single and multi task challenges.
 
 In case of a single task challenge use
 
 ```{r, eval=F, echo=T}
 # Use only task "c_random" in object data_matrix
   dataSubset=subset(data_matrix, task=="c_random")
 
-  challenge=as.challenge(dataSubset, 
+  challenge=as.challenge(dataSubset,
+                        # Specify how to refer to the task in plots and reports
+                        taskName="Task 1",
                         # Specify which column contains the algorithm, 
                         # which column contains a test case identifier 
                         # and which contains the metric value:
                         algorithm="alg_name", case="case", value="value", 
                         # Specify if small metric values are better
                         smallBetter = FALSE)
 ```
 
 *Instead*, for a multi-task challenge use 
 
 ```{r, eval=F, echo=T}
 # Same as above but with 'by="task"' where variable "task" contains the task identifier
   challenge=as.challenge(data_matrix, 
                          by="task", 
                          algorithm="alg_name", case="case", value="value", 
                          smallBetter = FALSE)
 ```
 
 
 ### 3.2 Perform ranking 
 
 Different ranking methods are available, choose one of them:
 
 - for "aggregate-then-rank" use (here: take mean for aggregation)
 ```{r, eval=F, echo=T}
 ranking=challenge%>%aggregateThenRank(FUN = mean, # aggregation function, 
                                                   # e.g. mean, median, min, max, 
                                                   # or e.g. function(x) quantile(x, probs=0.05)
                                       na.treat=0, # either "na.rm" to remove missing data, 
                                                   # set missings to numeric value (e.g. 0) 
                                                   # or specify a function, 
                                                   # e.g. function(x) min(x)
                                       ties.method = "min" # a character string specifying 
                                                           # how ties are treated, see ?base::rank
                                             )  
 ```
 
 - *alternatively*, for  "rank-then-aggregate" with arguments as above (here: take mean for aggregation):
 ```{r, eval=F, echo=T}
 ranking=challenge%>%rankThenAggregate(FUN = mean,
                                       ties.method = "min"
                                       )
 ```
 
 - *alternatively*, for test-then-rank based on Wilcoxon signed rank test:
 ```{r, eval=F, echo=T}
 ranking=challenge%>%testThenRank(alpha=0.05, # significance level
                                  p.adjust.method="none",  # method for adjustment for
                                                           # multiple testing, see ?p.adjust
                                  na.treat=0, # either "na.rm" to remove missing data,
                                              # set missings to numeric value (e.g. 0)
                                              # or specify a function, e.g. function(x) min(x)
                                  ties.method = "min" # a character string specifying
                                                      # how ties are treated, see ?base::rank
                      )
 
 ```
 
 ## 4. Perform bootstrapping
 
 Perform bootstrapping with 1000 bootstrap samples using one CPU
 ```{r, eval=F, echo=T}
 set.seed(1)
 ranking_bootstrapped=ranking%>%bootstrap(nboot=1000)
 ```
 
 If you want to use multiple CPUs (here: 8 CPUs), use
 
 ```{r, eval=F, echo=T}
 library(doParallel)
 registerDoParallel(cores=8)  
 set.seed(1)
 ranking_bootstrapped=ranking%>%bootstrap(nboot=1000, parallel=TRUE, progress = "none")
 stopImplicitCluster()
 ```
 
 
 
 ## 5. Generate the report
 Generate report in PDF, HTML or DOCX format. Code differs slightly for single and multi task challenges.
 
 ### 5.1 For single task challenges
 ```{r, eval=F, echo=T}
 ranking_bootstrapped %>% 
   report(title="singleTaskChallengeExample", # used for the title of the report
          file = "filename", 
          format = "PDF", # format can be "PDF", "HTML" or "Word"
          latex_engine="pdflatex", #LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex"
          clean=TRUE #optional. Using TRUE will clean intermediate files that are created during rendering.
         ) 
 
 ```
 
 Argument *file* allows for specifying the output file path as well, otherwise the working directory is used.
 If file is specified but does not have a file extension, an extension will be automatically added according to the output format given in *format*. 
 Using argument *clean=FALSE* allows to retain intermediate files, such as separate files for each figure.
 
 If argument "file" is omitted, the report is  created in a temporary folder with file name "report".
 
 
 
 
 ### 5.1 For multi task challenges
 Same as for single task challenges, but additionally consensus ranking (rank aggregation across tasks) has to be given.
 
 Compute ranking consensus across tasks (here: consensus ranking according to mean ranks across tasks):
  
 ```{r, eval=F, echo=T}
 # See ?relation_consensus for different methods to derive consensus ranking
 meanRanks=ranking%>%consensus(method = "euclidean") 
 meanRanks # note that there may be ties (i.e. some algorithms have identical mean rank)
 ```
 
 Generate report as above, but with additional specification of consensus ranking
 ```{r, eval=F, echo=T}
 ranking_bootstrapped %>% 
   report(consensus=meanRanks,
          title="multiTaskChallengeExample",
          file = "filename", 
          format = "PDF", # format can be "PDF", "HTML" or "Word"
          latex_engine="pdflatex"#LaTeX engine for producing PDF output. Options are "pdflatex", "lualatex", and "xelatex"
         )
 ```
 
 
 # Changes
 
 #### Version 0.3.3
 - Force line break to avoid that authors exceed the page in generated PDF reports
 
 #### Version 0.3.2
 - Correct names of authors
 
 #### Version 0.3.1
 - Refactoring
 
 #### Version 0.3.0
 - Major bug fix release
 
 #### Version 0.2.5
 - Bug fixes
 
 
 #### Version 0.2.4
 - Automatic insertion of missings
 
 #### Version 0.2.3
 - Bug fixes
 - Reports for subsets (top list) of algorithms: Use e.g. `subset(ranking_bootstrapped, top=3) %>% report(...)` (or `subset(ranking, top=3) %>% report(...)` for report without bootstrap results) to only show the top 3 algorithms according to the chosen ranking methods, where  `ranking_bootstrapped` and `ranking` objects as defined in the example. Line plot for ranking robustness can be used to check whether algorithms performing well in other ranking methods are excluded. Bootstrapping still takes entire uncertainty into account. Podium plot neglect and ranking heatmap neglect excluded algorithms. Only available for single task challenges (for mutli task challenges not sensible because each task would contain a different sets of algorithms). 
 - Reports for subsets of tasks: Use e.g. `subset(ranking_bootstrapped, tasks=c("task1", "task2","task3)) %>% report(...)` to restrict report to tasks "task1", "task2","task3. You may want to recompute the consensus ranking before using `meanRanks=subset(ranking, tasks=c("task1", "task2","task3))%>%consensus(method = "euclidean")`
 
 #### Version 0.2.1
 - Introduction in reports now mentions e.g. ranking method, number of test cases,...
 - Function `subset()` allows selection of tasks after bootstrapping, e.g. `subset(ranking_bootstrapped,1:3)`
 - `report()` functions gain argument `colors` (default: `default_colors`). Change e.g. to `colors=viridisLite::inferno` which "is designed in such a way that it will analytically be perfectly perceptually-uniform, both in regular form and also when converted to black-and-white. It is also designed to be perceived by readers with the most common form of color blindness." See package `viridis` for further similar functions.
 
 #### Version 0.2.0
 - Improved layout in case of many algorithms and tasks (while probably still not perfect)
 - Consistent coloring of algorithms across figures
 - `report()` function can be applied to ranked object before bootstrapping (and thus excluding figures based on bootstrapping), i.e. in the example `ranking %>% report(...)`
 - bug fixes
   
   
 # Reference
 
 Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
 
 ![alt text](HIP_Logo.png){width=100px}
diff --git a/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd b/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd
new file mode 100644
index 0000000..db8f9dd
--- /dev/null
+++ b/inst/appdir/characterizationOfAlgorithmsBootstrapping.Rmd
@@ -0,0 +1,57 @@
+### Ranking stability: Ranking variability via bootstrap approach
+
+Blob plot of bootstrap results over the different tasks separated
+by algorithm allows another perspective on the assessment data. This gives deeper insights into the characteristics
+of tasks and the ranking uncertainty of the algorithms in each
+task. 
+<!-- 1000 bootstrap Rankings were performed for each task. -->
+<!-- Each algorithm is considered separately and for each subtask (x-axis) all observed ranks across bootstrap samples (y-axis) are displayed. Additionally, medians and IQR is shown in black. -->
+
+<!-- We see which algorithm is consistently among best, which is consistently among worst, which vary extremely... -->
+
+
+\bigskip
+
+```{r blobplot_bootstrap_byAlgorithm,fig.width=7,fig.height = 5}
+#stabilityByAlgorithm.bootstrap.list
+if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){
+  stabilityByAlgorithm(boot_object,
+                       ordering=ordering_consensus,
+                       max_size = 9,
+                       size=4,
+                       shape=4,
+                       single = F) + scale_color_manual(values=cols)
+} else {
+  pl=stabilityByAlgorithm(boot_object,
+                          ordering=ordering_consensus,
+                          max_size = 9,
+                          size=4,
+                          shape=4,
+                          single = T)
+  for (i in 1:length(pl)) print(pl[[i]] + 
+                                  scale_color_manual(values=cols) +
+                                  guides(size = guide_legend(title="%"),color="none")
+                                )
+}
+
+```
+
+
+<!-- Stacked frequencies of observed ranks across bootstrap samples are displayed with colouring according to subtask. Vertical lines provide original (non-bootstrap) rankings for each subtask. -->
+
+An alternative representation is provided by a stacked
+frequency plot of the observed ranks, separated by algorithm. Observed ranks across bootstrap samples are
+displayed with colouring according to task. For algorithms that
+achieve the same rank in different tasks for the full assessment
+data set, vertical lines are on top of each other. Vertical lines
+allow to compare the achieved rank of each algorithm over
+different tasks.
+
+\bigskip
+
+
+```{r stackedFrequencies_bootstrap_byAlgorithm,fig.width=7,fig.height = 5}
+#stabilityByAlgorithmStacked.bootstrap.list
+stabilityByAlgorithmStacked(boot_object,ordering=ordering_consensus)
+```
+
diff --git a/inst/appdir/characterizationOfTasksBootstrapping.Rmd b/inst/appdir/characterizationOfTasksBootstrapping.Rmd
new file mode 100644
index 0000000..62c9504
--- /dev/null
+++ b/inst/appdir/characterizationOfTasksBootstrapping.Rmd
@@ -0,0 +1,51 @@
+### Visualizing bootstrap results
+To investigate which
+tasks separate algorithms well (i.e., lead to a stable ranking),
+two visualization methods are recommended.
+
+Bootstrap results can be shown in a blob plot showing one plot for each
+task. In this view, the spread of the blobs for each algorithm
+can be compared across tasks. Deviations from the diagonal indicate deviations
+from the consensus ranking (over tasks). Specifically, if rank
+distribution of an algorithm is consistently below the diagonal,
+the algorithm performed better in this task than on average
+across tasks, while if the rank distribution of an algorithm
+is consistently above the diagonal, the algorithm performed
+worse in this task than on average across tasks. At the bottom
+of each panel, ranks for each algorithm in the tasks is provided.
+
+
+<!-- Shows which subtask leads to stable ranking and in which subtask ranking is more uncertain. -->
+
+
+Same as in Section \ref{blobByTask} but now ordered according to consensus.
+
+\bigskip
+
+```{r blobplot_bootstrap_byTask,fig.width=9, fig.height=9}
+#stabilityByTask.bootstrap.list
+if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){
+  stabilityByTask(boot_object,
+                  ordering=ordering_consensus,
+                  max_size = 9,
+                  size=4,
+                  shape=4) + scale_color_manual(values=cols)
+} else {
+  pl=list()
+  for (subt in names(boot_object$bootsrappedRanks)){
+    a=list(bootsrappedRanks=list(boot_object$bootsrappedRanks[[subt]]),
+           matlist=list(boot_object$matlist[[subt]]))
+    names(a$bootsrappedRanks)=names(a$matlist)=subt
+    class(a)="bootstrap.list"
+    r=boot_object$matlist[[subt]]
+    
+    pl[[subt]]=stabilityByTask(a,
+                               max_size = 9,
+                               ordering=ordering_consensus,
+                               size.ranks=.25*theme_get()$text$size,
+                               size=4,
+                               shape=4) + scale_color_manual(values=cols) + ggtitle(subt)
+  }
+  for (i in 1:length(pl)) print(pl[[i]])
+}
+```
\ No newline at end of file
diff --git a/inst/appdir/consensusRanking.Rmd b/inst/appdir/consensusRanking.Rmd
new file mode 100644
index 0000000..ce183f7
--- /dev/null
+++ b/inst/appdir/consensusRanking.Rmd
@@ -0,0 +1,6 @@
+Consensus ranking according to chosen method `r attr(params$consensus,"method")`:
+```{r}
+knitr::kable(data.frame(value=round(params$consensus,3), 
+                        rank=rank(params$consensus, 
+                                  ties.method="min")))
+```
diff --git a/inst/appdir/overviewMultiTaskBootstrapping.Rmd b/inst/appdir/overviewMultiTaskBootstrapping.Rmd
new file mode 100644
index 0000000..46341f6
--- /dev/null
+++ b/inst/appdir/overviewMultiTaskBootstrapping.Rmd
@@ -0,0 +1,4 @@
+* Visualization of assessment data: Dot- and boxplots, podium plots and ranking heatmaps
+* Visualization of ranking stability: Blob plots, violin plots and significance maps
+* Visualization of ranking robustness: Line plots
+* Visualization of cross-task insights
\ No newline at end of file
diff --git a/inst/appdir/overviewMultiTaskNoBootstrapping.Rmd b/inst/appdir/overviewMultiTaskNoBootstrapping.Rmd
new file mode 100644
index 0000000..06f0d85
--- /dev/null
+++ b/inst/appdir/overviewMultiTaskNoBootstrapping.Rmd
@@ -0,0 +1,4 @@
+* Visualization of assessment data: Dot- and boxplots, podium plots and ranking heatmaps
+* Visualization of ranking stability: Significance maps
+* Visualization of ranking robustness: Line plots
+* Visualization of cross-task insights
\ No newline at end of file
diff --git a/inst/appdir/overviewSingleTaskBootstrapping.Rmd b/inst/appdir/overviewSingleTaskBootstrapping.Rmd
new file mode 100644
index 0000000..14c8bcd
--- /dev/null
+++ b/inst/appdir/overviewSingleTaskBootstrapping.Rmd
@@ -0,0 +1,3 @@
+* Visualization of assessment data: Dot- and boxplot, podium plot and ranking heatmap
+* Visualization of ranking stability: Blob plot, violin plot and significance map
+* Visualization of ranking robustness: Line plot
\ No newline at end of file
diff --git a/inst/appdir/overviewSingleTaskNoBootstrapping.Rmd b/inst/appdir/overviewSingleTaskNoBootstrapping.Rmd
new file mode 100644
index 0000000..a8ecd70
--- /dev/null
+++ b/inst/appdir/overviewSingleTaskNoBootstrapping.Rmd
@@ -0,0 +1,3 @@
+* Visualization of assessment data: Dot- and boxplot, podium plot and ranking heatmap
+* Visualization of ranking stability: Significance map
+* Visualization of ranking robustness: Line plot
diff --git a/inst/appdir/reportMultiple.Rmd b/inst/appdir/reportMultiple.Rmd
index 596948c..88ca717 100644
--- a/inst/appdir/reportMultiple.Rmd
+++ b/inst/appdir/reportMultiple.Rmd
@@ -1,563 +1,347 @@
 ---
 params:
   object: NA
   colors: NA
   name: NULL
   consensus: NA
+  isMultiTask: NA
+  bootstrappingEnabled: NA
 title: "Benchmarking report for `r params$name` "
 author: "created by challengeR v`r packageVersion('challengeR')`  \nWiesenfarth, Reinke, Landman, Cardoso, Maier-Hein & Kopp-Schneider (2019)"
 date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`"
 editor_options: 
   chunk_output_type: console
 ---
 
 <!-- This text is outcommented -->
 <!-- R code chunks start with "```{r }" and end with "```" -->
 <!-- Please do not change anything inside of code chunks, otherwise any latex code is allowed -->
 
 <!-- inline code with `r 0` -->
 
 
 ```{r setup, include=FALSE}
 options(width=80)
 out.format <- knitr::opts_knit$get("out.format")
 img_template <- switch( out.format,
                      word = list("img-params"=list(dpi=150,
                                                fig.width=6,
                                                fig.height=6,
                                                out.width="504px",
                                                out.height="504px")),
                      {
                        # default
                        list("img-params"=list( fig.width=7,fig.height = 3,dpi=300))
                      } )
 
 knitr::opts_template$set( img_template )
 
 knitr::opts_chunk$set(echo = F,#fig.width=7,fig.height = 3,dpi=300,
                       fig.align="center")
 theme_set(theme_light())
 
+isMultiTask = params$isMultiTask
+bootstrappingEnabled = params$bootstrappingEnabled
+
 ```
 
 
 ```{r }
-boot_object = params$object
+object = params$object
+if (isMultiTask) {
 ordering_consensus=names(params$consensus)
+} else
+{
+  ordering_consensus=names(sort(t(object$matlist[[1]][,"rank",drop=F])["rank",]))
+}
 color.fun=params$colors
+
 ```
 
 ```{r }
-challenge_multiple=boot_object$data
 
-ranking.fun=boot_object$FUN
-object=challenge_multiple%>%ranking.fun
+challenge_multiple=object$data
+ranking.fun=object$FUN
 
 cols_numbered=cols=color.fun(length(ordering_consensus))
 names(cols)=ordering_consensus
 names(cols_numbered)= paste(1:length(cols),names(cols))
 
+if (bootstrappingEnabled) {
+  boot_object = params$object
+  challenge_multiple=boot_object$data
+
+  ranking.fun=boot_object$FUN
+  object=challenge_multiple%>%ranking.fun
+
+  object$fulldata=boot_object$fulldata  # only not NULL if subset of algorithms used
+
+  cols_numbered=cols=color.fun(length(ordering_consensus))
+  names(cols)=ordering_consensus
+  names(cols_numbered)= paste(1:length(cols),names(cols))
+}
+
 
 ```
 
 
 This document presents a systematic report on a benchmark study. Input data comprises raw metric values for all algorithms and test cases. Generated plots are:
 
-* Visualization of assessment data: Dot- and boxplots, podium plots and ranking heatmaps
-* Visualization of ranking robustness: Line plots
-* Visualization of ranking stability: Blob plots, violin plots and significance maps
-* Visualization of cross-task insights
+```{r, child=if (!isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewSingleTaskNoBootstrapping.Rmd", package="challengeR")}
+
+```
 
+```{r, child=if (!isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewSingleTaskBootstrapping.Rmd", package="challengeR")}
+
+```
+
+```{r, child=if (isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewMultiTaskNoBootstrapping.Rmd", package="challengeR")}
+
+```
+
+```{r, child=if (isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewMultiTaskBootstrapping.Rmd", package="challengeR")}
+
+```
+
+```{r,results='asis'}
+
+if (!isMultiTask && !is.null(object$fulldata[[1]])) {
+  cat("Only top ",
+      length(levels(challenge_multiple[[1]][[attr(challenge_multiple,"algorithm")]])),
+      " out of ",
+      length(levels(object$fulldata[[1]][[attr(challenge_multiple,"algorithm")]])),
+      " algorithms visualized.\n")
+}
+
+```
 
 Ranking of algorithms within tasks according to the following chosen ranking scheme:
 
 ```{r,results='asis'}
 a=(  lapply(object$FUN.list,function(x) {
                  if (!is.character(x)) return(paste0("aggregate using function ",
                                                      paste(gsub("UseMethod","",
                                                                 deparse(functionBody(x))),
                                                            collapse=" ")
                                                      ))
                  else if (x=="rank") return(x)
                  else return(paste0("aggregate using function ",x))
   }))
 cat("&nbsp; &nbsp; *",paste0(a,collapse=" then "),"*",sep="")
 
 if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop.sign' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.")
 ```
 
-
-Ranking list for each task:
 ```{r,results='asis'}
-for (t in 1:length(object$matlist)){
-  cat("\n",names(object$matlist)[t],": ")
-  n.cases=nrow(challenge_multiple[[t]])/length(unique(challenge_multiple[[t]][[attr(challenge_multiple,"algorithm")]]))
-  cat("\nAnalysis based on ", 
+
+if (isMultiTask) {
+  cat("Ranking list for each task:\n")
+
+  for (t in 1:length(object$matlist)){
+    cat("\n",names(object$matlist)[t],": ")
+    n.cases=nrow(challenge_multiple[[t]])/length(unique(challenge_multiple[[t]][[attr(challenge_multiple,"algorithm")]]))
+
+    cat("\nAnalysis based on ",
+        n.cases,
+        " test cases which included",
+        sum(is.na(challenge_multiple[[t]][[attr(challenge_multiple,"value")]])),
+        " missing values.")
+
+    x=object$matlist[[t]]
+    print(knitr::kable(x[order(x$rank),]))
+  }
+} else {
+  n.cases=nrow(challenge_multiple[[1]])/length(unique(challenge_multiple[[1]][[attr(challenge_multiple,"algorithm")]]))
+
+  cat("\nAnalysis based on ",
       n.cases,
-      " test cases which included", 
-      sum(is.na(challenge_multiple[[t]][[attr(challenge_multiple,"value")]])),
+      " test cases which included",
+      sum(is.na(challenge_multiple[[1]][[attr(challenge_multiple,"value")]])),
       " missing values.")
-  
-  if (n.cases<log2(5000)) warning("Associated figures based on bootstrapping should be treated with caution due to small number of test cases!")
-  
-  x=object$matlist[[t]]
+
+  cat("\n\nRanking list:")
+
+  x=object$matlist[[1]]
   print(knitr::kable(x[order(x$rank),]))
 }
 
 ```
 
 \bigskip
 
-Consensus ranking according to chosen method `r attr(params$consensus,"method")`:
-```{r}
-knitr::kable(data.frame(value=round(params$consensus,3), 
-                        rank=rank(params$consensus, 
-                                  ties.method="min")))
+
+```{r, child=if (isMultiTask) system.file("appdir", "consensusRanking.Rmd", package="challengeR")}
+
 ```
 
 
 # Visualization of raw assessment data
-Algorithms are ordered according to chosen ranking scheme for each task.
+
+```{r,results='asis'}
+
+if (isMultiTask) {
+  cat("Algorithms are ordered according to chosen ranking scheme for each task.")
+}
+
+```
 
 ## Dot- and boxplots
 
 *Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all test cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual test cases.
 
 \bigskip
 
 ```{r boxplots}
-temp=boxplot(object,size=.8)
-temp=lapply(temp,function(x) utils::capture.output(x+xlab("Algorithm")+ylab("Metric value")))
+temp=boxplot(object, size=.8)
+temp=lapply(temp, function(x) utils::capture.output(x+xlab("Algorithm")+ylab("Metric value")))
 
 ```
 
 
 
 ## Podium plots
 *Podium plots* (see also Eugster et al, 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering_consensus)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering_consensus)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding test case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering_consensus)`).  Dots corresponding to identical test cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. 
 
 ```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
 plot.new()
 algs=ordering_consensus
 l=legend("topright", 
          paste0(1:length(algs),": ",algs), 
          lwd = 1, cex=1.4,seg.len=1.1,
          title="Rank: Alg.",
          plot=F) 
 
 w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc')
 h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc')
 addy=max(grconvertY(l$rect$h,"user","inches"),6)
 ```
 
 
 ```{r podium,eval=T,fig.width=12, fig.height=addy}
 #c(bottom, left, top, right
+
 op<-par(pin=c(par()$pin[1],6),
         omd=c(0, 1-w, 0, 1),
         mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-.5,0),
         cex.axis=1.5,
         cex.lab=1.5,
-        cex.main=1.7)#,mar=c(5, 4, 4, 2) + 0.1)
+        cex.main=1.7)
 
 oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines")
 if (oh>0) par(oma=c(oh,0,0,0))
 
 
 set.seed(38)
 podium(object,
        col=cols,
        lines.show = T, lines.alpha = .4,
        dots.cex=.9,
        ylab="Metric value",
        layout.heights=c(1,.35),
        legendfn = function(algs, cols) {
                  legend(par('usr')[2], par('usr')[4], 
                  xpd=NA, 
                  paste0(1:length(algs),": ",algs), 
                  lwd = 1, col =  cols, 
                  bg = NA,
                  cex=1.4, seg.len=1.1,
                  title="Rank: Alg.") 
         }
       )
 par(op)
   
 ```
 
 
 ## Ranking heatmaps
 *Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of test cases in which algorithm $A_j$ achieved rank $i$.
 
 \bigskip
 
 ```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'}
 temp=utils::capture.output(rankingHeatmap(object))
 ```
 
 
 
 # Visualization of ranking stability
 
-
-## *Blob plot* for visualizing ranking stability based on bootstrap sampling \label{blobByTask}
-
-Algorithms are color-coded, and the area of each blob at position $\left( A_i, \text{rank } j \right)$ is proportional to the relative frequency $A_i$ achieved rank $j$ across $b=$ `r ncol(boot_object$bootsrappedRanks[[1]])` bootstrap samples. The median rank for each algorithm is indicated by a black cross. 95\% bootstrap intervals across bootstrap samples are indicated by black lines. 
-
-
-\bigskip
-
-```{r blobplot_bootstrap,fig.width=9, fig.height=9}
-pl=list()
-for (subt in names(boot_object$bootsrappedRanks)){
-  a=list(bootsrappedRanks=list(boot_object$bootsrappedRanks[[subt]]),
-         matlist=list(boot_object$matlist[[subt]]))
-  names(a$bootsrappedRanks)=names(a$matlist)=subt
-  class(a)="bootstrap.list"
-  r=boot_object$matlist[[subt]]
-
-  pl[[subt]]=stabilityByTask(a,
-                             max_size =8,
-                             ordering=rownames(r[order(r$rank),]),
-                             size.ranks=.25*theme_get()$text$size,
-                             size=8,
-                             shape=4) + scale_color_manual(values=cols)
-
-}
-
-# if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){
-#   ggpubr::ggarrange(plotlist = pl)
-# } else {
-  for (i in 1:length(pl)) print(pl[[i]])
-#}
+```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationBlobPlots.Rmd", package="challengeR")}
 
 ```
 
+```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationViolinPlots.Rmd", package="challengeR")}
 
-## *Violin plot* for visualizing ranking stability based on bootstrapping \label{violin}
-
-The ranking list based on the full assessment data is pairwisely compared with the ranking lists based on the individual bootstrap samples (here $b=$ `r ncol(boot_object$bootsrappedRanks[[1]])` samples). For each pair of rankings, Kendall's $\tau$ correlation is computed. Kendall’s $\tau$ is a scaled index determining the correlation between the lists. It is computed by evaluating the number of pairwise concordances and discordances between ranking lists and produces values between $-1$ (for inverted order) and $1$ (for identical order). A violin plot, which simultaneously depicts a boxplot and a density plot, is generated from the results.
-
-\bigskip
-
-```{r violin}
-violin(boot_object)
 ```
 
 
-
-
-
 ## *Significance maps* for visualizing ranking stability based on statistical significance
 
 *Significance maps* depict incidence matrices of
 pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values of the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference.
 
 
 \bigskip
 
 ```{r significancemap,fig.width=6, fig.height=6,out.width='200%'}
 temp=utils::capture.output(significanceMap(object,alpha=0.05,p.adjust.method="holm")
         )
+
 ```
 
 <!-- \subsubsection{Hasse diagram} -->
 
 <!-- ```{r single_stability_significance_hasse, fig.height=19} -->
 <!-- plot(relensemble) -->
 <!-- ``` -->
 
 
 
 
 ## Ranking robustness to ranking methods
 *Line plots* for visualizing rankings robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods.
 
 \bigskip
 
 ```{r lineplot,fig.width=7,fig.height = 5}
-if (length(boot_object$matlist)<=6 &
-    nrow((boot_object$matlist[[1]]))<=10 ){
+if (length(object$matlist)<=6 &nrow((object$matlist[[1]]))<=10 ){
   methodsplot(challenge_multiple,
               ordering = ordering_consensus,
               na.treat=object$call[[1]][[1]]$na.treat) + scale_color_manual(values=cols)
-} else {
+}else {
   x=challenge_multiple
   for (subt in names(challenge_multiple)){
      dd=as.challenge(x[[subt]],
                      value=attr(x,"value"), 
                      algorithm=attr(x,"algorithm") ,
                      case=attr(x,"case"),
                      annotator = attr(x,"annotator"), 
                      by=attr(x,"by"),
                      smallBetter = !attr(x,"largeBetter"),
                      na.treat=object$call[[1]][[1]]$na.treat
                      )
  
     print(methodsplot(dd,
                       ordering = ordering_consensus) + scale_color_manual(values=cols)
           )
   }
 }
 ```
 
 
 
-
-
-# Visualization of cross-task insights
-
-Algorithms are ordered according to consensus ranking.
-
-
-
-
-## Characterization of algorithms
-
-### Ranking stability: Variability of achieved rankings across tasks
-<!-- Variability of achieved rankings across tasks: If a -->
-<!-- reasonably large number of tasks is available, a blob plot -->
-<!-- can be drawn, visualizing the distribution -->
-<!-- of ranks each algorithm attained across tasks. -->
-<!-- Displayed are all ranks and their frequencies an algorithm -->
-<!-- achieved in any task. If all tasks would provide the same -->
-<!-- stable ranking, narrow intervals around the diagonal would -->
-<!-- be expected. -->
-Blob plot similar to the one shown in section \ref{blobByTask} substituting rankings based on bootstrap samples with the rankings corresponding to multiple tasks. This way, the distribution of ranks across tasks can be intuitively visualized.
-
-
-\bigskip
-
-```{r blobplot_raw}
-#stability.ranked.list
-stability(object,ordering=ordering_consensus,max_size=9,size=8,shape=4)+scale_color_manual(values=cols)
-```
-
-
-### Ranking stability: Ranking variability via bootstrap approach
-
-Blob plot of bootstrap results over the different tasks separated
-by algorithm allows another perspective on the assessment data. This gives deeper insights into the characteristics
-of tasks and the ranking uncertainty of the algorithms in each
-task. 
-<!-- 1000 bootstrap Rankings were performed for each task. -->
-<!-- Each algorithm is considered separately and for each subtask (x-axis) all observed ranks across bootstrap samples (y-axis) are displayed. Additionally, medians and IQR is shown in black. -->
-
-<!-- We see which algorithm is consistently among best, which is consistently among worst, which vary extremely... -->
-
-
-\bigskip
-
-```{r blobplot_bootstrap_byAlgorithm,fig.width=7,fig.height = 5}
-#stabilityByAlgorithm.bootstrap.list
-if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){
-  stabilityByAlgorithm(boot_object,
-                       ordering=ordering_consensus,
-                       max_size = 9,
-                       size=4,
-                       shape=4,
-                       single = F) + scale_color_manual(values=cols)
-} else {
-  pl=stabilityByAlgorithm(boot_object,
-                          ordering=ordering_consensus,
-                          max_size = 9,
-                          size=4,
-                          shape=4,
-                          single = T)
-  for (i in 1:length(pl)) print(pl[[i]] + 
-                                  scale_color_manual(values=cols) +
-                                  guides(size = guide_legend(title="%"),color="none")
-                                )
-}
-
-```
-
-<!-- Stacked frequencies of observed ranks across bootstrap samples are displayed with colouring according to subtask. Vertical lines provide original (non-bootstrap) rankings for each subtask. -->
-
-An alternative representation is provided by a stacked
-frequency plot of the observed ranks, separated by algorithm. Observed ranks across bootstrap samples are
-displayed with colouring according to task. For algorithms that
-achieve the same rank in different tasks for the full assessment
-data set, vertical lines are on top of each other. Vertical lines
-allow to compare the achieved rank of each algorithm over
-different tasks.
-
-\bigskip
-
-
-```{r stackedFrequencies_bootstrap_byAlgorithm,fig.width=7,fig.height = 5}
-#stabilityByAlgorithmStacked.bootstrap.list
-stabilityByAlgorithmStacked(boot_object,ordering=ordering_consensus)
-```
-
-
-
-
-## Characterization of tasks
-
-
-### Visualizing bootstrap results
-To investigate which
-tasks separate algorithms well (i.e., lead to a stable ranking),
-two visualization methods are recommended.
-
-Bootstrap results can be shown in a blob plot showing one plot for each
-task. In this view, the spread of the blobs for each algorithm
-can be compared across tasks. Deviations from the diagonal indicate deviations
-from the consensus ranking (over tasks). Specifically, if rank
-distribution of an algorithm is consistently below the diagonal,
-the algorithm performed better in this task than on average
-across tasks, while if the rank distribution of an algorithm
-is consistently above the diagonal, the algorithm performed
-worse in this task than on average across tasks. At the bottom
-of each panel, ranks for each algorithm in the tasks is provided.
-
-
-<!-- Shows which subtask leads to stable ranking and in which subtask ranking is more uncertain. -->
-
-
-Same as in Section \ref{blobByTask} but now ordered according to consensus.
-
-\bigskip
-
-```{r blobplot_bootstrap_byTask,fig.width=9, fig.height=9}
-#stabilityByTask.bootstrap.list
-if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){
-  stabilityByTask(boot_object,
-                  ordering=ordering_consensus,
-                  max_size = 9,
-                  size=4,
-                  shape=4) + scale_color_manual(values=cols)
-} else {
-  pl=list()
-  for (subt in names(boot_object$bootsrappedRanks)){
-    a=list(bootsrappedRanks=list(boot_object$bootsrappedRanks[[subt]]),
-           matlist=list(boot_object$matlist[[subt]]))
-    names(a$bootsrappedRanks)=names(a$matlist)=subt
-    class(a)="bootstrap.list"
-    r=boot_object$matlist[[subt]]
-    
-    pl[[subt]]=stabilityByTask(a,
-                               max_size = 9,
-                               ordering=ordering_consensus,
-                               size.ranks=.25*theme_get()$text$size,
-                               size=4,
-                               shape=4) + scale_color_manual(values=cols)
-  }
-  for (i in 1:length(pl)) print(pl[[i]])
-}
-```
-
-
-### Cluster Analysis
-<!-- Quite a different question of interest -->
-<!-- is to investigate the similarity of tasks with respect to their -->
-<!-- rankings, i.e., which tasks lead to similar ranking lists and the -->
-<!-- ranking of which tasks are very different. For this question -->
-<!-- a hierarchical cluster analysis is performed based on the -->
-<!-- distance between ranking lists. Different distance measures -->
-<!-- can be used (here: Spearman's footrule distance) -->
-<!-- as well as different agglomeration methods (here: complete and average).  -->
-
-
-Dendrogram from hierarchical cluster analysis and \textit{network-type graphs} for assessing the similarity of tasks based on challenge rankings. 
-
-A dendrogram is a visualization approach based on hierarchical clustering. It depicts clusters according to a chosen distance measure (here: Spearman's footrule) as well as a chosen agglomeration method (here: complete and average agglomeration). 
-\bigskip
-
-```{r , fig.width=6, fig.height=5,out.width='60%'}
-#d=relation_dissimilarity.ranked.list(object,method=kendall)
-
-# use ranking list
-  relensemble=as.relation.ranked.list(object)
- 
-# # use relations
-#   a=challenge_multi%>%decision.challenge(p.adjust.method="none")
-#   aa=lapply(a,as.relation.challenge.incidence)
-#   names(aa)=names(challenge_multi)
-#   relensemble= do.call(relation_ensemble,args = aa)
-d <- relation_dissimilarity(relensemble, method = "symdiff")
-```
-
-  
-```{r dendrogram_complete, fig.width=6, fig.height=5,out.width='60%'}
-if (length(relensemble)>2) {
-  plot(hclust(d,method="complete")) #,main="Symmetric difference distance - complete"
-} else cat("\nCluster analysis only sensible if there are >2 tasks.\n\n")
-```
-
-\bigskip
-
-
-```{r dendrogram_average, fig.width=6, fig.height=5,out.width='60%'}
-if (length(relensemble)>2) plot(hclust(d,method="average")) #,main="Symmetric difference distance - average"
-```
-
-<!-- An alternative representation of -->
-<!-- distances between tasks (see Eugster et al, 2008) is provided by networktype -->
-<!-- graphs. -->
-<!-- Every task is represented by a node and nodes are connected -->
-<!-- by edges. Distance between nodes increase exponentially with -->
-<!-- the chosen distance measure d (here: distance between nodes -->
-<!-- equal to 1:05d). Thick edges represent smaller distance, i.e., -->
-<!-- the ranking lists of corresponding tasks are similar. Tasks with -->
-<!-- a unique winner are filled to indicate the algorithm. In case -->
-<!-- there are more than one first-ranked algorithm, nodes remain -->
-<!-- uncoloured. -->
-
-
-In network-type graphs (see Eugster et al, 2008), every task is represented by a node and nodes are connected by edges whose length is determined by a chosen distance measure. Here, distances between nodes are chosen to increase exponentially in Spearman's footrule distance with growth rate 0.05 to accentuate large distances.
-Hence, tasks that are similar with respect to their algorithm ranking appear closer together than those that are dissimilar. Nodes representing tasks with a unique winner are colored-coded by the winning algorithm. In case there are more than one first-ranked algorithms in a task, the corresponding node remains uncolored.
-\bigskip
-
-```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
-if (length(relensemble)>2) {
-  netw=network(object,
-               method = "symdiff", 
-               edge.col=grDevices::grey.colors,
-               edge.lwd=1,
-               rate=1.05,
-               cols=cols
-               )
-  
-  plot.new()
-  leg=legend("topright",  names(netw$leg.col), lwd = 1, col = netw$leg.col, bg =NA,plot=F,cex=.8)
-  w <- grconvertX(leg$rect$w, to='inches')
-  addy=6+w
-} else addy=1
+```{r, child=if (isMultiTask) system.file("appdir", "visualizationAcrossTasks.Rmd", package="challengeR")}
 
 ```
 
-```{r network, fig.width=addy, fig.height=6,out.width='100%'}
-if (length(relensemble)>2) {
-  plot(netw,
-       layoutType = "neato",
-       fixedsize=TRUE,
-       # fontsize,
-       # width,
-       # height,
-       shape="ellipse",
-       cex=.8
-       )
-}
-
-```
 
 
-# Reference
+# References
 
 Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
 
 M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory
 and inferential analysis of benchmark experiments,”
-Institut fuer Statistik, Ludwig-Maximilians-
-Universitaet Muenchen, Germany, Technical Report 30,
+Institut fuer Statistik, Ludwig-Maximilians-Universitaet Muenchen, Germany, Technical Report 30,
 2008. [Online]. Available: http://epub.ub.uni-muenchen.
 de/4134/.
-
-
-
-
-
-
-
diff --git a/inst/appdir/reportMultipleShort.Rmd b/inst/appdir/reportMultipleShort.Rmd
deleted file mode 100644
index 45964cf..0000000
--- a/inst/appdir/reportMultipleShort.Rmd
+++ /dev/null
@@ -1,410 +0,0 @@
----
-params:
-  object: NA
-  colors: NA
-  name: NULL
-  consensus: NA
-title: "Benchmarking report for `r params$name` "
-author: "created by challengeR v`r packageVersion('challengeR')`  \nWiesenfarth, Reinke, Landman, Cardoso, Maier-Hein & Kopp-Schneider (2019)"
-date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`"
-editor_options: 
-  chunk_output_type: console
----
-
-<!-- This text is outcommented -->
-<!-- R code chunks start with "```{r }" and end with "```" -->
-<!-- Please do not change anything inside of code chunks, otherwise any latex code is allowed -->
-
-<!-- inline code with `r 0` -->
-
-
-```{r setup, include=FALSE}
-options(width=80)
-out.format <- knitr::opts_knit$get("out.format")
-img_template <- switch( out.format,
-                     word = list("img-params"=list(dpi=150,
-                                               fig.width=6,
-                                               fig.height=6,
-                                               out.width="504px",
-                                               out.height="504px")),
-                     {
-                       # default
-                       list("img-params"=list( fig.width=7,fig.height = 3,dpi=300))
-                     } )
-
-knitr::opts_template$set( img_template )
-
-knitr::opts_chunk$set(echo = F,#fig.width=7,fig.height = 3,dpi=300,
-                      fig.align="center")
-theme_set(theme_light())
-
-```
-
-
-```{r }
-object = params$object
-ordering_consensus=names(params$consensus)
-color.fun=params$colors
-
-```
-
-```{r }
-challenge_multiple=object$data
-
-ranking.fun=object$FUN
-
-cols_numbered=cols=color.fun(length(ordering_consensus))
-names(cols)=ordering_consensus
-names(cols_numbered)= paste(1:length(cols),names(cols))
-
-```
-
-
-This document presents a systematic report on a benchmark study. Input data comprises raw metric values for all algorithms and test cases. Generated plots are:
-
-* Visualization of assessment data: Dot- and boxplots, podium plots and ranking heatmaps
-* Visualization of ranking robustness: Line plots
-* Visualization of ranking stability: Significance maps
-* Visualization of cross-task insights
-
-
-Ranking of algorithms within tasks according to the following chosen ranking scheme:
-
-```{r,results='asis'}
-a=(  lapply(object$FUN.list,function(x) {
-                 if (!is.character(x)) return(paste0("aggregate using function ",
-                                                     paste(gsub("UseMethod","",
-                                                                deparse(functionBody(x))),
-                                                           collapse=" ")
-                                                     ))
-                 else if (x=="rank") return(x)
-                 else return(paste0("aggregate using function ",x))
-  }))
-cat("&nbsp; &nbsp; *",paste0(a,collapse=" then "),"*",sep="")
-
-if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop.sign' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.")
-```
-
-
-Ranking list for each task:
-```{r,results='asis'}
-for (t in 1:length(object$matlist)){
-  cat("\n",names(object$matlist)[t],": ")
-  n.cases=nrow(challenge_multiple[[t]])/length(unique(challenge_multiple[[t]][[attr(challenge_multiple,"algorithm")]]))
-  
-  cat("\nAnalysis based on ", 
-      n.cases,
-      " test cases which included", 
-      sum(is.na(challenge_multiple[[t]][[attr(challenge_multiple,"value")]])),
-      " missing values.")
-  
-  x=object$matlist[[t]]
-  print(knitr::kable(x[order(x$rank),]))
-}
-
-```
-
-\bigskip
-
-Consensus ranking according to chosen method `r attr(params$consensus,"method")`:
-```{r}
-knitr::kable(data.frame(value=round(params$consensus,3), 
-                        rank=rank(params$consensus, 
-                                  ties.method="min")))
-```
-
-
-# Visualization of raw assessment data
-Algorithms are ordered according to chosen ranking scheme for each task.
-
-## Dot- and boxplots
-
-*Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all test cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual test cases.
-
-\bigskip
-
-```{r boxplots}
-temp=boxplot(object, size=.8)
-temp=lapply(temp, function(x) utils::capture.output(x+xlab("Algorithm")+ylab("Metric value")))
-
-```
-
-
-
-## Podium plots
-*Podium plots* (see also Eugster et al, 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering_consensus)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering_consensus)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding test case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering_consensus)`).  Dots corresponding to identical test cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. 
-
-```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
-plot.new()
-algs=ordering_consensus
-l=legend("topright", 
-         paste0(1:length(algs),": ",algs), 
-         lwd = 1, cex=1.4,seg.len=1.1,
-         title="Rank: Alg.",
-         plot=F) 
-
-w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc')
-h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc')
-addy=max(grconvertY(l$rect$h,"user","inches"),6)
-```
-
-
-```{r podium,eval=T,fig.width=12, fig.height=addy}
-#c(bottom, left, top, right
-
-op<-par(pin=c(par()$pin[1],6),
-        omd=c(0, 1-w, 0, 1),
-        mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-.5,0),
-        cex.axis=1.5,
-        cex.lab=1.5,
-        cex.main=1.7)
-
-oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines")
-if (oh>0) par(oma=c(oh,0,0,0))
-
-
-set.seed(38)
-podium(object,
-       col=cols,
-       lines.show = T, lines.alpha = .4,
-       dots.cex=.9,
-       ylab="Metric value",
-       layout.heights=c(1,.35),
-       legendfn = function(algs, cols) {
-                 legend(par('usr')[2], par('usr')[4], 
-                 xpd=NA, 
-                 paste0(1:length(algs),": ",algs), 
-                 lwd = 1, col =  cols, 
-                 bg = NA,
-                 cex=1.4, seg.len=1.1,
-                 title="Rank: Alg.") 
-        }
-      )
-par(op)
-  
-```
-
-
-## Ranking heatmaps
-*Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of test cases in which algorithm $A_j$ achieved rank $i$.
-
-\bigskip
-
-```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'}
-temp=utils::capture.output(rankingHeatmap(object))
-```
-
-
-
-# Visualization of ranking stability
-
-
-
-
-
-## *Significance maps* for visualizing ranking stability based on statistical significance
-
-*Significance maps* depict incidence matrices of
-pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values of the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference.
-
-
-\bigskip
-
-```{r significancemap,fig.width=6, fig.height=6,out.width='200%'}
-temp=utils::capture.output(significanceMap(object,alpha=0.05,p.adjust.method="holm")
-        )
-
-```
-
-<!-- \subsubsection{Hasse diagram} -->
-
-<!-- ```{r single_stability_significance_hasse, fig.height=19} -->
-<!-- plot(relensemble) -->
-<!-- ``` -->
-
-
-
-
-## Ranking robustness to ranking methods
-*Line plots* for visualizing rankings robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods.
-
-\bigskip
-
-```{r lineplot,fig.width=7,fig.height = 5}
-if (length(object$matlist)<=6 &nrow((object$matlist[[1]]))<=10 ){
-  methodsplot(challenge_multiple,
-              ordering = ordering_consensus,
-              na.treat=object$call[[1]][[1]]$na.treat) + scale_color_manual(values=cols)
-} else {
-  x=challenge_multiple
-  for (subt in names(challenge_multiple)){
-     dd=as.challenge(x[[subt]],
-                     value=attr(x,"value"), 
-                     algorithm=attr(x,"algorithm") ,
-                     case=attr(x,"case"),
-                     annotator = attr(x,"annotator"), 
-                     by=attr(x,"by"),
-                     smallBetter = !attr(x,"largeBetter"),
-                     na.treat=object$call[[1]][[1]]$na.treat
-                     )
- 
-    print(methodsplot(dd,
-                      ordering = ordering_consensus) + scale_color_manual(values=cols)
-          )
-  }
-}
-```
-
-
-
-
-
-# Visualization of cross-task insights
-
-Algorithms are ordered according to consensus ranking.
-
-
-
-
-## Characterization of algorithms
-
-### Ranking stability: Variability of achieved rankings across tasks
-<!-- Variability of achieved rankings across tasks: If a -->
-<!-- reasonably large number of tasks is available, a blob plot -->
-<!-- can be drawn, visualizing the distribution -->
-<!-- of ranks each algorithm attained across tasks. -->
-<!-- Displayed are all ranks and their frequencies an algorithm -->
-<!-- achieved in any task. If all tasks would provide the same -->
-<!-- stable ranking, narrow intervals around the diagonal would -->
-<!-- be expected. -->
-
-<!-- blob plot similar to the one shown in Fig.~\ref{blobByTask} substituting rankings based on bootstrap samples with the rankings corresponding to multiple tasks. This way, the distribution of ranks across tasks can be intuitively visualized. -->
-Blob plot for visualizing ranking stability across tasks. Algorithms are color-coded, and the area of each blob at position $\left( A_i, \text{rank } j \right)$ is proportional to the relative frequency $A_i$ achieved rank $j$ across $b=$ `r ncol(boot_object$bootsrappedRanks[[1]])` bootstrap samples across tasks. The median rank for each algorithm is indicated by a black cross. 95\% bootstrap intervals across bootstrap samples are indicated by black lines. This way, the distribution of ranks across tasks can be intuitively visualized.
-
-
-\bigskip
-
-```{r blobplot_raw}
-#stability.ranked.list
-stability(object,ordering=ordering_consensus,max_size=9,size=8,shape=4)+scale_color_manual(values=cols)
-```
-
-
-
-
-## Characterization of tasks
-
-
-
-### Cluster Analysis
-<!-- Quite a different question of interest -->
-<!-- is to investigate the similarity of tasks with respect to their -->
-<!-- rankings, i.e., which tasks lead to similar ranking lists and the -->
-<!-- ranking of which tasks are very different. For this question -->
-<!-- a hierarchical cluster analysis is performed based on the -->
-<!-- distance between ranking lists. Different distance measures -->
-<!-- can be used (here: Spearman's footrule distance) -->
-<!-- as well as different agglomeration methods (here: complete and average).  -->
-
-
-Dendrogram from hierarchical cluster analysis and \textit{network-type graphs} for assessing the similarity of tasks based on challenge rankings. 
-
-A dendrogram is a visualization approach based on hierarchical clustering. It depicts clusters according to a chosen distance measure (here: Spearman's footrule) as well as a chosen agglomeration method (here: complete and average agglomeration). 
-\bigskip
-
-```{r , fig.width=6, fig.height=5,out.width='60%'}
-#d=relation_dissimilarity.ranked.list(object,method=kendall)
-
-# use ranking list
-  relensemble=as.relation.ranked.list(object)
- 
-# # use relations
-#   a=challenge_multi%>%decision.challenge(p.adjust.method="none")
-#   aa=lapply(a,as.relation.challenge.incidence)
-#   names(aa)=names(challenge_multi)
-#   relensemble= do.call(relation_ensemble,args = aa)
-d <- relation_dissimilarity(relensemble, method = "symdiff")
-```
-
-  
-```{r dendrogram_complete, fig.width=6, fig.height=5,out.width='60%'}
-if (length(relensemble)>2) {
-  plot(hclust(d,method="complete")) #,main="Symmetric difference distance - complete"
-} else cat("\nCluster analysis only sensible if there are >2 tasks.\n\n")
-```
-
-\bigskip
-
-
-```{r dendrogram_average, fig.width=6, fig.height=5,out.width='60%'}
-if (length(relensemble)>2) plot(hclust(d,method="average")) #,main="Symmetric difference distance - average"
-```
-
-<!-- An alternative representation of -->
-<!-- distances between tasks (see Eugster et al, 2008) is provided by networktype -->
-<!-- graphs. -->
-<!-- Every task is represented by a node and nodes are connected -->
-<!-- by edges. Distance between nodes increase exponentially with -->
-<!-- the chosen distance measure d (here: distance between nodes -->
-<!-- equal to 1:05d). Thick edges represent smaller distance, i.e., -->
-<!-- the ranking lists of corresponding tasks are similar. Tasks with -->
-<!-- a unique winner are filled to indicate the algorithm. In case -->
-<!-- there are more than one first-ranked algorithm, nodes remain -->
-<!-- uncoloured. -->
-
-
-In network-type graphs (see Eugster et al, 2008), every task is represented by a node and nodes are connected by edges whose length is determined by a chosen distance measure. Here, distances between nodes are chosen to increase exponentially in Spearman's footrule distance with growth rate 0.05 to accentuate large distances.
-Hence, tasks that are similar with respect to their algorithm ranking appear closer together than those that are dissimilar. Nodes representing tasks with a unique winner are colored-coded by the winning algorithm. In case there are more than one first-ranked algorithms in a task, the corresponding node remains uncolored.
-\bigskip
-
-```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
-if (length(relensemble)>2) {
-  netw=network(object,
-               method = "symdiff", 
-               edge.col=grDevices::grey.colors,
-               edge.lwd=1,
-               rate=1.05,
-               cols=cols
-               )
-  
-  plot.new()
-  leg=legend("topright",  names(netw$leg.col), lwd = 1, col = netw$leg.col, bg =NA,plot=F,cex=.8)
-  w <- grconvertX(leg$rect$w, to='inches')
-  addy=6+w
-} else addy=1
-
-```
-
-```{r network, fig.width=addy, fig.height=6,out.width='100%'}
-if (length(relensemble)>2) {
-  plot(netw,
-       layoutType = "neato",
-       fixedsize=TRUE,
-       # fontsize,
-       # width,
-       # height,
-       shape="ellipse",
-       cex=.8
-       )
-}
-
-```
-
-
-# Reference
-
-Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
-
-M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory
-and inferential analysis of benchmark experiments,”
-Institut fuer Statistik, Ludwig-Maximilians-
-Universitaet Muenchen, Germany, Technical Report 30,
-2008. [Online]. Available: http://epub.ub.uni-muenchen.
-de/4134/.
-
-
-
-
-
-
-
diff --git a/inst/appdir/reportSingle.Rmd b/inst/appdir/reportSingle.Rmd
deleted file mode 100644
index 90efe46..0000000
--- a/inst/appdir/reportSingle.Rmd
+++ /dev/null
@@ -1,297 +0,0 @@
----
-params:
-  object: NA
-  colors: NA
-  name: NULL
-title: "Benchmarking report for `r params$name` "
-author: "created by challengeR v`r packageVersion('challengeR')`  \nWiesenfarth, Reinke, Landman, Cardoso, Maier-Hein & Kopp-Schneider (2019)"
-date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`"
-editor_options: 
-  chunk_output_type: console
----
-
-
-
-
-```{r setup, include=FALSE}
-options(width=80)
-# out.format <- knitr::opts_knit$get("out.format")
-# img_template <- switch( out.format,
-#                      word = list("img-params"=list(fig.width=6,
-#                                                    fig.height=6,
-#                                                    dpi=150)),
-#                      {
-#                        # default
-#                        list("img-params"=list( dpi=150,
-#                                                fig.width=6,
-#                                                fig.height=6,
-#                                                out.width="504px",
-#                                                out.height="504px"))
-#                      } )
-# 
-# knitr::opts_template$set( img_template )
-
-knitr::opts_chunk$set(echo = F,fig.width=7,fig.height = 3,dpi=300,fig.align="center")
-#theme_set(theme_light())
-theme_set(theme_light(base_size=11))
-
-```
-
-```{r }
-boot_object = params$object
-color.fun=params$colors
-```
-
-
-```{r }
-challenge_single=boot_object$data
-ordering=  names(sort(t(boot_object$mat[,"rank",drop=F])["rank",]))
-ranking.fun=boot_object$FUN
-object=challenge_single%>%ranking.fun
-
-object$fulldata=boot_object$fulldata  # only not NULL if subset of algorithms used
-
-cols_numbered=cols=color.fun(length(ordering))
-names(cols)=ordering
-names(cols_numbered)= paste(1:length(cols),names(cols))
-
-```
-
-
-
-<!-- ***** -->
-
-<!-- This text is outcommented -->
-<!-- R code chunks start with "```{r }" and end with "```" -->
-<!-- Please do not change anything inside of code chunks, otherwise any latex code is allowed -->
-
-<!-- inline code with `r 0` -->
-
-
-This document presents a systematic report on a benchmark study. Input data comprises raw metric values for all algorithms and test cases. Generated plots are:
-
-* Visualization of assessment data: Dot- and boxplot, podium plot and ranking heatmap
-* Visualization of ranking robustness: Line plot
-* Visualization of ranking stability: Blob plot, violin plot and significance map
-
-```{r}
-n.cases=nrow(challenge_single)/length(unique(challenge_single[[attr(challenge_single,"algorithm")]]))
-```
-
-Analysis based on `r n.cases` test cases which included `r sum(is.na(challenge_single[[attr(challenge_single,"value")]]))` missing values.
-
-```{r,results='asis'}
-if (!is.null(boot_object$fulldata)) {
-  cat("Only top ",
-      length(levels(boot_object$data[[attr(boot_object$data,"algorithm")]])), 
-      " out of ", 
-      length(levels(boot_object$fulldata[[attr(boot_object$data,"algorithm")]])), 
-      " algorithms visualized.\n")
-}
-```
-
-
-```{r}
-if (n.cases<log2(5000)) warning("Bootstrapping in case of few test cases should be treated with caution!")
-```
-
-Algorithms are ordered according to the following chosen ranking scheme:
-
-```{r,results='asis'}
-a=(  lapply(object$FUN.list,function(x) {
-               if (!is.character(x)) return(paste0("aggregate using function ",
-                                                   paste(gsub("UseMethod",
-                                                              "",
-                                                              deparse(functionBody(x))),
-                                                         collapse=" "))
-                                            )
-               else if (x=="rank") return(x)
-               else return(paste0("aggregate using function ",x))
-       })
-     )
-cat("&nbsp; &nbsp; *",paste0(a,collapse=" then "),"*",sep="")
-
-
-if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop.sign' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.")
-```
-
-
-
-Ranking list:
-
-```{r}
-#knitr::kable(object$mat[order(object$mat$rank),])
-print(object)
-
-```
-
-
-
-
-
-
-
-# Visualization of raw assessment data
-
-## Dot- and boxplot
-
-*Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all test cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual test cases.
-\bigskip
-
-```{r boxplots}
-boxplot(object,size=.8)+xlab("Algorithm")+ylab("Metric value")
-
-```
-
-
-
-## Podium plot
-*Podium plots* (see also Eugster et al, 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding test case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering)`).  Dots corresponding to identical test cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. 
-\bigskip
-
-
-```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
-plot.new()
-algs=levels(challenge_single[[attr(challenge_single,"algorithm")]])
-
-l=legend("topright", 
-         paste0(1:length(algs),": ",algs), 
-         lwd = 1, cex=1.4,seg.len=1.1,
-         title="Rank: Alg.",
-         plot=F) 
-w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc')
-h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc')
-addy=max(grconvertY(l$rect$h,"user","inches"),6)
-```
-
-
-```{r podium,eval=T,fig.width=12, fig.height=addy}
-op<-par(pin=c(par()$pin[1],6),
-        omd=c(0, 1-w, 0, 1),
-        mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-3.3,0),
-        cex.axis=1.5,
-        cex.lab=1.5,
-        cex.main=1.7)
-oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines")
-if (oh>0) par(oma=c(oh,0,0,0))
-
-set.seed(38)
-podium(object, 
-       col=cols,
-       lines.show = T,lines.alpha = .4,
-       dots.cex=.9,ylab="Metric value",layout.heights=c(1,.35),
-       legendfn = function(algs, cols) {
-         legend(par('usr')[2], 
-                par('usr')[4], 
-                xpd=NA, 
-                paste0(1:length(algs),": ",algs), 
-                lwd = 1, 
-                col =  cols,
-                bg = NA,
-                cex=1.4,
-                seg.len=1.1,
-                title="Rank: Alg.") 
-        }
-        )
-par(op)
-```
-
-
-## Ranking heatmap
-*Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of test cases in which algorithm $A_j$ achieved rank $i$.
-
-\bigskip
-
-```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'}
-rankingHeatmap(object)
-```
-
-
-
-# Visualization of ranking stability
-
-
-
-<!-- Results based on `r ncol(boot_object$bootsrappedRanks)` bootstrap samples. -->
-
-## *Blob plot* for visualizing ranking stability based on bootstrap sampling
-
-Algorithms are color-coded, and the area of each blob at position $\left( A_i, \text{rank } j \right)$ is proportional to the relative frequency $A_i$ achieved rank $j$ across $b=$ `r ncol(boot_object$bootsrappedRanks)` bootstrap samples. The median rank for each algorithm is indicated by a black cross. 95\% bootstrap intervals across bootstrap samples are indicated by black lines. 
-
-\bigskip
-
-```{r blobplot,fig.width=7,fig.height = 7}
-stability(boot_object,max_size = 8,size.ranks=.25*theme_get()$text$size,size=8,shape=4 )+scale_color_manual(values=cols)
-```
-
-
-## Violin plot for visualizing ranking stability based on bootstrapping
-
-The ranking list based on the full assessment data is pairwisely compared with the ranking lists based on the individual bootstrap samples (here $b=$ `r ncol(boot_object$bootsrappedRanks)` samples). For each pair of rankings, Kendall's $\tau$ correlation is computed. Kendall’s $\tau$ is a scaled index determining the correlation between the lists. It is computed by evaluating the number of pairwise concordances and discordances between ranking lists and produces values between $-1$ (for inverted order) and $1$ (for identical order). A violin plot, which simultaneously depicts a boxplot and a density plot, is generated from the results.
-\bigskip
-
-```{r violin}
-violin(boot_object)+xlab("")
-```
-
-
-
-
-
-## *Significance maps* for visualizing ranking stability based on statistical significance
-
-*Significance maps* depict incidence matrices of
-pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values of the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference.
-\bigskip
-
-```{r significancemap,fig.width=7, fig.height=6}
-print(significanceMap(object,alpha=0.05,p.adjust.method="holm")
-        )
-```
-
-
-
-
-<!-- \subsubsection{Hasse diagram} -->
-
-<!-- ```{r single_stability_significance_hasse, fig.height=19} -->
-<!-- plot(relensemble) -->
-<!-- ``` -->
-
-
-
-
-## Ranking robustness with respect to ranking methods
-*Line plots* for visualizing rankings robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods.
-
-\bigskip
-
-```{r lineplot,fig.width=7,fig.height = 5}
-methodsplot(object )+scale_color_manual(values=cols)
-```
-
-
-
-
-
-# Reference
-
-
-Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
-
-
-M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory
-and inferential analysis of benchmark experiments,”
-Institut fuer Statistik, Ludwig-Maximilians-
-Universitaet Muenchen, Germany, Technical Report 30,
-2008. [Online]. Available: http://epub.ub.uni-muenchen.
-de/4134/.
-
-
-
-
-
-
-
-
-
diff --git a/inst/appdir/reportSingleShort.Rmd b/inst/appdir/reportSingleShort.Rmd
deleted file mode 100644
index 87a7980..0000000
--- a/inst/appdir/reportSingleShort.Rmd
+++ /dev/null
@@ -1,261 +0,0 @@
----
-params:
-  object: NA
-  colors: NA
-  name: NULL
-title: "Benchmarking report for `r params$name` "
-author: "created by challengeR v`r packageVersion('challengeR')`  \nWiesenfarth, Reinke, Landman, Cardoso, Maier-Hein & Kopp-Schneider (2019)"
-date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`"
-editor_options: 
-  chunk_output_type: console
----
-
-
-
-
-```{r setup, include=FALSE}
-options(width=80)
-# out.format <- knitr::opts_knit$get("out.format")
-# img_template <- switch( out.format,
-#                      word = list("img-params"=list(fig.width=6,
-#                                                    fig.height=6,
-#                                                    dpi=150)),
-#                      {
-#                        # default
-#                        list("img-params"=list( dpi=150,
-#                                                fig.width=6,
-#                                                fig.height=6,
-#                                                out.width="504px",
-#                                                out.height="504px"))
-#                      } )
-# 
-# knitr::opts_template$set( img_template )
-
-knitr::opts_chunk$set(echo = F,fig.width=7,fig.height = 3,dpi=300,fig.align="center")
-#theme_set(theme_light())
-theme_set(theme_light(base_size=11))
-
-```
-
-```{r }
-object = params$object
-color.fun=params$colors
-```
-
-
-```{r }
-challenge_single=object$data
-ordering=  names(sort(t(object$mat[,"rank",drop=F])["rank",]))
-ranking.fun=object$FUN
-
-cols_numbered=cols=color.fun(length(ordering))
-names(cols)=ordering
-names(cols_numbered)= paste(1:length(cols),names(cols))
-
-```
-
-
-
-<!-- ***** -->
-
-<!-- This text is outcommented -->
-<!-- R code chunks start with "```{r }" and end with "```" -->
-<!-- Please do not change anything inside of code chunks, otherwise any latex code is allowed -->
-
-<!-- inline code with `r 0` -->
-
-This document presents a systematic report on a benchmark study. Input data comprises raw metric values for all algorithms and test cases. Generated plots are:
-
-* Visualization of assessment data: Dot- and boxplot, podium plot and ranking heatmap
-* Visualization of ranking robustness: Line plot
-* Visualization of ranking stability: Significance map
-
-
-Analysis based on `r nrow(challenge_single)/length(unique(challenge_single[[attr(challenge_single,"algorithm")]]))` test cases which included `r sum(is.na(challenge_single[[attr(challenge_single,"value")]]))` missing values.
-
-```{r,results='asis'}
-if (!is.null(object$fulldata)) {
-  cat("Only top ",
-      length(levels(object$data[[attr(object$data,"algorithm")]])), 
-      " out of ", 
-      length(levels(object$fulldata[[attr(object$data,"algorithm")]])), 
-      " algorithms visualized.\n")
-}
-```
-
-
-
-Algorithms are ordered according to the following chosen ranking scheme:
-
-```{r,results='asis'}
-a=(  lapply(object$FUN.list,function(x) {
-               if (!is.character(x)) return(paste0("aggregate using function ",
-                                                   paste(gsub("UseMethod",
-                                                              "",
-                                                              deparse(functionBody(x))),
-                                                         collapse=" "))
-                                            )
-               else if (x=="rank") return(x)
-               else return(paste0("aggregate using function ",x))
-       })
-     )
-cat("&nbsp; &nbsp; *",paste0(a,collapse=" then "),"*",sep="")
-
-
-if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop.sign' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.")
-```
-
-Ranking list:
-
-```{r}
-print(object)
-```
-
-
-
-
-
-
-
-# Visualization of raw assessment data
-
-## Dot- and boxplot
-
-*Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all test cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual test cases.
-\bigskip
-
-```{r boxplots}
-boxplot(object,size=.8)+xlab("Algorithm")+ylab("Metric value")
-
-```
-
-
-
-## Podium plot
-*Podium plots* (see also Eugster et al, 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding test case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering)`).  Dots corresponding to identical test cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. 
-\bigskip
-
-
-```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
-plot.new()
-algs=levels(challenge_single[[attr(challenge_single,"algorithm")]])
-
-l=legend("topright", 
-         paste0(1:length(algs),": ",algs), 
-         lwd = 1, cex=1.4,seg.len=1.1,
-         title="Rank: Alg.",
-         plot=F) 
-w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc')
-h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc')
-addy=max(grconvertY(l$rect$h,"user","inches"),6)
-```
-
-
-```{r podium,eval=T,fig.width=12, fig.height=addy}
-op<-par(pin=c(par()$pin[1],6),
-        omd=c(0, 1-w, 0, 1),
-        mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-3.3,0),
-        cex.axis=1.5,
-        cex.lab=1.5,
-        cex.main=1.7)
-oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines")
-if (oh>0) par(oma=c(oh,0,0,0))
-
-set.seed(38)
-podium(object, 
-       col=cols,
-       lines.show = T,lines.alpha = .4,
-       dots.cex=.9,ylab="Metric value",layout.heights=c(1,.35),
-       legendfn = function(algs, cols) {
-         legend(par('usr')[2], 
-                par('usr')[4], 
-                xpd=NA, 
-                paste0(1:length(algs),": ",algs), 
-                lwd = 1, 
-                col =  cols,
-                bg = NA,
-                cex=1.4,
-                seg.len=1.1,
-                title="Rank: Alg.") 
-        }
-        )
-par(op)
-```
-
-
-## Ranking heatmap
-*Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of test cases in which algorithm $A_j$ achieved rank $i$.
-
-\bigskip
-
-```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'}
-rankingHeatmap(object)
-```
-
-
-
-# Visualization of ranking stability
-
-
-
-
-
-
-
-## *Significance maps* for visualizing ranking stability based on statistical significance
-
-*Significance maps* depict incidence matrices of
-pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values of the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference.
-\bigskip
-
-```{r significancemap,fig.width=7, fig.height=6}
-print(significanceMap(object,alpha=0.05,p.adjust.method="holm")
-        )
-```
-
-
-
-
-<!-- \subsubsection{Hasse diagram} -->
-
-<!-- ```{r single_stability_significance_hasse, fig.height=19} -->
-<!-- plot(relensemble) -->
-<!-- ``` -->
-
-
-
-
-## Ranking robustness with respect to ranking methods
-*Line plots* for visualizing rankings robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods.
-
-\bigskip
-
-```{r lineplot,fig.width=7,fig.height = 5}
-methodsplot(object )+scale_color_manual(values=cols)
-```
-
-
-
-
-
-# Reference
-
-
-Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
-
-
-M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory
-and inferential analysis of benchmark experiments,”
-Institut fuer Statistik, Ludwig-Maximilians-
-Universitaet Muenchen, Germany, Technical Report 30,
-2008. [Online]. Available: http://epub.ub.uni-muenchen.
-de/4134/.
-
-
-
-
-
-
-
-
-
diff --git a/inst/appdir/visualizationAcrossTasks.Rmd b/inst/appdir/visualizationAcrossTasks.Rmd
new file mode 100644
index 0000000..2aacc04
--- /dev/null
+++ b/inst/appdir/visualizationAcrossTasks.Rmd
@@ -0,0 +1,138 @@
+# Visualization of cross-task insights
+
+Algorithms are ordered according to consensus ranking.
+
+
+
+
+## Characterization of algorithms
+
+### Ranking stability: Variability of achieved rankings across tasks
+<!-- Variability of achieved rankings across tasks: If a -->
+<!-- reasonably large number of tasks is available, a blob plot -->
+<!-- can be drawn, visualizing the distribution -->
+<!-- of ranks each algorithm attained across tasks. -->
+<!-- Displayed are all ranks and their frequencies an algorithm -->
+<!-- achieved in any task. If all tasks would provide the same -->
+<!-- stable ranking, narrow intervals around the diagonal would -->
+<!-- be expected. -->
+Blob plot similar to the one shown in Section \ref{blobByTask} substituting rankings based on bootstrap samples with the rankings corresponding to multiple tasks. This way, the distribution of ranks across tasks can be intuitively visualized.
+
+
+\bigskip
+
+```{r blobplot_raw}
+#stability.ranked.list
+stability(object,ordering=ordering_consensus,max_size=9,size=8,shape=4)+scale_color_manual(values=cols)
+```
+
+
+
+
+
+```{r, child=if (isMultiTask && bootstrappingEnabled) system.file("appdir", "characterizationOfAlgorithmsBootstrapping.Rmd", package="challengeR")}
+
+```
+
+
+
+## Characterization of tasks
+
+
+```{r, child=if (isMultiTask && bootstrappingEnabled) system.file("appdir", "characterizationOfTasksBootstrapping.Rmd", package="challengeR")}
+
+```
+
+### Cluster Analysis
+<!-- Quite a different question of interest -->
+<!-- is to investigate the similarity of tasks with respect to their -->
+<!-- rankings, i.e., which tasks lead to similar ranking lists and the -->
+<!-- ranking of which tasks are very different. For this question -->
+<!-- a hierarchical cluster analysis is performed based on the -->
+<!-- distance between ranking lists. Different distance measures -->
+<!-- can be used (here: Spearman's footrule distance) -->
+<!-- as well as different agglomeration methods (here: complete and average).  -->
+
+
+Dendrogram from hierarchical cluster analysis} and \textit{network-type graphs} for assessing the similarity of tasks based on challenge rankings. 
+
+A dendrogram is a visualization approach based on hierarchical clustering. It depicts clusters according to a chosen distance measure (here: Spearman's footrule) as well as a chosen agglomeration method (here: complete and average agglomeration). 
+\bigskip
+
+```{r , fig.width=6, fig.height=5,out.width='60%'}
+#d=relation_dissimilarity.ranked.list(object,method=kendall)
+
+# use ranking list
+  relensemble=as.relation.ranked.list(object)
+ 
+# # use relations
+#   a=challenge_multi%>%decision.challenge(p.adjust.method="none")
+#   aa=lapply(a,as.relation.challenge.incidence)
+#   names(aa)=names(challenge_multi)
+#   relensemble= do.call(relation_ensemble,args = aa)
+d <- relation_dissimilarity(relensemble, method = "symdiff")
+```
+
+  
+```{r dendrogram_complete, fig.width=6, fig.height=5,out.width='60%'}
+if (length(relensemble)>2) {
+  plot(hclust(d,method="complete")) #,main="Symmetric difference distance - complete"
+} else cat("\nCluster analysis only sensible if there are >2 tasks.\n\n")
+```
+
+\bigskip
+
+
+```{r dendrogram_average, fig.width=6, fig.height=5,out.width='60%'}
+if (length(relensemble)>2) plot(hclust(d,method="average")) #,main="Symmetric difference distance - average"
+```
+
+<!-- An alternative representation of -->
+<!-- distances between tasks (see Eugster et al, 2008) is provided by networktype -->
+<!-- graphs. -->
+<!-- Every task is represented by a node and nodes are connected -->
+<!-- by edges. Distance between nodes increase exponentially with -->
+<!-- the chosen distance measure d (here: distance between nodes -->
+<!-- equal to 1:05d). Thick edges represent smaller distance, i.e., -->
+<!-- the ranking lists of corresponding tasks are similar. Tasks with -->
+<!-- a unique winner are filled to indicate the algorithm. In case -->
+<!-- there are more than one first-ranked algorithm, nodes remain -->
+<!-- uncoloured. -->
+
+
+In network-type graphs (see Eugster et al, 2008), every task is represented by a node and nodes are connected by edges whose length is determined by a chosen distance measure. Here, distances between nodes are chosen to increase exponentially in Spearman's footrule distance with growth rate 0.05 to accentuate large distances.
+Hence, tasks that are similar with respect to their algorithm ranking appear closer together than those that are dissimilar. Nodes representing tasks with a unique winner are colored-coded by the winning algorithm. In case there are more than one first-ranked algorithms in a task, the corresponding node remains uncolored.
+\bigskip
+
+```{r ,eval=T,fig.width=12, fig.height=6,include=FALSE}
+if (length(relensemble)>2) {
+  netw=network(object,
+               method = "symdiff", 
+               edge.col=grDevices::grey.colors,
+               edge.lwd=1,
+               rate=1.05,
+               cols=cols
+               )
+  
+  plot.new()
+  leg=legend("topright",  names(netw$leg.col), lwd = 1, col = netw$leg.col, bg =NA,plot=F,cex=.8)
+  w <- grconvertX(leg$rect$w, to='inches')
+  addy=6+w
+} else addy=1
+
+```
+
+```{r network, fig.width=addy, fig.height=6,out.width='100%'}
+if (length(relensemble)>2) {
+  plot(netw,
+       layoutType = "neato",
+       fixedsize=TRUE,
+       # fontsize,
+       # width,
+       # height,
+       shape="ellipse",
+       cex=.8
+       )
+}
+
+```
\ No newline at end of file
diff --git a/inst/appdir/visualizationBlobPlots.Rmd b/inst/appdir/visualizationBlobPlots.Rmd
new file mode 100644
index 0000000..b730e57
--- /dev/null
+++ b/inst/appdir/visualizationBlobPlots.Rmd
@@ -0,0 +1,39 @@
+## *Blob plot* for visualizing ranking stability based on bootstrap sampling \label{blobByTask}
+
+Algorithms are color-coded, and the area of each blob at position $\left( A_i, \text{rank } j \right)$ is proportional to the relative frequency $A_i$ achieved rank $j$ across $b=$ `r ncol(boot_object$bootsrappedRanks[[1]])` bootstrap samples. The median rank for each algorithm is indicated by a black cross. 95\% bootstrap intervals across bootstrap samples are indicated by black lines. 
+
+
+\bigskip
+
+```{r blobplot_bootstrap,fig.width=9, fig.height=9}
+showLabelForSingleTask <- FALSE
+
+if (length(names(boot_object$bootsrappedRanks)) > 1) {
+  showLabelForSingleTask <- TRUE
+}
+
+pl=list()
+for (subt in names(boot_object$bootsrappedRanks)){
+  a=list(bootsrappedRanks=list(boot_object$bootsrappedRanks[[subt]]),
+         matlist=list(boot_object$matlist[[subt]]))
+  names(a$bootsrappedRanks)=names(a$matlist)=subt
+  class(a)="bootstrap.list"
+  r=boot_object$matlist[[subt]]
+
+  pl[[subt]]=stabilityByTask(a,
+                             max_size =8,
+                             ordering=rownames(r[order(r$rank),]),
+                             size.ranks=.25*theme_get()$text$size,
+                             size=8,
+                             shape=4,
+                             showLabelForSingleTask=showLabelForSingleTask) + scale_color_manual(values=cols)
+
+}
+
+# if (length(boot_object$matlist)<=6 &nrow((boot_object$matlist[[1]]))<=10 ){
+#   ggpubr::ggarrange(plotlist = pl)
+# } else {
+  for (i in 1:length(pl)) print(pl[[i]])
+#}
+
+```
diff --git a/inst/appdir/visualizationViolinPlots.Rmd b/inst/appdir/visualizationViolinPlots.Rmd
new file mode 100644
index 0000000..36e8ba3
--- /dev/null
+++ b/inst/appdir/visualizationViolinPlots.Rmd
@@ -0,0 +1,9 @@
+## *Violin plot* for visualizing ranking stability based on bootstrapping \label{violin}
+
+The ranking list based on the full assessment data is pairwisely compared with the ranking lists based on the individual bootstrap samples (here $b=$ `r ncol(boot_object$bootsrappedRanks[[1]])` samples). For each pair of rankings, Kendall's $\tau$ correlation is computed. Kendall’s $\tau$ is a scaled index determining the correlation between the lists. It is computed by evaluating the number of pairwise concordances and discordances between ranking lists and produces values between $-1$ (for inverted order) and $1$ (for identical order). A violin plot, which simultaneously depicts a boxplot and a density plot, is generated from the results.
+
+\bigskip
+
+```{r violin}
+violin(boot_object)
+```
diff --git a/tests/testthat/test-aggregateThenRank.R b/tests/testthat/test-aggregateThenRank.R
index 796b508..df03703 100644
--- a/tests/testthat/test-aggregateThenRank.R
+++ b/tests/testthat/test-aggregateThenRank.R
@@ -1,346 +1,346 @@
 test_that("aggregate-than-rank by mean works with two algorithms for one case, small values are better", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 1),
     "A2" = data.frame(value_FUN = 0.8, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by mean works with two algorithms (reverse order) for one case, small values are better", {
   data <- rbind(
             data.frame(algo = "A2", value = 0.8, case = "C1"),
             data.frame(algo = "A1", value = 0.6, case = "C1"))
 
-  challenge <- as.challenge(data, algorithm = "algo", case = "case", value = "value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm = "algo", case = "case", value = "value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind("A2" = data.frame(value_FUN = 0.8, rank = 2),
                            "A1" = data.frame(value_FUN = 0.6, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by mean works with two algorithms for one case, large values are better", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 2),
     "A2" = data.frame(value_FUN = 0.8, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by mean works with two algorithms (reverse order) for one case, large values are better", {
   data <- rbind(
     data.frame(algo = "A2", value = 0.8, case = "C1"),
     data.frame(algo = "A1", value = 0.6, case = "C1"))
 
-  challenge <- as.challenge(data, algorithm = "algo", case = "case", value = "value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm = "algo", case = "case", value = "value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind("A2" = data.frame(value_FUN = 0.8, rank = 1),
                            "A1" = data.frame(value_FUN = 0.6, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank raises error for invalid aggregation function", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = meanx),
                "object 'meanx' not found", fixed = TRUE)
 })
 
 test_that("aggregate-than-rank by mean works with two algorithms for one case and 'min' as ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, ties.method = "min")
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 1),
     "A2" = data.frame(value_FUN = 0.6, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by mean works with two algorithms for one case and 'max' as ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, ties.method = "max")
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 2),
     "A2" = data.frame(value_FUN = 0.6, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank raises error for invalid ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean, ties.method = "maxx"),
                "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
 })
 
 test_that("aggregate-than-rank raises error for invalid ties method even when no ties present", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean, ties.method = "maxx"),
                "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
 })
 
 test_that("aggregate-than-rank by mean works with two algorithms for two cases", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A1", value=0.4, case="C2"),
     data.frame(algo="A2", value=0.8, case="C1"),
     data.frame(algo="A2", value=1.0, case="C2"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.5, rank = 1),
     "A2" = data.frame(value_FUN = 0.9, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by median works with two algorithms for two cases", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A1", value=0.4, case="C2"),
     data.frame(algo="A2", value=0.8, case="C1"),
     data.frame(algo="A2", value=1.0, case="C2"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = median)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.5, rank = 1),
     "A2" = data.frame(value_FUN = 0.9, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by mean works with one algorithm for one case", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank raises error when no NA treatment specified but NAs are contained", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean),
                "argument \"na.treat\" is missing, with no default", fixed = TRUE)
 })
 
 test_that("aggregate-than-rank raises error when invalid NA treatment specified and NAs are contained", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean, na.treat = "na.rmx"),
-               "Argument \"na.treat\" is invalid. It can be \"na.rm\", numeric value or function.", fixed = TRUE)
+               "Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.", fixed = TRUE)
 })
 
 test_that("specified NA treatment does not influence ranking when no NAs are contained", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = 0)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 2),
     "A2" = data.frame(value_FUN = 0.8, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("NAs are replaced by numeric value", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = 0)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = 0.0, rank = 2),
     "A2" = data.frame(value_FUN = 0.8, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("NAs are replaced by function value", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   replacementFunction <- function(x) { -1 }
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = replacementFunction)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_FUN = -1.0, rank = 2),
     "A2" = data.frame(value_FUN = 0.8, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("NAs are removed", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = "na.rm")
 
   expectedRanking <- rbind(
     "A2" = data.frame(value_FUN = 0.8, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("aggregate-than-rank by mean works for multi-task challenge (2 tasks in data set), no missing data", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.5, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRankingTask1 <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 1),
     "A2" = data.frame(value_FUN = 0.8, rank = 2))
 
   expectedRankingTask2 <- rbind(
     "A1" = data.frame(value_FUN = 0.5, rank = 2),
     "A2" = data.frame(value_FUN = 0.4, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRankingTask1)
   expect_equal(ranking$matlist$T2, expectedRankingTask2)
 })
 
 test_that("NAs are replaced by numeric value in multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = 100)
 
   expectedRankingTask1 <- rbind(
     "A1" = data.frame(value_FUN = 0.6, rank = 1),
     "A2" = data.frame(value_FUN = 0.8, rank = 2))
 
   expectedRankingTask2 <- rbind(
     "A1" = data.frame(value_FUN = 100.0, rank = 2),
     "A2" = data.frame(value_FUN = 0.4, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRankingTask1)
   expect_equal(ranking$matlist$T2, expectedRankingTask2)
 })
 
 test_that("aggregate-than-rank raises error when no NA treatment specified but NAs are contained in multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean),
                "argument \"na.treat\" is missing, with no default", fixed = TRUE)
 })
diff --git a/tests/testthat/test-blobPlotStabilityAcrossTasks.R b/tests/testthat/test-blobPlotStabilityAcrossTasks.R
new file mode 100644
index 0000000..ffadb9f
--- /dev/null
+++ b/tests/testthat/test-blobPlotStabilityAcrossTasks.R
@@ -0,0 +1,46 @@
+test_that("blob plot for visualizing ranking stability across tasks raises error for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  expect_error(stability(ranking),
+               "The stability of rankings across tasks cannot be computed for less than two tasks.", fixed=TRUE)
+})
+
+test_that("blob plot for visualizing ranking stability across tasks returns one plot for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  actualPlot <- stability(ranking)
+  expect_is(actualPlot, "ggplot")
+})
diff --git a/tests/testthat/test-blobPlotStabilityByAlgorithm.R b/tests/testthat/test-blobPlotStabilityByAlgorithm.R
new file mode 100644
index 0000000..d0b0e62
--- /dev/null
+++ b/tests/testthat/test-blobPlotStabilityByAlgorithm.R
@@ -0,0 +1,52 @@
+test_that("blob plot for visualizing ranking stability by algorithm raises error for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  expect_error(stabilityByAlgorithm(rankingBootstrapped),
+               "The stability of rankings by algorithm cannot be computed for less than two tasks.", fixed=TRUE)
+})
+
+test_that("blob plot for visualizing ranking stability by algorithm returns one plot for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  actualPlot <- stabilityByAlgorithm(rankingBootstrapped)
+  expect_is(actualPlot, "ggplot")
+})
diff --git a/tests/testthat/test-blobPlotStabilityByTask.R b/tests/testthat/test-blobPlotStabilityByTask.R
new file mode 100644
index 0000000..1f5d26d
--- /dev/null
+++ b/tests/testthat/test-blobPlotStabilityByTask.R
@@ -0,0 +1,52 @@
+test_that("blob plot for visualizing ranking stability returns one plot for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  actualPlot <- stabilityByTask(rankingBootstrapped)
+  expect_is(actualPlot, "ggplot")
+})
+
+test_that("blob plot for visualizing ranking stability returns one plot for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  actualPlot <- stabilityByTask(rankingBootstrapped)
+  expect_is(actualPlot, "ggplot")
+})
diff --git a/tests/testthat/test-boxplot.R b/tests/testthat/test-boxplot.R
new file mode 100644
index 0000000..66d8538
--- /dev/null
+++ b/tests/testthat/test-boxplot.R
@@ -0,0 +1,54 @@
+test_that("boxplot for ranked single-task data set has no title", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  actualPlot <- boxplot(ranking)[[1]]
+  expect_is(actualPlot, "ggplot")
+  expect_equal(actualPlot$labels$title, NULL)
+})
+
+test_that("boxplots for ranked multi-task data set have titles", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+   dataTask2 <- cbind(task="T2",
+                      rbind(
+                        data.frame(algo="A1", value=0.2, case="C1"),
+                        data.frame(algo="A2", value=0.3, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.7, case="C2"),
+                        data.frame(algo="A2", value=0.8, case="C2"),
+                        data.frame(algo="A3", value=0.9, case="C2")
+                      ))
+
+   data <- rbind(dataTask1, dataTask2)
+
+   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+   ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+   actualPlots <- boxplot(ranking)
+   actualPlotTask1 <- actualPlots[[1]]
+   actualPlotTask2 <- actualPlots[[2]]
+
+   expect_is(actualPlotTask1, "ggplot")
+   expect_equal(actualPlotTask1$labels$title, "T1")
+
+   expect_is(actualPlotTask2, "ggplot")
+   expect_equal(actualPlotTask2$labels$title, "T2")
+})
diff --git a/tests/testthat/test-challenge.R b/tests/testthat/test-challenge.R
index b6dd5e0..e0ba197 100644
--- a/tests/testthat/test-challenge.R
+++ b/tests/testthat/test-challenge.R
@@ -1,507 +1,574 @@
-test_that("attributes are set for single-task challenge", {
+test_that("empty attribute 'taskName' raises error for single-task challenge", {
   data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=0.4, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C1"),
-            data.frame(algo="A2", value=0.7, case="C2"))
+      data.frame(algo="A1", value=0.8, case="C1"),
+      data.frame(algo="A2", value=0.6, case="C1"))
+
+  expect_error(as.challenge(data, taskName="", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+               "Argument 'taskName' is empty.", fixed=TRUE)
+})
+
+test_that("only whitespaces in attribute 'taskName' raises error for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
+
+  expect_error(as.challenge(data, taskName="  ", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+               "Argument 'taskName' is empty.", fixed=TRUE)
+})
+
+test_that("attributes are set for single-task challenge with specified task name", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
+
+  actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  expect_equal(attr(actualChallenge, "annotator"), NULL)
+  expect_equal(attr(actualChallenge, "by"), "task")
+  expect_equal(attr(actualChallenge, "largeBetter"), TRUE)
+  expect_equal(attr(actualChallenge, "check"), TRUE)
+
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1"))
+  expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1"))
+
+  # expect that there's no attribute "task"
+  expect_equal(attr(actualChallenge, "task"), NULL)
+  expect_equal(attr(actualChallenge$T1, "task"), NULL)
+  expect_equal(attr(actualChallenge$T2, "task"), NULL)
+})
+
+test_that("attributes are set for single-task challenge with dummy task name", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
 
   actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE)
 
   expect_equal(attr(actualChallenge, "annotator"), NULL)
-  expect_equal(attr(actualChallenge, "by"), NULL)
+  expect_equal(attr(actualChallenge, "by"), "task")
   expect_equal(attr(actualChallenge, "largeBetter"), TRUE)
   expect_equal(attr(actualChallenge, "check"), TRUE)
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A1", "A2", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.4, 0.6, 0.7))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2"))
+
+  expect_equal(as.vector(actualChallenge$dummyTask$algo), c("A1", "A2"))
+  expect_equal(as.vector(actualChallenge$dummyTask$value), c(0.8, 0.6))
+  expect_equal(as.vector(actualChallenge$dummyTask$case), c("C1", "C1"))
+  expect_equal(as.vector(actualChallenge$dummyTask$task), c("dummyTask", "dummyTask"))
+
+  # expect that there's no attribute "task"
+  expect_equal(attr(actualChallenge, "task"), NULL)
+  expect_equal(attr(actualChallenge$dummyTask, "task"), NULL)
+  expect_equal(attr(actualChallenge$dummyTask, "task"), NULL)
+})
+
+test_that("leading and trailing whitespaces are trimmed for attribute 'taskName'", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
+
+  actualChallenge <- as.challenge(data, taskName=" T1  ", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2"))
 })
 
-test_that("attributes are set for multi-task challenge with sanity check enabled", {
+test_that("attributes are set for multi-task challenge", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A2", value=0.6, case="C1")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.2, case="C1"),
                        data.frame(algo="A2", value=0.3, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
-  actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE, check=TRUE)
+  actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE)
 
   expect_equal(attr(actualChallenge, "annotator"), NULL)
   expect_equal(attr(actualChallenge, "by"), "task")
   expect_equal(attr(actualChallenge, "largeBetter"), FALSE)
   expect_equal(attr(actualChallenge, "check"), TRUE)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1"))
   expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3))
   expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C1"))
   expect_equal(as.vector(actualChallenge$T2$task), c("T2", "T2"))
 
   # expect that there's no attribute "task"
   expect_equal(attr(actualChallenge, "task"), NULL)
   expect_equal(attr(actualChallenge$T1, "task"), NULL)
   expect_equal(attr(actualChallenge$T2, "task"), NULL)
 })
 
 test_that("attributes are set for multi-task challenge with sanity check disabled", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A2", value=0.6, case="C1")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.2, case="C1"),
                        data.frame(algo="A2", value=0.3, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE, check=FALSE)
 
   expect_equal(attr(actualChallenge, "annotator"), NULL)
   expect_equal(attr(actualChallenge, "by"), "task")
   expect_equal(attr(actualChallenge, "largeBetter"), FALSE)
   expect_equal(attr(actualChallenge, "check"), FALSE)
   expect_equal(as.vector(actualChallenge$algo), c("A1", "A2", "A1", "A2"))
   expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6, 0.2, 0.3))
   expect_equal(as.vector(actualChallenge$case), c("C1", "C1", "C1", "C1"))
   expect_equal(as.vector(actualChallenge$task), c("T1", "T1", "T2", "T2"))
 })
 
-test_that("missing algorithm performances are added as NA with sanity check enabled for single-task challenge", {
-  data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A2", value=0.6, case="C2"))
+test_that("attribute 'taskName' is ignored for multi-task challenge", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1")
+                     ))
 
-  expect_message(actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-                 "Performance of not all algorithms is observed for all cases. Inserted as missings in following cases:")
+  data <- rbind(dataTask1, dataTask2)
 
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A1", "A2", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, NA, NA, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2"))
+  expect_warning(as.challenge(data, taskName="T1", by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE),
+                 "Argument 'taskName' is ignored for multi-task data set.", fixed=TRUE)
 })
 
-test_that("missing algorithm performances are not added as NA with sanity check disabled for single-task challenge", {
+test_that("missing algorithm performances are added as NAs for single-task challenge", {
   data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A2", value=0.6, case="C2"))
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C2"))
 
-  actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE)
-
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2"))
-})
-
-test_that("missing algorithm performances are added as NA with sanity check enabled for multi-task challenge (1 task in data set)", {
-  data <- cbind(task="T1",
-                rbind(
-                  data.frame(algo="A1", value=0.8, case="C1"),
-                  data.frame(algo="A2", value=0.6, case="C2")
-                ))
-
-  expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
-                 "Performance of not all algorithms is observed for all cases in task T1. Inserted as missings in following cases:")
+  expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
 })
 
-test_that("missing algorithm performances are not added as NA with sanity check disabled for multi-task challenge (1 task in data set)", {
+test_that("multi-task data set containing one task is interpreted as single-task data set, missing algorithm performances are added", {
   data <- cbind(task="T1",
                 rbind(
                   data.frame(algo="A1", value=0.8, case="C1"),
                   data.frame(algo="A2", value=0.6, case="C2")
                 ))
 
-  actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE)
+  # do not specify parameter "by" to interpret multi-task data set as single-task data set
+  expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
 
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2"))
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
 })
 
-test_that("missing algorithm performances are added as NA with sanity check enabled for multi-task challenge (2 tasks in data set)", {
+test_that("missing algorithm performances are added as NAs for multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A2", value=0.6, case="C2")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.2, case="C1"),
                        data.frame(algo="A1", value=0.3, case="C2"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
-                 "Performance of not all algorithms is observed for all cases in task T1. Inserted as missings in following cases:")
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2", "A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4, NA))
   expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1", "C2"))
 })
 
-test_that("case cannot appear more than once per algorithm with sanity check enabled for single-task challenge", {
+test_that("missing algorithm performances are not added as NA with sanity check disabled for single-task challenge", {
   data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=0.8, case="C1"))
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C2"))
+
+  actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE)
 
-  expect_error(as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1) appear(s) more than once for the same algorithm", fixed=TRUE)
+  expect_equal(as.vector(actualChallenge$algo), c("A1", "A2"))
+  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6))
+  expect_equal(as.vector(actualChallenge$case), c("C1", "C2"))
 })
 
-test_that("cases cannot appear more than once per algorithm with sanity check enabled for single-task challenge", {
-  data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A2", value=0.7, case="C1"),
-            data.frame(algo="A1", value=0.5, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C2"))
+test_that("missing algorithm performances are not added as NA with sanity check disabled for multi-task challenge (2 tasks in data set)", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A1", value=0.3, case="C2"),
+                       data.frame(algo="A2", value=0.4, case="C1")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE)
 
-  expect_error(as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1, C2) appear(s) more than once for the same algorithm", fixed=TRUE)
+  expect_equal(as.vector(actualChallenge$algo), c("A1", "A2", "A1", "A1", "A2"))
+  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6, 0.2, 0.3, 0.4))
+  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2", "C1"))
 })
 
-test_that("cases cannot appear more than once per algorithm when missing data was added with sanity check enabled for single-task challenge", {
+test_that("case cannot appear more than once per algorithm for single-task challenge", {
   data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A2", value=0.6, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C2"))
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A1", value=0.8, case="C1"))
 
-  expect_error(as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1, C2) appear(s) more than once for the same algorithm", fixed=TRUE)
+  expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+               "Case(s) (C1) appear(s) more than once for the same algorithm in task 'T1'.", fixed=TRUE)
 })
 
-test_that("case cannot appear more than once per algorithm with sanity check enabled for multi-task challenge (1 task in data set)", {
+test_that("multi-task data set containing one task is interpreted as single-task data set, case cannot appear more than once per algorithm", {
+  data <- cbind(task="T1",
+                rbind(
+                  data.frame(algo="A1", value=0.8, case="C1"),
+                  data.frame(algo="A1", value=0.8, case="C1")
+                ))
+
+  # do not specify parameter "by" to interpret multi-task data set as single-task data set
+  expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+               "Case(s) (C1) appear(s) more than once for the same algorithm in task 'T1'.", fixed=TRUE)
+})
+
+test_that("case cannot appear more than once per algorithm for multi-task challenge (1 task in data set)", {
   data <- cbind(task="T1",
                 rbind(
                   data.frame(algo="A1", value=0.8, case="C1"),
                   data.frame(algo="A1", value=0.8, case="C1")
                 ))
 
   expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1) appear(s) more than once for the same algorithm in task T1", fixed=TRUE)
+               "Case(s) (C1) appear(s) more than once for the same algorithm in task 'T1'.", fixed=TRUE)
 })
 
-test_that("cases cannot appear more than once per algorithm with sanity check enabled for multi-task challenge (1 task in data set)", {
+test_that("cases cannot appear more than once per algorithm for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.7, case="C1"),
+    data.frame(algo="A1", value=0.5, case="C2"),
+    data.frame(algo="A2", value=0.6, case="C2"),
+    data.frame(algo="A2", value=0.6, case="C2"))
+
+  expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+               "Case(s) (C1, C2) appear(s) more than once for the same algorithm in task 'T1'.", fixed=TRUE)
+})
+
+test_that("cases cannot appear more than once per algorithm for multi-task challenge (1 task in data set)", {
   data <- cbind(task="T1",
                 rbind(
                   data.frame(algo="A1", value=0.8, case="C1"),
                   data.frame(algo="A1", value=0.8, case="C1"),
                   data.frame(algo="A2", value=0.7, case="C1"),
                   data.frame(algo="A1", value=0.5, case="C2"),
                   data.frame(algo="A2", value=0.6, case="C2"),
                   data.frame(algo="A2", value=0.6, case="C2")
                 ))
 
   expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1, C2) appear(s) more than once for the same algorithm in task T1", fixed=TRUE)
+               "Case(s) (C1, C2) appear(s) more than once for the same algorithm in task 'T1'.", fixed=TRUE)
 })
 
-test_that("cases cannot appear more than once per algorithm with sanity check enabled for multi-task challenge (2 tasks in data set)", {
+test_that("cases cannot appear more than once per algorithm for multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1") # let T1 pass
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A2", value=0.7, case="C1"),
                        data.frame(algo="A1", value=0.5, case="C2"),
                        data.frame(algo="A2", value=0.6, case="C2"),
                        data.frame(algo="A2", value=0.6, case="C2")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1, C2) appear(s) more than once for the same algorithm in task T2", fixed=TRUE)
+               "Case(s) (C1, C2) appear(s) more than once for the same algorithm in task 'T2'.", fixed=TRUE)
 })
 
-test_that("multi-task data set containing one task is interpreted as single-task data set, missing algorithm performances are added", {
-  data <- cbind(task="T1",
-                rbind(
-                  data.frame(algo="A1", value=0.8, case="C1"),
-                  data.frame(algo="A2", value=0.6, case="C2")
-                ))
-
-  # do not specify parameter "by" to interpret multi-task data set as single-task data set
-  expect_message(actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-                 "Performance of not all algorithms is observed for all cases. Inserted as missings in following cases:")
-
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A1", "A2", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, NA, NA, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2"))
-})
-
-test_that("multi-task data set containing one task is interpreted as single-task data set, case cannot appear more than once per algorithm", {
-  data <- cbind(task="T1",
-                rbind(
-                  data.frame(algo="A1", value=0.8, case="C1"),
-                  data.frame(algo="A1", value=0.8, case="C1")
-                ))
+test_that("cases cannot appear more than once per algorithm when missing data was added for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C2"),
+    data.frame(algo="A2", value=0.6, case="C2"))
 
-  # do not specify parameter "by" to interpret multi-task data set as single-task data set
-  expect_error(as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1) appear(s) more than once for the same algorithm", fixed=TRUE)
+  #expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+  #             "Case(s) (C1, C2) appear(s) more than once for the same algorithm in task 'T1'.", fixed=TRUE)
 })
 
 test_that("user is notified of duplicate cases when multi-task data set is interpreted as single-task data set (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   # do not specify parameter "by" to interpret multi-task data set as single-task data set
-  expect_error(as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Case(s) (C1) appear(s) more than once for the same algorithm", fixed=TRUE)
+  expect_error(as.challenge(data, taskName="New task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+               "Case(s) (C1) appear(s) more than once for the same algorithm in task 'New task'.", fixed=TRUE)
 })
 
 test_that("user is notified of missing algorithm performance when multi-task data set is interpreted as single-task data set (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A2", value=0.6, case="C2")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   # do not specify parameter "by" to interpret multi-task data set as single-task data set
-  expect_message(as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE),
-               "Performance of not all algorithms is observed for all cases. Inserted as missings in following cases:", fixed=TRUE)
+  expect_message(as.challenge(data, taskName="New task", algorithm="algo", case="case", value="value", smallBetter=FALSE),
+                 "Performance of not all algorithms is observed for all cases in task 'New task'. Inserted as missings in following cases:", fixed=TRUE)
 })
 
 test_that("NAs are replaced by numeric value for single-task challenge", {
   data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=NA, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C1"),
-            data.frame(algo="A2", value=NA, case="C2"))
-
-  actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0)
-
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A1", "A2", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.0, 0.6, 0.0))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2"))
-})
-
-test_that("NAs are replaced by function value for single-task challenge", {
-  data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=NA, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C1"),
-            data.frame(algo="A2", value=NA, case="C2"))
-
-  replacementFunction <- function(x) { 2 }
-
-  actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction)
-
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A1", "A2", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 2.0, 0.6, 2.0))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2"))
-})
-
-test_that("NAs are removed for single-task challenge", {
-  data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A1", value=NA, case="C2"),
-            data.frame(algo="A2", value=0.6, case="C1"),
-            data.frame(algo="A2", value=NA, case="C2"))
-
-  actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm")
-
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C1"))
-})
-
-test_that("automatically added NAs are replaced by numeric value for single-task challenge", {
-  data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A2", value=0.6, case="C2"))
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A1", value=NA, case="C2"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A2", value=NA, case="C2"))
 
-  expect_message(actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0),
-                 "Performance of not all algorithms is observed for all cases. Inserted as missings in following cases:")
+  actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0)
 
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A1", "A2", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.0, 0.0, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2"))
-})
-
-test_that("automatically added NAs are removed for single-task challenge", {
-  data <- rbind(
-            data.frame(algo="A1", value=0.8, case="C1"),
-            data.frame(algo="A2", value=0.6, case="C2"))
-
-  expect_message(actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"),
-                 "Performance of not all algorithms is observed for all cases. Inserted as missings in following cases:")
-
-  expect_equal(as.vector(actualChallenge$algo), c("A1", "A2"))
-  expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6))
-  expect_equal(as.vector(actualChallenge$case), c("C1", "C2"))
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.6, 0.0))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
 })
 
 test_that("NAs are replaced by numeric value for multi-task challenge", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A1", value=NA, case="C2")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A2", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.5, case="C2")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A2", "A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(0.0, 0.5))
   expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2"))
 })
 
+test_that("NAs are replaced by function value for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A1", value=NA, case="C2"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A2", value=NA, case="C2"))
+
+  replacementFunction <- function(x) { 2 }
+
+  actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction)
+
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 2.0, 0.6, 2.0))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
+})
+
 test_that("NAs are replaced by function value for multi-task challenge", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A1", value=NA, case="C2")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A2", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.5, case="C2")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   replacementFunction <- function(x) { 2 }
 
   actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 2.0))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A2", "A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(2.0, 0.5))
   expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2"))
 })
 
+test_that("NAs are removed for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A1", value=NA, case="C2"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A2", value=NA, case="C2"))
+
+  actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm")
+
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1"))
+})
+
 test_that("NAs are removed for multi-task challenge", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A1", value=NA, case="C2")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A2", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.5, case="C2")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm")
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(0.5))
   expect_equal(as.vector(actualChallenge$T2$case), c("C2"))
 })
 
+test_that("automatically added NAs are replaced by numeric value for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C2"))
+
+  expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0),
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
+
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.0, 0.6))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
+})
+
 test_that("automatically added NAs are replaced by numeric value for multi-task challenge", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A2", value=0.6, case="C2")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.2, case="C1"),
                        data.frame(algo="A1", value=0.3, case="C2"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0),
-                 "Performance of not all algorithms is observed for all cases in task T1. Inserted as missings in following cases:")
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.0, 0.6))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2", "A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4, 0.0))
   expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1", "C2"))
 })
 
+test_that("automatically added NAs are removed for single-task challenge", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C2"))
+
+  expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"),
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
+
+  expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2"))
+  expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6))
+  expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2"))
+})
+
 test_that("automatically added NAs are removed for multi-task challenge", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.8, case="C1"),
                        data.frame(algo="A2", value=0.6, case="C2")
                      ))
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.2, case="C1"),
                        data.frame(algo="A1", value=0.3, case="C2"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"),
-                 "Performance of not all algorithms is observed for all cases in task T1. Inserted as missings in following cases:")
+                 "Performance of not all algorithms is observed for all cases in task 'T1'. Inserted as missings in following cases:", fixed=TRUE)
 
   expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2"))
   expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6))
   expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2"))
 
   expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2"))
   expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4))
   expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1"))
-})
\ No newline at end of file
+})
diff --git a/tests/testthat/test-linePlot.R b/tests/testthat/test-linePlot.R
new file mode 100644
index 0000000..3c0fd7d
--- /dev/null
+++ b/tests/testthat/test-linePlot.R
@@ -0,0 +1,42 @@
+test_that("line plot across ranking methods returns one plot for multi-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  actualPlot <- methodsplot(challenge)
+  expect_is(actualPlot, "ggplot")
+})
+
+test_that("line plot across ranking methods returns one plot for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  actualPlot <- methodsplot(challenge)
+  expect_is(actualPlot, "ggplot")
+})
diff --git a/tests/testthat/test-networkPlot.R b/tests/testthat/test-networkPlot.R
new file mode 100644
index 0000000..8aeb2af
--- /dev/null
+++ b/tests/testthat/test-networkPlot.R
@@ -0,0 +1,85 @@
+test_that("cluster analysis network plot raises error for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  expect_error(network(ranking, edge.col=grDevices::grey.colors, edge.lwd=1, cols=NULL),
+               "The cluster analysis is only sensible for more than two tasks.", fixed=TRUE)
+})
+
+test_that("cluster analysis network plot raises error for multi-task data set containing two tasks", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  expect_error(network(ranking, edge.col=grDevices::grey.colors, edge.lwd=1, cols=NULL),
+               "The cluster analysis is only sensible for more than two tasks.", fixed=TRUE)
+})
+
+test_that("cluster analysis network plot returns a network object for multi-task data set containing three tasks", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.2, case="C1"),
+                       data.frame(algo="A3", value=0.3, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=0.7, case="C2"),
+                       data.frame(algo="A3", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  actualPlot <- network(ranking, edge.col=grDevices::grey.colors, edge.lwd=1, cols=NULL)
+  expect_is(actualPlot, "network")
+})
diff --git a/tests/testthat/test-rankThenAggregate.R b/tests/testthat/test-rankThenAggregate.R
index ae63900..8ce5459 100644
--- a/tests/testthat/test-rankThenAggregate.R
+++ b/tests/testthat/test-rankThenAggregate.R
@@ -1,286 +1,286 @@
 test_that("rank-then-aggregate by mean works with two algorithms for one case, small values are better", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 1, rank = 1),
-    "A2" = data.frame(rank_mean = 2, rank = 2))
+    "A1" = data.frame(rank_FUN = 1, rank = 1),
+    "A2" = data.frame(rank_FUN = 2, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate by mean works with two algorithms (reverse order) for one case, small values are better", {
   data <- rbind(
             data.frame(algo = "A2", value = 0.8, case = "C1"),
             data.frame(algo = "A1", value = 0.6, case = "C1"))
 
-  challenge <- as.challenge(data, algorithm = "algo", case = "case", value = "value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm = "algo", case = "case", value = "value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
-  expectedRanking <- rbind("A2" = data.frame(rank_mean = 2, rank = 2),
-                           "A1" = data.frame(rank_mean = 1, rank = 1))
+  expectedRanking <- rbind("A2" = data.frame(rank_FUN = 2, rank = 2),
+                           "A1" = data.frame(rank_FUN = 1, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate by mean works with two algorithms for one case, large values are better", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 2, rank = 2),
-    "A2" = data.frame(rank_mean = 1, rank = 1))
+    "A1" = data.frame(rank_FUN = 2, rank = 2),
+    "A2" = data.frame(rank_FUN = 1, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate by mean works with two algorithms (reverse order) for one case, large values are better", {
   data <- rbind(
     data.frame(algo = "A2", value = 0.8, case = "C1"),
     data.frame(algo = "A1", value = 0.6, case = "C1"))
 
-  challenge <- as.challenge(data, algorithm = "algo", case = "case", value = "value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm = "algo", case = "case", value = "value", smallBetter = FALSE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
-  expectedRanking <- rbind("A2" = data.frame(rank_mean = 1, rank = 1),
-                           "A1" = data.frame(rank_mean = 2, rank = 2))
+  expectedRanking <- rbind("A2" = data.frame(rank_FUN = 1, rank = 1),
+                           "A1" = data.frame(rank_FUN = 2, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate raises error for invalid aggregation function", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%rankThenAggregate(FUN = meanx),
                "object 'meanx' not found", fixed = TRUE)
 })
 
 test_that("rank-then-aggregate by mean works with two algorithms for one case and 'min' as ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean, ties.method = "min")
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 1, rank = 1),
-    "A2" = data.frame(rank_mean = 1, rank = 1))
+    "A1" = data.frame(rank_FUN = 1, rank = 1),
+    "A2" = data.frame(rank_FUN = 1, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate by mean works with two algorithms for one case and 'max' as ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean, ties.method = "max")
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 2, rank = 2),
-    "A2" = data.frame(rank_mean = 2, rank = 2))
+    "A1" = data.frame(rank_FUN = 2, rank = 2),
+    "A2" = data.frame(rank_FUN = 2, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate raises error for invalid ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%rankThenAggregate(FUN = mean, ties.method = "maxx"),
                "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
 })
 
 test_that("rank-then-aggregate raises error for invalid ties method even when no ties present", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%rankThenAggregate(FUN = mean, ties.method = "maxx"),
                "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
 })
 
 test_that("rank-then-aggregate by mean works with two algorithms for two cases", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A1", value=0.4, case="C2"),
     data.frame(algo="A2", value=0.8, case="C1"),
     data.frame(algo="A2", value=1.0, case="C2"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 1, rank = 1),
-    "A2" = data.frame(rank_mean = 2, rank = 2))
+    "A1" = data.frame(rank_FUN = 1, rank = 1),
+    "A2" = data.frame(rank_FUN = 2, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate by median works with two algorithms for two cases", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A1", value=0.4, case="C2"),
     data.frame(algo="A2", value=0.8, case="C1"),
     data.frame(algo="A2", value=1.0, case="C2"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = median)
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_median = 1, rank = 1),
-    "A2" = data.frame(rank_median = 2, rank = 2))
+    "A1" = data.frame(rank_FUN = 1, rank = 1),
+    "A2" = data.frame(rank_FUN = 2, rank = 2))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate by mean works with one algorithm for one case", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = TRUE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 1, rank = 1))
+    "A1" = data.frame(rank_FUN = 1, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate assigns worst rank for NA", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRanking <- rbind(
-    "A1" = data.frame(rank_mean = 2, rank = 2),
-    "A2" = data.frame(rank_mean = 1, rank = 1))
+    "A1" = data.frame(rank_FUN = 2, rank = 2),
+    "A2" = data.frame(rank_FUN = 1, rank = 1))
 
-  expect_equal(ranking$mat, expectedRanking)
+  expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("rank-then-aggregate raises error for unused NA treatment argument", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
-  challenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter = FALSE)
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   expect_error(challenge%>%rankThenAggregate(FUN = mean, na.treat = 0),
                "unused argument (na.treat = 0)", fixed = TRUE)
 })
 
 test_that("rank-then-aggregate by mean works for multi-task challenge (2 tasks in data set), no missing data", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.5, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRankingTask1 <- rbind(
     "A1" = data.frame(rank_FUN = 1, rank = 1),
     "A2" = data.frame(rank_FUN = 2, rank = 2))
 
   expectedRankingTask2 <- rbind(
     "A1" = data.frame(rank_FUN = 2, rank = 2),
     "A2" = data.frame(rank_FUN = 1, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRankingTask1)
   expect_equal(ranking$matlist$T2, expectedRankingTask2)
 })
 
 test_that("rank-then-aggregate assigns worst rank for NA in multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%rankThenAggregate(FUN = mean)
 
   expectedRankingTask1 <- rbind(
     "A1" = data.frame(rank_FUN = 1, rank = 1),
     "A2" = data.frame(rank_FUN = 2, rank = 2))
 
   expectedRankingTask2 <- rbind(
     "A1" = data.frame(rank_FUN = 2, rank = 2),
     "A2" = data.frame(rank_FUN = 1, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRankingTask1)
   expect_equal(ranking$matlist$T2, expectedRankingTask2)
 })
 
 test_that("rank-then-aggregate raises error for unused NA treatment argument in multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%rankThenAggregate(FUN = mean, na.treat = "na.rm"),
                "unused argument (na.treat = \"na.rm\")", fixed = TRUE)
 })
diff --git a/tests/testthat/test-rankingHeatmap.R b/tests/testthat/test-rankingHeatmap.R
new file mode 100644
index 0000000..5c77967
--- /dev/null
+++ b/tests/testthat/test-rankingHeatmap.R
@@ -0,0 +1,54 @@
+test_that("ranking heatmap for single-task data set has no title", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  actualPlot <- rankingHeatmap(ranking)[[1]]
+  expect_is(actualPlot, "ggplot")
+  expect_equal(actualPlot$labels$title, NULL)
+})
+
+test_that("ranking heatmap for multi-task data set have titles", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+   dataTask2 <- cbind(task="T2",
+                      rbind(
+                        data.frame(algo="A1", value=0.2, case="C1"),
+                        data.frame(algo="A2", value=0.3, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.7, case="C2"),
+                        data.frame(algo="A2", value=0.8, case="C2"),
+                        data.frame(algo="A3", value=0.9, case="C2")
+                      ))
+
+   data <- rbind(dataTask1, dataTask2)
+
+   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+   ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+   actualPlots <- rankingHeatmap(ranking)
+   actualPlotTask1 <- actualPlots[[1]]
+   actualPlotTask2 <- actualPlots[[2]]
+
+   expect_is(actualPlotTask1, "ggplot")
+   expect_equal(actualPlotTask1$labels$title, "T1")
+
+   expect_is(actualPlotTask2, "ggplot")
+   expect_equal(actualPlotTask2$labels$title, "T2")
+})
diff --git a/tests/testthat/test-report.R b/tests/testthat/test-report.R
new file mode 100644
index 0000000..406e679
--- /dev/null
+++ b/tests/testthat/test-report.R
@@ -0,0 +1,467 @@
+test_that("PDF report for single-task data set without bootstrapping is created", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  ranking %>%
+    report(title="Test Challenge",
+           file="testthat_single_task_no_bootstrapping",
+           format="PDF",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_single_task_no_bootstrapping.pdf"))
+
+  # Clean up
+  if (file.exists("testthat_single_task_no_bootstrapping.pdf")) {
+    file.remove("testthat_single_task_no_bootstrapping.pdf")
+  }
+
+})
+
+test_that("HTML report for single-task data set without bootstrapping is created", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  ranking %>%
+    report(title="Test Challenge",
+           file="testthat_single_task_no_bootstrapping",
+           format="HTML",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_single_task_no_bootstrapping.html"))
+
+  # Clean up
+  if (file.exists("testthat_single_task_no_bootstrapping.html")) {
+    file.remove("testthat_single_task_no_bootstrapping.html")
+  }
+
+})
+
+test_that("Word report for single-task data set without bootstrapping is created", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  ranking %>%
+    report(title="Test Challenge",
+           file="testthat_single_task_no_bootstrapping",
+           format="Word",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_single_task_no_bootstrapping.docx"))
+
+  # Clean up
+  if (file.exists("testthat_single_task_no_bootstrapping.docx")) {
+    file.remove("testthat_single_task_no_bootstrapping.docx")
+  }
+
+})
+
+test_that("PDF report for single-task data set with bootstrapping is created", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrapped %>%
+    report(title="Test Challenge",
+           file="testthat_single_task_bootstrapping",
+           format="PDF",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_single_task_bootstrapping.pdf"))
+
+  # Clean up
+  if (file.exists("testthat_single_task_bootstrapping.pdf")) {
+    file.remove("testthat_single_task_bootstrapping.pdf")
+  }
+
+})
+
+test_that("HTML report for single-task data set with bootstrapping is created", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrapped %>%
+    report(title="Test Challenge",
+           file="testthat_single_task_bootstrapping",
+           format="HTML",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_single_task_bootstrapping.html"))
+
+  # Clean up
+  if (file.exists("testthat_single_task_bootstrapping.html")) {
+    file.remove("testthat_single_task_bootstrapping.html")
+  }
+
+})
+
+test_that("Word report for single-task data set with bootstrapping is created", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrapped %>%
+    report(title="Test Challenge",
+           file="testthat_single_task_bootstrapping",
+           format="Word",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_single_task_bootstrapping.docx"))
+
+  # Clean up
+  if (file.exists("testthat_single_task_bootstrapping.docx")) {
+    file.remove("testthat_single_task_bootstrapping.docx")
+  }
+
+})
+
+test_that("PDF report for multi-task data set without bootstrapping is created", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.8, case="C1")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  meanRanks <- ranking%>%consensus(method = "euclidean")
+
+  ranking %>%
+    report(consensus=meanRanks,
+           title="Test Challenge",
+           file="testthat_multi_task_no_bootstrapping",
+           format="PDF",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_multi_task_no_bootstrapping.pdf"))
+
+  # Clean up
+  if (file.exists("testthat_multi_task_no_bootstrapping.pdf")) {
+    file.remove("testthat_multi_task_no_bootstrapping.pdf")
+  }
+
+})
+
+test_that("HTML report for multi-task data set without bootstrapping is created", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.8, case="C1")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  meanRanks <- ranking%>%consensus(method = "euclidean")
+
+  ranking %>%
+    report(consensus=meanRanks,
+           title="Test Challenge",
+           file="testthat_multi_task_no_bootstrapping",
+           format="HTML",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_multi_task_no_bootstrapping.html"))
+
+  # Clean up
+  if (file.exists("testthat_multi_task_no_bootstrapping.html")) {
+    file.remove("testthat_multi_task_no_bootstrapping.html")
+  }
+
+})
+
+test_that("Word report for multi-task data set without bootstrapping is created", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.8, case="C1")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  meanRanks <- ranking%>%consensus(method = "euclidean")
+
+  ranking %>%
+    report(consensus=meanRanks,
+           title="Test Challenge",
+           file="testthat_multi_task_no_bootstrapping",
+           format="Word",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_multi_task_no_bootstrapping.docx"))
+
+  # Clean up
+  if (file.exists("testthat_multi_task_no_bootstrapping.docx")) {
+    file.remove("testthat_multi_task_no_bootstrapping.docx")
+  }
+
+})
+
+test_that("PDF report for multi-task data set with bootstrapping is created", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.2, case="C1"),
+                       data.frame(algo="A3", value=0.3, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=0.7, case="C2"),
+                       data.frame(algo="A3", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  meanRanks <- ranking%>%consensus(method = "euclidean")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrapped %>%
+    report(consensus=meanRanks,
+           title="Test Challenge",
+           file="testthat_multi_task_bootstrapping",
+           format="PDF",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_multi_task_bootstrapping.pdf"))
+
+  # Clean up
+  if (file.exists("testthat_multi_task_bootstrapping.pdf")) {
+    file.remove("testthat_multi_task_bootstrapping.pdf")
+  }
+
+})
+
+test_that("HTML report for multi-task data set with bootstrapping is created", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.2, case="C1"),
+                       data.frame(algo="A3", value=0.3, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=0.7, case="C2"),
+                       data.frame(algo="A3", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  meanRanks <- ranking%>%consensus(method = "euclidean")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrapped %>%
+    report(consensus=meanRanks,
+           title="Test Challenge",
+           file="testthat_multi_task_bootstrapping",
+           format="HTML",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_multi_task_bootstrapping.html"))
+
+  # Clean up
+  if (file.exists("testthat_multi_task_bootstrapping.html")) {
+    file.remove("testthat_multi_task_bootstrapping.html")
+  }
+
+})
+
+test_that("Word report for multi-task data set with bootstrapping is created", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+  dataTask3 <- cbind(task="T3",
+                     rbind(
+                       data.frame(algo="A1", value=0.1, case="C1"),
+                       data.frame(algo="A2", value=0.2, case="C1"),
+                       data.frame(algo="A3", value=0.3, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=0.7, case="C2"),
+                       data.frame(algo="A3", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2, dataTask3)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  meanRanks <- ranking%>%consensus(method = "euclidean")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrapped %>%
+    report(consensus=meanRanks,
+           title="Test Challenge",
+           file="testthat_multi_task_bootstrapping",
+           format="Word",
+           clean=TRUE,
+           open=FALSE)
+
+  expect_true(file.exists("testthat_multi_task_bootstrapping.docx"))
+
+  # Clean up
+  if (file.exists("testthat_multi_task_bootstrapping.docx")) {
+    file.remove("testthat_multi_task_bootstrapping.docx")
+  }
+
+})
diff --git a/tests/testthat/test-significanceMap.R b/tests/testthat/test-significanceMap.R
new file mode 100644
index 0000000..6f04755
--- /dev/null
+++ b/tests/testthat/test-significanceMap.R
@@ -0,0 +1,54 @@
+test_that("significance map for single-task data set has no title", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  actualPlot <- significanceMap(ranking)[[1]]
+  expect_is(actualPlot, "ggplot")
+  expect_equal(actualPlot$labels$title, NULL)
+})
+
+test_that("significance map for multi-task data set have titles", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  actualPlots <- significanceMap(ranking)
+  actualPlotTask1 <- actualPlots[[1]]
+  actualPlotTask2 <- actualPlots[[2]]
+
+  expect_is(actualPlotTask1, "ggplot")
+  expect_equal(actualPlotTask1$labels$title, "T1")
+
+  expect_is(actualPlotTask2, "ggplot")
+  expect_equal(actualPlotTask2$labels$title, "T2")
+})
diff --git a/tests/testthat/test-stackedBarPlotStabilityByAlgorithm.R b/tests/testthat/test-stackedBarPlotStabilityByAlgorithm.R
new file mode 100644
index 0000000..507dfa2
--- /dev/null
+++ b/tests/testthat/test-stackedBarPlotStabilityByAlgorithm.R
@@ -0,0 +1,52 @@
+test_that("stacked bar plot for visualizing ranking stability by algorithm raises error for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  expect_error(stabilityByAlgorithmStacked(rankingBootstrapped),
+               "The stability of rankings by algorithm cannot be computed for less than two tasks.", fixed=TRUE)
+})
+
+test_that("stacked bar plot for visualizing ranking stability by algorithm returns one plot for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                        data.frame(algo="A1", value=0.8, case="C1"),
+                        data.frame(algo="A2", value=0.6, case="C1"),
+                        data.frame(algo="A3", value=0.4, case="C1"),
+                        data.frame(algo="A1", value=0.2, case="C2"),
+                        data.frame(algo="A2", value=0.1, case="C2"),
+                        data.frame(algo="A3", value=0.0, case="C2")
+                      ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  actualPlot <- stabilityByAlgorithmStacked(rankingBootstrapped)
+  expect_is(actualPlot, "ggplot")
+})
diff --git a/tests/testthat/test-subset.R b/tests/testthat/test-subset.R
new file mode 100644
index 0000000..31622fb
--- /dev/null
+++ b/tests/testthat/test-subset.R
@@ -0,0 +1,220 @@
+test_that("top 2 performing algorithms are extracted and data set is reduced respectively", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  rankingSubset <- subset(ranking, top=2)
+
+  expectedRankingSubset <- rbind(
+    "A1" = data.frame(value_FUN = 0.5, rank = 1),
+    "A2" = data.frame(value_FUN = 0.35, rank = 2))
+
+  expect_equal(rankingSubset$matlist$T1, expectedRankingSubset)
+
+  expect_equal(as.vector(rankingSubset$data$T1$algo), c("A1", "A2", "A1", "A2"))
+  expect_equal(as.vector(rankingSubset$data$T1$value), c(0.8, 0.6, 0.2, 0.1))
+  expect_equal(as.vector(rankingSubset$data$T1$case), c("C1", "C1", "C2", "C2"))
+  expect_equal(as.vector(rankingSubset$data$T1$task), c("T1", "T1", "T1", "T1"))
+})
+
+test_that("extraction of subset raises error for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  expect_error(subset(ranking, top=2),
+               "Subset of algorithms only sensible for single-task challenges.", fixed=TRUE)
+})
+
+test_that("extraction of subset returns all algorithms even when more are requested", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  rankingSubset <- subset(ranking, top=4)
+
+  expectedRankingSubset <- rbind(
+    "A1" = data.frame(value_FUN = 0.5, rank = 1),
+    "A2" = data.frame(value_FUN = 0.35, rank = 2),
+    "A3" = data.frame(value_FUN = 0.2, rank = 3))
+
+  expect_equal(rankingSubset$matlist$T1, expectedRankingSubset)
+})
+
+test_that("extraction of subset returns more algorithms then requested when ties are present", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.8, case="C1"),
+    data.frame(algo="A3", value=0.8, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.2, case="C2"),
+    data.frame(algo="A3", value=0.2, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  rankingSubset <- subset(ranking, top=2)
+
+  expectedRankingSubset <- rbind(
+    "A1" = data.frame(value_FUN = 0.5, rank = 1),
+    "A2" = data.frame(value_FUN = 0.5, rank = 1),
+    "A3" = data.frame(value_FUN = 0.5, rank = 1))
+
+  expect_equal(rankingSubset$matlist$T1, expectedRankingSubset)
+})
+
+test_that("top 2 performing algorithms are extracted from bootstrap ranking and data set is reduced respectively", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrappedSubset <- subset(rankingBootstrapped, top=2)
+
+  expectedRankingSubset <- rbind(
+    "A1" = data.frame(value_FUN = 0.5, rank = 1),
+    "A2" = data.frame(value_FUN = 0.35, rank = 2))
+
+  expect_equal(rankingBootstrappedSubset$matlist$T1, expectedRankingSubset)
+
+  expect_equal(as.vector(rankingBootstrappedSubset$data$T1$algo), c("A1", "A2", "A1", "A2"))
+  expect_equal(as.vector(rankingBootstrappedSubset$data$T1$value), c(0.8, 0.6, 0.2, 0.1))
+  expect_equal(as.vector(rankingBootstrappedSubset$data$T1$case), c("C1", "C1", "C2", "C2"))
+  expect_equal(as.vector(rankingBootstrappedSubset$data$T1$task), c("T1", "T1", "T1", "T1"))
+
+  expect_equal(dim(rankingBootstrappedSubset$bootsrappedRanks$T1), c(2, 10))
+  expect_equal(dim(rankingBootstrappedSubset$bootsrappedAggregate$T1), c(2, 10))
+})
+
+test_that("extraction of bootstrap ranking subset raises error for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  expect_error(subset(subset(rankingBootstrapped, top=2), top=2),
+               "Subset of algorithms only sensible for single-task challenges.", fixed=TRUE)
+})
+
+test_that("extraction of bootstrap ranking subset returns all algorithms even when more are requested", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrappedSubset <- subset(rankingBootstrapped, top=4)
+
+  expectedRankingSubset <- rbind(
+    "A1" = data.frame(value_FUN = 0.5, rank = 1),
+    "A2" = data.frame(value_FUN = 0.35, rank = 2),
+    "A3" = data.frame(value_FUN = 0.2, rank = 3))
+
+  expect_equal(rankingBootstrappedSubset$matlist$T1, expectedRankingSubset)
+})
+
+test_that("extraction of bootstrap ranking subset returns more algorithms then requested when ties are present", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.8, case="C1"),
+    data.frame(algo="A3", value=0.8, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.2, case="C2"),
+    data.frame(algo="A3", value=0.2, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrappedSubset <- subset(rankingBootstrapped, top=2)
+
+  expectedRankingSubset <- rbind(
+    "A1" = data.frame(value_FUN = 0.5, rank = 1),
+    "A2" = data.frame(value_FUN = 0.5, rank = 1),
+    "A3" = data.frame(value_FUN = 0.5, rank = 1))
+
+  expect_equal(rankingBootstrappedSubset$matlist$T1, expectedRankingSubset)
+})
diff --git a/tests/testthat/test-taskSubset.R b/tests/testthat/test-taskSubset.R
new file mode 100644
index 0000000..21dd4cf
--- /dev/null
+++ b/tests/testthat/test-taskSubset.R
@@ -0,0 +1,184 @@
+test_that("extraction of task subset works for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  rankingSubset <- taskSubset(ranking, tasks=c("T2"))
+
+  expect_equal(length(rankingSubset$matlist), 1)
+  expect_is(rankingSubset$matlist$T2, "data.frame")
+
+  expect_equal(length(rankingSubset$data), 1)
+  expect_is(rankingSubset$data$T2, "data.frame")
+})
+
+test_that("extraction of task subset works for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  rankingSubset <- taskSubset(ranking, tasks=c("T1"))
+
+  expect_equal(length(rankingSubset$matlist), 1)
+  expect_is(rankingSubset$matlist$T1, "data.frame")
+
+  expect_equal(length(rankingSubset$data), 1)
+  expect_is(rankingSubset$data$T1, "data.frame")
+})
+
+test_that("extraction of task subset does not raise an error for invalid task name", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  rankingSubset <- taskSubset(ranking, tasks=c("T1x"))
+
+  expect_equal(length(rankingSubset$matlist), 1)
+  expect_equal(rankingSubset$matlist$T1, NULL)
+
+  expect_equal(length(rankingSubset$data), 1)
+  expect_equal(rankingSubset$data$T1, NULL)
+})
+
+test_that("extraction of task subset from bootstrap ranking works for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrappedSubset <- taskSubset(rankingBootstrapped, tasks=c("T2"))
+
+  expect_equal(length(rankingBootstrappedSubset$matlist), 1)
+  expect_is(rankingBootstrappedSubset$matlist$T2, "data.frame")
+
+  expect_equal(length(rankingBootstrappedSubset$data), 1)
+  expect_is(rankingBootstrappedSubset$data$T2, "data.frame")
+
+  expect_equal(length(rankingBootstrappedSubset$bootsrappedRanks), 1)
+  expect_is(rankingBootstrappedSubset$bootsrappedRanks$T2, "data.frame")
+
+  expect_equal(length(rankingBootstrappedSubset$bootsrappedAggregate), 1)
+  expect_is(rankingBootstrappedSubset$bootsrappedAggregate$T2, "data.frame")
+})
+
+test_that("extraction of task subset from bootstrap ranking works for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrappedSubset <- taskSubset(rankingBootstrapped, tasks=c("T1"))
+
+  expect_equal(length(rankingBootstrappedSubset$matlist), 1)
+  expect_is(rankingBootstrappedSubset$matlist$T1, "data.frame")
+
+  expect_equal(length(rankingBootstrappedSubset$data), 1)
+  expect_is(rankingBootstrappedSubset$data$T1, "data.frame")
+
+  expect_equal(length(rankingBootstrappedSubset$bootsrappedRanks), 1)
+  expect_is(rankingBootstrappedSubset$bootsrappedRanks$T1, "data.frame")
+
+  expect_equal(length(rankingBootstrappedSubset$bootsrappedAggregate), 1)
+  expect_is(rankingBootstrappedSubset$bootsrappedAggregate$T1, "data.frame")
+})
+
+test_that("extraction of task subset from bootstrap ranking does not raise an error for invalid task name", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=mean, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  rankingBootstrappedSubset <- taskSubset(rankingBootstrapped, tasks=c("T1x"))
+
+  expect_equal(length(rankingBootstrappedSubset$matlist), 1)
+  expect_equal(rankingBootstrappedSubset$matlist$T1, NULL)
+
+  expect_equal(length(rankingBootstrappedSubset$data), 1)
+  expect_equal(rankingBootstrappedSubset$data$T1, NULL)
+
+  expect_equal(length(rankingBootstrappedSubset$bootsrappedRanks), 1)
+  expect_equal(rankingBootstrappedSubset$bootsrappedRanks$T1, NULL)
+
+  expect_equal(length(rankingBootstrappedSubset$bootsrappedAggregate), 1)
+  expect_equal(rankingBootstrappedSubset$bootsrappedAggregate$T1, NULL)
+})
diff --git a/tests/testthat/test-violinPlot.R b/tests/testthat/test-violinPlot.R
new file mode 100644
index 0000000..eee36f9
--- /dev/null
+++ b/tests/testthat/test-violinPlot.R
@@ -0,0 +1,52 @@
+test_that("violin plot for visualizing ranking stability returns one plot for single-task data set", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.6, case="C1"),
+    data.frame(algo="A3", value=0.4, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A2", value=0.1, case="C2"),
+    data.frame(algo="A3", value=0.0, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  actualPlot <- violin(rankingBootstrapped)
+  expect_is(actualPlot, "ggplot")
+})
+
+test_that("violin plot for visualizing ranking stability returns one plot for multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.6, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A2", value=0.1, case="C2"),
+                       data.frame(algo="A3", value=0.0, case="C2")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A2", value=0.3, case="C1"),
+                       data.frame(algo="A3", value=0.4, case="C1"),
+                       data.frame(algo="A1", value=0.7, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C2"),
+                       data.frame(algo="A3", value=0.9, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE)
+
+  ranking <- challenge%>%aggregateThenRank(FUN=median, ties.method="min")
+
+  set.seed(1)
+  rankingBootstrapped <- ranking%>%bootstrap(nboot=10)
+
+  actualPlot <- violin(rankingBootstrapped)
+  expect_is(actualPlot, "ggplot")
+})