diff --git a/NAMESPACE b/NAMESPACE
index ce666ef..154e70a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,211 +1,210 @@
 S3method("+",ggList)
 S3method(print,ggList)
 export(
   "aggregate.bootstrap", "aggregate.bootstrap.list", "aggregate.challenge", "aggregate.ranked.list",
   "aggregateThenRank",
   "as.challenge",
   "as.relation.challenge.incidence", "as.relation.ranked.list",
   "bootstrap", "bootstrap.ranked.list", #"bootstrap.default",
   "boxplot.ranked.list", "boxplot.bootstrap.list", "boxplot.comparedRanks.list",
   #"check_strict_preference",
   "compareRanks",    "compareRanks.ranked",    "compareRanks.ranked.list", # "compareRanks.default",
   "consensus",       "consensus.ranked.list", #"consensus.default",
   "Decision", "decision.challenge",
   "default_colors",
   "density.bootstrap.list",
   "dendrogram", "dendrogram.ranked.list",
   "extract.workflow",
   "kendall",    "kendall.bootstrap.list",
   # "merge.list",
   "lineplot.challenge",
   "methodsplot","methodsplot.challenge",
   "network", "plot.network",
   "podium","podium.ranked.list", "podium.challenge",#"podium.default",
   "print.aggregated", "print.aggregated.list", "print.comparedRanks",    "print.ranked", "print.ranked.list",
   #"quickmerge.list",
   "rank", "rank.aggregated.list", "rank.aggregatedRanks.list", "rank.challenge", #"rank.default",
   "rankFrequencies", "rankFrequencies.bootstrap",    "rankFrequencies.bootstrap.list", #"rankFrequencies.default",
   #"rankNA2",
   "rankThenAggregate",
   "rankingHeatmap", "rankingHeatmap.ranked.list", "relation_dissimilarity.ranked.list",
   "report", "report.bootstrap.list",
   "second",
   "select.if", "select.if.aggregated.list", "select.if.comparedRanks.list", "select.if.list", "select.if.ranked.list",
   "significanceMap",
   "spearmansFootrule", "spearmansWeightedFootrule",
   "splitby",
   "stability", "stabilityByAlgorithm", "stabilityByAlgorithmStacked","stabilityByTask",
   "stability.ranked.list", "relation_dissimilarity",
   "stabilityByAlgorithm.bootstrap.list",
   "stabilityByAlgorithmStacked.bootstrap.list",
   "stabilityByTask.bootstrap.list",
   "subset",
   "subset.aggregated.list",    "subset.comparedRanks.list", "subset.list", "subset.ranked.list", "subset.bootstrap.list", #"which.top",
   "test",    "test.challenge", "test.default",
   "testThenRank",
   "violin", "violin.bootstrap.list",
   "winner", "winner.bootstrap",    "winner.bootstrap.list", "winner.default", "winner.ranked",    "winner.ranked.list",
   "winnerFrequencies",    "winnerFrequencies.bootstrap", "winnerFrequencies.bootstrap.list",    "winnerFrequencies.default"
   )
 
 
 importFrom("dplyr", "bind_rows","group_by","summarise","select_if","filter","mutate","right_join","anti_join","ungroup","arrange","desc")
 importFrom("rlang",":=",".data","!!")
 importFrom("reshape2","melt", "acast")
 importFrom("utils", "capture.output", "methods")
 importFrom("plyr", "llply")
 importFrom("knitr", "kable")
 importFrom("tidyr", "complete","expand")
 importFrom("purrr", "%>%")
 importFrom("rmarkdown", "render","word_document","pdf_document","html_document")
 importFrom("viridisLite", "viridis","cividis")
 importFrom("ggplot2", "aes","aes_string","geom_abline", "geom_bar", "geom_boxplot", "geom_count", "geom_density", "geom_jitter",
     "geom_line", "geom_point", "geom_raster", "geom_step", "geom_text", "geom_violin","annotate","guide_legend",
     "geom_vline", "ggplot", "ggtitle","vars","xlab","ylab","scale_size_area","theme_get","rel","geom_hline","ggplot_build","scale_fill_manual",
     "scale_y_continuous","coord_cartesian", "element_text", "facet_wrap", "position_jitter", "stat", "stat_summary", "theme", "unit","guides","scale_fill_viridis_c",
     "theme_set", "theme_light", "scale_color_manual", "element_blank")
 importFrom("grDevices", "col2rgb", "gray", "rgb", "grey")
 importFrom("graphics", "abline", "axis", "barplot", "box", "layout",
                "legend", "par", "plot", "points", "segments","boxplot", "stripchart", "title", "grconvertX", "plot.new")
 importFrom("stats", "as.dist", "as.formula", "median", "p.adjust", "density",
                "quantile", "aggregate", "cor", "wilcox.test", "terms.formula", "complete.cases")
 importFrom("methods", "new")
 importFrom("relations","relation","as.relation",
     "relation_domain", "relation_incidence", "relation_is_asymmetric","relation_consensus","relation_ensemble",
     "relation_is_irreflexive", "relation_is_negatively_transitive",
     "relation_is_transitive", "relation_is_trichotomous", "relation_scores",
     "relation_violations","relation_dissimilarity")
 importFrom("graph", "addEdge")
 
 
 S3method(print, comparedRanks)
 S3method(print, aggregated)
 S3method(print, ranked)
 S3method(print, aggregated.list)
 S3method(print, ranked.list)
 
 S3method(aggregate, challenge)
 S3method(aggregate, ranked.list)
 S3method(aggregate, bootstrap.list)
 S3method(aggregate, bootstrap)
 
 S3method(test, default)
 S3method(test, challenge)
 
 S3method(Aggregate, default)
-S3method(Aggregate, data.frame)
 S3method(Aggregate, list)
 
 S3method(Rank, default)
 S3method(Rank, list)
 
 S3method(rank, default)
 S3method(rank, challenge)
 S3method(rank, aggregated.list)
 S3method(rank, aggregatedRanks.list)
 
 S3method(bootstrap, default)
 S3method(bootstrap, ranked.list)
 
 S3method(dendrogram, default)
 S3method(dendrogram, ranked.list)
 
 S3method(winner, default)
 S3method(winner, ranked)
 S3method(winner, ranked.list)
 S3method(winner, bootstrap)
 S3method(winner, bootstrap.list)
 
 S3method(rankFrequencies, default)
 S3method(rankFrequencies, bootstrap)
 S3method(rankFrequencies, bootstrap.list)
 
 S3method(winnerFrequencies, default)
 S3method(winnerFrequencies, bootstrap)
 S3method(winnerFrequencies, bootstrap.list)
 
 
 S3method(compareRanks,default)
 S3method(compareRanks,ranked)
 S3method(compareRanks,ranked.list)
 
 S3method(merge,list)
 
 S3method(melt,ranked.list)
 S3method(melt,aggregated.list)
 
 S3method(boxplot,ranked.list)
 S3method(boxplot,comparedRanks.list)
 S3method(boxplot,bootstrap.list)
 
 S3method(select.if,default)
 S3method(select.if,list)
 S3method(select.if,aggregated.list)
 S3method(select.if,ranked.list)
 S3method(select.if,comparedRanks.list)
 
 S3method(subset,list)
 S3method(subset,bootstrap.list)
 S3method(subset,aggregated.list)
 S3method(subset,ranked.list)
 S3method(subset,comparedRanks.list)
 
 S3method(podium,default)
 S3method(podium,challenge)
 S3method(podium,ranked.list)
 
 S3method(network,default)
 S3method(network,ranked.list)
 S3method(network,dist)
 S3method(plot,network)
 
 S3method(density,bootstrap.list)
 
 S3method(as.relation,challenge.incidence)
 S3method(as.relation,ranked.list)
 
 S3method(subset,bootstrap.list)
 S3method(subset,ranked.list)
 S3method(subset,list)
 S3method(subset,comparedRanks.list)
 S3method(subset,aggregated.list)
 
 S3method(decision,challenge)
 S3method(decision,default)
 
 S3method(lineplot,challenge)
 S3method(lineplot,default)
 
 S3method(methodsplot,challenge)
 S3method(methodsplot,default)
 
 S3method(significanceMap,data.frame)
 S3method(significanceMap,ranked.list)
 S3method(significanceMap,default)
 
 S3method(violin,bootstrap.list)
 S3method(violin,default)
 
 S3method(rankingHeatmap,ranked.list)
 S3method(rankingHeatmap,default)
 
 S3method(relation_dissimilarity,ranked.list)
 S3method(relation_dissimilarity,default)
 
 S3method(stabilityByTask,bootstrap.list)
 S3method(stabilityByTask,default)
 S3method(stability,default)
 S3method(stability,ranked.list)
 
 S3method(stabilityByAlgorithm,bootstrap.list)
 S3method(stabilityByAlgorithm,default)
 
 S3method(stabilityByAlgorithmStacked,bootstrap.list)
 S3method(stabilityByAlgorithmStacked,default)
 
 S3method(consensus,ranked.list)
 S3method(consensus,default)
 
 S3method(report,bootstrap.list)
 S3method(report,ranked.list)
 S3method(report,default)
diff --git a/R/Aggregate.R b/R/Aggregate.R
index 2a3f680..23b9d59 100644
--- a/R/Aggregate.R
+++ b/R/Aggregate.R
@@ -1,166 +1,96 @@
 Aggregate <- function(object,...) UseMethod("Aggregate")
 Aggregate.default <- function(object, ...) aggregate(object,...)  #stats::aggregate
 
-
-Aggregate.data.frame <-function(object,
-                                x,
-                                algorithm,
-                                FUN=mean,
-                                na.treat="na.rm", #can be na.rm, numeric value or function
-                                case,
-                                alpha=0.05, p.adjust.method="none",
-                                alternative="one.sided",
-                                test.fun=function(x,y) wilcox.test(x,
-                                                                   y,
-                                                                   alternative = alternative,
-                                                                   exact=FALSE,
-                                                                   paired = TRUE)$p.value,
-                                largeBetter=TRUE, # only needed for significance
-                                ...                      ){
-  call=match.call(expand.dots = T)
-  if (is.numeric(na.treat)) object[,x][is.na(object[,x])]=na.treat
-  else if (is.function(na.treat)) object[,x][is.na(object[,x])]=na.treat(object[,x][is.na(object[,x])])
-  else if (na.treat=="na.rm") object=object[!is.na(object[,x]),]
-  else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.")
-
-  if (is.character(FUN) && FUN=="significance"){
-    if(missing(case)| missing(largeBetter)| missing(alpha)) stop("If FUN='significance' arguments case, largeBetter and alpha need to be given")
-    if (length(unique(object[[algorithm]]))<=1){
-      warning("only one ", algorithm, " available")
-      agg=data.frame()
-    } else {
-      agg = significance(object,
-                         x,
-                         algorithm,
-                         case,
-                         alpha,
-                         largeBetter,
-                         p.adjust.method = p.adjust.method,
-                         alternative = alternative,
-                         ...
-      )
-    }
-    isSignificance=TRUE
-  } else {
-    if (is.function(FUN)) FUNname <-gsub('\")',"",gsub('UseMethod(\"',"",deparse(functionBody(FUN)),fixed = T),fixed=T)
-    else if (is.character(FUN)) FUNname=FUN
-    if (is.character(FUN))  FUN=try(eval(parse(text=FUN)),silent = T)
-
-    if (!is.function(FUN)) stop("FUN has to be a function (possibly as character) or 'significance'")
-    agg <- aggregate(object[, x],
-                     by = list(object[, algorithm]),
-                     FUN = function(z) do.call(FUN, args = list(x = z))
-    )
-    names(agg)=c(algorithm,
-                 paste0(x,"_",
-                        FUNname))
-    rownames(agg)=agg[,1]
-    agg=agg[,-1,drop=F]
-    isSignificance=FALSE
-  }
-  res=list(FUN = . %>% (call),
-           FUN.list=list(FUN),
-           call=list(call),
-           data=object,
-           mat=agg,
-           isSignificance= isSignificance)
-
-  class(res)=c("aggregated",class(res))
-  res
-
-}
-
-
 Aggregate.list <-function(object,
                           x,
                           algorithm,
                           FUN = mean,
                           na.treat = "na.rm",
                           parallel = FALSE,
                           progress = "none",
                           case,
                           alpha = 0.05,
                           p.adjust.method = "none",
                           alternative = "one.sided",
                           test.fun = function(x, y) wilcox.test(x,
                                                                 y,
                                                                 alternative = alternative,
                                                                 exact = FALSE,
                                                                 paired = TRUE)$p.value,
                           largeBetter = TRUE,   # only needed for significance
                           ...            ) {
   call=match.call(expand.dots = T)
   if (is.character(FUN) && FUN=="significance"){
     if(missing(case)|  missing(largeBetter)|  missing(alpha)) stop("If FUN='significance' arguments case, largeBetter and alpha need to be given")
     matlist=llply(1:length(object),
                   function(id){
                     piece=object[[id]]
                     if (length(unique(piece[[algorithm]]))<=1){
-                      warning("only one ", algorithm, " available in element ", names(object)[id])
+                      warning("Only one algorithm available in task '", names(object)[id], "'.")
                       return(data.frame("prop_significance"=rep(NA,length(unique(piece[[algorithm]]))),
                                         row.names = unique(piece[[algorithm]])))
                     }
                     if (is.numeric(na.treat)) piece[,x][is.na(piece[,x])]=na.treat
                     else if (is.function(na.treat)) piece[,x][is.na(piece[,x])]=na.treat(piece[,x][is.na(piece[,x])])
                     else if (na.treat=="na.rm") piece=piece[!is.na(piece[,x]),]
                     else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.")
 
                     xmean <- significance(piece,
                                           x,
                                           algorithm,
                                           case,
                                           alpha,
                                           p.adjust.method=p.adjust.method,
                                           largeBetter,
                                           alternative=alternative,
                                           ...)
                     class(xmean)=c("aggregated",
                                    class(xmean))
                     xmean
                     },
                   .parallel=parallel,
                   .progress=progress
     )
     isSignificance=TRUE
 
   } else {
     if (is.function(FUN)) FUNname <-gsub('\")',"",gsub('UseMethod(\"',"",deparse(functionBody(FUN)),fixed = T),fixed=T)
     else if (is.character(FUN)) FUNname=FUN
-    
+
     if (is.character(FUN)) FUN=try(eval(parse(text=FUN)),
                                    silent = T)
     if (!is.function(FUN)) stop("FUN has to be a function (possibly as character) or 'significance'")
 
     matlist=llply(object,
                   function(piece){
                     if (is.numeric(na.treat)) piece[,x][is.na(piece[,x])]=na.treat
                     else if (is.function(na.treat)) piece[,x][is.na(piece[,x])]=na.treat(piece[,x][is.na(piece[,x])])
                     else if (na.treat=="na.rm") piece=piece[!is.na(piece[,x]),]
                     else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.")
 
                     xmean <- aggregate(piece[,x],
                                        by=list(piece[,algorithm]),
                                        FUN=function(z) do.call(FUN,args=list(x=z)))
                     names(xmean)=c(algorithm,
                                    paste0(x,"_",FUNname))
                     rownames(xmean)=xmean[,1]
                     xmean=xmean[,-1,drop=F]
                     xmean
                   },
                   .parallel=parallel,
                   .progress=progress
                   )
     isSignificance=FALSE
   }
   names(matlist)=names(object)
   res=list(FUN = . %>% (call),
            FUN.list=list(FUN),
            call=list(call),
            data=object,
            matlist=matlist,
            isSignificance=isSignificance
            )
 
   class(res)=c("aggregated.list",class(res))
   res
 }
diff --git a/R/Bootstrap.R b/R/Bootstrap.R
index 549c7c4..f85f99c 100644
--- a/R/Bootstrap.R
+++ b/R/Bootstrap.R
@@ -1,179 +1,160 @@
 bootstrap <- function(object,...) UseMethod("bootstrap")
 bootstrap.default <- function(object, ...) stop("not implemented for this class")
 
 
 bootstrap.ranked.list=function(object,
                                nboot,
                                parallel=FALSE,
                                progress="text",
                                ...){
   algorithm=attr(object$data,"algorithm")
   by=attr(object$data,"case")
 
   # exclude if only 1 data set or less than 3 algorithms
   tidy.data.id=sapply(object$data,
                       function(data.subset) {
                         ifelse((length(unique(data.subset[[by]]))==1 |  length(unique(data.subset[[algorithm]]))<=2 ),
                                yes=FALSE,
                                no=TRUE)
                         })
   tidy.data=object$data[tidy.data.id]
   tidy.matlist=object$matlist[tidy.data.id]
 
   res= llply(1:nboot,
              function(it){
                # draw 1 sample for each task
                bootDatalist = lapply(tidy.data, function(data.subset) {
                  index = unique(data.subset[[by]])
 
                  # bootIndex=sample(index,size=length(index),replace=TRUE)
                  # bootData=bind_rows(lapply(bootIndex,function(zz) data.subset[data.subset[[by]]==zz,]))
                  # faster:
                  bootIndex = data.frame(sample(index,
                                                size = length(index),
                                                replace = TRUE))
                  colnames(bootIndex) = by
                  bootData = merge(bootIndex,
                                   data.subset,
                                   by = by)
                  bootData
                })
                attr(bootDatalist, "inverseOrder") = attr(object$data, "inverseOrder")
                attr(bootDatalist, "algorithm") = attr(object$data, "algorithm")
                attr(bootDatalist, "case") = attr(object$data, "case")
                attr(bootDatalist, "check") = FALSE
                object$FUN(bootDatalist)$mat
              },
              .parallel = parallel,
              .progress = progress)
 
-
-
-  # rankmatlist=lapply(res[[1]],function(z) z[,"rank",drop=F])
-  # for (j in 2:length(res)){
-  #   rankmatlist=merge.list(rankmatlist,lapply(res[[j]],function(z) z[,"rank",drop=F]),by="row.names", suffixes = c(paste0(".",j-1),paste0(".",j)))
-  #   rankmatlist=lapply(rankmatlist, function(z) {
-  #     rownames(z)=z[,"Row.names"]
-  #     z=z[,-1]
-  #     })
-  # }
-  #
-  # aggmatlist=lapply(res[[1]],function(z) z[,-2,drop=F])
-  # for (j in 2:length(res)){
-  #   aggmatlist=merge.list(aggmatlist,lapply(res[[j]],function(z) z[,-2,drop=F]),by="row.names", suffixes = c(paste0(".",j-1),paste0(".",j)))
-  #   aggmatlist=lapply(aggmatlist, function(z) {
-  #     rownames(z)=z[,"Row.names"]
-  #     z=z[,-1]
-  #     })
-  # }
   rankmatlist = lapply(res[[1]],
                        function(z) z[, "rank", drop = F]
                        )
   for (j in 2:length(res)) {
     rankmatlist = quickmerge.list(rankmatlist,
                                   lapply(res[[j]],
                                          function(z)  z[, "rank", drop = F]))
   }
 
   aggmatlist = lapply(res[[1]],
                       function(z) z[, -2, drop = F])
   for (j in 2:length(res)) {
     aggmatlist = quickmerge.list(aggmatlist,
                                  lapply(res[[j]],
                                         function(z) z[, -2, drop = F]))
   }
 
   final=list(bootsrappedRanks=rankmatlist,
              bootsrappedAggregate=aggmatlist,
              data=object$data,
              matlist=tidy.matlist,
              FUN=object$FUN,
              FUN.list=object$FUN.list)
   class(final)=c("bootstrap.list")
   final
 }
 
 
 
 
 
 
 ####################################################################################################
 # deprecate following functions?
 
 
 
 rankFrequencies <- function(object,...) UseMethod("rankFrequencies")
 rankFrequencies.default <- function(object, ...) stop("not implemented for this class")
 
 rankFrequencies.bootstrap=function(object, who,...){
   if (is.data.frame(who)) who=rownames(who)
   if (length(who)==1){
     res=table(t(object$bootsrappedRanks[rownames(object$bootsrappedRanks)==who,]))
     cat("\n",who,"\n")
     print(res)
   } else {
     res=lapply(who, function(w){
       rr=table(t(object$bootsrappedRanks[rownames(object$bootsrappedRanks)==w,]))
     cat(w,"\n")
       print(rr)
       cat("\n")
       rr
     })
   }
   res=c(list(rankFrequencies=res),object)
   invisible(res)
 }
 
 rankFrequencies.bootstrap.list=function(object, who,...){
   if (is.data.frame(who)) who=rownames(who)
   res=lapply(object$bootsrappedRanks,function(bootMat){
     if (length(who)==1){
       res=table(t(bootMat[rownames(bootMat)==who,]))
       cat("\n",who,"\n")
       print(res)
     } else {
       res=lapply(who, function(w){
         rr=table(t(bootMat[rownames(bootMat)==w,]))
         cat(w,"\n")
         print(rr)
         cat("\n")
         rr
       })
     }
     res
   })
   res=c(list(rankFrequencies=res),object)
   invisible(res)
 }
 
 
 
 
 winnerFrequencies <- function(object,...) UseMethod("winnerFrequencies")
 winnerFrequencies.default <- function(object, ...) stop("not implemented for this class")
 
 # Achtung: bester rank muss ==1 sein und nicht z.B. 1.5
 winnerFrequencies.bootstrap=function(object,...){
   rankings_dicho=ifelse(object$bootsrappedRanks==1,1,0)
   winnerFrequencies=data.frame(winnerFrequency=rowSums(rankings_dicho),row.names = rownames(object$bootsrappedRanks))
   res=merge(object$mat,winnerFrequencies,by="row.names",...)
   rownames(res)=res[,1]
   res=res[,-1]
   # res=c(res=res,object)
   # class(res)="bootstrapResults"
   res
 }
 
 winnerFrequencies.bootstrap.list=function(object,...){
   res=lapply(1:length(object$bootsrappedRanks),function(id){
     rankings_dicho=ifelse(object$bootsrappedRanks[[id]]==1,1,0)
     winnerFrequencies=data.frame(winnerFrequency=rowSums(rankings_dicho),row.names = rownames(object$bootsrappedRanks[[id]]))
     res=merge(object$matlist[[id]],winnerFrequencies,by="row.names",...)
     rownames(res)=res[,1]
     res=res[,-1]
     res
   })
   names(res)=names(object$bootsrappedRanks)
   res
 }
diff --git a/R/merge.list.R b/R/merge.list.R
index ebfcf91..fa741b9 100644
--- a/R/merge.list.R
+++ b/R/merge.list.R
@@ -1,55 +1,43 @@
-# merge.list=function(x,y,by="row.names",suffixes = c(".1",".2"),...){
-#   if (is.list(x) & is.list(y)){
-#     if (!all.equal(names(x),names(y))) stop("list elements must have same names and lists must have same length")
-#     res=lapply(1:length(x), function(z){
-#       merge(x[[z]],y[[z]],by=by,suffixes=suffixes,...)
-#     })
-#     names(res)=names(x)
-#     res
-#     
-#   } else stop("Comparison of a list and a data.frame under construction")
-# }
-
 merge.list=function(x,y,by="row.names",suffixes = c(".1",".2"),...){
   if (is.list(x) & is.list(y)){
     #if (!all.equal(names(x),names(y))) stop("list elements must have same names and lists must have same length")
     common.elements=intersect(names(x),names(y))
-    
+
     res=lapply(common.elements, function(z){
       merge(x[[z]],y[[z]],by=by,suffixes=suffixes,...)
     })
     names(res)=common.elements
     res
-    
+
   } else stop("Comparison of a list and a data.frame under construction")
 }
-  
+
 quickmerge.list=function(x,y){
   if (is.list(x) & is.list(y)){
     #if (!all.equal(names(x),names(y))) stop("list elements must have same names and lists must have same length")
     common.elements=intersect(names(x),names(y))
-    
+
     res=lapply(common.elements, function(z){
       dat1=x[[z]]
       dat2=y[[z]]
       dat1=dat1[order(rownames(dat1)),,drop=F]
       dat2=dat2[order(rownames(dat2)),,drop=F]
       if (all(rownames(dat1)==rownames(dat2))) {
           qq=cbind(dat1,dat2)
           rownames(qq)=rownames(dat1)
           qq
       }
       else {
         id=intersect(rownames(dat1),rownames(dat2))
         dat1=dat1[match(id,rownames(dat1)),]
         dat2=dat2[match(id,rownames(dat2)),,drop=F]
         qq=cbind(dat1,dat2)
         rownames(qq)=rownames(dat1)
         qq
       }
     })
     names(res)=common.elements
     res
-    
+
   } else stop("Comparison of a list and a data.frame under construction")
 }
diff --git a/R/testBased.R b/R/testBased.R
index 4927e03..897ec79 100644
--- a/R/testBased.R
+++ b/R/testBased.R
@@ -1,210 +1,175 @@
 decision <- function(x,...) UseMethod("decision")
 decision.default <- function(x, ...) stop("not implemented for this class")
 
-decision.challenge=function(x,                             
-                            na.treat=NULL, #entweder na.rm, numeric value oder function
-                            alpha=0.05, 
+decision.challenge=function(x,
+                            na.treat=NULL, #  it can be 'na.rm', numeric value or function
+                            alpha=0.05,
                             p.adjust.method="none",
                             alternative="one.sided",
                             test.fun=function(x,y) wilcox.test(x,y,
                                                                alternative = alternative,exact=FALSE,
                                                                paired = TRUE)$p.value,
                             parallel=FALSE,
                             progress="none",...){
-  
+
   if (is.null(na.treat)){ #na.treat only optional if no missing values in data set
     if (!inherits(x,"list")){
       if (!any(is.na(x[,attr(x, "value")]))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect
       else stop("Please specify na.treat in as.challenge()")
     } else {
-      if (!any(sapply(x, 
+      if (!any(sapply(x,
                       function(task) any(is.na(task[,attr(x, "value")]))))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect
       else stop("Please specify na.treat in as.challenge()")
     }
   }
-  
-  
+
+
   if (alternative!="two.sided") alternative=ifelse(attr(x,"largeBetter"),
                                                    yes="greater",
                                                    no="less")
-  call=match.call(expand.dots = T)  
-  
+  call=match.call(expand.dots = T)
+
   object=x
   algorithm=attr(object,"algorithm")
   case=attr(object,"case")
   value=attr(object,"value")
-  largeBetter=attr(object,"largeBetter") 
+  largeBetter=attr(object,"largeBetter")
   if(missing(case)| missing(largeBetter)) stop("arguments case and alpha need to be given in as.challenge()")
-  
-  
+
+
   if (inherits(object,"list")){
-    matlist=llply(1:length(object), 
-                  function(id){ 
+    matlist=llply(1:length(object),
+                  function(id){
                     piece=object[[id]]
                     if (length(unique(piece[[algorithm]]))<=1){
                       warning("only one ", algorithm, " available in element ", names(object)[id])
-                    } 
+                    }
                     if (is.numeric(na.treat)) piece[,value][is.na(piece[,value])]=na.treat
                     else if (is.function(na.treat)) piece[,value][is.na(piece[,value])]=na.treat(piece[,value][is.na(piece[,value])])
                     else if (na.treat=="na.rm") piece=piece[!is.na(piece[,value]),]
                     mat=Decision(piece, value, algorithm, case, alpha, largeBetter,
                                  p.adjust.method=p.adjust.method,
                                  alternative=alternative,
                                  test.fun=test.fun)
                     mat=as.data.frame(mat)
                     mat[is.na(mat)]=0
                     mat=as.matrix(mat)
                     class(mat)=c(class(mat),"challenge.incidence")
                     mat
-                    
-                  }, 
+
+                  },
                   .parallel=parallel,
                   .progress=progress )
     names(matlist)=names(object)
     return(matlist)
   } else {
     if (length(unique(object[[algorithm]]))<=1){
       warning("only one ", algorithm, " available")
       matlist=(matrix(NA,1,1))
     } else {
-      mat=Decision(object, 
-                   value, 
-                   algorithm, 
-                   case, 
-                   alpha, 
+      mat=Decision(object,
+                   value,
+                   algorithm,
+                   case,
+                   alpha,
                    largeBetter,
                    p.adjust.method=p.adjust.method,
                    alternative=alternative,
                    test.fun=test.fun)
     }
     mat=as.data.frame(mat)
     mat[is.na(mat)]=0
     mat=as.matrix(mat)
     class(mat)=c(class(mat),"challenge.incidence")
     return(mat)
-    
+
   }
 }
 
 
 Decision=function(object,
                   value,
                   by,
                   case,
-                  alpha, 
+                  alpha,
                   largeBetter=FALSE,
                   p.adjust.method="none",
                   alternative="one.sided",
                   test.fun=function(x,y) wilcox.test(x,y,
                                                      alternative = alternative,exact=FALSE,
                                                      paired = TRUE)$p.value){
   algorithms=unique(object[[by]])
   if (length(unique(object[[case]]))==1){
-    warning("Only one test case in task")
-  } #else {
-    combinations=expand.grid(algorithms,algorithms)[,2:1]
-    combinations=combinations[apply(combinations,1,function(z) anyDuplicated(z)==0),] # remove i==j
-    
-    pvalues=sapply(1:nrow(combinations), function(it){ 
-      dat1=object[object[[by]]==combinations[it,1],]
-      dat2=object[object[[by]]==combinations[it,2],]
-      id=intersect(dat2[,case],dat1[,case])
-      dat1=dat1[match(id,dat1[,case]),value]
-      dat2=dat2[match(id,dat2[,case]),value]
-      test.fun(dat1,dat2)
-      
-    })
-    decisions=as.numeric(p.adjust(pvalues,
-                                  method=p.adjust.method)<= alpha)
-    res=cbind(combinations,decisions)
-    reshape2::acast(res,
-                    Var2~Var1,
-                    value.var="decisions")
-  #  }
-}
-
-
-# test.fun=function(x,y) suppressWarnings(frdAllPairsExactTest(list(x,y),p.adjust.method ="none")$p.value)
-# test.fun=function(x,y) wilcox.test(x,y,
-#                                    alternative = alternative,exact=FALSE,
-#                                    paired = TRUE)$p.value
+    warning("Only one case in task.")
+  }
 
+  combinations=expand.grid(algorithms,algorithms)[,2:1]
+  combinations=combinations[apply(combinations,1,function(z) anyDuplicated(z)==0),] # remove i==j
 
+  pvalues=sapply(1:nrow(combinations), function(it){
+    dat1=object[object[[by]]==combinations[it,1],]
+    dat2=object[object[[by]]==combinations[it,2],]
+    id=intersect(dat2[,case],dat1[,case])
+    dat1=dat1[match(id,dat1[,case]),value]
+    dat2=dat2[match(id,dat2[,case]),value]
+    test.fun(dat1,dat2)
 
+  })
+  decisions=as.numeric(p.adjust(pvalues,
+                                method=p.adjust.method)<= alpha)
+  res=cbind(combinations,decisions)
+  reshape2::acast(res,
+                  Var2~Var1,
+                  value.var="decisions")
+}
 
 
-as.relation.challenge.incidence=function(x, 
+as.relation.challenge.incidence=function(x,
                                          verbose = FALSE, ...) {
   r <- relation(incidence = x, ...)
-  
-  
-  # if ( x$type == "=" ) {
-  #   props <- check_indifference_preference(r)
-  #   class <- "indiffpref"
-  # }
-  # else {
+
   props <- check_strict_preference(r)
   class <- "strictpref"
   r$.Meta$is_decreasing <- FALSE
-  # }
-  
+
   r$.Meta <- c(r$.Meta,
                structure(props, names = sprintf("is_%s", names(props))))
-  
+
   if ( verbose ) {
-    # cat(sQuote(x$type), "preference relation:\n")
-    
     for ( p in names(props) ) {
       cat(sprintf("%s = %s:\n", p, props[[p]]))
       print(relation_violations(r, p, TRUE))
     }
   }
-  
+
   structure(r, class = c(class, class(r)))
 }
 
 check_strict_preference=
 function(x) {
   list(irreflexive = relation_is_irreflexive(x),
        asymmetric = relation_is_asymmetric(x),
        transitive = relation_is_transitive(x),
        negatively_transitive = relation_is_negatively_transitive(x),
        trichotomous = relation_is_trichotomous(x))
 }
 
-# library(plyr)
-# 
-# a=challenge_multi%>%decision.challenge()
-# relation(incidence=a[[1]])
-# as.relation.challenge.incidence(a[[1]])
-# aa=lapply(a,function(x) relation(incidence = x))
-# 
-#   relensemble= do.call(relation_ensemble,args = aa)
-# 
-# plot(relensemble)
-# 
-
 
 significance=function(object,
                       value,
                       algorithm,
                       case,
-                      alpha, 
-                      largeBetter=FALSE,...){
-  # algorithm=attr(object,"algorithm")
-  # case=attr(object,"case")
-  # largeBetter=attr(object,"largeBetter") 
-  # value=attr(object,"value")
-  
+                      alpha,
+                      largeBetter=FALSE,...) {
+
   xx=as.challenge(object,
                   value=value,
                   algorithm=algorithm,
                   case=case,
                   smallBetter = !largeBetter,
                   check=FALSE)
   a=decision.challenge(xx,...)
-  prop_significance=  rowSums(a)/(ncol(a)-1)
+  prop_significance=rowSums(a)/(ncol(a)-1)
   return(data.frame("prop_significance"=prop_significance,
                     row.names = names(prop_significance)))
 }
-
diff --git a/inst/appdir/reportMultiple.Rmd b/inst/appdir/reportMultiple.Rmd
index 050ee48..54bfa9e 100644
--- a/inst/appdir/reportMultiple.Rmd
+++ b/inst/appdir/reportMultiple.Rmd
@@ -1,409 +1,409 @@
 ---
 params:
   object: NA
   colors: NA
   name: NULL
   consensus: NA
   isMultiTask: NA
   bootstrappingEnabled: NA
   fig.format: NULL
   dpi: NULL
 title: "Benchmarking report for `r params$name` "
 author: "created by challengeR v`r packageVersion('challengeR')`"
 date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`"
 editor_options: 
   chunk_output_type: console
 ---
 
 <!-- This text is outcommented -->
 <!-- R code chunks start with "```{r }" and end with "```" -->
 <!-- Please do not change anything inside of code chunks, otherwise any latex code is allowed -->
 
 <!-- inline code with `r 0` -->
 
 
 ```{r setup, include=FALSE}
 options(width=80)
 #out.format <- knitr::opts_knit$get("out.format")
 out.format <- knitr::opts_knit$get("rmarkdown.pandoc.to")
 
 img_template <- switch( out.format,
                      docx = list("img-params"=list(dpi=150,
                                                fig.width=6,
                                                fig.height=6,
                                                out.width="504px",
                                                out.height="504px")),
                      {
                        # default
                        list("img-params"=list( fig.width=7,
                                                fig.height = 3,
                                                dpi=300))
                      } )
 
 knitr::opts_template$set( img_template )
 
 knitr::opts_chunk$set(echo = F) # ,#fig.width=7,fig.height = 3,dpi=300,
 
 if (out.format != "docx") knitr::opts_chunk$set(fig.align = "center")
 if (!is.null(params$fig.format)) knitr::opts_chunk$set(dev = params$fig.format) # can be vector, e.g. fig.format=c('jpeg','png', 'pdf')
 if (!is.null(params$dpi)) knitr::opts_chunk$set(dpi = params$dpi) 
 
 theme_set(theme_light())
 
 isMultiTask = params$isMultiTask
 bootstrappingEnabled = params$bootstrappingEnabled
 
 ```
 
 
 ```{r }
 object = params$object
 if (isMultiTask) {
 ordering_consensus=names(params$consensus)
 } else
 {
   ordering_consensus=names(sort(t(object$matlist[[1]][,"rank",drop=F])["rank",]))
 }
 color.fun=params$colors
 
 ```
 
 ```{r }
 
 challenge_multiple=object$data
 ranking.fun=object$FUN
 
 cols_numbered=cols=color.fun(length(ordering_consensus))
 names(cols)=ordering_consensus
 names(cols_numbered)= paste(1:length(cols),names(cols))
 
 if (bootstrappingEnabled) {
   boot_object = params$object
   challenge_multiple=boot_object$data
 
   ranking.fun=boot_object$FUN
   object=challenge_multiple%>%ranking.fun
   object$FUN.list = boot_object$FUN.list
 
   object$fulldata=boot_object$fulldata  # only not NULL if subset of algorithms used
 
   cols_numbered=cols=color.fun(length(ordering_consensus))
   names(cols)=ordering_consensus
   names(cols_numbered)= paste(1:length(cols),names(cols))
 }
 
 
 ```
 
 This document presents a systematic report on the benchmark study "`r params$name`". Input data comprises raw metric values for all algorithms and cases. Generated plots are:
 
 ```{r, child=if (!isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewSingleTaskNoBootstrapping.Rmd", package="challengeR")}
 
 ```
 
 ```{r, child=if (!isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewSingleTaskBootstrapping.Rmd", package="challengeR")}
 
 ```
 
 ```{r, child=if (isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewMultiTaskNoBootstrapping.Rmd", package="challengeR")}
 
 ```
 
 ```{r, child=if (isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewMultiTaskBootstrapping.Rmd", package="challengeR")}
 
 ```
 
 Details can be found in Wiesenfarth et al. (2019).
 
 ```{r,results='asis'}
 
 if (isMultiTask) {
   cat("# Rankings\n")
 } else {
   cat("# Ranking")
 }
 
 ```
 
 
 Algorithms within a task are ranked according to the following ranking scheme:
 
 ```{r,results='asis'}
 a=(  lapply(object$FUN.list[1:2],function(x) {
                  if (!is.character(x)) return(paste0("aggregate using function ",
                                                      paste(gsub("UseMethod","",
                                                                 deparse(functionBody(x))),
                                                            collapse=" ")
                                                      ))
                  else if (x=="rank") return(x)
                  else return(paste0("aggregate using function ",x))
   }))
 cat("&nbsp; &nbsp; *",paste0(a,collapse=" then "),"*",sep="")
 
-if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop.sign' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.")
+if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop_significance' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.")
 ```
 
 ```{r,results='asis'}
 
 if (isMultiTask) {
   cat("Ranking for each task:\n")
   
   for (t in 1:length(object$matlist)){
     cat("\n",names(object$matlist)[t],": ")
     n.cases=nrow(challenge_multiple[[t]])/length(unique(challenge_multiple[[t]][[attr(challenge_multiple,"algorithm")]]))
 
     numberOfAlgorithms <- length(levels(challenge_multiple[[t]][[attr(challenge_multiple, "algorithm")]]))
     
     cat("\nThe analysis is based on",
         numberOfAlgorithms,
         "algorithms and",
         n.cases,
         "cases.",
         attr(object$data,"n.missing")[[t]], "missing cases have been found in the data set. ")
     
     if (nrow(attr(object$data,"missingData")[[t]])>0)  {
       if(attr(object$data,"n.missing")[[t]]==0 ) cat("However, ")
       else if(attr(object$data,"n.missing")[[t]]>0 ) cat("Additionally, ")
       cat("performance of not all algorithms has been observed for all cases in task '",
           names(object$matlist)[t],
           "'. Therefore, missings have been inserted in the following cases:")
       print(knitr::kable(as.data.frame(attr(object$data,"missingData")[[t]])))
     }
     
     if (nrow(attr(object$data,"missingData")[[t]])>0 | attr(object$data,"n.missing")[[t]]>0)  {
       if (is.numeric(attr(object$data,"na.treat"))) cat("All missings have been replaced by values of", attr(object$data,"na.treat"),".\n")
       else if (is.character(attr(object$data,"na.treat")) && attr(object$data,"na.treat")=="na.rm") cat("All missings have been removed.")
       else if (is.function(attr(object$data,"na.treat"))) {
         cat("Missings have been replaced using function ")
         print(attr(object$data,"na.treat"))
       }
       else if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="rank") cat("Missings lead to the algorithm ranked last for the missing case.")
     }
     
     x=object$matlist[[t]]
     print(knitr::kable(x[order(x$rank),]))
   }
 } else {
 
   n.cases=nrow(challenge_multiple[[1]])/length(unique(challenge_multiple[[1]][[attr(challenge_multiple,"algorithm")]]))
   
   # Is subset of algorithms used?
   if (!is.null(object$fulldata[[1]])) {
     cat("The top ",
         length(levels(challenge_multiple[[1]][[attr(challenge_multiple, "algorithm")]])),
         " out of ",
         length(levels(object$fulldata[[1]][[attr(challenge_multiple, "algorithm")]])),
         " algorithms are considered.\n")
     cat("\nThe analysis is based on",
         n.cases,
         "cases. ")
   } else {
     cat("\nThe analysis is based on",
         length(levels(challenge_multiple[[1]][[attr(challenge_multiple, "algorithm")]])),
         "algorithms and",
         n.cases,
         "cases. ")
   }
 
   cat(attr(object$data,"n.missing")[[1]], "missing cases have been found in the data set. ")
 
   if (nrow(attr(object$data,"missingData")[[1]])>0)  {
     
     if(attr(object$data,"n.missing")[[1]]==0 ) cat("However, ")
     else if(attr(object$data,"n.missing")[[1]]>0 ) cat("Additionally, ")
     cat("performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:")
     print(knitr::kable(as.data.frame(attr(object$data,"missingData")[[1]])))
    }
   
   if (nrow(attr(object$data,"missingData")[[1]])>0 | attr(object$data,"n.missing")[[1]]>0)  {
     if (is.numeric(attr(object$data,"na.treat"))) cat("All missings have been replaced by values of", attr(object$data,"na.treat"),".\n")
     else if (is.character(attr(object$data,"na.treat")) && attr(object$data,"na.treat")=="na.rm") cat("All missings have been removed.")
     else if (is.function(attr(object$data,"na.treat"))) {
       cat("Missings have been replaced using function ")
       print(attr(object$data,"na.treat"))
     }
       else if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="rank") cat("Missings lead to the algorithm ranked last for the missing case.")
   }
   cat("\n\nRanking:")
   
   x=object$matlist[[1]]
   print(knitr::kable(x[order(x$rank),]))
 }
 
 ```
 
 \bigskip
 
 
 ```{r, child=if (isMultiTask) system.file("appdir", "consensusRanking.Rmd", package="challengeR")}
 
 ```
 
 
 # Visualization of raw assessment data
 
 ```{r,results='asis'}
 
 if (isMultiTask) {
   cat("The algorithms are ordered according to the computed ranks for each task.")
 }
 
 ```
 
 ## Dot- and boxplot
 
 *Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual cases.
 
 \bigskip
 
 ```{r boxplots}
 boxplot(object, size=.8)
 ```
 
 
 
 ## Podium plot
 *Podium plots* (see also Eugster et al., 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering_consensus)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering_consensus)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering_consensus)`).  Dots corresponding to identical cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. 
 
 ```{r, include=FALSE, fig.keep="none",dev=NULL}
 plot.new()
 algs=ordering_consensus
 l=legend("topright", 
          paste0(1:length(algs),": ",algs), 
          lwd = 1, cex=1.4,seg.len=1.1,
          title="Rank: Alg.",
          plot=F) 
 
 w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc')
 h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc')
 addy=max(grconvertY(l$rect$h,"user","inches"),6)
 ```
 
 
 ```{r podium,eval=T,fig.width=12, fig.height=addy}
 #c(bottom, left, top, right
 
 op<-par(pin=c(par()$pin[1],6),
         omd=c(0, 1-w, 0, 1),
         mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-.5,0),
         cex.axis=1.5,
         cex.lab=1.5,
         cex.main=1.7)
 
 oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines")
 if (oh>0) par(oma=c(oh,0,0,0))
 
 
 set.seed(38)
 podium(object,
        col=cols,
        lines.show = T, lines.alpha = .4,
        dots.cex=.9,
        ylab="Metric value",
        layout.heights=c(1,.35),
        legendfn = function(algs, cols) {
                  legend(par('usr')[2], par('usr')[4], 
                  xpd=NA, 
                  paste0(1:length(algs),": ",algs), 
                  lwd = 1, col =  cols, 
                  bg = NA,
                  cex=1.4, seg.len=1.1,
                  title="Rank: Alg.") 
         }
       )
 par(op)
   
 ```
 
 
 ## Ranking heatmap
 *Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of cases in which algorithm $A_j$ achieved rank $i$.
 
 \bigskip
 
 ```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'}
 rankingHeatmap(object)
 ```
 
 
 
 # Visualization of ranking stability
 
 ```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationBlobPlots.Rmd", package="challengeR")}
 
 ```
 
 ```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationViolinPlots.Rmd", package="challengeR")}
 
 ```
 
 
 ## *Significance maps* for visualizing ranking stability based on statistical significance
 
 *Significance maps* depict incidence matrices of
 pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values of the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference.
 
 
 \bigskip
 
 ```{r significancemap,fig.width=6, fig.height=6,out.width='200%'}
 significanceMap(object,alpha=0.05,p.adjust.method="holm")
 ```
 
 <!-- \subsubsection{Hasse diagram} -->
 
 <!-- ```{r single_stability_significance_hasse, fig.height=19} -->
 <!-- plot(relensemble) -->
 <!-- ``` -->
 
 
 
 
 ## Ranking robustness to ranking methods
 *Line plots* for visualizing rankings robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods.
 
 \bigskip
 
 ```{r lineplot,fig.width=8, fig.height=6,out.width='95%'}
 if (length(object$matlist)<=6 &nrow((object$matlist[[1]]))<=10 ){
   methodsplot(challenge_multiple,
               ordering = ordering_consensus,
               na.treat=object$call[[1]][[1]]$na.treat) + 
     scale_color_manual(values=cols)
 }else {
   x=challenge_multiple
   for (subt in names(challenge_multiple)){
      dd=as.challenge(x[[subt]],
                      value=attr(x,"value"), 
                      algorithm=attr(x,"algorithm") ,
                      case=attr(x,"case"),
                      annotator = attr(x,"annotator"), 
                      by=attr(x,"by"),
                      smallBetter = !attr(x,"largeBetter"),
                      na.treat=object$call[[1]][[1]]$na.treat
                      )
  
     print(methodsplot(dd,
                       ordering = ordering_consensus) + 
             scale_color_manual(values=cols)
           )
   }
 }
 ```
 
 
 
 ```{r, child=if (isMultiTask) system.file("appdir", "visualizationAcrossTasks.Rmd", package="challengeR")}
 
 ```
 
 
 
 # References
 
 Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*
 
 M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory
 and inferential analysis of benchmark experiments,”
 Institut fuer Statistik, Ludwig-Maximilians-Universitaet Muenchen, Germany, Technical Report 30,
 2008. [Online]. Available: http://epub.ub.uni-muenchen.de/4134/.
diff --git a/tests/testthat/test-aggregateThenRank.R b/tests/testthat/test-aggregateThenRank.R
index 9105c1c..1d4cd5c 100644
--- a/tests/testthat/test-aggregateThenRank.R
+++ b/tests/testthat/test-aggregateThenRank.R
@@ -1,346 +1,346 @@
-test_that("aggregate-than-rank by mean works with two algorithms for one case, small values are better", {
+test_that("aggregate-then-rank by mean works with two algorithms for one case, small values are better", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 1),
     "A2" = data.frame(value_mean = 0.8, rank = 2))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by mean works with two algorithms (reverse order) for one case, small values are better", {
+test_that("aggregate-then-rank by mean works with two algorithms (reverse order) for one case, small values are better", {
   data <- rbind(
             data.frame(algo = "A2", value = 0.8, case = "C1"),
             data.frame(algo = "A1", value = 0.6, case = "C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm = "algo", case = "case", value = "value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind("A2" = data.frame(value_mean = 0.8, rank = 2),
                            "A1" = data.frame(value_mean = 0.6, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by mean works with two algorithms for one case, large values are better", {
+test_that("aggregate-then-rank by mean works with two algorithms for one case, large values are better", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 2),
     "A2" = data.frame(value_mean = 0.8, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by mean works with two algorithms (reverse order) for one case, large values are better", {
+test_that("aggregate-then-rank by mean works with two algorithms (reverse order) for one case, large values are better", {
   data <- rbind(
     data.frame(algo = "A2", value = 0.8, case = "C1"),
     data.frame(algo = "A1", value = 0.6, case = "C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm = "algo", case = "case", value = "value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind("A2" = data.frame(value_mean = 0.8, rank = 1),
                            "A1" = data.frame(value_mean = 0.6, rank = 2))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank raises error for invalid aggregation function", {
+test_that("aggregate-then-rank raises error for invalid aggregation function", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = meanx),
                "object 'meanx' not found", fixed = TRUE)
 })
 
-test_that("aggregate-than-rank by mean works with two algorithms for one case and 'min' as ties method", {
+test_that("aggregate-then-rank by mean works with two algorithms for one case and 'min' as ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, ties.method = "min")
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 1),
     "A2" = data.frame(value_mean = 0.6, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by mean works with two algorithms for one case and 'max' as ties method", {
+test_that("aggregate-then-rank by mean works with two algorithms for one case and 'max' as ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, ties.method = "max")
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 2),
     "A2" = data.frame(value_mean = 0.6, rank = 2))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank raises error for invalid ties method", {
+test_that("aggregate-then-rank raises error for invalid ties method", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.6, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean, ties.method = "maxx"),
                "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
 })
 
-test_that("aggregate-than-rank raises error for invalid ties method even when no ties present", {
+test_that("aggregate-then-rank raises error for invalid ties method even when no ties present", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean, ties.method = "maxx"),
                "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
 })
 
-test_that("aggregate-than-rank by mean works with two algorithms for two cases", {
+test_that("aggregate-then-rank by mean works with two algorithms for two cases", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A1", value=0.4, case="C2"),
     data.frame(algo="A2", value=0.8, case="C1"),
     data.frame(algo="A2", value=1.0, case="C2"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.5, rank = 1),
     "A2" = data.frame(value_mean = 0.9, rank = 2))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by median works with two algorithms for two cases", {
+test_that("aggregate-then-rank by median works with two algorithms for two cases", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A1", value=0.4, case="C2"),
     data.frame(algo="A2", value=0.8, case="C1"),
     data.frame(algo="A2", value=1.0, case="C2"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = median)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_median = 0.5, rank = 1),
     "A2" = data.frame(value_median = 0.9, rank = 2))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by mean works with one algorithm for one case", {
+test_that("aggregate-then-rank by mean works with one algorithm for one case", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank raises error when no NA treatment specified but NAs are contained", {
+test_that("aggregate-then-rank raises error when no NA treatment specified but NAs are contained", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean),
                "argument \"na.treat\" is missing, with no default", fixed = TRUE)
 })
 
-test_that("aggregate-than-rank raises error when invalid NA treatment specified and NAs are contained", {
+test_that("aggregate-then-rank raises error when invalid NA treatment specified and NAs are contained", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean, na.treat = "na.rmx"),
                "Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.", fixed = TRUE)
 })
 
 test_that("specified NA treatment does not influence ranking when no NAs are contained", {
   data <- rbind(
     data.frame(algo="A1", value=0.6, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = 0)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 2),
     "A2" = data.frame(value_mean = 0.8, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("NAs are replaced by numeric value", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = 0)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = 0.0, rank = 2),
     "A2" = data.frame(value_mean = 0.8, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("NAs are replaced by function value", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   replacementFunction <- function(x) { -1 }
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = replacementFunction)
 
   expectedRanking <- rbind(
     "A1" = data.frame(value_mean = -1.0, rank = 2),
     "A2" = data.frame(value_mean = 0.8, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
 test_that("NAs are removed", {
   data <- rbind(
     data.frame(algo="A1", value=NA, case="C1"),
     data.frame(algo="A2", value=0.8, case="C1"))
 
   challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = "na.rm")
 
   expectedRanking <- rbind(
     "A2" = data.frame(value_mean = 0.8, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRanking)
 })
 
-test_that("aggregate-than-rank by mean works for multi-task challenge (2 tasks in data set), no missing data", {
+test_that("aggregate-then-rank by mean works for multi-task challenge (2 tasks in data set), no missing data", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=0.5, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean)
 
   expectedRankingTask1 <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 1),
     "A2" = data.frame(value_mean = 0.8, rank = 2))
 
   expectedRankingTask2 <- rbind(
     "A1" = data.frame(value_mean = 0.5, rank = 2),
     "A2" = data.frame(value_mean = 0.4, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRankingTask1)
   expect_equal(ranking$matlist$T2, expectedRankingTask2)
 })
 
 test_that("NAs are replaced by numeric value in multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   ranking <- challenge%>%aggregateThenRank(FUN = mean, na.treat = 100)
 
   expectedRankingTask1 <- rbind(
     "A1" = data.frame(value_mean = 0.6, rank = 1),
     "A2" = data.frame(value_mean = 0.8, rank = 2))
 
   expectedRankingTask2 <- rbind(
     "A1" = data.frame(value_mean = 100.0, rank = 2),
     "A2" = data.frame(value_mean = 0.4, rank = 1))
 
   expect_equal(ranking$matlist$T1, expectedRankingTask1)
   expect_equal(ranking$matlist$T2, expectedRankingTask2)
 })
 
-test_that("aggregate-than-rank raises error when no NA treatment specified but NAs are contained in multi-task challenge (2 tasks in data set)", {
+test_that("aggregate-then-rank raises error when no NA treatment specified but NAs are contained in multi-task challenge (2 tasks in data set)", {
   dataTask1 <- cbind(task="T1",
                      rbind(
                        data.frame(algo="A1", value=0.6, case="C1"),
                        data.frame(algo="A2", value=0.8, case="C1")
                      ))
 
   dataTask2 <- cbind(task="T2",
                      rbind(
                        data.frame(algo="A1", value=NA, case="C1"),
                        data.frame(algo="A2", value=0.4, case="C1")
                      ))
 
   data <- rbind(dataTask1, dataTask2)
 
   challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
 
   expect_error(challenge%>%aggregateThenRank(FUN = mean),
                "argument \"na.treat\" is missing, with no default", fixed = TRUE)
 })
diff --git a/tests/testthat/test-testThenRank.R b/tests/testthat/test-testThenRank.R
new file mode 100644
index 0000000..060cef5
--- /dev/null
+++ b/tests/testthat/test-testThenRank.R
@@ -0,0 +1,346 @@
+test_that("test-then-rank raises warning for one case", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.6, case="C1"),
+    data.frame(algo="A2", value=0.8, case="C1"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_warning(ranking <- challenge%>%testThenRank(),
+                 "Only one case in task.", fixed = TRUE)
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 1))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("test-then-rank raises warning for one algorithm", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.6, case="C1"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_warning(ranking <- challenge%>%testThenRank(),
+                 "Only one algorithm available in task 'T1'.", fixed = TRUE)
+})
+
+test_that("test-then-rank works with two algorithms, small values are better", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.2, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank()
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 1, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 2))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("test-then-rank works with two algorithms, large values are better", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.2, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = FALSE)
+
+  ranking <- challenge%>%testThenRank()
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 2),
+    "A2" = data.frame(prop_significance = 1, rank = 1))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("test-then-rank works for ties method 'max'", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.6, case="C1"),
+    data.frame(algo="A1", value=0.6, case="C2"),
+    data.frame(algo="A2", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.8, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank(ties.method = "max")
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 2),
+    "A2" = data.frame(prop_significance = 0, rank = 2))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("test-then-rank raises error for invalid ties method", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.6, case="C1"),
+    data.frame(algo="A1", value=0.6, case="C2"),
+    data.frame(algo="A2", value=0.8, case="C1"),
+    data.frame(algo="A2", value=0.8, case="C2"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_error(challenge%>%testThenRank(ties.method = "maxx"),
+               "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
+})
+
+test_that("test-then-rank raises error for invalid ties method even when no ties present", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.2, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_error(challenge%>%testThenRank(ties.method = "maxx"),
+               "'arg' should be one of \"average\", \"first\", \"last\", \"random\", \"max\", \"min\"", fixed = TRUE)
+})
+
+test_that("test-then-rank raises error when no NA treatment specified but NAs are contained", {
+  data <- rbind(
+    data.frame(algo="A1", value=NA, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_error(challenge%>%testThenRank(),
+               "argument \"na.treat\" is missing, with no default", fixed = TRUE)
+})
+
+test_that("test-then-rank raises error when invalid NA treatment specified and NAs are contained", {
+  data <- rbind(
+    data.frame(algo="A1", value=NA, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_error(challenge%>%testThenRank(na.treat = "na.rmx"),
+               "Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.", fixed = TRUE)
+})
+
+test_that("specified NA treatment does not influence ranking when no NAs are contained", {
+  data <- rbind(
+    data.frame(algo="A1", value=0.2, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank(na.treat = 0)
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 1, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 2))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("NAs are replaced by numeric value", {
+  data <- rbind(
+    data.frame(algo="A1", value=NA, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank(na.treat = 100.0)
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 1))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("NAs are replaced by function value", {
+  data <- rbind(
+    data.frame(algo="A1", value=NA, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  replacementFunction <- function(x) { 0.0 }
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank(na.treat = replacementFunction)
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 1, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 2))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("NAs are removed", {
+  data <- rbind(
+    data.frame(algo="A1", value=NA, case="C1"),
+    data.frame(algo="A1", value=0.2, case="C2"),
+    data.frame(algo="A1", value=0.2, case="C3"),
+    data.frame(algo="A1", value=0.2, case="C4"),
+    data.frame(algo="A2", value=1.0, case="C1"),
+    data.frame(algo="A2", value=1.0, case="C2"),
+    data.frame(algo="A2", value=1.0, case="C3"),
+    data.frame(algo="A2", value=1.0, case="C4"))
+
+  challenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank(na.treat = "na.rm")
+
+  expectedRanking <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 1))
+
+  expect_equal(ranking$matlist$T1, expectedRanking)
+})
+
+test_that("test-then-rank works for multi-task data set with no missing data", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A1", value=0.2, case="C3"),
+                       data.frame(algo="A1", value=0.2, case="C4"),
+                       data.frame(algo="A2", value=1.0, case="C1"),
+                       data.frame(algo="A2", value=1.0, case="C2"),
+                       data.frame(algo="A2", value=1.0, case="C3"),
+                       data.frame(algo="A2", value=1.0, case="C4")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.6, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank()
+
+  expectedRankingTask1 <- rbind(
+    "A1" = data.frame(prop_significance = 1, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 2))
+
+  expectedRankingTask2 <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 1))
+
+  expect_equal(ranking$matlist$T1, expectedRankingTask1)
+  expect_equal(ranking$matlist$T2, expectedRankingTask2)
+})
+
+test_that("NAs are replaced by numeric value in multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=NA, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A1", value=0.2, case="C3"),
+                       data.frame(algo="A1", value=0.2, case="C4"),
+                       data.frame(algo="A2", value=1.0, case="C1"),
+                       data.frame(algo="A2", value=1.0, case="C2"),
+                       data.frame(algo="A2", value=1.0, case="C3"),
+                       data.frame(algo="A2", value=1.0, case="C4")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.6, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=0.8, case="C1"),
+                       data.frame(algo="A2", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  ranking <- challenge%>%testThenRank(na.treat = 0)
+
+  expectedRankingTask1 <- rbind(
+    "A1" = data.frame(prop_significance = 1, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 2))
+
+  expectedRankingTask2 <- rbind(
+    "A1" = data.frame(prop_significance = 0, rank = 1),
+    "A2" = data.frame(prop_significance = 0, rank = 1))
+
+  expect_equal(ranking$matlist$T1, expectedRankingTask1)
+  expect_equal(ranking$matlist$T2, expectedRankingTask2)
+})
+
+test_that("test-then-rank raises error when no NA treatment specified but NAs are contained in multi-task data set", {
+  dataTask1 <- cbind(task="T1",
+                     rbind(
+                       data.frame(algo="A1", value=0.2, case="C1"),
+                       data.frame(algo="A1", value=0.2, case="C2"),
+                       data.frame(algo="A1", value=0.2, case="C3"),
+                       data.frame(algo="A1", value=0.2, case="C4"),
+                       data.frame(algo="A2", value=1.0, case="C1"),
+                       data.frame(algo="A2", value=1.0, case="C2"),
+                       data.frame(algo="A2", value=1.0, case="C3"),
+                       data.frame(algo="A2", value=1.0, case="C4")
+                     ))
+  dataTask2 <- cbind(task="T2",
+                     rbind(
+                       data.frame(algo="A1", value=0.6, case="C1"),
+                       data.frame(algo="A1", value=0.6, case="C2"),
+                       data.frame(algo="A2", value=NA, case="C1"),
+                       data.frame(algo="A2", value=0.8, case="C2")
+                     ))
+
+  data <- rbind(dataTask1, dataTask2)
+
+  challenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter = TRUE)
+
+  expect_error(challenge%>%testThenRank(),
+               "argument \"na.treat\" is missing, with no default", fixed = TRUE)
+})