diff --git a/R/Aggregate.R b/R/Aggregate.R index 23b9d59..2bf027c 100644 --- a/R/Aggregate.R +++ b/R/Aggregate.R @@ -1,96 +1,96 @@ Aggregate <- function(object,...) UseMethod("Aggregate") Aggregate.default <- function(object, ...) aggregate(object,...) #stats::aggregate Aggregate.list <-function(object, x, algorithm, FUN = mean, na.treat = "na.rm", parallel = FALSE, progress = "none", case, alpha = 0.05, p.adjust.method = "none", alternative = "one.sided", test.fun = function(x, y) wilcox.test(x, y, alternative = alternative, exact = FALSE, paired = TRUE)$p.value, - largeBetter = TRUE, # only needed for significance + smallBetter = FALSE, # only needed for significance ... ) { call=match.call(expand.dots = T) if (is.character(FUN) && FUN=="significance"){ - if(missing(case)| missing(largeBetter)| missing(alpha)) stop("If FUN='significance' arguments case, largeBetter and alpha need to be given") + if(missing(case)| missing(smallBetter)| missing(alpha)) stop("If FUN='significance' arguments case, smallBetter and alpha need to be given") matlist=llply(1:length(object), function(id){ piece=object[[id]] if (length(unique(piece[[algorithm]]))<=1){ warning("Only one algorithm available in task '", names(object)[id], "'.") return(data.frame("prop_significance"=rep(NA,length(unique(piece[[algorithm]]))), row.names = unique(piece[[algorithm]]))) } if (is.numeric(na.treat)) piece[,x][is.na(piece[,x])]=na.treat else if (is.function(na.treat)) piece[,x][is.na(piece[,x])]=na.treat(piece[,x][is.na(piece[,x])]) else if (na.treat=="na.rm") piece=piece[!is.na(piece[,x]),] else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.") xmean <- significance(piece, x, algorithm, case, alpha, p.adjust.method=p.adjust.method, - largeBetter, + smallBetter, alternative=alternative, ...) class(xmean)=c("aggregated", class(xmean)) xmean }, .parallel=parallel, .progress=progress ) isSignificance=TRUE } else { if (is.function(FUN)) FUNname <-gsub('\")',"",gsub('UseMethod(\"',"",deparse(functionBody(FUN)),fixed = T),fixed=T) else if (is.character(FUN)) FUNname=FUN if (is.character(FUN)) FUN=try(eval(parse(text=FUN)), silent = T) if (!is.function(FUN)) stop("FUN has to be a function (possibly as character) or 'significance'") matlist=llply(object, function(piece){ if (is.numeric(na.treat)) piece[,x][is.na(piece[,x])]=na.treat else if (is.function(na.treat)) piece[,x][is.na(piece[,x])]=na.treat(piece[,x][is.na(piece[,x])]) else if (na.treat=="na.rm") piece=piece[!is.na(piece[,x]),] else stop("Argument 'na.treat' is invalid. It can be 'na.rm', numeric value or function.") xmean <- aggregate(piece[,x], by=list(piece[,algorithm]), FUN=function(z) do.call(FUN,args=list(x=z))) names(xmean)=c(algorithm, paste0(x,"_",FUNname)) rownames(xmean)=xmean[,1] xmean=xmean[,-1,drop=F] xmean }, .parallel=parallel, .progress=progress ) isSignificance=FALSE } names(matlist)=names(object) res=list(FUN = . %>% (call), FUN.list=list(FUN), call=list(call), data=object, matlist=matlist, isSignificance=isSignificance ) class(res)=c("aggregated.list",class(res)) res } diff --git a/R/Rank.R b/R/Rank.R index d7119eb..68cc837 100644 --- a/R/Rank.R +++ b/R/Rank.R @@ -1,74 +1,74 @@ Rank <- function(object,...) UseMethod("Rank") Rank.default <- function(object, ...) rank(object,...) #base::rank Rank.list <- function(object, x, annotator, ties.method="min", - largeBetter=FALSE, + smallBetter=TRUE, ...){ call=match.call(expand.dots = T) annotator.missing=missing(annotator) if (any(sapply(object, function(task) { (attr(object,"check") && - largeBetter && + smallBetter && any(is.na(task[[x]])) && min(task[[x]], na.rm=TRUE)==0) }) )) { message("There are missing metric values and metric values exactly equal to zero. Have some actually missing values been entered as zero in some instances? If yes, specify optional argument na.treat=0 in as.challenge().") } matlist=lapply(object, function(task){ if (annotator.missing){ res=bind_rows( lapply(split(task, task[[attr(object,"case")]]), function(task.case) cbind(task.case, rank=rankNA2(task.case[[x]], ties.method = ties.method, - largeBetter = largeBetter) + smallBetter = smallBetter) ) ) ) class(res)[2]="ranked" res } else { byAnnotator=split(task, as.list(task[,annotator])) temp=bind_rows( lapply(byAnnotator, function(annotator){ bind_rows( lapply(split(annotator, annotator[[attr(object,"case")]]), function(annotator.case) cbind(annotator.case, rank=rankNA2(annotator.case[[x]], ties.method = ties.method, - largeBetter = largeBetter) + smallBetter = smallBetter) ) ) ) } ) ) class(temp)[2]="ranked" temp } } ) res=list(FUN = . %>% (call), call=list(call), data=object, matlist=matlist) class(res)=c("ranked.list",class(res)) res } diff --git a/R/Rank.aggregated.list.R b/R/Rank.aggregated.list.R index 6d64223..d5d9637 100644 --- a/R/Rank.aggregated.list.R +++ b/R/Rank.aggregated.list.R @@ -1,73 +1,73 @@ rank.aggregated.list <-function(object, ties.method="min", - largeBetter, + smallBetter, ...){ - + call=match.call(expand.dots = F) - if (missing(largeBetter)){ - if (!is.null(attr(object$data,"largeBetter"))) largeBetter=attr(object$data,"largeBetter") - else stop("largeBetter has to be provided either in as.challenge() or rank()") + if (missing(smallBetter)){ + if (!is.null(attr(object$data,"smallBetter"))) smallBetter=attr(object$data,"smallBetter") + else stop("smallBetter has to be provided either in as.challenge() or rank()") + + if (object$isSignificance) smallBetter=FALSE # smallBetter already taken care of by one-sided test nature of signficance + } - if (object$isSignificance) largeBetter=TRUE # smallBetter (largeBetter) already taken care of by one-sided test nature of signficance - } - call=call("rank.aggregated.list", object=call$object, ties.method=ties.method, - largeBetter=largeBetter) - + smallBetter=smallBetter) + matlist=object$matlist - matlist=lapply(matlist, + matlist=lapply(matlist, function(y){ if (nrow(y)>0) r=rankNA2(y[,ncol(y)], ties.method=ties.method, - largeBetter=largeBetter) + smallBetter=smallBetter) else r=NULL res=cbind(y,rank=r) res }) - + res=list(matlist=matlist, data=object$data, call=list(object$call,call), FUN = . %>% (object$FUN) %>% (call), FUN.list=c(object$FUN.list, "rank") ) class(res)=c("ranked.list",class(res)) res } rank.aggregatedRanks.list <-function(object, ties.method="min", ...){ - - call=match.call(expand.dots = F) + + call=match.call(expand.dots = F) call=call("rank.aggregatedRanks.list", object=call$object, ties.method=ties.method) matlist=object$matlist matlist=lapply(matlist, function(y){ if (nrow(y)>0) r=rankNA2(y[,ncol(y)], ties.method=ties.method, - largeBetter=FALSE) + smallBetter=TRUE) else r=NULL res=cbind(y,rank=r) res }) - + res=list(matlist=matlist, data=object$data, call=list(object$call,call), FUN = . %>% (object$FUN) %>% (call), FUN.list=c(object$FUN.list, "rank") ) class(res)=c("ranked.list",class(res)) res res } diff --git a/R/aaggregate.R b/R/aaggregate.R index d1e8839..58f6628 100644 --- a/R/aaggregate.R +++ b/R/aaggregate.R @@ -1,167 +1,167 @@ test <- function(x,...) UseMethod("test") test.default <- function(x, ...) stop("not implemented for this class") test.challenge=function(x,...) aggregate.challenge(x=x, FUN="significance",...) #' Title #' #' @param x #' @param FUN #' @param na.treat #' @param alpha #' @param p.adjust.method #' @param parallel #' @param progress #' @param ... #' #' @return #' @export #' #' @examples aggregate.challenge=function(x, FUN=mean, na.treat, #either "na.rm", numeric value or function alpha=0.05, p.adjust.method="none",# only needed for significance parallel=FALSE, progress="none",...){ call=as.list(match.call()) if (missing(na.treat) && !is.null(attr(x,"na.treat"))) na.treat <- attr(x, "na.treat") - + if (missing(na.treat)){ #na.treat only optional if no missing values in data set if (!inherits(x,"list")){ if (!any(is.na(x[,attr(x, "value")]))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect } else { if (!any(sapply(x, function(task) any(is.na(task[,attr(x, "value")]))))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect } } else attr(x,"na.treat") <- na.treat res1=do.call("Aggregate",list(object=x, x=attr(x,"value"), algorithm=attr(x,"algorithm"), FUN=call$FUN, na.treat=na.treat, parallel=parallel, progress=progress, case=attr(x,"case"), alpha=alpha, p.adjust.method=p.adjust.method, - largeBetter=attr(x,"largeBetter") # only needed for significance + smallBetter=attr(x,"smallBetter") # only needed for significance )) call2=call("Aggregate", object=call$x, x=attr(x,"value"), algorithm=attr(x,"algorithm"), FUN=call$FUN, na.treat=na.treat, parallel=parallel,progress=progress, case=attr(x,"case"), alpha=alpha, p.adjust.method=p.adjust.method, - largeBetter=attr(x,"largeBetter") # only needed for significance + smallBetter=attr(x,"smallBetter") # only needed for significance ) if (inherits(x,"list")){ res=list(FUN = . %>% (call2), call=list(call2), FUN.list=list(FUN), data=x, matlist=res1$matlist, isSignificance=res1$isSignificance) class(res)=c("aggregated.list",class(res)) } else { res=list(FUN = . %>% (call2), call=list(call2), FUN.list=list(FUN), data=x, mat=res1$mat, isSignificance=res1$isSignificance) class(res)=c("aggregated",class(res)) } res } aggregate.ranked.list <-function(x, FUN=mean, ...){ call=match.call(expand.dots = F) call=call("aggregate.ranked.list", x=call$x, FUN=FUN) algorithm=attr(x$data,"algorithm") resmatlist=Aggregate.list(x$matlist, x="rank", algorithm=algorithm, FUN=FUN,...)$matlist resmatlist=lapply(resmatlist, function(z) as.data.frame(z)) res=list(matlist=resmatlist, call=c(x$call,call), data=x$data, FUN = . %>% (x$FUN) %>% (call), FUN.list=c(x$FUN.list,FUN) ) class(res)=c("aggregatedRanks.list",class(res)) res } aggregate.bootstrap.list <-function(x, what="metric", FUN=mean, ...){ call=match.call(expand.dots = T) if (is.character(FUN)) FUN=try(eval(parse(text=FUN)), silent = T) FUNname=as.character(call$FUN) if (!is.function(FUN)) stop("FUN has to be a function (possibly as character)") matlist=llply(1:length(x$bootsrappedRank), function(i.piece){ if (what=="ranks") xmean <- as.data.frame(apply(x$bootsrappedRank[[i.piece]],1,FUN=FUN)) else xmean <- as.data.frame(apply(x$bootsrappedAggregate[[i.piece]],1,FUN=FUN)) names(xmean)=paste0(what,"_",FUNname) xmean }) names(matlist)=names(x$bootsrappedRank) res=list(FUN = . %>% (call), call=list(call), data=x, matlist=matlist) class(res)=c("aggregated.list",class(res)) res } aggregate.bootstrap<-function(x,what="metric",FUN=mean, ... ){ call=match.call(expand.dots = T) if (is.character(FUN)) FUN=try(eval(parse(text=FUN)),silent = T) FUNname=as.character(call$FUN) if (!is.function(FUN)) stop("FUN has to be a function (possibly as character)") if (what=="ranks") xmean <- as.data.frame(apply(x$bootsrappedRank, 1, FUN=FUN)) else xmean <- as.data.frame(apply(x$bootsrappedAggregate, 1, FUN=FUN)) names(xmean)=paste0(what,"_",FUNname) res=list(FUN = . %>% (call), call=list(call), data=x, mat=xmean) class(res)=c("aggregated",class(res)) res } diff --git a/R/boxplot.R b/R/boxplot.R index 6fa8eab..e39f20c 100644 --- a/R/boxplot.R +++ b/R/boxplot.R @@ -1,80 +1,68 @@ boxplot.ranked.list=function(x, color="blue", jitter.width=0.25,...){ algo=attr(x$data,"algorithm") value=attr(x$data,"value") ranking=x x=x$data for (i in names(x)) { x[[i]][[algo]]=factor(x[[i]][[algo]], levels=rownames(ranking$matlist[[i]][order(ranking$matlist[[i]]$rank),])) } a=lapply(1:length(x),function(id){ ggplot(aes_string(algo,value),data=x[[id]])+ geom_jitter(position=position_jitter(width=jitter.width, height=0), color=color,...)+ geom_boxplot(outlier.shape = NA,fill=NA)+ ggtitle(names(x)[id]) + theme(axis.text.x=element_text(angle = -90, hjust = 0)) + xlab("Algorithm") + ylab("Metric value") }) # Remove title for single-task data set if (length(a) == 1) { a[[1]]$labels$title <- NULL } else { names(a) = names(x$matlist) } class(a) <- "ggList" a } boxplot.comparedRanks.list=function(x,...){ tau=sapply(x,function(z) z$tau) boxplot(tau,ylim=c(0,1.0),las=2, outline=FALSE, ylab="Kendall's tau",...) stripchart(tau, vertical = TRUE, method = "jitter", pch = 21, col = "blue", add=TRUE,...) } boxplot.bootstrap.list=function(x,...){ winner.noboot=winner.ranked.list(x) x2=winnerFrequencies(x) n.bootstraps= ncol(x$bootsrappedRanks[[1]]) perc_boot_Winner=lapply(1:length(x2),function(i){ x2.i=x2[[i]] winner.id=which(rownames(x2.i)%in%rownames(winner.noboot[[i]])) #could be multiple winners!!!! 100*x2.i[winner.id,3,drop=F]/n.bootstraps }) boxplot(unlist(perc_boot_Winner),ylim=c(0,100),las=2, outline=FALSE, ylab="% Bootstraps",xlab="Winner ranks 1", sub=paste(n.bootstraps,"Bootstraps"),...) stripchart(unlist(perc_boot_Winner), vertical = TRUE, method = "jitter", pch = 21, col = "blue", add=TRUE,...) } - - - - -# winnerFrequencies(bb) - # winner.datax=winner(datax, largeBetter=TRUE) #no bootstrap - # x2=numberRank1(datax,originalranking.datax,datax_boot) - # boot_W_1=x2[which(x2$algorithm_id==winner.datax),3] - # boot_NW_1=sum(x2[-which(x2$algorithm_id==winner.datax),3]>9) -# perc_boot_Winner=boot_W_1/Data$N_Bootstraps -# perc_boot_NotWinner=Data$Bootstrap_Rank1_NotWinner/(Data$N_Algo-1) - diff --git a/R/challengeR.R b/R/challengeR.R index fdd3a0a..c30253e 100644 --- a/R/challengeR.R +++ b/R/challengeR.R @@ -1,150 +1,150 @@ #' Title #' #' @param object #' @param value #' @param algorithm #' @param case #' @param taskName Optional for single-task data set that does not contain a task column. #' @param by The name of the column that contains the task identifiers. Required for multi-task data set. #' @param annotator #' @param smallBetter #' @param na.treat #' @param check #' #' @return #' @export #' #' @examples as.challenge=function(object, value, algorithm , case=NULL, taskName=NULL, by=NULL, annotator=NULL, smallBetter=FALSE, na.treat=NULL, # optional check=TRUE) { object=as.data.frame(object[,c(value, algorithm, case, by, annotator)]) # sanity checks if (check) { if (!is.null(by) && !is.null(taskName)) { warning("Argument 'taskName' is ignored for multi-task data set.") } # Add task column for data set without task column by using the specified task name. if (is.null(by) && !is.null(taskName)) { taskName <- trimws(taskName) if (taskName == "") { stop("Argument 'taskName' is empty.") } object <- cbind(task=taskName, object) by = "task" } # Add task column for data set without task column by using a dummy task name. if (is.null(by) && is.null(taskName)) { object <- cbind(task="dummyTask", object) by = "task" } object=splitby(object,by=by) object=lapply(object,droplevels) missingData = n.missing = list() for (task in names(object)) { if (!all(is.numeric(object[[task]][[value]]))) stop("Performance values must be numeric.") n.missing[[task]] <- sum(is.na(object[[task]][[value]])) # already missing before na.treat; for report if (n.missing[[task]]>0) message("Note: ", n.missing, " missing cases have been found in the data set.") # check for missing cases missingData[[task]]=object[[task]] %>% expand(!!as.symbol(algorithm), !!as.symbol(case))%>% anti_join(object[[task]], by=c( algorithm,case)) if (nrow(missingData[[task]])>0) { if (length(object) == 1 ) { # single task message("Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:") } else { # multi task message("Performance of not all algorithms has been observed for all cases in task '", task, "'. Therefore, missings have been inserted in the following cases:") } print(as.data.frame(missingData[[task]])) object[[task]]=as.data.frame(object[[task]] %>% complete(task, !!as.symbol(algorithm), !!as.symbol(case))) } # check duplicate cases all1=apply(table(object[[task]][[algorithm]], object[[task]][[case]]), 2, function(x) all(x==1)) if (!all(all1)) { n.duplicated <- sum(all1!=1) if (length(object) == 1 ) { # single task if (n.duplicated/length(all1) >= 1/5) { # at least a quarter of the cases is duplicated stop ("The following case(s) appear(s) more than once for the same algorithm. Please revise. ", "Or are you considering a multi-task challenge and forgot to specify argument 'by'?\n", "Case(s): ", paste(names(which(all1!=1)), collapse=", ") ) } else { stop ("The following case(s) appear(s) more than once for the same algorithm. Please revise.\n", "Case(s): ", paste(names(which(all1!=1)), collapse=", ") ) } } else { # multi task stop ("The following case(s) appear(s) more than once for the same algorithm in task '", task, "'. Please revise.\n", "Case(s): ", paste(names(which(all1!=1)), collapse=", ") ) } } if (!is.null(na.treat)) { if (is.numeric(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat else if (is.function(na.treat)) object[[task]][,value][is.na(object[[task]][,value])]=na.treat(object[[task]][,value][is.na(object[[task]][,value])]) else if (is.character(na.treat) && na.treat=="na.rm") object[[task]]=object[[task]][!is.na(object[[task]][,value]),] } } } if (check==TRUE && (any(sapply(missingData, function(x) nrow(x))>0) |any(n.missing>0))) { if (is.null(na.treat)) message("For aggregate-then-rank, na.treat will have to be specified. ", "For rank-then-aggregate, missings will implicitly lead to the algorithm ranked last for the missing test case." ) else if (is.numeric(na.treat)) message("All missings have been replaced by the value ", na.treat,".\n") else if (is.character(na.treat) && na.treat=="na.rm") message("All missings have been removed.") else if (is.function(na.treat)) { message("Missings have been replaced using function ") print(na.treat) } } if (check==TRUE){ attr(object,"n.missing")=n.missing attr(object,"missingData")=missingData } attr(object,"na.treat")=na.treat attr(object,"algorithm")=algorithm attr(object,"value")=value attr(object,"case")=case attr(object,"annotator")=annotator attr(object,"by")=by - attr(object,"largeBetter")=!smallBetter + attr(object,"smallBetter")=smallBetter attr(object,"check")=check class(object)=c("challenge", class(object)) object } diff --git a/R/methodsplot.R b/R/methodsplot.R index 5ab2757..c52cb6f 100644 --- a/R/methodsplot.R +++ b/R/methodsplot.R @@ -1,108 +1,108 @@ methodsplot <- function(x,...) UseMethod("methodsplot") methodsplot.default <- function(x, ...) stop("not implemented for this class") methodsplot.challenge=function(x, na.treat=NULL, methods=list(testBased=.%>%test() %>% rank(ties.method = "min"), meanThenRank= .%>% aggregate( FUN="mean") %>% rank(ties.method = "min"), medianThenRank=.%>% aggregate( FUN="median") %>% rank(ties.method = "min"), rankThenMean= .%>%rank(ties.method = "min") %>% aggregate( FUN="mean") %>%rank(ties.method = "min"), rankThenMedian=.%>%rank(ties.method = "min") %>% aggregate( FUN="median") %>%rank(ties.method = "min") ), ordering, ...) { if (any(sapply(x, function(task) any(is.na(task[,attr(x, "value")]))))) { # only if missings present, else do nothing if (is.null(na.treat)) { warning("Please specify na.treat in as.challenge()") return(NULL) } else { xx = melt(x, id.vars=c(attr(x,"value"), attr(x,"algorithm") , attr(x,"case"), attr(x,"annotator"), attr(x,"by") )) x=as.challenge(xx, value=attr(x,"value"), algorithm=attr(x,"algorithm") , case=attr(x,"case"), by=attr(x,"by"), annotator = attr(x,"annotator"), - smallBetter = !attr(x,"largeBetter"), + smallBetter = attr(x,"smallBetter"), na.treat=na.treat) } } a=lapply(methods,function(fun) fun(x)) dat=melt(a,measure.vars="rank") colnames(dat)[4:5]=c("task","rankingMethod") if (missing(ordering)){ lev=sort(unique(dat$algorithm)) lab=lev } else { lev=ordering lab=lev } dat=dat%>% dplyr::rename(rank=.data$value)%>% mutate(rank=factor(.data$rank))%>% mutate(task=factor(.data$task))%>% mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab)) linePlot <- ggplot(data = dat) + aes(x = rankingMethod, y = rank, color=algorithm, group=algorithm ) + geom_line(size=1)+ xlab("Ranking method") + ylab("Rank")+ theme( strip.placement = "outside", axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) ) # Create multi-panel plot with task names as titles for multi-task data set if (length(x) > 1) { linePlot <- linePlot + facet_wrap(~ task) } return(linePlot) } # methodsplot.ranked.list does not exist, use methodpsplot.challenge instead since consonsus ranking needed for ordering (or alphabetical ordering instead) #similar plot to methods plot, instead of across ranking methods across tasks lineplot <- function(x,...) UseMethod("lineplot") lineplot.default <- function(x, ...) stop("not implemented for this class") lineplot.challenge=function(x, ordering,...){ if (inherits(x,"list")) { dat=melt(x,measure.vars="rank") colnames(dat)[4]=c("task") if (missing(ordering)){ lev=sort(unique(dat$algorithm)) lab=lev } else { lev=ordering lab=paste(1:length(ordering),ordering) } dat=dat%>% dplyr::rename(rank=.data$value)%>% mutate(rank=factor(.data$rank))%>% mutate(task=factor(.data$task))%>% mutate(algorithm=factor(.data$algorithm, levels=lev,labels = lab)) ggplot(data = dat) + aes(x = task, y = rank, color=algorithm, group=algorithm ) + geom_line(size=1)+ theme( axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) ) } else stop("Only applicable to multiple tasks") } diff --git a/R/podium.R b/R/podium.R index 66109ba..e5d769d 100644 --- a/R/podium.R +++ b/R/podium.R @@ -1,163 +1,163 @@ podium <- function(object,...) UseMethod("podium") podium.default <- function(object, ...) stop("not implemented for this class") podium.ranked.list=function(object, xlab = NULL, ylab = NULL, lines.show = TRUE, lines.alpha = 0.2, lines.lwd = 1, col, lines.col = col, dots.pch = 19, dots.cex = 1, places.lty = 2, places.col = 1, legendfn = function(algs, cols) { legend("topleft", algs, lwd = 1, col = cols, bg = "white") }, layout.heights=c(1,0.4), ...){ if (is.null(xlab)) xlab <- "Podium" if (is.null(ylab)) ylab <- "Performance" x=object$data podiumPlots <- length(names(x)) for (subt in names(x)) { ordering=t(object$matlist[[subt]][,"rank",drop=F])["rank",] if (missing(col)) col=default_colors(length(ordering), algorithms = names(ordering)) dd=as.challenge(x[[subt]], value=attr(x,"value"), algorithm=attr(x,"algorithm"), case=attr(x,"case"), by=attr(x, "by"), annotator = attr(x,"annotator"), - smallBetter = !attr(x,"largeBetter"), + smallBetter = attr(x,"smallBetter"), na.treat=object$call[[1]][[1]]$na.treat) podiumPlot <- podium(dd, ordering=ordering, xlab = xlab, ylab = ylab, lines.show = lines.show, lines.alpha = lines.alpha, lines.lwd = lines.lwd, col=col, lines.col = lines.col, dots.pch = dots.pch, dots.cex = dots.cex, places.lty = places.lty, places.col = places.col, legendfn = legendfn, layout.heights=layout.heights, ...) if (length(names(x)) > 1) { title(subt,outer=T,line=-3) } append(podiumPlots, podiumPlot) } } podium.challenge=function(object, ordering, xlab = NULL, ylab = NULL, lines.show = FALSE, lines.alpha = 0.2, lines.lwd = 1, col,lines.col = col, dots.pch = 19, dots.cex = 1, places.lty = 2, places.col = 1, legendfn = function(algs, cols) { legend("topleft", algs, lwd = 1, col = cols, bg = "white") }, layout.heights=c(1,0.4), ...) { ranking=object%>%rank( ties.method = "random" ) task <- ranking$matlist[[1]] dat=as.data.frame(table(task[[attr(object, "algorithm")]], task$rank, dnn=c("algorithm","rank")), responseName = "Count") form=as.formula(paste(attr(object,"case"), attr(object,"algorithm"), sep="~")) ranks=acast(task, form, value.var="rank") values=acast(task, form, value.var=attr(object, "value")) nranks=acast(dat, algorithm~rank, value.var="Count") nalgs <- ncol(ranks) algs <- colnames(ranks) barorder <- order(ordering) orderedAlgorithms= names(ordering)[barorder] ylim=range(task[[attr(object,"value")]], na.rm = TRUE) dotplotborders <- (0:nalgs) * nalgs dotplaces <- (1:nalgs) - 0.5 names(dotplaces) <- orderedAlgorithms linecols <- sapply(lines.col, function(c) { r <- col2rgb(c) rgb(r[1], r[2], r[3], alpha = round(255 * lines.alpha), maxColorValue = 255) }) opar <- par(no.readonly = TRUE) layout(matrix(c(1, 2), nrow = 2, byrow = TRUE), heights =layout.heights) mar <- par("mar") par(mar = c(0, mar[2], mar[3], mar[4])) plot(dotplotborders, rep(ylim[2], nalgs + 1), type = "n", ylim = ylim, ylab = ylab, xlab = "", axes = F) axis(1, at = dotplotborders, labels = NA, lwd = par("lwd")) axis(2, lwd = par("lwd")) box() abline(v = dotplotborders, lty = places.lty, col = places.col) linesegments <- function(x, y, ...) { n <- length(x) segments(x[-n], y[-n], x[-1], y[-1], ...) } drawthe <- function(fn, col, ...) { for (i in 1:nrow(values)) { r <- ranks[i, ] o <- order(r) performances <- (values[i, ])[o] places <- (dotplaces[names(r)] + ((r - 1) * nalgs))[o] fn(places, performances, col = col[names(r)[o]], ...) } } if (lines.show) drawthe(linesegments, linecols, lwd = lines.lwd) drawthe(points, col, pch = dots.pch, cex = dots.cex) legendfn(orderedAlgorithms, col[orderedAlgorithms]) par(mar = c(mar[1], mar[2], 0, mar[4])) barplot(nranks[barorder,], beside = TRUE, width = 1, axes = F, space = c(0, 0), border = NA, ylim = c(0, nrow(ranks)), names.arg = paste(1:nalgs, ".", sep = ""), col = col[orderedAlgorithms], xlab = xlab) axis(1, at = c(0, dotplotborders), labels = NA, lwd = par("lwd")) box() par(opar) } diff --git a/R/rankNA2.R b/R/rankNA2.R index be2120b..7c6828a 100644 --- a/R/rankNA2.R +++ b/R/rankNA2.R @@ -1,10 +1,10 @@ rankNA2 <- -function(x,ties.method="min",largeBetter=F){ - r=rank((-1)^(largeBetter)*x,ties.method=ties.method,na.last="keep") #xtfrm maybe faster alternative +function(x,ties.method="min",smallBetter=TRUE){ + r=rank((-1)^(!smallBetter)*x,ties.method=ties.method,na.last="keep") #xtfrm maybe faster alternative if (any(is.na(x))){ maxrank=ifelse(all(is.na(x)), yes=0, no=max(r,na.rm=TRUE)) - if (ties.method%in%c("min","random")) r[is.na(x)]<-maxrank+1 + if (ties.method%in%c("min","random")) r[is.na(x)]<-maxrank+1 if (ties.method=="average") r[is.na(x)]<-maxrank+mean(1:sum(is.na(x))) } r } diff --git a/R/rankingHeatmap.R b/R/rankingHeatmap.R index 588a550..82bfbcf 100644 --- a/R/rankingHeatmap.R +++ b/R/rankingHeatmap.R @@ -1,63 +1,63 @@ rankingHeatmap <- function(x,...) UseMethod("rankingHeatmap") rankingHeatmap.default <- function(x, ...) stop("not implemented for this class") rankingHeatmap.ranked.list=function (x,ties.method="min",...) { xx=x$data a=lapply(names(x$matlist),function(subt){ ordering=rownames(x$matlist[[subt]])[order(x$matlist[[subt]]$rank)] dd=as.challenge(xx[[subt]], value=attr(xx,"value"), algorithm=attr(xx,"algorithm") , case=attr(xx,"case"), by=attr(xx, "by"), annotator = attr(xx,"annotator"), - smallBetter = !attr(xx,"largeBetter"), + smallBetter = attr(xx,"smallBetter"), na.treat=x$call[[1]][[1]]$na.treat) rankingHeatmap(dd, ordering=ordering, ties.method=ties.method,...) + ggtitle(subt) }) # Remove title for single-task data set if (length(a) == 1) { a[[1]]$labels$title <- NULL } else { names(a) = names(x$matlist) } class(a) <- "ggList" a } rankingHeatmap.challenge=function(x, ordering, ties.method="min",...) { ranking=x%>%rank( ties.method = ties.method ) task <- ranking$matlist[[1]] dat=as.data.frame(table(task[[attr(x,"algorithm")]], task$rank, dnn=c("algorithm","rank")), responseName = "Count") dat$algorithm=factor(dat$algorithm, levels=ordering) ncases=length(unique(task[[attr(x,"case")]])) ggplot(dat)+ geom_raster(aes(algorithm, rank, fill= Count))+ geom_hline(yintercept = seq(1.5,max(task$rank)-.5,by=1), color=grey(.8),size=.3)+ geom_vline(xintercept = seq(1.5,length(unique(dat$algorithm))-.5,by=1), color=grey(.8),size=.3)+ scale_fill_viridis_c(direction = -1, limits=c(0,ncases) )+ theme(axis.text.x = element_text(angle = 90), aspect.ratio=1)+ xlab("Algorithm")+ ylab("Rank") } diff --git a/R/rrank.R b/R/rrank.R index 020d373..ce5ff6c 100644 --- a/R/rrank.R +++ b/R/rrank.R @@ -1,48 +1,46 @@ rank <- function(object,...) UseMethod("rank") rank.default <- function(object, ...) base::rank(object,...) #stats::aggregate rank.challenge=function(object, x, ties.method="min",...){ call=as.list(match.call()) if (!is.null(attr(object,"annotator"))) { call2=call("Rank", object=call$object, x=attr(object,"value"), annotator=c(attr(object,"annotator")), ties.method=ties.method, - largeBetter=attr(object,"largeBetter") + smallBetter=attr(object,"smallBetter") ) res1=do.call("Rank",list(object=object, x=attr(object,"value"), annotator=c(attr(object,"annotator")), ties.method=ties.method, - largeBetter=attr(object,"largeBetter") + smallBetter=attr(object,"smallBetter") )) } else { call2=call("Rank", object=call$object, x=attr(object,"value"), ties.method=ties.method, - largeBetter=attr(object,"largeBetter") + smallBetter=attr(object,"smallBetter") ) res1=do.call("Rank",list(object=object, x=attr(object,"value"), ties.method=ties.method, - largeBetter=attr(object,"largeBetter") + smallBetter=attr(object,"smallBetter") )) } res=list(FUN = . %>% (call2), call=list(call2), FUN.list=list("rank"), data=object, matlist=res1$matlist) class(res)=c("ranked.list",class(res)) res } - - diff --git a/R/testBased.R b/R/testBased.R index 897ec79..818e5a7 100644 --- a/R/testBased.R +++ b/R/testBased.R @@ -1,175 +1,175 @@ decision <- function(x,...) UseMethod("decision") decision.default <- function(x, ...) stop("not implemented for this class") decision.challenge=function(x, na.treat=NULL, # it can be 'na.rm', numeric value or function alpha=0.05, p.adjust.method="none", alternative="one.sided", test.fun=function(x,y) wilcox.test(x,y, alternative = alternative,exact=FALSE, paired = TRUE)$p.value, parallel=FALSE, progress="none",...){ if (is.null(na.treat)){ #na.treat only optional if no missing values in data set if (!inherits(x,"list")){ if (!any(is.na(x[,attr(x, "value")]))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect else stop("Please specify na.treat in as.challenge()") } else { if (!any(sapply(x, function(task) any(is.na(task[,attr(x, "value")]))))) na.treat="na.rm" # there are no missings so set na.treat by dummy "na.rm" has no effect else stop("Please specify na.treat in as.challenge()") } } - if (alternative!="two.sided") alternative=ifelse(attr(x,"largeBetter"), + if (alternative!="two.sided") alternative=ifelse(!attr(x,"smallBetter"), yes="greater", no="less") call=match.call(expand.dots = T) object=x algorithm=attr(object,"algorithm") case=attr(object,"case") value=attr(object,"value") - largeBetter=attr(object,"largeBetter") - if(missing(case)| missing(largeBetter)) stop("arguments case and alpha need to be given in as.challenge()") + smallBetter=attr(object,"smallBetter") + if(missing(case)| missing(smallBetter)) stop("arguments case and alpha need to be given in as.challenge()") if (inherits(object,"list")){ matlist=llply(1:length(object), function(id){ piece=object[[id]] if (length(unique(piece[[algorithm]]))<=1){ warning("only one ", algorithm, " available in element ", names(object)[id]) } if (is.numeric(na.treat)) piece[,value][is.na(piece[,value])]=na.treat else if (is.function(na.treat)) piece[,value][is.na(piece[,value])]=na.treat(piece[,value][is.na(piece[,value])]) else if (na.treat=="na.rm") piece=piece[!is.na(piece[,value]),] - mat=Decision(piece, value, algorithm, case, alpha, largeBetter, + mat=Decision(piece, value, algorithm, case, alpha, smallBetter, p.adjust.method=p.adjust.method, alternative=alternative, test.fun=test.fun) mat=as.data.frame(mat) mat[is.na(mat)]=0 mat=as.matrix(mat) class(mat)=c(class(mat),"challenge.incidence") mat }, .parallel=parallel, .progress=progress ) names(matlist)=names(object) return(matlist) } else { if (length(unique(object[[algorithm]]))<=1){ warning("only one ", algorithm, " available") matlist=(matrix(NA,1,1)) } else { mat=Decision(object, value, algorithm, case, alpha, - largeBetter, + smallBetter, p.adjust.method=p.adjust.method, alternative=alternative, test.fun=test.fun) } mat=as.data.frame(mat) mat[is.na(mat)]=0 mat=as.matrix(mat) class(mat)=c(class(mat),"challenge.incidence") return(mat) } } Decision=function(object, value, by, case, alpha, - largeBetter=FALSE, + smallBetter=TRUE, p.adjust.method="none", alternative="one.sided", test.fun=function(x,y) wilcox.test(x,y, alternative = alternative,exact=FALSE, paired = TRUE)$p.value){ algorithms=unique(object[[by]]) if (length(unique(object[[case]]))==1){ warning("Only one case in task.") } combinations=expand.grid(algorithms,algorithms)[,2:1] combinations=combinations[apply(combinations,1,function(z) anyDuplicated(z)==0),] # remove i==j pvalues=sapply(1:nrow(combinations), function(it){ dat1=object[object[[by]]==combinations[it,1],] dat2=object[object[[by]]==combinations[it,2],] id=intersect(dat2[,case],dat1[,case]) dat1=dat1[match(id,dat1[,case]),value] dat2=dat2[match(id,dat2[,case]),value] test.fun(dat1,dat2) }) decisions=as.numeric(p.adjust(pvalues, method=p.adjust.method)<= alpha) res=cbind(combinations,decisions) reshape2::acast(res, Var2~Var1, value.var="decisions") } as.relation.challenge.incidence=function(x, verbose = FALSE, ...) { r <- relation(incidence = x, ...) props <- check_strict_preference(r) class <- "strictpref" r$.Meta$is_decreasing <- FALSE r$.Meta <- c(r$.Meta, structure(props, names = sprintf("is_%s", names(props)))) if ( verbose ) { for ( p in names(props) ) { cat(sprintf("%s = %s:\n", p, props[[p]])) print(relation_violations(r, p, TRUE)) } } structure(r, class = c(class, class(r))) } check_strict_preference= function(x) { list(irreflexive = relation_is_irreflexive(x), asymmetric = relation_is_asymmetric(x), transitive = relation_is_transitive(x), negatively_transitive = relation_is_negatively_transitive(x), trichotomous = relation_is_trichotomous(x)) } significance=function(object, value, algorithm, case, alpha, - largeBetter=FALSE,...) { + smallBetter=TRUE,...) { xx=as.challenge(object, value=value, algorithm=algorithm, case=case, - smallBetter = !largeBetter, + smallBetter = smallBetter, check=FALSE) a=decision.challenge(xx,...) prop_significance=rowSums(a)/(ncol(a)-1) return(data.frame("prop_significance"=prop_significance, row.names = names(prop_significance))) } diff --git a/R/wrapper.R b/R/wrapper.R index 14371b1..3a70371 100644 --- a/R/wrapper.R +++ b/R/wrapper.R @@ -1,23 +1,21 @@ aggregateThenRank=function(object,FUN,ties.method = "min",...){ - object %>% - aggregate(FUN=FUN,...) %>% + object %>% + aggregate(FUN=FUN,...) %>% rank(ties.method = ties.method) } testThenRank=function(object,FUN,ties.method = "min",...){ - object %>% - aggregate(FUN="significance",...) %>% + object %>% + aggregate(FUN="significance",...) %>% rank(ties.method = ties.method) } rankThenAggregate=function(object, FUN, ties.method = "min" ){ - object %>% - rank(ties.method = ties.method)%>% - aggregate(FUN=FUN) %>% - rank(ties.method = ties.method) #small rank is always best, i.e. largeBetter always FALSE + object %>% + rank(ties.method = ties.method)%>% + aggregate(FUN=FUN) %>% + rank(ties.method = ties.method) # small rank is always best, i.e. smallBetter always TRUE } - - diff --git a/inst/appdir/reportMultiple.Rmd b/inst/appdir/reportMultiple.Rmd index 54bfa9e..ca545c7 100644 --- a/inst/appdir/reportMultiple.Rmd +++ b/inst/appdir/reportMultiple.Rmd @@ -1,409 +1,409 @@ --- params: object: NA colors: NA name: NULL consensus: NA isMultiTask: NA bootstrappingEnabled: NA fig.format: NULL dpi: NULL title: "Benchmarking report for `r params$name` " author: "created by challengeR v`r packageVersion('challengeR')`" date: "`r Sys.setlocale('LC_TIME', 'English'); format(Sys.time(), '%d %B, %Y')`" editor_options: chunk_output_type: console --- ```{r setup, include=FALSE} options(width=80) #out.format <- knitr::opts_knit$get("out.format") out.format <- knitr::opts_knit$get("rmarkdown.pandoc.to") img_template <- switch( out.format, docx = list("img-params"=list(dpi=150, fig.width=6, fig.height=6, out.width="504px", out.height="504px")), { # default list("img-params"=list( fig.width=7, fig.height = 3, dpi=300)) } ) knitr::opts_template$set( img_template ) knitr::opts_chunk$set(echo = F) # ,#fig.width=7,fig.height = 3,dpi=300, if (out.format != "docx") knitr::opts_chunk$set(fig.align = "center") if (!is.null(params$fig.format)) knitr::opts_chunk$set(dev = params$fig.format) # can be vector, e.g. fig.format=c('jpeg','png', 'pdf') if (!is.null(params$dpi)) knitr::opts_chunk$set(dpi = params$dpi) theme_set(theme_light()) isMultiTask = params$isMultiTask bootstrappingEnabled = params$bootstrappingEnabled ``` ```{r } object = params$object if (isMultiTask) { ordering_consensus=names(params$consensus) } else { ordering_consensus=names(sort(t(object$matlist[[1]][,"rank",drop=F])["rank",])) } color.fun=params$colors ``` ```{r } challenge_multiple=object$data ranking.fun=object$FUN cols_numbered=cols=color.fun(length(ordering_consensus)) names(cols)=ordering_consensus names(cols_numbered)= paste(1:length(cols),names(cols)) if (bootstrappingEnabled) { boot_object = params$object challenge_multiple=boot_object$data ranking.fun=boot_object$FUN object=challenge_multiple%>%ranking.fun object$FUN.list = boot_object$FUN.list object$fulldata=boot_object$fulldata # only not NULL if subset of algorithms used cols_numbered=cols=color.fun(length(ordering_consensus)) names(cols)=ordering_consensus names(cols_numbered)= paste(1:length(cols),names(cols)) } ``` This document presents a systematic report on the benchmark study "`r params$name`". Input data comprises raw metric values for all algorithms and cases. Generated plots are: ```{r, child=if (!isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewSingleTaskNoBootstrapping.Rmd", package="challengeR")} ``` ```{r, child=if (!isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewSingleTaskBootstrapping.Rmd", package="challengeR")} ``` ```{r, child=if (isMultiTask && !bootstrappingEnabled) system.file("appdir", "overviewMultiTaskNoBootstrapping.Rmd", package="challengeR")} ``` ```{r, child=if (isMultiTask && bootstrappingEnabled) system.file("appdir", "overviewMultiTaskBootstrapping.Rmd", package="challengeR")} ``` Details can be found in Wiesenfarth et al. (2019). ```{r,results='asis'} if (isMultiTask) { cat("# Rankings\n") } else { cat("# Ranking") } ``` Algorithms within a task are ranked according to the following ranking scheme: ```{r,results='asis'} a=( lapply(object$FUN.list[1:2],function(x) { if (!is.character(x)) return(paste0("aggregate using function ", paste(gsub("UseMethod","", deparse(functionBody(x))), collapse=" ") )) else if (x=="rank") return(x) else return(paste0("aggregate using function ",x)) })) cat("    *",paste0(a,collapse=" then "),"*",sep="") if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="significance") cat("\n\n Column 'prop_significance' is equal to the number of pairwise significant test results for a given algorithm divided by the number of algorithms.") ``` ```{r,results='asis'} if (isMultiTask) { cat("Ranking for each task:\n") for (t in 1:length(object$matlist)){ cat("\n",names(object$matlist)[t],": ") n.cases=nrow(challenge_multiple[[t]])/length(unique(challenge_multiple[[t]][[attr(challenge_multiple,"algorithm")]])) numberOfAlgorithms <- length(levels(challenge_multiple[[t]][[attr(challenge_multiple, "algorithm")]])) cat("\nThe analysis is based on", numberOfAlgorithms, "algorithms and", n.cases, "cases.", attr(object$data,"n.missing")[[t]], "missing cases have been found in the data set. ") if (nrow(attr(object$data,"missingData")[[t]])>0) { if(attr(object$data,"n.missing")[[t]]==0 ) cat("However, ") else if(attr(object$data,"n.missing")[[t]]>0 ) cat("Additionally, ") cat("performance of not all algorithms has been observed for all cases in task '", names(object$matlist)[t], "'. Therefore, missings have been inserted in the following cases:") print(knitr::kable(as.data.frame(attr(object$data,"missingData")[[t]]))) } if (nrow(attr(object$data,"missingData")[[t]])>0 | attr(object$data,"n.missing")[[t]]>0) { if (is.numeric(attr(object$data,"na.treat"))) cat("All missings have been replaced by values of", attr(object$data,"na.treat"),".\n") else if (is.character(attr(object$data,"na.treat")) && attr(object$data,"na.treat")=="na.rm") cat("All missings have been removed.") else if (is.function(attr(object$data,"na.treat"))) { cat("Missings have been replaced using function ") print(attr(object$data,"na.treat")) } else if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="rank") cat("Missings lead to the algorithm ranked last for the missing case.") } x=object$matlist[[t]] print(knitr::kable(x[order(x$rank),])) } } else { n.cases=nrow(challenge_multiple[[1]])/length(unique(challenge_multiple[[1]][[attr(challenge_multiple,"algorithm")]])) # Is subset of algorithms used? if (!is.null(object$fulldata[[1]])) { cat("The top ", length(levels(challenge_multiple[[1]][[attr(challenge_multiple, "algorithm")]])), " out of ", length(levels(object$fulldata[[1]][[attr(challenge_multiple, "algorithm")]])), " algorithms are considered.\n") cat("\nThe analysis is based on", n.cases, "cases. ") } else { cat("\nThe analysis is based on", length(levels(challenge_multiple[[1]][[attr(challenge_multiple, "algorithm")]])), "algorithms and", n.cases, "cases. ") } cat(attr(object$data,"n.missing")[[1]], "missing cases have been found in the data set. ") if (nrow(attr(object$data,"missingData")[[1]])>0) { if(attr(object$data,"n.missing")[[1]]==0 ) cat("However, ") else if(attr(object$data,"n.missing")[[1]]>0 ) cat("Additionally, ") cat("performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:") print(knitr::kable(as.data.frame(attr(object$data,"missingData")[[1]]))) } if (nrow(attr(object$data,"missingData")[[1]])>0 | attr(object$data,"n.missing")[[1]]>0) { if (is.numeric(attr(object$data,"na.treat"))) cat("All missings have been replaced by values of", attr(object$data,"na.treat"),".\n") else if (is.character(attr(object$data,"na.treat")) && attr(object$data,"na.treat")=="na.rm") cat("All missings have been removed.") else if (is.function(attr(object$data,"na.treat"))) { cat("Missings have been replaced using function ") print(attr(object$data,"na.treat")) } else if (is.character(object$FUN.list[[1]]) && object$FUN.list[[1]]=="rank") cat("Missings lead to the algorithm ranked last for the missing case.") } cat("\n\nRanking:") x=object$matlist[[1]] print(knitr::kable(x[order(x$rank),])) } ``` \bigskip ```{r, child=if (isMultiTask) system.file("appdir", "consensusRanking.Rmd", package="challengeR")} ``` # Visualization of raw assessment data ```{r,results='asis'} if (isMultiTask) { cat("The algorithms are ordered according to the computed ranks for each task.") } ``` ## Dot- and boxplot *Dot- and boxplots* for visualizing raw assessment data separately for each algorithm. Boxplots representing descriptive statistics over all cases (median, quartiles and outliers) are combined with horizontally jittered dots representing individual cases. \bigskip ```{r boxplots} boxplot(object, size=.8) ``` ## Podium plot *Podium plots* (see also Eugster et al., 2008) for visualizing raw assessment data. Upper part (spaghetti plot): Participating algorithms are color-coded, and each colored dot in the plot represents a metric value achieved with the respective algorithm. The actual metric value is encoded by the y-axis. Each podium (here: $p$=`r length(ordering_consensus)`) represents one possible rank, ordered from best (1) to last (here: `r length(ordering_consensus)`). The assignment of metric values (i.e. colored dots) to one of the podiums is based on the rank that the respective algorithm achieved on the corresponding case. Note that the plot part above each podium place is further subdivided into $p$ "columns", where each column represents one participating algorithm (here: $p=$ `r length(ordering_consensus)`). Dots corresponding to identical cases are connected by a line, leading to the shown spaghetti structure. Lower part: Bar charts represent the relative frequency for each algorithm to achieve the rank encoded by the podium place. ```{r, include=FALSE, fig.keep="none",dev=NULL} plot.new() algs=ordering_consensus l=legend("topright", paste0(1:length(algs),": ",algs), lwd = 1, cex=1.4,seg.len=1.1, title="Rank: Alg.", plot=F) w <- grconvertX(l$rect$w, to='ndc') - grconvertX(0, to='ndc') h<- grconvertY(l$rect$h, to='ndc') - grconvertY(0, to='ndc') addy=max(grconvertY(l$rect$h,"user","inches"),6) ``` ```{r podium,eval=T,fig.width=12, fig.height=addy} #c(bottom, left, top, right op<-par(pin=c(par()$pin[1],6), omd=c(0, 1-w, 0, 1), mar=c(par('mar')[1:3], 0)+c(-.5,0.5,-.5,0), cex.axis=1.5, cex.lab=1.5, cex.main=1.7) oh=grconvertY(l$rect$h,"user","lines")-grconvertY(6,"inches","lines") if (oh>0) par(oma=c(oh,0,0,0)) set.seed(38) podium(object, col=cols, lines.show = T, lines.alpha = .4, dots.cex=.9, ylab="Metric value", layout.heights=c(1,.35), legendfn = function(algs, cols) { legend(par('usr')[2], par('usr')[4], xpd=NA, paste0(1:length(algs),": ",algs), lwd = 1, col = cols, bg = NA, cex=1.4, seg.len=1.1, title="Rank: Alg.") } ) par(op) ``` ## Ranking heatmap *Ranking heatmaps* for visualizing raw assessment data. Each cell $\left( i, A_j \right)$ shows the absolute frequency of cases in which algorithm $A_j$ achieved rank $i$. \bigskip ```{r rankingHeatmap,fig.width=9, fig.height=9,out.width='70%'} rankingHeatmap(object) ``` # Visualization of ranking stability ```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationBlobPlots.Rmd", package="challengeR")} ``` ```{r, child=if (bootstrappingEnabled) system.file("appdir", "visualizationViolinPlots.Rmd", package="challengeR")} ``` ## *Significance maps* for visualizing ranking stability based on statistical significance *Significance maps* depict incidence matrices of pairwise significant test results for the one-sided Wilcoxon signed rank test at a 5\% significance level with adjustment for multiple testing according to Holm. Yellow shading indicates that metric values of the algorithm on the x-axis were significantly superior to those from the algorithm on the y-axis, blue color indicates no significant difference. \bigskip ```{r significancemap,fig.width=6, fig.height=6,out.width='200%'} significanceMap(object,alpha=0.05,p.adjust.method="holm") ``` ## Ranking robustness to ranking methods *Line plots* for visualizing rankings robustness across different ranking methods. Each algorithm is represented by one colored line. For each ranking method encoded on the x-axis, the height of the line represents the corresponding rank. Horizontal lines indicate identical ranks for all methods. \bigskip ```{r lineplot,fig.width=8, fig.height=6,out.width='95%'} if (length(object$matlist)<=6 &nrow((object$matlist[[1]]))<=10 ){ methodsplot(challenge_multiple, ordering = ordering_consensus, na.treat=object$call[[1]][[1]]$na.treat) + scale_color_manual(values=cols) }else { x=challenge_multiple for (subt in names(challenge_multiple)){ dd=as.challenge(x[[subt]], value=attr(x,"value"), algorithm=attr(x,"algorithm") , case=attr(x,"case"), annotator = attr(x,"annotator"), by=attr(x,"by"), - smallBetter = !attr(x,"largeBetter"), + smallBetter = attr(x,"smallBetter"), na.treat=object$call[[1]][[1]]$na.treat ) print(methodsplot(dd, ordering = ordering_consensus) + scale_color_manual(values=cols) ) } } ``` ```{r, child=if (isMultiTask) system.file("appdir", "visualizationAcrossTasks.Rmd", package="challengeR")} ``` # References Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121* M. J. A. Eugster, T. Hothorn, and F. Leisch, “Exploratory and inferential analysis of benchmark experiments,” Institut fuer Statistik, Ludwig-Maximilians-Universitaet Muenchen, Germany, Technical Report 30, 2008. [Online]. Available: http://epub.ub.uni-muenchen.de/4134/. diff --git a/tests/testthat/test-challenge.R b/tests/testthat/test-challenge.R index ea3a179..89b5643 100644 --- a/tests/testthat/test-challenge.R +++ b/tests/testthat/test-challenge.R @@ -1,578 +1,578 @@ test_that("empty attribute 'taskName' raises error for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) expect_error(as.challenge(data, taskName="", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Argument 'taskName' is empty.", fixed=TRUE) }) test_that("only whitespaces in attribute 'taskName' raises error for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) expect_error(as.challenge(data, taskName=" ", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Argument 'taskName' is empty.", fixed=TRUE) }) test_that("attributes are set for single-task challenge with specified task name", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") - expect_equal(attr(actualChallenge, "largeBetter"), TRUE) + expect_equal(attr(actualChallenge, "smallBetter"), FALSE) expect_equal(attr(actualChallenge, "check"), TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1")) # expect that there's no attribute "task" expect_equal(attr(actualChallenge, "task"), NULL) expect_equal(attr(actualChallenge$T1, "task"), NULL) expect_equal(attr(actualChallenge$T2, "task"), NULL) }) test_that("attributes are set for single-task challenge with dummy task name", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) actualChallenge <- as.challenge(data, algorithm="algo", case="case", value="value", smallBetter=FALSE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") - expect_equal(attr(actualChallenge, "largeBetter"), TRUE) + expect_equal(attr(actualChallenge, "smallBetter"), FALSE) expect_equal(attr(actualChallenge, "check"), TRUE) expect_equal(as.vector(actualChallenge$dummyTask$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$dummyTask$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$dummyTask$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$dummyTask$task), c("dummyTask", "dummyTask")) # expect that there's no attribute "task" expect_equal(attr(actualChallenge, "task"), NULL) expect_equal(attr(actualChallenge$dummyTask, "task"), NULL) expect_equal(attr(actualChallenge$dummyTask, "task"), NULL) }) test_that("leading and trailing whitespaces are trimmed for attribute 'taskName'", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1")) actualChallenge <- as.challenge(data, taskName=" T1 ", algorithm="algo", case="case", value="value", smallBetter=FALSE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) }) test_that("attributes are set for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") - expect_equal(attr(actualChallenge, "largeBetter"), FALSE) + expect_equal(attr(actualChallenge, "smallBetter"), TRUE) expect_equal(attr(actualChallenge, "check"), TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C1")) expect_equal(as.vector(actualChallenge$T2$task), c("T2", "T2")) # expect that there's no attribute "task" expect_equal(attr(actualChallenge, "task"), NULL) expect_equal(attr(actualChallenge$T1, "task"), NULL) expect_equal(attr(actualChallenge$T2, "task"), NULL) }) test_that("attributes are set for multi-task challenge with sanity check disabled", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE, check=FALSE) expect_equal(attr(actualChallenge, "annotator"), NULL) expect_equal(attr(actualChallenge, "by"), "task") - expect_equal(attr(actualChallenge, "largeBetter"), FALSE) + expect_equal(attr(actualChallenge, "smallBetter"), TRUE) expect_equal(attr(actualChallenge, "check"), FALSE) expect_equal(as.vector(actualChallenge$algo), c("A1", "A2", "A1", "A2")) expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6, 0.2, 0.3)) expect_equal(as.vector(actualChallenge$case), c("C1", "C1", "C1", "C1")) expect_equal(as.vector(actualChallenge$task), c("T1", "T1", "T2", "T2")) }) test_that("attribute 'taskName' is ignored for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A2", value=0.3, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_warning(as.challenge(data, taskName="T1", by="task", algorithm="algo", case="case", value="value", smallBetter=TRUE), "Argument 'taskName' is ignored for multi-task data set.", fixed=TRUE) }) test_that("missing algorithm performances are added as NAs for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1", "T1", "T1")) }) test_that("multi-task data set containing one task is interpreted as single-task data set, missing algorithm performances are added", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("missing algorithm performances are added as NAs for multi-task challenge (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Performance of not all algorithms has been observed for all cases in task 'T1'. Therefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, NA, NA, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T1$task), c("T1", "T1", "T1", "T1")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4, NA)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T2$task), c("T2", "T2", "T2", "T2")) }) test_that("missing algorithm performances are not added as NA with sanity check disabled for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE) expect_equal(as.vector(actualChallenge$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$case), c("C1", "C2")) }) test_that("missing algorithm performances are not added as NA with sanity check disabled for multi-task challenge (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, check=FALSE) expect_equal(as.vector(actualChallenge$algo), c("A1", "A2", "A1", "A1", "A2")) expect_equal(as.vector(actualChallenge$value), c(0.8, 0.6, 0.2, 0.3, 0.4)) expect_equal(as.vector(actualChallenge$case), c("C1", "C2", "C1", "C2", "C1")) }) test_that("case cannot appear more than once per algorithm for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1")) expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("multi-task data set containing one task is interpreted as single-task data set, case cannot appear more than once per algorithm", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1") )) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("case cannot appear more than once per algorithm for multi-task challenge (1 task in data set)", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1") )) expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.7, case="C1"), data.frame(algo="A1", value=0.5, case="C2"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2")) expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1, C2", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm for multi-task challenge (1 task in data set)", { data <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.7, case="C1"), data.frame(algo="A1", value=0.5, case="C2"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2") )) expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1, C2", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm for multi-task challenge (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1") # let T1 pass )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.7, case="C1"), data.frame(algo="A1", value=0.5, case="C2"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2") )) data <- rbind(dataTask1, dataTask2) expect_error(as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm in task 'T2'. Please revise.\nCase(s): C1, C2", fixed=TRUE) }) test_that("cases cannot appear more than once per algorithm when missing data was added for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2"), data.frame(algo="A2", value=0.6, case="C2")) expect_error(as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1, C2", fixed=TRUE) }) test_that("user is notified of duplicate cases when multi-task data set is interpreted as single-task data set (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.8, case="C1") )) data <- rbind(dataTask1, dataTask2) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_error(as.challenge(data, taskName="New task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "The following case(s) appear(s) more than once for the same algorithm. Please revise. Or are you considering a multi-task challenge and forgot to specify argument 'by'?\nCase(s): C1", fixed=TRUE) }) test_that("user is notified of missing algorithm performance when multi-task data set is interpreted as single-task data set (2 tasks in data set)", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=0.6, case="C2") )) data <- rbind(dataTask1, dataTask2) # do not specify parameter "by" to interpret multi-task data set as single-task data set expect_message(as.challenge(data, taskName="New task", algorithm="algo", case="case", value="value", smallBetter=FALSE), "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) }) test_that("NAs are replaced by numeric value for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A2", value=NA, case="C2")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.6, 0.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("NAs are replaced by numeric value for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=NA, case="C1"), data.frame(algo="A2", value=0.5, case="C2") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.0, 0.5)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2")) }) test_that("NAs are replaced by function value for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A2", value=NA, case="C2")) replacementFunction <- function(x) { 2 } actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 2.0, 0.6, 2.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("NAs are replaced by function value for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=NA, case="C1"), data.frame(algo="A2", value=0.5, case="C2") )) data <- rbind(dataTask1, dataTask2) replacementFunction <- function(x) { 2 } actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=replacementFunction) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 2.0)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(2.0, 0.5)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2")) }) test_that("NAs are removed for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2"), data.frame(algo="A2", value=0.6, case="C1"), data.frame(algo="A2", value=NA, case="C2")) actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm") expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C1")) }) test_that("NAs are removed for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A1", value=NA, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A2", value=NA, case="C1"), data.frame(algo="A2", value=0.5, case="C2") )) data <- rbind(dataTask1, dataTask2) actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm") expect_equal(as.vector(actualChallenge$T1$algo), c("A1")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8)) expect_equal(as.vector(actualChallenge$T1$case), c("C1")) expect_equal(as.vector(actualChallenge$T2$algo), c("A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.5)) expect_equal(as.vector(actualChallenge$T2$case), c("C2")) }) test_that("automatically added NAs are replaced by numeric value for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0), "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.0, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) }) test_that("automatically added NAs are replaced by numeric value for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat=0), "Performance of not all algorithms has been observed for all cases in task 'T1'. Therefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.0, 0.0, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2", "C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4, 0.0)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1", "C2")) }) test_that("automatically added NAs are removed for single-task challenge", { data <- rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2")) expect_message(actualChallenge <- as.challenge(data, taskName="T1", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"), "Performance of not all algorithms has been observed for all cases. Therefore, missings have been inserted in the following cases:", fixed=TRUE) expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) }) test_that("automatically added NAs are removed for multi-task challenge", { dataTask1 <- cbind(task="T1", rbind( data.frame(algo="A1", value=0.8, case="C1"), data.frame(algo="A2", value=0.6, case="C2") )) dataTask2 <- cbind(task="T2", rbind( data.frame(algo="A1", value=0.2, case="C1"), data.frame(algo="A1", value=0.3, case="C2"), data.frame(algo="A2", value=0.4, case="C1") )) data <- rbind(dataTask1, dataTask2) expect_message(actualChallenge <- as.challenge(data, by="task", algorithm="algo", case="case", value="value", smallBetter=FALSE, na.treat="na.rm"), "Performance of not all algorithms has been observed for all cases in task 'T1'. Therefore, missings have been inserted in the following cases:", fixed=TRUE) - + expect_equal(as.vector(actualChallenge$T1$algo), c("A1", "A2")) expect_equal(as.vector(actualChallenge$T1$value), c(0.8, 0.6)) expect_equal(as.vector(actualChallenge$T1$case), c("C1", "C2")) expect_equal(as.vector(actualChallenge$T2$algo), c("A1", "A1", "A2")) expect_equal(as.vector(actualChallenge$T2$value), c(0.2, 0.3, 0.4)) expect_equal(as.vector(actualChallenge$T2$case), c("C1", "C2", "C1")) }) diff --git a/vignettes/Overview.Rmd b/vignettes/Overview.Rmd index fae9849..e374a91 100644 --- a/vignettes/Overview.Rmd +++ b/vignettes/Overview.Rmd @@ -1,370 +1,370 @@ --- title: "How to use challengeR" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{How to use challengeR} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` # Introduction This document is meant to be an overview guide of the classes, methods and different steps used in the tutorial scripts, and aims to achieve a deeper understanding of the analysis and visualization toolkit. The overview is divided in sections, following the usage. # Ranking configuration Once the data has been loaded (either manually or using a .csv file), the first thing to do is to create a challenge object. Then, the ranking method will be chosen and configured. ## Define challenge object Challenges can be single- or multi-task. We define a challenge task as a subproblem to be solved in the scope of a challenge for which a dedicated ranking/leaderboard is provided (if any). The assessment method (e.g. metric(s) applied) may vary across different tasks of a challenge. For example, a segmentation challenge may comprise three tasks: 1) segmentation of the liver 2) segmentation of the kidney 3) segmentation of the spleen In the context of the visualization toolkit, we differentiate between challenges that only comprise a single-task ("single-task challenge") and challenges with multiple tasks with each task containing different results and rankings ("multi-task challenge"). In the latter case, the report can directly be configured across all specified tasks by defining a task column in the data matrix. The first step is to create a challenge object. The file "challengeR.R" will be used for that purpose, which will be now analysed. The following code refers to the constructor: ```{r, eval=F, echo=T} # challengeR.R as.challenge=function(object, value, algorithm , case=NULL, taskName=NULL, by=NULL, annotator=NULL, smallBetter=FALSE, na.treat=NULL, check=TRUE) ``` Each parameter corresponds to: - object: the object that will be returned, in the specific case, the data set itself - value: column corresponding to the values of the metric (only one metric is supported) - algorithm: column corresponding to the algorithm identifiers - case: column corresponding to the test case identifier - taskName: optional task name (string) for single-task challenges, the parameter will be displayed as titles of plots - by: (="task" ), use it when it is a multi-task challenge. If the parameter is not specified, the challenge will be automatically be interpreted as a single-task challenge. - annotator: (currently not implemented) specify here if there are more than one annotator - smallBetter: specify if small metric values are indicating a better performance - na.treat: (optional) specify how missing values (NA) are treated, e.g. set them to the worst possible metric values. There is no need to specify this value because either if the user knows for sure that the data set has no NAs, or if the data set has NAs and rank-then-aggregate is applied. - check: computes sanity check if TRUE. The sanity check can be computed for both single- and multi-task challenges. It checks missing algorithm performance, and also whether the test cases appear more than once. An example of how to use it (for a multi-task challenge): ```{r, eval=F, echo=T} # challengeR.R challenge=as.challenge(data_matrix, value="value", algorithm="alg_name", case="case", by="task", smallBetter = FALSE) ``` ! Take into account that for single-task challenges, the "by" parameter should not be configured ! For single-task challenges, if the data matrix consists of a task column, it is easier to create a subset of the data matrix that only includes the values for that specific task: ```{r, eval=F, echo=T} dataSubset=subset(data_matrix, task=="TASK_NAME") ``` In this way, "dataSubset" will be used to create the challenge object. ## Configure ranking method The classes "wrapper.R", "aaggregate.R" and "Rank.aggregated.R" are used. In order to configure the ranking methods, the next parameters are considered: - FUN: aggregation function, e.g. mean, median, min, max, or e.g. function(x) quantile(x, probs=0.05) - na.treat: treatment of missing data / null values (here needs to be specified again, because it was an optional parameter when the challenge object was created) either "na.rm" to remove missing data, set missings to numeric value (e.g. 0) or specify a function e.g. function(x) min(x) - ties.method: a character string specifying how ties (two items that are the same in rank) are treated, see ?base::rank or click on [*Strategies for assigning rankings*](https://en.wikipedia.org/wiki/Ranking#Strategies_for_assigning_rankings) for more details - alpha: significance level (only for Significance ranking) - p.adjust.method: method for adjustment for multiple testing, see ?p.adjust Different ranking methods are available: #### Metric-based aggregation -> aggregateThenRank method ```{r, eval=F, echo=T} # wrapper.R aggregateThenRank=function(object,FUN,ties.method = "min",...){ object %>% aggregate(FUN=FUN,...) %>% rank(ties.method = ties.method) } ``` First, (object %>% aggregate), the metric values for each algorithm are aggregated across all cases using the specified aggregation function: ```{r, eval=F, echo=T} # aaggregate.R aggregate.challenge=function(x, FUN=mean, na.treat, alpha=0.05, p.adjust.method="none", parallel=FALSE, progress="none", ...) ``` -Second, (aggregate %>% rank), the aggregated metric values are converted into a ranking list, following the largeBetter argument defined above: +Second, (aggregate %>% rank), the aggregated metric values are converted into a ranking list, following the smallBetter argument defined above: ```{r, eval=F, echo=T} # Rank.aggregated.R rank.aggregated <-function(object, ties.method="min", - largeBetter, + smallBetter, ...) ``` An example for "aggregate-then-rank" use (takink mean for aggregation): ```{r, eval=F, echo=T} ranking=challenge%>%aggregateThenRank(FUN = mean, na.treat=0, ties.method = "min" ) ``` #### Case-based aggregation -> rankThenAggregate method ```{r, eval=F, echo=T} # wrapper.R rankThenAggregate=function(object, FUN, ties.method = "min" ){ object %>% rank(ties.method = ties.method)%>% aggregate(FUN=FUN) %>% rank(ties.method = ties.method) } ``` First, (object %>% rank), a ranking will be created for each case across all algorithms. Missing values will be assigned to the worst rank: ```{r, eval=F, echo=T} # rrank.R rank.challenge=function(object, x, ties.method="min", ...) ``` Second, (rank %>% aggregate), the ranks per case will be aggregated for each algorithm: ```{r, eval=F, echo=T} # aaggregate.R aggregate.ranked <-function(x, FUN=mean, ... ) ``` Third, (aggregate %>% rank), the previously ranked and aggregated values are converted to a ranking list again: ```{r, eval=F, echo=T} # Rank.aggregated.R rank.aggregated <-function(object, ties.method="min", - largeBetter, + smallBetter, ...) ``` An example for "rank-then-aggregate" with arguments as above (taking mean for aggregation): ```{r, eval=F, echo=T} ranking=challenge%>%rankThenAggregate(FUN = mean, ties.method = "min" ) ``` #### Significance ranking -> testThenRank method This method is similar to "aggregateThenRank", but having a fixed "significance" function. ```{r, eval=F, echo=T} # wrapper.R testThenRank=function(object,FUN,ties.method = "min",...){ object %>% aggregate(FUN="significance",...) %>% rank(ties.method = ties.method) } ``` First, (object %>% aggregate),the metric values will be aggregated across all cases. In this case, a pairwise comparison between all algorithms will be performed by using statistical tests. For each algorithm, it will be counted how often the specific algorithm is significantly superior to others. This count will be saved as aggregated value: ! No need to specify the function again, it is already set as "significance" ! ```{r, eval=F, echo=T} # aaggregate.R aggregate.challenge=function(x, FUN="significance", na.treat, alpha=0.05, p.adjust.method="none", parallel=FALSE, progress="none", ...) ``` Second, (aggregate %>% rank), the aggregated values are converted to a ranking list: ```{r, eval=F, echo=T} # Rank.aggregated.R rank.aggregated <-function(object, ties.method="min", - largeBetter, + smallBetter, ...) ``` An example for test-then-rank based on Wilcoxon signed rank test: ```{r, eval=F, echo=T} ranking=challenge%>%testThenRank(alpha=0.05, p.adjust.method="none", na.treat=0, ties.method = "min" ) ``` # Uncertainty analysis (bootstrapping) The assessment of stability of rankings across different ranking methods with respect to both sampling variability and variability across tasks is of major importance. In order to investigate ranking stability, the bootstrap approach can be used for a given method. The procedure consists on: 1. Use available data sets to generate N bootstrap datasets 2. Perform ranking on each bootstrap dataset The ranking strategy is performed repeatedly on each bootstrap sample. One bootstrap sample of a task with n test cases consists of n test cases randomly drawn with replacement from this task. A total of b of these bootstrap samples are drawn (e.g., b = 1000). Bootstrap approaches can be evaluated in two ways: either the rankings for each bootstrap sample are evaluated for each algorithm, or the distribution of correlations or pairwise distances between the ranking list based on the full assessment data and based on each bootstrap sample can be explored. ! Note that this step is optional, can be omitted and directly generate the report. ! The following method is used to perform ranking on the generated bootstrap datasets: ```{r, eval=F, echo=T} # Bootstrap.R bootstrap.ranked=function(object, nboot, parallel=FALSE, progress="text", ...) ``` - nboot: number of bootstrap datasets to generate - parallel: TRUE when using multiple CPUs - progress: when setting it to "text", a progress bar indicating the progress of the bootstrapping is shown An example of bootstrapping using multiple CPUs (8 CPUs): ```{r, eval=F, echo=T} library(doParallel) registerDoParallel(cores=8) set.seed(1) ranking_bootstrapped=ranking%>%bootstrap(nboot=1000, parallel=TRUE, progress = "none") stopImplicitCluster() ``` # Report generation Finally, the report will be generated. For this last step take into account if the uncertainty analysis was performed or not. If the uncertainty analysis was not performed, use: ```{r, eval=F, echo=T} # Report.R report.ranked=function(object, file, title="", colors=default_colors, format="PDF", latex_engine="pdflatex", open=TRUE, ...) ``` If the uncertainty analysis was performed, use: ```{r, eval=F, echo=T} # Report.R report.bootstrap=function(object, file, title="", colors=default_colors, format="PDF", latex_engine="pdflatex", clean=TRUE, open=TRUE, ...) ``` The report can be generated in different formats: - file: name of the output file. If the output path is not specified, the working directory is used. If the file is specified but does not have a file extension, an extension will be automatically added according to the output format given in *format*. If omitted, the report is created in a temporary folder with file name "report". - title: title of the report - colors: color coding for the algorithms across all figures. Can be specified. Change e.g. to colors=viridisLite::inferno which is designed in such a way that it will analytically be perfectly perceptually-uniform, both in regular form and also when converted to black-and-white. It is also designed to be perceived by readers with the most common form of color blindness. See package viridis for further similar functions. - format: output format ("PDF", "HTML" or "Word") - latex_engine: LaTeX engine for producing PDF output ("pdflatex", "lualatex", "xelatex") - clean: optional. Using TRUE will clean intermediate files that are created during rendering. Using FALSE allows to retain intermediate files, such as separate files for each figure. - open: triggers opening of the report after generation or not An example of how to generate the report for a *single-task* challenge: ```{r, eval=F, echo=T} ranking_bootstrapped %>% report(title="singleTaskChallengeExample", file = "filename", format = "PDF", latex_engine="pdflatex", clean=TRUE ) ``` ! Note that the code differs slightly for single- and multi-task challenges. ! For multi-task challenges consensus ranking (rank aggregation across tasks) has to be given additionally. Consensus relations “synthesize” the information in the elements of a relation ensemble into a single relation, often by minimizing a criterion function measuring how dissimilar consensus candidates are from the (elements of) the ensemble (the so-called “optimization approach”). The following method is used: ```{r, eval=F, echo=T} # consensus.R consensus.ranked.list=function(object, method, ...) ``` - method: consensus ranking method, see ?relation_consensus for different methods to derive consensus ranking. An example of computing ranking consensus across tasks, being consensus ranking according to mean ranks across tasks: ```{r, eval=F, echo=T} meanRanks=ranking%>%consensus(method = "euclidean") ``` Generate report as above, but with additional specification of consensus ranking: ```{r, eval=F, echo=T} ranking_bootstrapped %>% report(consensus=meanRanks, title="multiTaskChallengeExample", file = "filename", format = "PDF", latex_engine="pdflatex" ) ``` # Features - Reports for subsets (top list) of algorithms: Use e.g. `subset(ranking_bootstrapped, top=3) %>% report(...)` (or `subset(ranking, top=3) %>% report(...)` for report without bootstrap results) to only show the top 3 algorithms according to the chosen ranking methods, where `ranking_bootstrapped` and `ranking` objects as defined in the example. Line plot for ranking robustness can be used to check whether algorithms performing well in other ranking methods are excluded. Bootstrapping still takes entire uncertainty into account. Podium plot neglect and ranking heatmap neglect excluded algorithms. Only available for single-task challenges (for mutli task challenges not sensible because each task would contain a different sets of algorithms). - Reports for subsets of tasks: Use e.g. `subset(ranking_bootstrapped, tasks=c("task1", "task2","task3)) %>% report(...)` to restrict report to tasks "task1", "task2","task3. You may want to recompute the consensus ranking before using `meanRanks=subset(ranking, tasks=c("task1", "task2","task3))%>%consensus(method = "euclidean")` # Terms of use Licenced under GPL-3. If you use this software for a publication, cite Wiesenfarth, M., Reinke, A., Landman, B.A., Cardoso, M.J., Maier-Hein, L. and Kopp-Schneider, A. (2019). Methods and open-source toolkit for analyzing and visualizing challenge results. *arXiv preprint arXiv:1910.05121*