From 8945c1de30092e2ba5251870988356ff9a83546c Mon Sep 17 00:00:00 2001 From: Thomas Alexander Gerds Date: Wed, 5 Jun 2024 18:17:03 +0200 Subject: [PATCH] the great cleanup --- R/AUC.binary.R | 7 +- R/AUC.competing.risks.R | 16 +- R/AUC.survival.R | 18 +- R/Brier.binary.R | 8 +- R/Brier.competing.risks.R | 9 +- R/Brier.survival.R | 17 +- R/Score.R | 788 ++++++++++++++++--------------- R/crossvalPerf.bootcv.R | 72 +-- R/crossvalPerf.loob.AUC.R | 29 +- R/crossvalPerf.loob.Brier.R | 55 +-- R/getCensoringWeights.R | 29 +- R/getComparisons.R | 8 +- R/getPerformanceData.R | 51 +- R/getResponse.R | 11 +- man/Score.Rd | 19 +- man/plotCalibration.Rd | 2 +- slowtests/slowtest-score.R | 41 +- slowtests/test-score-bootstrap.R | 9 +- slowtests/test-score-crossval.R | 7 +- 19 files changed, 615 insertions(+), 581 deletions(-) diff --git a/R/AUC.binary.R b/R/AUC.binary.R index f3af571d..c55bac1b 100644 --- a/R/AUC.binary.R +++ b/R/AUC.binary.R @@ -1,11 +1,11 @@ ### AUC.binary.R --- #---------------------------------------------------------------------- -## Author: Thomas Alexander Gerds +## Author: Thomas Alexander Gerds and Johan Sebastian Ohlendorff ## Created: Jan 11 2022 (17:04) ## Version: -## Last-Updated: Jun 5 2024 (07:25) +## Last-Updated: Jun 5 2024 (14:50) ## By: Thomas Alexander Gerds -## Update #: 17 +## Update #: 19 #---------------------------------------------------------------------- ## ### Commentary: @@ -173,6 +173,7 @@ AUC.binary <- function(DT, res.cut <- list() for (i in 1:length(cutpoints)){ temp.TPR <- subset(temp.TPR.ic,cutpoints==cutpoints[i]) + # FIXME: merge on what? aucDT.temp <- merge(aucDT,temp.TPR) some.fun <- function(riskRegression_event,risk,TPR,FPR,PPV,NPV,Prisks,Prisks2,cut,N){ meanY <- mean(riskRegression_event) diff --git a/R/AUC.competing.risks.R b/R/AUC.competing.risks.R index 8c258e78..f8355190 100644 --- a/R/AUC.competing.risks.R +++ b/R/AUC.competing.risks.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jan 11 2022 (17:06) ## Version: -## Last-Updated: Jun 5 2024 (07:24) +## Last-Updated: Jun 5 2024 (17:52) ## By: Thomas Alexander Gerds -## Update #: 37 +## Update #: 43 #---------------------------------------------------------------------- ## ### Commentary: @@ -81,7 +81,7 @@ AUC.competing.risks <- function(DT, den_FPR<-sum(ipcwControls1+ipcwControls2) indeces <- sindex(risk,cutpoints,comp = "greater",TRUE) res <- list() - # FIXME + # FIXME: the order is according to -risk but why? is that necessary? ordered <- order(riskRegression_time) ## can probably move this outside to improve computation time, for now keep it for (i in 1:length(cutpoints)){ den_PPV <- sum(ipcwCases[risk > cutpoints[i]]+ipcwControls1[risk > cutpoints[i]] + ipcwControls2[risk > cutpoints[i]]) @@ -210,8 +210,8 @@ AUC.competing.risks <- function(DT, sum((FP-c(0,FP[-N]))*((c(0,TP[-N])+TP)/2)) } score <- aucDT[nodups,list(AUC=AireTrap(FPR,TPR)),by=list(model,times)] - data.table::setkey(score,model,times) - aucDT <- merge(score,aucDT,all=TRUE) + aucDT <- merge(score,aucDT,by = c("model","times"),all=TRUE) + data.table::setkey(aucDT,model,times) if (se.fit[[1]]==1L){ aucDT[,nth.times:=as.numeric(factor(times))] @@ -230,16 +230,16 @@ AUC.competing.risks <- function(DT, conservative = conservative[[1]], cens.model = cens.model), by=list(model,times)] se.score <- aucDT[,list(se=sd(IF.AUC)/sqrt(N)),by=list(model,times)] - data.table::setkey(se.score,model,times) - score <- score[se.score] + score <- score[se.score,,on = c("model","times")] + data.table::setkey(score,model,times) if (se.fit==1L){ score[,lower:=pmax(0,AUC-qnorm(1-alpha/2)*se)] score[,upper:=pmin(1,AUC+qnorm(1-alpha/2)*se)] }else{ score[,se:=NULL] } + aucDT <- aucDT[score, ,on = c("model","times")] data.table::setkey(aucDT,model,times) - aucDT <- aucDT[score] if (keep.vcov[[1]] == TRUE){ output <- c(output,list(vcov=getVcov(aucDT,"IF.AUC",times=TRUE))) } diff --git a/R/AUC.survival.R b/R/AUC.survival.R index 3da3e2bf..382303d0 100644 --- a/R/AUC.survival.R +++ b/R/AUC.survival.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jan 11 2022 (17:06) ## Version: -## Last-Updated: Jun 5 2024 (07:23) +## Last-Updated: Jun 5 2024 (17:51) ## By: Thomas Alexander Gerds -## Update #: 30 +## Update #: 45 #---------------------------------------------------------------------- ## ### Commentary: @@ -79,7 +79,7 @@ AUC.survival <- function(DT, den_FPR<-sum(ipcwControls) indeces <- sindex(risk,cutpoints,comp = "greater",TRUE) res <- list() - ## browser() + # FIXME ordered <- order(riskRegression_time) ## can probably move this outside to improve computation time, for now keep it SE.TPR <- SE.FPR <- SE.PPV <- SE.NPV <- NA for (i in 1:length(cutpoints)){ @@ -187,9 +187,8 @@ AUC.survival <- function(DT, }else{ output <- NULL } - - data.table::setkey(score,model,times) - aucDT <- merge(score,aucDT,all=TRUE) + aucDT <- merge(score,aucDT,by = c("model","times"),all=TRUE) + data.table::setkey(aucDT,model,times) if (se.fit[[1]]==1L){ aucDT[,nth.times:=as.numeric(factor(times))] ## compute influence function @@ -207,16 +206,17 @@ AUC.survival <- function(DT, conservative = conservative[[1]], cens.model = cens.model), by=list(model,times)] se.score <- aucDT[,list(se=sd(IF.AUC)/sqrt(N)),by=list(model,times)] - data.table::setkey(se.score,model,times) - score <- score[se.score] + score <- score[se.score,,on = c("model","times")] + data.table::setkey(score,model,times) if (se.fit==1L){ score[,lower:=pmax(0,AUC-qnorm(1-alpha/2)*se)] score[,upper:=pmin(1,AUC+qnorm(1-alpha/2)*se)] }else{ score[,se:=NULL] } + # join with auc and (se, lower, upper) if se.fit + aucDT <- score[aucDT,,on = c("model","times")] data.table::setkey(aucDT,model,times) - aucDT <- aucDT[score] if (keep.vcov[[1]] == TRUE){ output <- c(output,list(vcov=getVcov(aucDT,"IF.AUC",times=TRUE))) } diff --git a/R/Brier.binary.R b/R/Brier.binary.R index 842edfe4..ecdb5e10 100644 --- a/R/Brier.binary.R +++ b/R/Brier.binary.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jan 11 2022 (17:03) ## Version: -## Last-Updated: Jun 5 2024 (07:25) +## Last-Updated: Jun 5 2024 (14:52) ## By: Thomas Alexander Gerds -## Update #: 8 +## Update #: 9 #---------------------------------------------------------------------- ## ### Commentary: @@ -51,8 +51,8 @@ Brier.binary <- function(DT, if (length(dolist)>0){ ## merge with Brier score data.table::setkey(DT,model) - data.table::setkey(score,model) - DT <- DT[score] + DT <- DT[score,,on = c("model")] + data.table::setkey(DT,model) if (se.fit[[1]]==TRUE){ contrasts.Brier <- DT[,getComparisons(data.table(x=Brier, IF=residuals, diff --git a/R/Brier.competing.risks.R b/R/Brier.competing.risks.R index 124323f0..51a27a4a 100644 --- a/R/Brier.competing.risks.R +++ b/R/Brier.competing.risks.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jan 11 2022 (17:04) ## Version: -## Last-Updated: Jun 5 2024 (07:24) +## Last-Updated: Jun 5 2024 (14:43) ## By: Thomas Alexander Gerds -## Update #: 9 +## Update #: 10 #---------------------------------------------------------------------- ## ### Commentary: @@ -68,10 +68,9 @@ Brier.competing.risks <- function(DT, } data.table::setkey(score,model,times) if (length(dolist)>0){ - data.table::setkey(DT,model,times) ## merge with Brier score - DT <- DT[score] - data.table::setkey(score,model,times) + DT <- DT[score, ,on = c("model","times")] + data.table::setkey(DT,model,times) if (se.fit[[1]]==TRUE){ contrasts.Brier <- DT[,getComparisons(data.table(x=Brier,IF=IF.Brier,model=model), NF=NF, diff --git a/R/Brier.survival.R b/R/Brier.survival.R index 629482a1..b7e66617 100644 --- a/R/Brier.survival.R +++ b/R/Brier.survival.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jan 11 2022 (17:04) ## Version: -## Last-Updated: Jun 5 2024 (07:24) +## Last-Updated: Jun 5 2024 (14:48) ## By: Thomas Alexander Gerds -## Update #: 9 +## Update #: 19 #---------------------------------------------------------------------- ## ### Commentary: @@ -32,8 +32,8 @@ Brier.survival <- function(DT, ...){ IC0=IPCW=nth.times=riskRegression_ID = riskRegression_time = riskRegression_status=times=raw.Residuals=risk=Brier=residuals=WTi=Wt=status=setorder=model=IF.Brier=data.table=sd=lower=qnorm=se=upper=NULL ## compute 0/1 outcome: - DT[riskRegression_time<=times & status==1,residuals:=(1-risk)^2/WTi] - DT[riskRegression_time<=times & status==0,residuals:=0] + DT[riskRegression_time<=times & riskRegression_status==1,residuals:=(1-risk)^2/WTi] + DT[riskRegression_time<=times & riskRegression_status==0,residuals:=0] DT[riskRegression_time>times,residuals:=(risk)^2/Wt] if (se.fit[[1]]==1L){ @@ -61,18 +61,17 @@ Brier.survival <- function(DT, data.table::setkey(score,model,times) if (length(dolist)>0L){ ## merge with Brier score + DT <- DT[score,,on = c("model","times")] data.table::setkey(DT,model,times) - ## data.table::setkey(score,model,times) - DT <- DT[score] if (se.fit[[1]]==TRUE){ - contrasts.Brier <- DT[,getComparisons(data.table(x=Brier,IF=IF.Brier,model=model), + contrasts.Brier <- DT[,getComparisons(dt = data.table(x=Brier,IF=IF.Brier,model=model), NF=NF, N=N, alpha=alpha, dolist=dolist, - se.fit=se.fit),by=list(times)] + se.fit=TRUE),by=list(times)] }else{ - contrasts.Brier <- DT[,getComparisons(data.table(x=Brier,model=model), + contrasts.Brier <- DT[,getComparisons(dt = data.table(x=Brier,model=model), NF=NF, N=N, alpha=alpha, diff --git a/R/Score.R b/R/Score.R index 54bd02fb..37b85699 100644 --- a/R/Score.R +++ b/R/Score.R @@ -96,10 +96,15 @@ ##' Cox regression (\code{"cox"}) both applied to the censored times. If the right hand side of \code{formula} does not specify covariates, ##' the Kaplan-Meier method is used even if this argument is set to \code{"cox"}. Also implemented is a template for users specifying other models to estimate the IPCW. Here the user ##' should be supply a function, taking as input a \code{"formula"} and \code{"data"}. This does come at the cost of only being able to calculate conservative confidence intervals. -##' @param split.method Method for cross-validation. Right now the only choices are \code{bootcv}, \code{cvk} and \code{loob}. In the first case, bootstrap learning sets -##' are drawn with our without replacement (argument \code{M}) from \code{data}. The data not included in the current bootstrap learning -##' set are used as validation set to compute the prediction performance. In the second case, k-fold cross-validation is performed. Note that k has to be an explicit number, e.g. 5 or 10, -##' when passing this as an argument. In the third case, leave-one-out bootstrap cross-validation is performed for the Brier score and leave-pair-out bootstrap cross-validation is performed for the AUC. +##' @param split.method Method for cross-validation. Right now the available choices are +##' \itemize{ +##' \item \code{"bootcv"} bootstrap cross-validation. Argument \code{B} controls the number of bootstraps. +##' \item \code{"cv5"} 5-fold cross-validation. Argument \code{B} controls how many times to repeat 5-fold crossvalidation. +##' \item \code{"cv10"} 10-fold cross-validation. Argument \code{B} controls how many times to repeat 10-fold crossvalidation. +##' \item \code{"cvk"} k-fold cross-validation for any value of k between 2 and N-1 where N is the sample size. Argument \code{B} controls how many times to repeat k-fold crossvalidation. +##' \item \code{"loob"} leave-one-out bootstrap cross-validation is performed for the Brier score and leave-pair-out bootstrap cross-validation is performed for the AUC. Argument \code{B} controls the number of bootstraps. +##' \item \code{"none"} no data splitting +##' } ##' @param B Number of bootstrap sets for cross-validation. \code{B} should be set to 1, when k-fold cross-validation is used. ##' @param M Size of subsamples for bootstrap cross-validation. If specified it ##' has to be an integer smaller than the size of \code{data}. @@ -160,7 +165,8 @@ #' obtained for different data splits during crossvalidation. ##' @param cutpoints If not \code{NULL}, estimates and standard errors of the TPR (True Positive Rate), ##' FPR (False Positive Rate), PPV (Positive Predictive Value), and NPV (Negative Predictive Value) -##' are given at the \code{cutpoints}. These values are saved in \code{object$AUC$res.cut}. +##' are given at the \code{cutpoints}. These values are saved in \code{object$AUC$res.cut}. +##' @param verbose Verbosity level. Set to '-1' or 'FALSE' to get complete silence. Set to '0' to quiet all messages and warnings but keep the progress.bar. Set to '1' to see warnings and to '2' (the default) to see both warnings and messages. ##' @param ... Named list containing additional arguments that are passed on to the \code{predictRisk} methods corresponding to object. See examples. ##' @return List with scores and assessments of contrasts, i.e., ##' tests and confidence limits for performance and difference in performance (AUC and Brier), @@ -519,6 +525,7 @@ Score.list <- function(object, roc.method='vertical', roc.grid=switch(roc.method,"vertical"=seq(0,1,.01),"horizontal"=seq(1,0,-.01)), cutpoints = NULL, + verbose = 2, ...){ se.conservative=IPCW=IF.AUC.conservative=IF.AUC0=IF.AUC=IC0=Brier=AUC=casecontrol=se=nth.times=riskRegression_time=riskRegression_time_status=riskRegression_ID=WTi=risk=IF.Brier=lower=upper=crossval=b=time=status=model=reference=p=model=pseudovalue=riskRegression_event=residuals=event=j=NULL @@ -551,7 +558,7 @@ Score.list <- function(object, if (!("Brier" %in% metrics)) metrics <- c(metrics,"Brier") if (!null.model) { null.model <- TRUE - warning("Value of argument 'null.model' ignored as the null model is needed to compute IPA/R^2.") + if (verbose>0) warning("Value of argument 'null.model' ignored as the null model is needed to compute IPA/R^2.") } ipa <- TRUE }else{ @@ -566,6 +573,9 @@ Score.list <- function(object, else { ROC <- FALSE } + if ((!missing(cutpoints)) && (!missing(split.method)) && tolower(split.method) != "none") + if (verbose>0) warning("Calculation of sensitivities, specificities and predictive values only implemented for argument split.method='none'.\n + Argument 'cutpoints' is ignored.") if ("Calibration" %in% plots) { ## add pseudo if needed if (!("pseudo" %in% cens.method)) cens.method <- c(cens.method,"pseudo") @@ -585,7 +595,7 @@ Score.list <- function(object, ### if (cens.model != "cox" && cens.model != "none" && cens.model != "KaplanMeier" && cens.model != "discrete"){ if (!conservative[[1]]){ - warning("For this model, we can't calculate the IF of the Survival function of the censoring distribution. \n Therefore, force conservative = TRUE") + if (verbose>0) warning("For this model, we can't calculate the IF of the Survival function of the censoring distribution. \n Therefore, force conservative = TRUE") conservative[[1]] <- TRUE } passed.args <- names(as.list(match.call())[-1]) @@ -648,9 +658,11 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model ## put change names of outcome if (response.type %in% c("survival","competing.risks")){ colnames(response) <- paste0("riskRegression_",colnames(response)) - # FIXME: check if order still correct data <- cbind(response,data) N <- as.numeric(NROW(data)) + # new order is only used when external validation data are not + # sorted in order to sort vectors/matrices with predicted risks + # in accordance with the response neworder <- data[,order(riskRegression_time,-riskRegression_status)] data.table::setorder(data,riskRegression_time,-riskRegression_status) }else{ @@ -662,7 +674,11 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model ## add riskRegression_ID variable for merging purposes and because output has long format ## if data.table data contains a variable N the code data[,riskRegression_ID:=1:N] does not ## work hence we do it like this - data[["riskRegression_ID"]]=1:N + response.names <- data.table::copy(names(response)) + data[,riskRegression_ID:=1:N] + response[,riskRegression_ID:=1:N] + setkey(response,riskRegression_ID) + setkey(data,riskRegression_ID) if (response.type=="survival") formula <- stats::update(formula,"prodlim::Hist(riskRegression_time,riskRegression_status)~.") if (response.type=="competing.risks") @@ -670,7 +686,6 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model ## stop("Dont know how to predict response of type ",response.type)) cens.type <- attr(response,"cens.type") if (is.null(cens.type)) cens.type <- "uncensored" - rm(response) # }}} # {{{ SplitMethod & parallel stuff @@ -869,7 +884,8 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model stop("Landmark updating not yet implemented.") } } else{ - if (!missing(times) && (!is.null(times)) && (times[1]!=FALSE)) warning("Function 'Score': Response type is not time-to-event: argument 'times' will be ignored.",call.=FALSE) + if (verbose>0) if (!missing(times) && (!is.null(times)) && (times[1]!=FALSE)) + warning("Function 'Score': Response type is not time-to-event: argument 'times' will be ignored.",call.=FALSE) times <- NULL NT <- 1 } @@ -882,9 +898,9 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model data=data, times=times, cens.model=cens.model, - response.type=response.type, influence.curve=getIC, - censoring.save.memory = censoring.save.memory) + censoring.save.memory = censoring.save.memory, + verbose = verbose) ##split.method$internal.name %in% c("noplan",".632+") ## if cens.model is marginal then IC is a matrix (ntimes,newdata) ## if cens.model is Cox then IC is an array (nlearn, ntimes, newdata) @@ -945,18 +961,19 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model cens.type=cens.type, object=object, object.classes=object.classes, - NT=NT) + NT=NT, + verbose = verbose) if (any(is.na(DT[["risk"]]))){ missing.predictions <- DT[,list("Missing.values"=sum(is.na(risk))),by=byvars] missing.predictions[,model:=factor(model,levels=mlevs,mlabels)] - warning("Missing values in the predicted risks. See `missing.predictions' in output list.") + if (verbose>0)warning("Missing values in the predicted risks. See `missing.predictions' in output list.") }else{ missing.predictions <- "None" } if (("Brier"%in% metrics) && (any(is.na(DT[["risk"]]))|| (max(DT[["risk"]])>1 || min(DT[["risk"]])<0))){ off.predictions <- DT[,list("missing.values"=sum(is.na(risk)),"negative.values"=sum(risk<0,na.rm=TRUE),"values.above.1"=sum(risk>1,na.rm=TRUE)),by=byvars] off.predictions[,model:=factor(model,levels=mlevs,mlabels)] - warning("Predicted values off the probability scale (negative or above 100%). See `off.predictions' in output list.\nOnly a problem for the Brier score, You can stop this warning by setting metrics='auc'.") + if (verbose>0) warning("Predicted values off the probability scale (negative or above 100%). See `off.predictions' in output list.\nOnly a problem for the Brier score, You can stop this warning by setting metrics='auc'.") }else{ off.predictions <- "None" } @@ -996,45 +1013,95 @@ c.f., Chapter 7, Section 5 in Gerds & Kattan 2021. Medical risk prediction model # {{{ Crossvalidation # {{{ bootstrap re-fitting and k-fold-CV -if (split.method$internal.name%in%c("BootCv","LeaveOneOutBoot","crossval")){ - if (missing(trainseeds)||is.null(trainseeds)){ - if (!missing(seed)) set.seed(seed) - if (split.method$internal.name == "crossval"){ - trainseeds <- sample(1:1000000,size=B*split.method$k,replace=FALSE) - }else{ - trainseeds <- sample(1:1000000,size=B,replace=FALSE) - } - } - if (parallel=="snow") exports <- c("data","split.method","Weights","N","trainseeds") else exports <- NULL - if (!is.null(progress.bar)){ - message("Running crossvalidation algorithm") - if (!(progress.bar %in% c(1,2,3))) progress.bar <- 3 - if (B==1 && split.method$internal.name == "crossval"){ - message(paste0("Fitting the models in ",split.method$k," learning datasets, then predicting the risks in validation datasets")) - pb <- txtProgressBar(max = split.method$k, style = progress.bar,width=20) - } else{ - message(paste0("Fitting the models in ",B," learning datasets, then predicting the risks in validation datasets")) - pb <- txtProgressBar(max = B, style = progress.bar,width=20) + if (split.method$internal.name%in%c("BootCv","LeaveOneOutBoot","crossval")){ + if (missing(trainseeds)||is.null(trainseeds)){ + if (!missing(seed)) set.seed(seed) + if (split.method$internal.name == "crossval"){ + trainseeds <- sample(1:1000000,size=B*split.method$k,replace=FALSE) + }else{ + trainseeds <- sample(1:1000000,size=B,replace=FALSE) + } } - } - `%dopar%` <- foreach::`%dopar%` - ## k-fold-CV - if (split.method$internal.name == "crossval"){ - DT.B <- foreach::foreach (b=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ - ## repetitions of k-fold to avoid Monte-Carlo error - index.b <- split.method$index(b) ## contains a sample of the numbers 1:k with replacement - if((B>1) && !is.null(progress.bar)){setTxtProgressBar(pb, b)} - DT.b <- rbindlist(lapply(1:split.method$k,function(fold){ - traindata=data[index.b!=fold] - testids <- index.b==fold # (1:N)[index.b!=fold] - if((B==1) && !is.null(progress.bar)){ - setTxtProgressBar(pb, fold) + ## if (parallel=="snow") + ## exports <- c("data","split.method","Weights","N","trainseeds") + ## else + exports <- NULL + verbose <- as.numeric(verbose[[1]]) + if (verbose >= 0){ + if (!is.null(progress.bar)){ + if (verbose>1) message("Running crossvalidation algorithm") + if (!(progress.bar %in% c(1,2,3))) progress.bar <- 3 + if (B==1 && split.method$internal.name == "crossval"){ + if (verbose>1) message(paste0("Fitting the models in ",split.method$k," learning datasets, then predicting the risks in validation datasets")) + pb <- txtProgressBar(max = split.method$k, style = progress.bar,width=20) + } else{ + if (verbose>1) message(paste0("Fitting the models in ",B," learning datasets, then predicting the risks in validation datasets")) + pb <- txtProgressBar(max = B, style = progress.bar,width=20) } - ## NOTE: subset.data.table preserves order ## So we need to use subset?? + } + }else{ + progress.bar <- NULL + } + `%dopar%` <- foreach::`%dopar%` + ## k-fold-CV + if (split.method$internal.name == "crossval"){ + DT.B <- foreach::foreach (b=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ + ## repetitions of k-fold to avoid Monte-Carlo error + index.b <- split.method$index(b) ## contains a sample of the numbers 1:k with replacement + if((B>1) && !is.null(progress.bar)){setTxtProgressBar(pb, b)} + DT.b <- rbindlist(lapply(1:split.method$k,function(fold){ + traindata=data[index.b!=fold] + testids <- index.b==fold # (1:N)[index.b!=fold] + if((B==1) && !is.null(progress.bar)){ + setTxtProgressBar(pb, fold) + } + ## NOTE: subset.data.table preserves order ## So we need to use subset?? + testdata <- subset(data,testids) + if (cens.type=="rightCensored"){ + testweights <- Weights + # Need to check what's expected that testids is here and below: + testweights$IPCW.subject.times <- subset(testweights$IPCW.subject.times,testids) + if (Weights$dim>0){ + testweights$IPCW.times <- subset(testweights$IPCW.times,testids) + } + } else { + testweights <- NULL + } + ## predicted risks of model trained without this fold + ## evaluated and added to this fold + DT.fold <- getPerformanceData(testdata=testdata, + testweights=testweights, + traindata=traindata, + trainseed=trainseeds[[(b-1)*split.method$k+fold]], + response.type=response.type, + times=times, + cause=cause, + neworder=NULL, + debug=debug, + levs=mlevs, + labels=mlabels, + predictRisk.args=predictRisk.args, + nullobject=nullobject, + cens.type=cens.type, + object=object, + object.classes=object.classes, + NT=NT) + return(DT.fold) + })) + DT.b[,b:=b] + DT.b + } + }else{# either LeaveOneOutBoot or BootCv + DT.B <- foreach::foreach (b=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ + if(!is.null(progress.bar)){setTxtProgressBar(pb, b)} + ## DT.B <- rbindlist(lapply(1:B,function(b){ + traindata=data[split.method$index(b)] + ## setkey(traindata,riskRegression_ID) + testids <- (match(1:N,unique(split.method$index(b)),nomatch=0)==0) + ## NOTE: subset.data.table preserves order testdata <- subset(data,testids) if (cens.type=="rightCensored"){ testweights <- Weights - # Need to check what's expected that testids is here and below: testweights$IPCW.subject.times <- subset(testweights$IPCW.subject.times,testids) if (Weights$dim>0){ testweights$IPCW.times <- subset(testweights$IPCW.times,testids) @@ -1042,178 +1109,223 @@ if (split.method$internal.name%in%c("BootCv","LeaveOneOutBoot","crossval")){ } else { testweights <- NULL } - ## predicted risks of model trained without this fold - ## evaluated and added to this fold - DT.fold <- getPerformanceData(testdata=testdata, - testweights=testweights, - traindata=traindata, - trainseed=trainseeds[[(b-1)*split.method$k+fold]], - response.type=response.type, - times=times, - cause=cause, - neworder=NULL, - debug=debug, - levs=mlevs, - labels=mlabels, - predictRisk.args=predictRisk.args, - nullobject=nullobject, - cens.type=cens.type, - object=object, - object.classes=object.classes, - NT=NT) - return(DT.fold) - })) - DT.b[,b:=b] - DT.b + DT.b <- getPerformanceData(testdata=testdata, + testweights=testweights, + traindata=traindata, + trainseed=trainseeds[[b]], + response.type=response.type, + times=times, + cause=cause, + neworder=NULL, + debug=debug, + levs=mlevs, + labels=mlabels, + predictRisk.args=predictRisk.args, + nullobject=nullobject, + cens.type=cens.type, + object=object, + object.classes=object.classes, + NT=NT) + DT.b[,b:=b] + DT.b + } } - }else{# either LeaveOneOutBoot or BootCv - DT.B <- foreach::foreach (b=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ - if(!is.null(progress.bar)){setTxtProgressBar(pb, b)} - ## DT.B <- rbindlist(lapply(1:B,function(b){ - traindata=data[split.method$index(b)] - ## setkey(traindata,riskRegression_ID) - testids <- (match(1:N,unique(split.method$index(b)),nomatch=0)==0) - ## NOTE: subset.data.table preserves order - testdata <- subset(data,testids) - if (cens.type=="rightCensored"){ - testweights <- Weights - testweights$IPCW.subject.times <- subset(testweights$IPCW.subject.times,testids) - if (Weights$dim>0){ - testweights$IPCW.times <- subset(testweights$IPCW.times,testids) + trycombine <- try(DT.B <- rbindlist(DT.B),silent=TRUE) + data.table::setkeyv(DT.B,c(byvars,"riskRegression_ID")) + if (inherits(trycombine,"try-error")){ + # error handling (because rbindlist is useless at reporting errors) + for (b in 1:B){ + if ("error" %in% class(DT.B[[b]]) || "simpleError" %in% "error" %in% class(DT.B[[b]])){ + stop(paste0("Errors occured in training models for crossvalidation. \n ", "Error found in iteration b = ",b,": \n ", DT.B[[b]])) } - } else { - testweights <- NULL } - ## print(class(traindata)) - ## print(names(traindata)) - DT.b <- getPerformanceData(testdata=testdata, - testweights=testweights, - traindata=traindata, - trainseed=trainseeds[[b]], + } + else if (nrow(DT.B)==0){ + stop("All training models failed. ") + } + + if (!is.null(progress.bar)){ + cat("\n") + } + if (any(is.na(DT.B[["risk"]]))){ + missing.predictions <- DT.B[,list("Missing.values"=sum(is.na(risk))),by=byvars] + if (verbose>0) warning("Missing values in the predicted risk. See `missing.predictions' in output list.") + } + if (("Brier"%in% metrics)&& (max(DT.B[["risk"]])>1 || min(DT.B[["risk"]])<0)){ + off.predictions <- DT.B[,list("negative.values"=sum(risk<0),"values.above.1"=sum(risk>1)),by=byvars] + if (verbose>0) warning("Values off the scale (either negative or above 100%) in the predicted risk. See `off.predictions' in output list.") + } + if (debug) message("setup data for cross-validation performance") + # }}} + # {{{ Leave-one-out bootstrap + ## start clause split.method$name=="LeaveOneOutBoot + if (split.method$internal.name =="crossval" && B == 1){ + crossvalPerf <- computePerformance(DT=DT.B, + N=N, + NT=NT, + NF=NF, + models=list(levels=mlevs,labels=mlabels), + response.type=response.type, + times=times, + jack=jack, + cens.type=cens.type, + cause=cause, + states=states, + alpha=alpha, + se.fit=se.fit, + conservative=conservative, + cens.model=cens.model, + keep.residuals=FALSE, + keep.vcov=FALSE, + keep.iid = FALSE, + dolist=dolist, + probs=probs, + metrics=metrics, + plots=plots, + summary=summary, + ibs=ibs, + ipa=ipa, + ROC=ROC, + MC=Weights$IC, + IC.data=Weights$IC.data, + breaks=NULL) + } + else if (split.method$name=="LeaveOneOutBoot" | split.method$internal.name =="crossval"){ ## Testing if the crossval works in this loop + if (verbose>1) message(paste0("Calculating the performance metrics in long format\nlevel-1 data with ", + NROW(DT.B), + " rows.", + ifelse(NROW(DT.B)>1000000, + " This may take a while ...", + " This should be fast ..."))) + crossvalPerf <- lapply(metrics, function(m){ + # either crossvalPerf.loob.AUC or crossvalPerf.loob.Brier + cvploob = paste0("crossvalPerf.loob.",m) + do.call(cvploob,list(times = times, + mlevs = mlevs, + se.fit = se.fit, + NT = NT, + response.type = response.type, + response.names = response.names, + cens.type = cens.type, + Weights = Weights, + split.method = split.method, + N = N, + B = B, + DT.B = DT.B, + dolist = dolist, + alpha = alpha, + byvars = byvars, + mlabels = mlabels, + ipa = ipa, + ibs = ibs, + keep.residuals = keep.residuals, + conservative = conservative, + cens.model = cens.model, + cause = cause)) + }) + names(crossvalPerf) <- metrics + ## copy paste from bootcv; same method to calculate ROC + if (ROC){ + ## implement ROC here + if (parallel=="snow") exports <- c("DT.B","N.b","cens.model") else exports <- NULL + if (!is.null(progress.bar)){ + if (!(progress.bar %in% c(1,2,3))) progress.bar <- 3 + pb1 <- txtProgressBar(max = B, style = progress.bar,width=20) + if (verbose>1) message(paste0("Calculating the ROC curve in ",B," validation data sets")) + } + crossval <- foreach::foreach(j=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ + DT.b <- DT.B[b==j] + N.b <- length(unique(DT.b[["riskRegression_ID"]])) + if(!is.null(progress.bar)){ + setTxtProgressBar(pb1, j) + } + computePerformance(DT=DT.b, + N=N.b, + NT=NT, + NF=NF, + models=list(levels=mlevs,labels=mlabels), response.type=response.type, times=times, - cause=cause, - neworder=NULL, - debug=debug, - levs=mlevs, - labels=mlabels, - predictRisk.args=predictRisk.args, - nullobject=nullobject, + jack=jack, cens.type=cens.type, - object=object, - object.classes=object.classes, - NT=NT) - DT.b[,b:=b] - DT.b - } - } - trycombine <- try(DT.B <- rbindlist(DT.B),silent=TRUE) - if (inherits(trycombine,"try-error")){ - # error handling (because rbindlist is useless at reporting errors) - for (b in 1:B){ - if ("error" %in% class(DT.B[[b]]) || "simpleError" %in% "error" %in% class(DT.B[[b]])){ - stop(paste0("Errors occured in training models for crossvalidation. \n ", "Error found in iteration b = ",b,": \n ", DT.B[[b]])) + cause=cause, + states=states, + alpha=alpha, + se.fit=FALSE, + conservative=TRUE, + cens.model=cens.model, + keep.residuals=FALSE, + keep.vcov=FALSE, + keep.iid =FALSE, + dolist=dolist, + probs=probs, + metrics="AUC", + plots=plots, + summary=summary, + ibs=ibs, + ipa=ipa, + ROC=ROC, + MC=Weights$IC, + IC.data=Weights$IC.data, + breaks=breaks) + } + cumROC <- lapply(crossval,function(x) x[["ROC"]][["plotframe"]]) + TPR=FPR=NULL + fancy.fun <- function(risk,TPR,FPR,roc.method,roc.grid){ + if (roc.method == "vertical"){ + temp <- stats::approx(x=FPR, + y=TPR, + xout=roc.grid, + ties=median, + yleft=0, + yright=1)$y + list(risk = rep(NA,length(roc.grid)),TPR=temp, FPR=roc.grid) + } + else { + temp <- stats::approx(x=TPR, + y=FPR, + xout=roc.grid, + ties=median, + yleft=0, #have to consider if yleft and yright have to be modified. + yright=1)$y + data.table(risk = rep(NA,length(roc.grid)),TPR=roc.grid, FPR=temp) + } + } + if (response.type == "binary"){ + cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model)]) + if (roc.method == "vertical"){ + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,FPR)] + } + else { + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,TPR)] + } + } + else { + cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model,times)]) + if (roc.method == "vertical"){ + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,FPR)] + } + else { + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,TPR)] + } + } + setorder(cumROC, cols = "TPR") + ROC.res <- list(plotframe=cumROC,plotmethod="ROC") + class(ROC.res) <- "scoreROC" + crossvalPerf[["ROC"]] <- ROC.res + } } - } - } - else if (nrow(DT.B)==0){ - stop("All training models failed. ") - } - - if (!is.null(progress.bar)){ - cat("\n") - } - if (any(is.na(DT.B[["risk"]]))){ - missing.predictions <- DT.B[,list("Missing.values"=sum(is.na(risk))),by=byvars] - warning("Missing values in the predicted risk. See `missing.predictions' in output list.") - } - if (("Brier"%in% metrics)&& (max(DT.B[["risk"]])>1 || min(DT.B[["risk"]])<0)){ - off.predictions <- DT.B[,list("negative.values"=sum(risk<0),"values.above.1"=sum(risk>1)),by=byvars] - warning("Values off the scale (either negative or above 100%) in the predicted risk. See `off.predictions' in output list.") - } - if (debug) message("setup data for cross-validation performance") - rr_vars <- grep("^riskRegression_",names(data)) - Response <- data[,rr_vars,with=FALSE] - Response.names <- names(Response) - Response.names <- Response.names[Response.names!="riskRegression_ID"] - setkey(Response,riskRegression_ID) - # }}} - # {{{ Leave-one-out bootstrap - ## start clause split.method$name=="LeaveOneOutBoot - if (split.method$internal.name =="crossval" && B == 1){ - crossvalPerf<-computePerformance(DT=DT.B, - N=N, - NT=NT, - NF=NF, - models=list(levels=mlevs,labels=mlabels), - response.type=response.type, - times=times, - jack=jack, - cens.type=cens.type, - cause=cause, - states=states, - alpha=alpha, - se.fit=se.fit, - conservative=conservative, - cens.model=cens.model, - keep.residuals=FALSE, - keep.vcov=FALSE, - keep.iid = FALSE, - dolist=dolist, - probs=probs, - metrics=metrics, - plots=plots, - summary=summary, - ibs=ibs, - ipa=ipa, - ROC=ROC, - MC=Weights$IC, - IC.data=Weights$IC.data, - breaks=NULL) - } - else if (split.method$name=="LeaveOneOutBoot" | split.method$internal.name =="crossval"){ ## Testing if the crossval works in this loop - message(paste0("Calculating the performance metrics in long format\nlevel-1 data with ", - NROW(DT.B), - " rows.", - ifelse(NROW(DT.B)>1000000, - " This may take a while ...", - " This should be fast ..."))) - crossvalPerf <- lapply(metrics, function(m){ - # either crossvalPerf.loob.AUC or crossvalPerf.loob.Brier - cvploop = paste0("crossvalPerf.loob.",m) - do.call(cvploop,list(times = times, - mlevs = mlevs, - se.fit = se.fit, - response.type = response.type, - NT = NT, - Response = Response, - cens.type = cens.type, - Weights = Weights, - split.method = split.method, - N = N, - B = B, - DT.B = DT.B, - data = data, - dolist = dolist, - alpha = alpha, - byvars = byvars, - mlabels = mlabels, - ipa = ipa, - ibs = ibs, - keep.residuals = keep.residuals, - conservative = conservative, - cens.model = cens.model, - cause = cause)) - }) - names(crossvalPerf) <- metrics - ## copy paste from bootcv; same method to calculate ROC - if (ROC){ + else if (split.method$name=="BootCv"){ + # {{{ bootcv ## implement ROC here if (parallel=="snow") exports <- c("DT.B","N.b","cens.model") else exports <- NULL if (!is.null(progress.bar)){ if (!(progress.bar %in% c(1,2,3))) progress.bar <- 3 pb1 <- txtProgressBar(max = B, style = progress.bar,width=20) - message(paste0("Calculating the ROC curve in ",B," validation data sets")) + if (verbose>1) message(paste0("Calculating the performance metrics in ",B," validation data sets")) + } + if (!("ROC" %in% plots)){ + breaks <- NULL } crossval <- foreach::foreach(j=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ DT.b <- DT.B[b==j] @@ -1238,10 +1350,10 @@ if (split.method$internal.name%in%c("BootCv","LeaveOneOutBoot","crossval")){ cens.model=cens.model, keep.residuals=FALSE, keep.vcov=FALSE, - keep.iid =FALSE, + keep.iid = FALSE, dolist=dolist, probs=probs, - metrics="AUC", + metrics=metrics, plots=plots, summary=summary, ibs=ibs, @@ -1251,193 +1363,99 @@ if (split.method$internal.name%in%c("BootCv","LeaveOneOutBoot","crossval")){ IC.data=Weights$IC.data, breaks=breaks) } - cumROC <- lapply(crossval,function(x) x[["ROC"]][["plotframe"]]) - TPR=FPR=NULL - fancy.fun <- function(risk,TPR,FPR,roc.method,roc.grid){ - if (roc.method == "vertical"){ - temp <- stats::approx(x=FPR, - y=TPR, - xout=roc.grid, - ties=median, - yleft=0, - yright=1)$y - list(risk = rep(NA,length(roc.grid)),TPR=temp, FPR=roc.grid) - } - else { - temp <- stats::approx(x=TPR, - y=FPR, - xout=roc.grid, - ties=median, - yleft=0, #have to consider if yleft and yright have to be modified. - yright=1)$y - data.table(risk = rep(NA,length(roc.grid)),TPR=roc.grid, FPR=temp) - } - } - if (response.type == "binary"){ - cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model)]) - if (roc.method == "vertical"){ - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,FPR)] - } - else { - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,TPR)] - } - } - else { - cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model,times)]) - if (roc.method == "vertical"){ - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,FPR)] - } - else { - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,TPR)] - } - } - setorder(cumROC, cols = "TPR") - ROC.res <- list(plotframe=cumROC,plotmethod="ROC") - class(ROC.res) <- "scoreROC" - crossvalPerf[["ROC"]] <- ROC.res - } - } - else if (split.method$name=="BootCv"){ - # {{{ bootcv - ## implement ROC here - if (parallel=="snow") exports <- c("DT.B","N.b","cens.model") else exports <- NULL - if (!is.null(progress.bar)){ - if (!(progress.bar %in% c(1,2,3))) progress.bar <- 3 - pb1 <- txtProgressBar(max = B, style = progress.bar,width=20) - message(paste0("Calculating the performance metrics in ",B," validation data sets")) - } - if (!("ROC" %in% plots)){ - breaks <- NULL - } - crossval <- foreach::foreach(j=1:B,.export=exports,.packages="data.table",.errorhandling=errorhandling) %dopar%{ - DT.b <- DT.B[b==j] - N.b <- length(unique(DT.b[["riskRegression_ID"]])) - if(!is.null(progress.bar)){ - setTxtProgressBar(pb1, j) + if (!is.null(progress.bar)){ + cat("\n") } - computePerformance(DT=DT.b, - N=N.b, - NT=NT, - NF=NF, - models=list(levels=mlevs,labels=mlabels), - response.type=response.type, - times=times, - jack=jack, - cens.type=cens.type, - cause=cause, - states=states, - alpha=alpha, - se.fit=FALSE, - conservative=TRUE, - cens.model=cens.model, - keep.residuals=FALSE, - keep.vcov=FALSE, - keep.iid = FALSE, - dolist=dolist, - probs=probs, - metrics=metrics, - plots=plots, - summary=summary, - ibs=ibs, - ipa=ipa, - ROC=ROC, - MC=Weights$IC, - IC.data=Weights$IC.data, - breaks=breaks) - } - if (!is.null(progress.bar)){ - cat("\n") - } - crossvalPerf <- lapply(metrics,function(m) crossvalPerf.bootcv(m,crossval,se.fit,keep.cv,byvars,alpha)) - names(crossvalPerf) <- metrics + crossvalPerf <- lapply(metrics,function(m) crossvalPerf.bootcv(m,crossval,se.fit,keep.cv,byvars,alpha)) + names(crossvalPerf) <- metrics - if ("ROC" %in% plots){ - cumROC <- lapply(crossval,function(x) x[["ROC"]][["plotframe"]]) - TPR=FPR=NULL - fancy.fun <- function(risk,TPR,FPR,roc.method,roc.grid){ - if (roc.method == "vertical"){ - temp <- stats::approx(x=FPR, - y=TPR, - xout=roc.grid, - ties=median, - yleft=0, - yright=1)$y - list(risk = rep(NA,length(roc.grid)),TPR=temp, FPR=roc.grid) - } - else { - temp <- stats::approx(x=TPR, - y=FPR, - xout=roc.grid, - ties=median, - yleft=0, #have to consider if yleft and yright have to be modified. - yright=1)$y - data.table(risk = rep(NA,length(roc.grid)),TPR=roc.grid, FPR=temp) + if ("ROC" %in% plots){ + cumROC <- lapply(crossval,function(x) x[["ROC"]][["plotframe"]]) + TPR=FPR=NULL + fancy.fun <- function(risk,TPR,FPR,roc.method,roc.grid){ + if (roc.method == "vertical"){ + temp <- stats::approx(x=FPR, + y=TPR, + xout=roc.grid, + ties=median, + yleft=0, + yright=1)$y + list(risk = rep(NA,length(roc.grid)),TPR=temp, FPR=roc.grid) + } + else { + temp <- stats::approx(x=TPR, + y=FPR, + xout=roc.grid, + ties=median, + yleft=0, #have to consider if yleft and yright have to be modified. + yright=1)$y + data.table(risk = rep(NA,length(roc.grid)),TPR=roc.grid, FPR=temp) + } } - } - if (response.type == "binary"){ - cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model)]) - if (roc.method == "vertical"){ - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,FPR)] + if (response.type == "binary"){ + cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model)]) + if (roc.method == "vertical"){ + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,FPR)] + } + else { + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,TPR)] + } } else { - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,TPR)] + cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model,times)]) + if (roc.method == "vertical"){ + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,FPR)] + } + else { + cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,TPR)] + } } + setorder(cumROC, cols = "TPR") + ROC.res <- list(plotframe=cumROC,plotmethod="ROC") + class(ROC.res) <- "scoreROC" + crossvalPerf[["ROC"]] <- ROC.res } - else { - cumROC <- lapply(cumROC,function(x) x[,fancy.fun(risk,TPR,FPR,roc.method=roc.method,roc.grid=roc.grid),by=list(model,times)]) - if (roc.method == "vertical"){ - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,FPR)] - } - else { - cumROC <- rbindlist(cumROC)[,lapply(.SD, mean),by=list(model,times,TPR)] - } + if (ipa==TRUE){ + if (response.type=="binary") + crossvalPerf[["Brier"]][["score"]][,IPA:=1-Brier/Brier[model=="Null model"]] + else + crossvalPerf[["Brier"]][["score"]][,IPA:=1-Brier/Brier[model=="Null model"],by=times] } - setorder(cumROC, cols = "TPR") - ROC.res <- list(plotframe=cumROC,plotmethod="ROC") - class(ROC.res) <- "scoreROC" - crossvalPerf[["ROC"]] <- ROC.res } - if (ipa==TRUE){ - if (response.type=="binary") - crossvalPerf[["Brier"]][["score"]][,IPA:=1-Brier/Brier[model=="Null model"]] - else - crossvalPerf[["Brier"]][["score"]][,IPA:=1-Brier/Brier[model=="Null model"],by=times] - } - } - # }}} - # {{{ collect data for plotRisk - if ("risks"%in% summary){ - crossvalPerf[["risks"]]$score <- DT.B[,{ - c(.SD[1,Response.names,with=FALSE], - list(risk=mean(risk), - sd.risk=sd(risk), - oob=.N))},.SDcols=c(Response.names,"risk"),by=c(byvars,"riskRegression_ID")] - crossvalPerf[["risks"]]$score[,model:=factor(model,levels=mlevs,mlabels)] - setcolorder(crossvalPerf[["risks"]]$score,c("riskRegression_ID",byvars,Response.names,"risk","sd.risk")) - } - # }}} - # {{{ collect data for calibration plots - if ("Calibration" %in% plots){ - if (keep.residuals[[1]]==TRUE && split.method$name[[1]]=="LeaveOneOutBoot"){ - crossvalPerf[["Calibration"]]$plotframe <- crossvalPerf$Brier$Residuals[model!=0,] - } else{ - ## there are no residuals in this case. residuals are only available for LOOB! - crossvalPerf[["Calibration"]]$plotframe <- DT.B[model!=0,{ - c(.SD[1,Response.names,with=FALSE], + # }}} + # {{{ collect data for plotRisk + if ("risks"%in% summary){ + crossvalPerf[["risks"]]$score <- DT.B[,{ + c(.SD[1,response.names,with=FALSE], list(risk=mean(risk), - oob=.N))},.SDcols=c(Response.names,"risk"),by=c(byvars,"riskRegression_ID")] - setcolorder(crossvalPerf[["Calibration"]]$plotframe,c("riskRegression_ID",byvars,Response.names,"risk")) + sd.risk=sd(risk), + oob=.N))},.SDcols=c(response.names,"risk"),by=c(byvars,"riskRegression_ID")] + crossvalPerf[["risks"]]$score[,model:=factor(model,levels=mlevs,mlabels)] + setcolorder(crossvalPerf[["risks"]]$score,c("riskRegression_ID",byvars,response.names,"risk","sd.risk")) } - crossvalPerf[["Calibration"]]$plotframe[,model:=factor(model,levels=mlevs,mlabels)] - if (keep.residuals[[1]]==FALSE && split.method$name[[1]]=="LeaveOneOutBoot"){ - crossvalPerf$Brier$Residuals <- NULL + # }}} + # {{{ collect data for calibration plots + if ("Calibration" %in% plots){ + if (keep.residuals[[1]]==TRUE && split.method$name[[1]]=="LeaveOneOutBoot"){ + crossvalPerf[["Calibration"]]$plotframe <- crossvalPerf$Brier$Residuals[model!=0,] + } else{ + ## there are no residuals in this case. residuals are only available for LOOB! + crossvalPerf[["Calibration"]]$plotframe <- DT.B[model!=0,{ + c(.SD[1,response.names,with=FALSE], + list(risk=mean(risk), + oob=.N))},.SDcols=c(response.names,"risk"),by=c(byvars,"riskRegression_ID")] + setcolorder(crossvalPerf[["Calibration"]]$plotframe,c("riskRegression_ID",byvars,response.names,"risk")) + } + crossvalPerf[["Calibration"]]$plotframe[,model:=factor(model,levels=mlevs,mlabels)] + if (keep.residuals[[1]]==FALSE && split.method$name[[1]]=="LeaveOneOutBoot"){ + crossvalPerf$Brier$Residuals <- NULL + } + if (cens.type=="rightCensored") + crossvalPerf[["Calibration"]]$plotframe <- merge(jack,crossvalPerf[["Calibration"]]$plotframe,by=c("riskRegression_ID","times")) } - if (cens.type=="rightCensored") - crossvalPerf[["Calibration"]]$plotframe <- merge(jack,crossvalPerf[["Calibration"]]$plotframe,by=c("riskRegression_ID","times")) + # }}} } # }}} -} - # }}} # {{{ the output object if (split.method$internal.name=="noplan"){ if (keep.residuals==TRUE){ diff --git a/R/crossvalPerf.bootcv.R b/R/crossvalPerf.bootcv.R index 0fe766b0..1e66e582 100644 --- a/R/crossvalPerf.bootcv.R +++ b/R/crossvalPerf.bootcv.R @@ -1,41 +1,41 @@ # Function to calculate cross-validation performance crossvalPerf.bootcv <- function(m,crossval,se.fit,keep.cv,byvars,alpha){ - ## score - if (length(crossval[[1]][[m]]$score)>0){ - cv.score <- data.table::rbindlist(lapply(crossval,function(x){x[[m]]$score})) - if (se.fit==TRUE){ - bootcv.score <- cv.score[,data.table::data.table(mean(.SD[[m]],na.rm=TRUE), - se=sd(.SD[[m]],na.rm=TRUE), - lower=quantile(.SD[[m]],alpha/2,na.rm=TRUE), - upper=quantile(.SD[[m]],(1-alpha/2),na.rm=TRUE)),by=byvars,.SDcols=m] - data.table::setnames(bootcv.score,c(byvars,m,"se","lower","upper")) + ## score + if (length(crossval[[1]][[m]]$score)>0){ + cv.score <- data.table::rbindlist(lapply(crossval,function(x){x[[m]]$score})) + if (se.fit==TRUE){ + bootcv.score <- cv.score[,data.table::data.table(mean(.SD[[m]],na.rm=TRUE), + se=sd(.SD[[m]],na.rm=TRUE), + lower=quantile(.SD[[m]],alpha/2,na.rm=TRUE), + upper=quantile(.SD[[m]],(1-alpha/2),na.rm=TRUE)),by=byvars,.SDcols=m] + data.table::setnames(bootcv.score,c(byvars,m,"se","lower","upper")) + }else{ + bootcv.score <- cv.score[,data.table::data.table(mean(.SD[[m]],na.rm=TRUE)),by=byvars,.SDcols=m] + data.table::setnames(bootcv.score,c(byvars,m)) + } }else{ - bootcv.score <- cv.score[,data.table::data.table(mean(.SD[[m]],na.rm=TRUE)),by=byvars,.SDcols=m] - data.table::setnames(bootcv.score,c(byvars,m)) + cv.score <- NULL + bootcv.score <- NULL } - }else{ - cv.score <- NULL - bootcv.score <- NULL - } - ## contrasts - if (length(crossval[[1]][[m]]$contrasts)>0){ - cv.contrasts <- data.table::rbindlist(lapply(crossval,function(x){x[[m]]$contrasts})) - delta.m <- paste0("delta.",m) - bootcv.contrasts <- switch(as.character(se.fit), - "1"={cv.contrasts[,data.table::data.table(mean(.SD[[delta.m]],na.rm=TRUE), - lower=quantile(.SD[[delta.m]],alpha/2,na.rm=TRUE), - upper=quantile(.SD[[delta.m]],(1-alpha/2),na.rm=TRUE)),by=c(byvars,"reference"),.SDcols=c(delta.m)] - }, - "0"={ - bootcv.contrasts <- cv.contrasts[,data.table::data.table(mean(.SD[[delta.m]],na.rm=TRUE)),by=c(byvars,"reference"),.SDcols=delta.m] - }) - data.table::setnames(bootcv.contrasts,"V1",delta.m) - }else{ - cv.contrasts <- NULL - bootcv.contrasts <- NULL - } - out <- list(score=bootcv.score,contrasts=bootcv.contrasts) - if (keep.cv) - out <- c(out,list(cv.score=cv.score,cv.contrasts=cv.contrasts)) - out + ## contrasts + if (length(crossval[[1]][[m]]$contrasts)>0){ + cv.contrasts <- data.table::rbindlist(lapply(crossval,function(x){x[[m]]$contrasts})) + delta.m <- paste0("delta.",m) + bootcv.contrasts <- switch(as.character(as.numeric(se.fit)), + "1"={cv.contrasts[,data.table::data.table(mean(.SD[[delta.m]],na.rm=TRUE), + lower=quantile(.SD[[delta.m]],alpha/2,na.rm=TRUE), + upper=quantile(.SD[[delta.m]],(1-alpha/2),na.rm=TRUE)),by=c(byvars,"reference"),.SDcols=c(delta.m)] + }, + "0"={ + bootcv.contrasts <- cv.contrasts[,data.table::data.table(mean(.SD[[delta.m]],na.rm=TRUE)),by=c(byvars,"reference"),.SDcols=delta.m] + }) + data.table::setnames(bootcv.contrasts,"V1",delta.m) + }else{ + cv.contrasts <- NULL + bootcv.contrasts <- NULL + } + out <- list(score=bootcv.score,contrasts=bootcv.contrasts) + if (keep.cv) + out <- c(out,list(cv.score=cv.score,cv.contrasts=cv.contrasts)) + out } diff --git a/R/crossvalPerf.loob.AUC.R b/R/crossvalPerf.loob.AUC.R index 1ce2b08c..2309babf 100644 --- a/R/crossvalPerf.loob.AUC.R +++ b/R/crossvalPerf.loob.AUC.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jun 4 2024 (09:20) ## Version: -## Last-Updated: Jun 4 2024 (15:08) +## Last-Updated: Jun 5 2024 (17:57) ## By: Thomas Alexander Gerds -## Update #: 11 +## Update #: 21 #---------------------------------------------------------------------- ## ### Commentary: @@ -17,16 +17,15 @@ crossvalPerf.loob.AUC <- function(times, mlevs, se.fit, - response.type, NT, - Response, + response.type, + response.names, cens.type, Weights, split.method, N, B, DT.B, - data, dolist, alpha, byvars, @@ -36,6 +35,7 @@ crossvalPerf.loob.AUC <- function(times, cause, # crossvalPerf.loob.Brier has more arguments ...) { + Response <- DT.B[,c(response.names,"riskRegression_ID"),with = FALSE][DT.B[,.I[1],by = "riskRegression_ID"]$V1] # initializing output bfold <- fold <- AUC <- riskRegression_event <- model <- b <- risk <- casecontrol <- IF.AUC <- IF.AUC0 <- se <- IF.AUC.conservative <- se.conservative <- lower <- upper <- NF <- reference <- riskRegression_event <- riskRegression_time <- riskRegression_status0 <- riskRegression_ID <- NULL if (response.type=="binary") { @@ -67,7 +67,8 @@ crossvalPerf.loob.AUC <- function(times, cause <- as.numeric(cause) rm(ind.mat) # first index for c++ function - first_hits <- sindex(Response[["riskRegression_time"]],times)-1 + if (response.type%in%c("survival","competing.risks")) + first_hits <- sindex(Response[["riskRegression_time"]],times)-1 for (s in 1:NT){ if (response.type=="binary"){ t <- 0 @@ -77,7 +78,7 @@ crossvalPerf.loob.AUC <- function(times, }else{ t <- times[s] if (response.type == "survival"){ - Response$riskRegression_status0 <- Response$riskRegression_status + Response[,riskRegression_status0 := riskRegression_status] } else { # competing risks oldEvent <- Response$riskRegression_event @@ -85,7 +86,7 @@ crossvalPerf.loob.AUC <- function(times, cause1ID <- oldEvent == 1 oldEvent[causeID] <- 1 oldEvent[cause1ID] <- cause - Response$riskRegression_status0 <- Response$riskRegression_status * oldEvent + Response[,riskRegression_status0 := riskRegression_status * oldEvent] # need a status variable with value # 0 = censored # 1 = cause of interest @@ -98,7 +99,7 @@ crossvalPerf.loob.AUC <- function(times, } # censoring weights if (cens.type=="rightCensored"){ - if (Weights$method!="cox"){ + if (Weights$method == "marginal"){ Wt <- Weights$IPCW.times[s] }else{ Wt <- Weights$IPCW.times[,s] @@ -111,8 +112,8 @@ crossvalPerf.loob.AUC <- function(times, w.controls <- weights[controls.index] n.cases <- sum(cases.index) n.controls <- sum(controls.index) - riskRegression_ID.case <- data[cases.index,]$riskRegression_ID - riskRegression_ID.controls <- data[controls.index,]$riskRegression_ID + riskRegression_ID.case <- Response[cases.index,]$riskRegression_ID + riskRegression_ID.controls <- Response[controls.index,]$riskRegression_ID muCase <- sum(w.cases) muControls <- sum(w.controls) Phi <- (1/N^2)*muCase*muControls @@ -136,7 +137,11 @@ crossvalPerf.loob.AUC <- function(times, } risk.mat <- matrix(NA,nrow=N,ncol=n.col) risk.mat[split.index] <- DTrisk - res <- aucLoobFun(riskRegression_ID.case,riskRegression_ID.controls,risk.mat,split.index,weights) + res <- aucLoobFun(riskRegression_ID.case, + riskRegression_ID.controls, + risk.mat, + split.index, + weights) ic0Case <- res[["ic0Case"]] ic0Control <- res[["ic0Control"]] warn <- res[["warn"]] diff --git a/R/crossvalPerf.loob.Brier.R b/R/crossvalPerf.loob.Brier.R index a0c5f0bd..321280ba 100644 --- a/R/crossvalPerf.loob.Brier.R +++ b/R/crossvalPerf.loob.Brier.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Jun 4 2024 (09:16) ## Version: -## Last-Updated: Jun 4 2024 (15:08) +## Last-Updated: Jun 5 2024 (18:00) ## By: Thomas Alexander Gerds -## Update #: 9 +## Update #: 37 #---------------------------------------------------------------------- ## ### Commentary: @@ -17,16 +17,15 @@ crossvalPerf.loob.Brier <- function(times, mlevs, se.fit, - response.type, NT, - Response, + response.type, + response.names, cens.type, Weights, split.method, N, B, DT.B, - data, dolist, alpha, byvars, @@ -37,7 +36,7 @@ crossvalPerf.loob.Brier <- function(times, conservative, cens.model, cause){ - riskRegression_status = riskRegression_time <- residuals <- risk <- WTi <- riskRegression_event <- riskRegression_event <- Brier <- IC0 <- nth.times <- IF.Brier <- lower <- se <- upper <- model <- NF <- IPCW <- response <- reference <- riskRegression_status0 <- IBS <- NULL + riskRegression_status = riskRegression_time <- residuals <- risk <- WTi <- riskRegression_event <- riskRegression_event <- Brier <- IC0 <- nth.times <- IF.Brier <- lower <- se <- upper <- model <- NF <- IPCW <- reference <- riskRegression_status0 <- IBS <- NULL ## sum across bootstrap samples where subject i is out of bag if (cens.type=="rightCensored"){ if (response.type=="survival"){ @@ -59,7 +58,13 @@ crossvalPerf.loob.Brier <- function(times, } ## for each individual sum the residuals of the bootstraps where this individual is out-of-bag ## divide by number of times out-off-bag later - DT.B <- DT.B[,data.table::data.table(risk=mean(risk),residuals=sum(residuals)),by=c(byvars,"riskRegression_ID")] + ## keep response and weights for re-merging later + if (response.type%in%c("survival","competing.risks")){ + DT.info <- DT.B[,c(response.names,"riskRegression_ID","WTi","Wt"),with = FALSE][DT.B[,.I[1],by = "riskRegression_ID"]$V1] + }else{ + DT.info <- DT.B[,c(response.names,"riskRegression_ID"),with = FALSE][DT.B[,.I[1],by = "riskRegression_ID"]$V1] + } + DT.B <- DT.B[,data.table::data.table(risk=mean(risk),residuals=sum(residuals)), by=c(byvars,"riskRegression_ID")] ## get denominator if (split.method$name=="LeaveOneOutBoot"){ ind.mat <- do.call("cbind",(lapply(1:B,split.method$index))) @@ -92,26 +97,10 @@ crossvalPerf.loob.Brier <- function(times, ## se.brier <- DT.B[,list(se=sd(IC0, na.rm=TRUE)/sqrt(N)),by=byvars] ## DT.B[,Brier:=NULL] if (cens.type[1]=="rightCensored" && !conservative){ - # merge with - ## FIXME HERE - rr_vars <- grep("^riskRegression_",names(data)) - DT.B <- data[,rr_vars,with=FALSE][DT.B,,on="riskRegression_ID"] + # merge with response, riskRegression_ID and weights + DT.B <- DT.info[DT.B,,on="riskRegression_ID"] DT.B[,nth.times:=as.numeric(factor(times))] - WW <- data.table(riskRegression_ID=1:N,WTi=Weights$IPCW.subject.times,key="riskRegression_ID") - ## merge WW with DT.B by riskRegression_ID, while retaining the order of DT.B - DT.B <- merge(DT.B,WW,by="riskRegression_ID") - ## DT.B[,WTi:=rep(Weights$IPCW.subject.times,NF+length(nullobject))] - if (Weights$method=="marginal"){ - Wt <- data.table(times=times,Wt=Weights$IPCW.times) - ## OBS: many digits in times may cause merge problems - DT.B <- merge(DT.B,Wt,by=c("times")) - }else{ - Wt <- data.table(times=rep(times,rep(N,NT)), - Wt=c(Weights$IPCW.times), - # FIXME HERE - riskRegression_ID=data$riskRegression_ID) - DT.B <- merge(DT.B,Wt,by=c("riskRegression_ID","times")) - } + data.table::setkeyv(DT.B,c(byvars,"riskRegression_ID")) if (cens.type=="uncensored"){ DT.B[,IF.Brier:= residuals] score.loob <- DT.B[,data.table(Brier=sum(residuals)/N,se=sd(residuals)/sqrt(N)),by=byvars] @@ -232,26 +221,20 @@ crossvalPerf.loob.Brier <- function(times, DT.B[,IPCW:=1/WTi] DT.B[riskRegression_time>=times,IPCW:=1/Wt] DT.B[riskRegression_time0){ output$contrasts[,model:=factor(model,levels=mlevs,mlabels)] output$contrasts[,reference:=factor(reference,levels=mlevs,mlabels)] - if (response.type%in%c("survival","competing.risks")) - setkey(output$score,model,times) - else - setkey(output$score,model) + data.table::setkeyv(output$score,byvars) } return(output) } diff --git a/R/getCensoringWeights.R b/R/getCensoringWeights.R index 33c06f16..1c3d4f6b 100644 --- a/R/getCensoringWeights.R +++ b/R/getCensoringWeights.R @@ -1,11 +1,10 @@ getCensoringWeights <- function(formula, data, - response, times, cens.model, - response.type, influence.curve=FALSE, - censoring.save.memory){ + censoring.save.memory, + verbose){ data <- copy(data) if((cens.model != "KaplanMeier")){ if (length(attr(terms(formula),"factors"))==0){ @@ -25,10 +24,9 @@ getCensoringWeights <- function(formula, IPCW.subject.times=IPCW.subject.times, method=cens.model,IC.data=NULL) if (influence.curve==TRUE){ - #Should not have to calculate IF for Nelson-AAlen - out <- c(out, - list(IC=NULL)) - + # Do not yet calculate IF for Nelson-AAlen + # this happens later + out <- c(out,list(IC=NULL)) # out <- c(out, # list(IC=IC_Nelson_Aalen_cens_time(time=data$riskRegression_time,status=data$riskRegression_status))) ## list(IC=data[,getInfluenceCurve.NelsonAalen(time=riskRegression_time,riskRegression_status=riskRegression_status)])) @@ -66,7 +64,6 @@ getCensoringWeights <- function(formula, minimumIncrement <- min(incrementsOfTime[incrementsOfTime > 0]) #need > 0 in case of ties IPCW.subject.times <- as.numeric(rms::survest(fit,newdata=wdata,times=Y-minimumIncrement/2,what='parallel',se.fit=FALSE)) #Y-minimumIncrement/2 # subject.times <- Y[(((Y<=max(times))*riskRegression_status)==1)] - out <- list(IPCW.times=IPCW.times,IPCW.subject.times=IPCW.subject.times,method=cens.model) if (influence.curve==TRUE){ TiMinus = Y-minimumIncrement/2 @@ -109,15 +106,15 @@ getCensoringWeights <- function(formula, stop("Use stratification with Cox instead. ") }, { - warning("Using other models (than Cox) for getting the censoring weights is under construction. ") + if (verbose>0)warning("Using other models (than Cox) for getting the censoring weights is under construction. ") vv <- all.vars(formula(delete.response(terms(formula)))) new.formula<-as.formula(paste0("Surv(riskRegression_time,riskRegression_status)",paste0("~",paste0(paste(vv,collapse = "+"))))) wdata <- copy(data) wdata[,riskRegression_status:=1-riskRegression_status] input <- list(formula=new.formula,data=wdata) - message("Fitting censoring model to data ...", appendLF = FALSE) + if (verbose>1)message("Fitting censoring model to data ...", appendLF = FALSE) fit <- do.call(cens.model,input) - message("done!") + if (verbose>1)message("done!") if (influence.curve){ if (is.null(data[["event"]])){ new.formula<-as.formula(paste0("Surv(riskRegression_time,riskRegression_status==1)",paste0("~",paste0(paste(vv,collapse = "+"))))) @@ -126,9 +123,9 @@ getCensoringWeights <- function(formula, new.formula<-as.formula(paste0("Surv(riskRegression_time,riskRegression_event==1)",paste0("~",paste0(paste(vv,collapse = "+"))))) } input <- list(formula=new.formula,data=wdata) - message("Fitting time model to data ...", appendLF = FALSE) + if (verbose>0)message("Fitting time model to data ...", appendLF = FALSE) fit.time <- do.call(cens.model,input) - message("done!") + if (verbose>0)message("done!") } else { fit.time <- NULL @@ -141,7 +138,11 @@ getCensoringWeights <- function(formula, # times.data.minus <- c(0,wdata$time[-length(wdata$time)]) #have to compute the weights for T_i minus not just Ti IPCW.subject.times <- diag(1-predictRisk(fit,wdata,wdata$time,1)) #computational problem with predictRisk IPCW.times <- 1-predictRisk(fit,wdata,times,1) - out <- list(IPCW.times=IPCW.times,IPCW.subject.times=IPCW.subject.times,method=cens.model,IC.data=IC.data) + out <- list( + IPCW.times=IPCW.times, + IPCW.subject.times=IPCW.subject.times, + method=cens.model, + IC.data=IC.data) }) out$dim <- ifelse(cens.model=="KaplanMeier" || cens.model == "marginal",0,1) out diff --git a/R/getComparisons.R b/R/getComparisons.R index 7e30fd5a..cedbf677 100644 --- a/R/getComparisons.R +++ b/R/getComparisons.R @@ -3,9 +3,9 @@ ## author: Thomas Alexander Gerds ## created: Jan 3 2016 (13:30) ## Version: -## last-updated: Jun 5 2024 (07:23) +## last-updated: Jun 5 2024 (07:40) ## By: Thomas Alexander Gerds -## Update #: 53 +## Update #: 54 #---------------------------------------------------------------------- ## ### Commentary: @@ -39,6 +39,10 @@ getComparisons <- function(dt,NF,N,alpha,dolist=NF:1,se.fit){ lower=lower, upper=upper, p=p) + }else{ + data.table(model=theta[model%in%g[-1]][["model"]], + reference=g[1], + delta=delta) } })) } else { diff --git a/R/getPerformanceData.R b/R/getPerformanceData.R index 8bb3627c..e6b42d8a 100644 --- a/R/getPerformanceData.R +++ b/R/getPerformanceData.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Feb 27 2022 (09:12) ## Version: -## Last-Updated: Jun 4 2024 (09:08) +## Last-Updated: Jun 5 2024 (18:02) ## By: Thomas Alexander Gerds -## Update #: 48 +## Update #: 66 #---------------------------------------------------------------------- ## ### Commentary: @@ -30,7 +30,8 @@ getPerformanceData <- function(testdata, cens.type, object, object.classes, - NT){ + NT, + verbose){ riskRegression_ID=model = risk = NULL # inherit everything else from parent frame: object, nullobject, NF, NT, times, cause, response.type, etc. Brier=IPA=IBS=NULL @@ -38,8 +39,8 @@ getPerformanceData <- function(testdata, N <- as.numeric(NROW(testdata)) # split data vertically into response and predictors X rr_vars <- grep("^riskRegression_",names(testdata)) - response <- testdata[,rr_vars,with=FALSE] - setkey(response,riskRegression_ID) + testresponse <- testdata[,rr_vars,with=FALSE] + data.table::setkey(testresponse,riskRegression_ID) X <- testdata[,-rr_vars,with=FALSE] if (debug) message("\nExtracted test set and prepared output object") # }}} @@ -67,11 +68,7 @@ getPerformanceData <- function(testdata, if (response.type=="binary"){ p <- do.call("predictRisk", c(list(object=c(object[[f]])),args))[neworder] }else{ - ## if(!is.null(include.times)){ ## remove columns at times beyond max time - ## p <- c(do.call("predictRisk",c(list(object=object[[f]][,include.times,drop=FALSE]),args))[neworder,]) - ## } else{ p <- c(do.call("predictRisk",c(list(object=object[[f]]),args))[neworder,]) - ## } } } else{ ## either binary or only one time point @@ -85,7 +82,7 @@ getPerformanceData <- function(testdata, model.f$call$data <- trainX trained.model <- try(eval(model.f$call),silent=TRUE) if (inherits(x=trained.model,what="try-error")){ - message(paste0("Failed to train the following model:")) + if (verbose>1)message(paste0("Failed to train the following model:")) try(eval(model.f$call),silent=FALSE) stop() } @@ -102,17 +99,18 @@ getPerformanceData <- function(testdata, } if (response.type%in%c("survival","competing.risks")){ out <- data.table(riskRegression_ID=testdata[["riskRegression_ID"]],model=f,risk=p,times=rep(times,rep(N,NT))) - setkey(out,model,times,riskRegression_ID) + byvars <- c("model","times") + data.table::setkey(out,model,times,"riskRegression_ID") out } else { out <- data.table(riskRegression_ID=testdata[["riskRegression_ID"]],model=f,risk=p) + byvars <- c("model") setkey(out,model,riskRegression_ID) out } })) if (any(is.na(pred$risk))) { - ## browser(skipCalls = 1) - message("Table of missing values in predicted risks:") + if (verbose>1)message("Table of missing values in predicted risks:") pred[,model:=factor(model,levels=levs,labels)] if (response.type[1] == "binary"){ print(pred[is.na(risk),data.table::data.table("sum(NA)" = .N),by = list(model)]) @@ -128,27 +126,30 @@ getPerformanceData <- function(testdata, if (cens.type=="rightCensored"){ Weights <- testweights ## add subject specific weights - set(response,j="WTi",value=Weights$IPCW.subject.times) + set(testresponse,j="WTi",value=Weights$IPCW.subject.times) } else { if (cens.type=="uncensored"){ Weights <- list(IPCW.times=rep(1,NT),IPCW.subject.times=matrix(1,ncol=NT,nrow=N)) Weights$method <- "marginal" - set(response,j="WTi",value=1) + set(testresponse,j="WTi",value=1) } else{ stop("Cannot handle this type of censoring.") } } ## add time point specific weights if (Weights$method=="marginal"){ - Wt <- data.table(times=times,Wt=Weights$IPCW.times) - ## OBS: many digits in times may cause merge problems - pred <- merge(pred,Wt,by=c("times")) - }else{ - Wt <- data.table(times=rep(times,rep(N,NT)), - Wt=c(Weights$IPCW.times), - riskRegression_ID=testdata$riskRegression_ID) - pred <- merge(pred,Wt,by=c("riskRegression_ID","times")) + Wt <- data.table(times = times,Wt=c(Weights$IPCW.times)) + pred <- Wt[pred,,on="times"] + data.table::setkey(pred,model,times,riskRegression_ID) + }else{ # here as many weights as there are subjects at each element of times + Wt <- rbindlist(lapply(1:length(times),function(s){ + data.table(riskRegression_ID = testresponse$riskRegression_ID, + times=rep(times[[s]],nrow(Weights$IPCW.times)), + Wt=Weights$IPCW.times[,s]) + })) + pred <- Wt[pred,,on=c("riskRegression_ID","times")] } + data.table::setkey(pred,model,times,riskRegression_ID) if (debug) message("merged the weights with input for performance metrics") } else { ## if (response.type=="binary") @@ -157,10 +158,10 @@ getPerformanceData <- function(testdata, if (debug) message("added weights to predictions") # }}} # {{{ merge with response - DT=merge(response,pred,by="riskRegression_ID") + DT=merge(testresponse,pred,by="riskRegression_ID") + data.table::setkey(DT,riskRegression_ID) DT } - #---------------------------------------------------------------------- ### getPerformanceData.R ends here diff --git a/R/getResponse.R b/R/getResponse.R index cccb32a0..26cecb78 100644 --- a/R/getResponse.R +++ b/R/getResponse.R @@ -53,11 +53,12 @@ getResponse <- function(formula,cause,data,vars){ if (responseNames[2]=="Surv"){ formula[[2]][[1]] <- as.name("Hist") } - response <- unclass(eval(formula[[2]],data)) - ## m <- stats::model.frame(formula=formula, - ## data=data, - ## na.action=na.fail) - ## response <- unclass(stats::model.response(m)) + temp <- unclass(eval(formula[[2]],data)) + response <- data.table::as.data.table(temp) + data.table::setattr(response,"states",attr(temp,"states")) + data.table::setattr(response,"event",attr(temp,"event")) + data.table::setattr(response,"model",attr(temp,"model")) + data.table::setattr(response,"cens.type",attr(temp,"cens.type")) } else{ stop("Cannot assign response type.") diff --git a/man/Score.Rd b/man/Score.Rd index f59bb219..88c21b8b 100644 --- a/man/Score.Rd +++ b/man/Score.Rd @@ -21,7 +21,6 @@ Score(object, ...) null.model = TRUE, se.fit = TRUE, conservative = FALSE, - multi.split.test = FALSE, conf.int = 0.95, contrasts = TRUE, probs = c(0, 0.25, 0.5, 0.75, 1), @@ -46,6 +45,7 @@ Score(object, ...) roc.grid = switch(roc.method, vertical = seq(0, 1, 0.01), horizontal = seq(1, 0, -0.01)), cutpoints = NULL, + verbose = 2, ... ) } @@ -112,8 +112,6 @@ variability of the estimate of the inverse probability of censoring weights when errors for prediction performance parameters. This can potentially reduce computation time and memory usage at a usually very small expense of a slightly higher standard error.} -\item{multi.split.test}{Logical or \code{0} or \code{1}. If \code{FALSE} or \code{0} do not calculate multi-split tests. This argument is ignored when \code{split.method} is "none".} - \item{conf.int}{Either logical or a numeric value between 0 and 1. In right censored data, confidence intervals are based on Blanche et al (see references). Setting \code{FALSE} prevents the computation of confidence intervals. \code{TRUE} computes 95 percent confidence @@ -148,10 +146,15 @@ Cox regression (\code{"cox"}) both applied to the censored times. If the right h the Kaplan-Meier method is used even if this argument is set to \code{"cox"}. Also implemented is a template for users specifying other models to estimate the IPCW. Here the user should be supply a function, taking as input a \code{"formula"} and \code{"data"}. This does come at the cost of only being able to calculate conservative confidence intervals.} -\item{split.method}{Method for cross-validation. Right now the only choices are \code{bootcv}, \code{cvk} and \code{loob}. In the first case, bootstrap learning sets -are drawn with our without replacement (argument \code{M}) from \code{data}. The data not included in the current bootstrap learning -set are used as validation set to compute the prediction performance. In the second case, k-fold cross-validation is performed. Note that k has to be an explicit number, e.g. 5 or 10, -when passing this as an argument. In the third case, leave-one-out bootstrap cross-validation is performed for the Brier score and leave-pair-out bootstrap cross-validation is performed for the AUC.} +\item{split.method}{Method for cross-validation. Right now the available choices are +\itemize{ +\item \code{"bootcv"} bootstrap cross-validation. Argument \code{B} controls the number of bootstraps. +\item \code{"cv5"} 5-fold cross-validation. Argument \code{B} controls how many times to repeat 5-fold crossvalidation. +\item \code{"cv10"} 10-fold cross-validation. Argument \code{B} controls how many times to repeat 10-fold crossvalidation. +\item \code{"cvk"} k-fold cross-validation for any value of k between 2 and N-1 where N is the sample size. Argument \code{B} controls how many times to repeat k-fold crossvalidation. +\item \code{"loob"} leave-one-out bootstrap cross-validation is performed for the Brier score and leave-pair-out bootstrap cross-validation is performed for the AUC. Argument \code{B} controls the number of bootstraps. +\item \code{"none"} no data splitting +}} \item{B}{Number of bootstrap sets for cross-validation. \code{B} should be set to 1, when k-fold cross-validation is used.} @@ -214,6 +217,8 @@ obtained for different data splits during crossvalidation.} \item{cutpoints}{If not \code{NULL}, estimates and standard errors of the TPR (True Positive Rate), FPR (False Positive Rate), PPV (Positive Predictive Value), and NPV (Negative Predictive Value) are given at the \code{cutpoints}. These values are saved in \code{object$AUC$res.cut}.} + +\item{verbose}{Verbosity level. Set to '-1' or 'FALSE' to get complete silence. Set to '0' to quiet all messages and warnings but keep the progress.bar. Set to '1' to see warnings and to '2' (the default) to see both warnings and messages.} } \value{ List with scores and assessments of contrasts, i.e., diff --git a/man/plotCalibration.Rd b/man/plotCalibration.Rd index b72c651d..91ed895e 100644 --- a/man/plotCalibration.Rd +++ b/man/plotCalibration.Rd @@ -9,7 +9,7 @@ plotCalibration( models, times, method = "nne", - cens.method, + cens.method = "local", round = 2, bandwidth = NULL, q = 10, diff --git a/slowtests/slowtest-score.R b/slowtests/slowtest-score.R index 620d7fb2..f46ef984 100644 --- a/slowtests/slowtest-score.R +++ b/slowtests/slowtest-score.R @@ -3,9 +3,9 @@ ## Author: Thomas Alexander Gerds ## Created: Dec 6 2020 (09:25) ## Version: -## Last-Updated: Sep 17 2022 (07:02) +## Last-Updated: Jun 5 2024 (18:13) ## By: Thomas Alexander Gerds -## Update #: 10 +## Update #: 13 #---------------------------------------------------------------------- ## ### Commentary: @@ -314,20 +314,31 @@ if (requireNamespace("pec",quietly=TRUE)){ ## predictRisk(fit.lrr,times=c(1,10,100,1000),newdata=Melanoma) fit.arr2 <- ARR(Hist(time,status)~thick+age,data=Melanoma,cause=1) fit.arr2a <- ARR(Hist(time,status)~tp(thick,power=1),data=Melanoma,cause=1) - fit.arr2b <- ARR(Hist(time,status)~timevar(thick),data=Melanoma,cause=1) - system.time(old <- pec(list(ARR=fit.arr2a,ARR.power=fit.arr2b,LRR=fit.lrr), - data=Melanoma, - formula=Hist(time,status)~1, - cause=1, B=10,split.method="none")) + ## fit.arr2b <- ARR(Hist(time,status)~timevar(thick),data=Melanoma,cause=1) + system.time(old <- pec(list( + ARR=fit.arr2a, + ## ARR.power=fit.arr2b, + LRR=fit.lrr), + data=Melanoma, + formula=Hist(time,status)~1, + times = c(500,1000,2000,3000), + exact = FALSE, + start = NULL, + cause=1, B=10,split.method="none")) ## predictRisk(fit.arr2a,newdata=Melanoma[1:10,],times=0) - system.time(new <- Score(list(ARR=fit.arr2a,ARR.power=fit.arr2b,LRR=fit.lrr), - data=Melanoma,conf.int=0, - times=c(0,sort(unique(Melanoma$time))), - metrics="brier",plots=NULL,summary=NULL, - formula=Hist(time,status)~1, - cause=1, B=10,split.method="none")) - nix <- lapply(1:4,function(m){ - expect_equal(ignore_attr=TRUE,new$Brier$score[model==names(new$models)[m]][["Brier"]], + system.time(new <- Score(list( + ARR=fit.arr2a, + ## ARR.power=fit.arr2b, + LRR=fit.lrr), + data=Melanoma,se.fit = FALSE, + ## times=c(0,sort(unique(Melanoma$time))), + metrics="brier",plots=NULL,summary=NULL, + formula=Hist(time,status)~1, + times = c(500,1000,2000,3000), + cause=1, B=10,split.method="none")) + nix <- lapply(1:3,function(m){ + expect_equal(ignore_attr=TRUE, + new$Brier$score[model==names(new$models)[m]][["Brier"]], old$AppErr[[names(old$AppErr)[[m]]]])}) }) } diff --git a/slowtests/test-score-bootstrap.R b/slowtests/test-score-bootstrap.R index 3cf1515f..a65485c7 100644 --- a/slowtests/test-score-bootstrap.R +++ b/slowtests/test-score-bootstrap.R @@ -6,14 +6,17 @@ library(randomForestSRC) test_that("loob survival",{ set.seed(8) - learndat=sampleData(200,outcome="survival") + ## learndat=sampleData(200,outcome="survival") + learndat=sampleData(38,outcome="survival") cox1a = coxph(Surv(time,event)~X6,data=learndat,x=TRUE,y=TRUE) cox2a = coxph(Surv(time,event)~X7+X8+X9,data=learndat,x=TRUE,y=TRUE) ## leave-one-out bootstrap set.seed(5) - loob.se0 <- Score(list("COX1"=cox1a,"COX2"=cox2a),formula=Surv(time,event)~1,data=learndat,times=5,split.method="loob",B=100,se.fit=FALSE,progress.bar=NULL) + loob.se0 <- Score(list("COX1"=cox1a,"COX2"=cox2a),formula=Surv(time,event)~1,data=learndat,times=5,split.method="loob",B=100,se.fit=FALSE,progress.bar=NULL,verbose = -1) set.seed(5) - loob.se1 <- Score(list("COX1"=cox1a,"COX2"=cox2a),formula=Surv(time,event)~1,data=learndat,times=5,split.method="loob",B=100,se.fit=TRUE,progress.bar=NULL) + loob.se1 <- Score(list("COX1"=cox1a,"COX2"=cox2a),formula=Surv(time,event)~1,data=learndat,times=5,split.method="loob",B=100,se.fit=TRUE,progress.bar=NULL,verbose = -1) + # covariate dependent censoring + loob.se1a <- Score(list("COX1"=cox1a,"COX2"=cox2a),formula=Surv(time,event)~X1,data=learndat,times=5,split.method="loob",B=100,se.fit=TRUE,progress.bar=NULL,verbose = -1) ## set.seed(5) ## loob.se1 <- Score(list("COX1"=cox1a,"COX2"=cox2a),formula=Surv(time,event)~1,data=learndat,times=5,split.method="loob",B=100,se.fit=TRUE,metrics="brier",conservative=TRUE) expect_equal(ignore_attr=TRUE,loob.se0$AUC$contrasts$delta,loob.se1$AUC$contrasts$delta) diff --git a/slowtests/test-score-crossval.R b/slowtests/test-score-crossval.R index 32607742..7e38e9bf 100644 --- a/slowtests/test-score-crossval.R +++ b/slowtests/test-score-crossval.R @@ -7,6 +7,8 @@ library(data.table) library(ranger) context("riskRegression") + + #does give some warnings probably, nothing too serious test.cv <- function(){ type.cvs <- c("cv5","loob", "bootcv") @@ -17,7 +19,8 @@ test.cv <- function(){ list(data = "survival", model = "coxph", response = "Surv(time,event)",left.extra = "x", right.extra = TRUE), list(data = "competing.risks", model = "CSC", response = "Hist(time,event)")) for (cv in type.cvs){ - for (typ in types){ + for (typ in types){ + browser() cat(paste0("Testing ", typ$data, " data with ",cv, " \n")) test_that(paste0("Testing ", typ$data, " data with ",cv),{ set.seed(18) @@ -116,4 +119,4 @@ test.cv2 <- function(){ } } -test.cv2() \ No newline at end of file +test.cv2()