Merge pull request #246 from OHDSI/develop

Release v3.5.0
OHDSI · Apr 18, 2024 · 4a10c69 · msuchard · Apr 20, 2024 · msuchard
2 parents 5049d54 + b8c964b
commit 4a10c69
Show file tree

Hide file tree

Showing 71 changed files with 410 additions and 80 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: FeatureExtraction
 Type: Package
 Title: Generating Features for a Cohort
-Version: 3.4.1
-Date: 2024-03-28
+Version: 3.5.0
+Date: 2024-04-18
 Authors@R: c(
   person("Martijn", "Schuemie", , "[email protected]", role = c("aut")),
   person("Marc", "Suchard", role = c("aut")),
@@ -44,6 +44,6 @@ VignetteBuilder: knitr
 URL: https://github.com/OHDSI/FeatureExtraction
 BugReports: https://github.com/OHDSI/FeatureExtraction/issues
 NeedsCompilation: no
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Encoding: UTF-8
 Language: en-US
diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,20 @@
+FeatureExtraction 3.5.0
+=======================
+
+New Features:
+
+- Adds the ability to filter covariates by setting a minimum threshold for covariate mean (#174)
+
+Bug Fixes:
+
+- Table 1 - does not report correct subject count (#237) 
+
 FeatureExtraction 3.4.1
 =======================
 
 Bug Fixes:
 
-- Weely R-check fails (#239)
+- Weekly R-check fails (#239)
 - BigQuery error (#208)
 - Error when specifying 1 temporal window in temportalCovariateSettings (#200)
 - metaData aggregation issue (#195)

diff --git a/R/GetCovariates.R b/R/GetCovariates.R
@@ -59,6 +59,9 @@
 #'                               of the createCovariate functions, or a list of such objects.
 #' @param aggregated             Should aggregate statistics be computed instead of covariates per
 #'                               cohort entry?
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#'                                will help reduce the file size of the characterization output, but will remove information
+#'                                on covariates that have very low values. The default is 0.
 #'
 #' @return
 #' Returns an object of type \code{covariateData}, containing information on the covariates.
@@ -101,7 +104,8 @@ getDbCovariateData <- function(connectionDetails = NULL,
                                cohortIds = c(-1),
                                rowIdField = "subject_id",
                                covariateSettings,
-                               aggregated = FALSE) {
+                               aggregated = FALSE,
+                               minCharacterizationMean = 0) {
   if (is.null(connectionDetails) && is.null(connection)) {
     stop("Need to provide either connectionDetails or connection")
   }
@@ -115,6 +119,10 @@ getDbCovariateData <- function(connectionDetails = NULL,
     warning("cohortId argument has been deprecated, please use cohortIds")
     cohortIds <- cohortId
   }
+  errorMessages <- checkmate::makeAssertCollection()
+  minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
+  checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+  checkmate::reportAssertions(collection = errorMessages)
   if (!is.null(connectionDetails)) {
     connection <- DatabaseConnector::connect(connectionDetails)
     on.exit(DatabaseConnector::disconnect(connection))
@@ -164,7 +172,8 @@ getDbCovariateData <- function(connectionDetails = NULL,
                      cdmVersion = cdmVersion,
                      rowIdField = rowIdField,
                      covariateSettings = covariateSettings[[i]],
-                     aggregated = aggregated)
+                     aggregated = aggregated,
+                     minCharacterizationMean = minCharacterizationMean)
         tempCovariateData <- do.call(eval(parse(text = fun)), args)
         if (is.null(covariateData)) {
           covariateData <- tempCovariateData

diff --git a/R/GetCovariatesFromOtherCohorts.R b/R/GetCovariatesFromOtherCohorts.R
@@ -22,7 +22,9 @@
 #' @param covariateSettings   An object of type \code{covariateSettings} as created using the
 #'                            \code{\link{createCohortBasedCovariateSettings}} or
 #'                            \code{\link{createCohortBasedTemporalCovariateSettings}} functions.
-#'
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#'                                will help reduce the file size of the characterization output, but will remove information
+#'                                on covariates that have very low values. The default is 0.
 #' @template GetCovarParams
 #'
 #' @export
@@ -35,7 +37,8 @@ getDbCohortBasedCovariatesData <- function(connection,
                                            cdmVersion = "5",
                                            rowIdField = "subject_id",
                                            covariateSettings,
-                                           aggregated = FALSE) {
+                                           aggregated = FALSE,
+                                           minCharacterizationMean = 0) {
   errorMessages <- checkmate::makeAssertCollection()
   checkmate::assertClass(connection, "DatabaseConnectorConnection", add = errorMessages)
   checkmate::assertCharacter(oracleTempSchema, len = 1, null.ok = TRUE, add = errorMessages)
@@ -46,6 +49,8 @@ getDbCohortBasedCovariatesData <- function(connection,
   checkmate::assertCharacter(rowIdField, len = 1, add = errorMessages)
   checkmate::assertClass(covariateSettings, "covariateSettings", add = errorMessages)
   checkmate::assertLogical(aggregated, len = 1, add = errorMessages)
+  minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
+  checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
   checkmate::reportAssertions(collection = errorMessages)
   if (!missing(cohortId)) { 
     warning("cohortId argument has been deprecated, please use cohortIds")
@@ -139,7 +144,8 @@ getDbCohortBasedCovariatesData <- function(connection,
     cdmVersion = cdmVersion,
     rowIdField = rowIdField,
     covariateSettings = detailledSettings,
-    aggregated = aggregated
+    aggregated = aggregated,
+    minCharacterizationMean = minCharacterizationMean
   )
 
   sql <- "TRUNCATE TABLE #covariate_cohort_ref; DROP TABLE #covariate_cohort_ref;"

diff --git a/R/GetDefaultCovariates.R b/R/GetDefaultCovariates.R
@@ -31,6 +31,9 @@
 #'                               it is a temp table, do not specify \code{targetDatabaseSchema}.
 #' @param targetCovariateRefTable (Optional) The name of the table where the covariate reference will be stored.
 #' @param targetAnalysisRefTable (Optional) The name of the table where the analysis reference will be stored.
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#'                                will help reduce the file size of the characterization output, but will remove information
+#'                                on covariates that have very low values. The default is 0.
 #'
 #' @template GetCovarParams
 #'
@@ -65,7 +68,8 @@ getDbDefaultCovariateData <- function(connection,
                                       targetCovariateTable,
                                       targetCovariateRefTable,
                                       targetAnalysisRefTable,
-                                      aggregated = FALSE) {
+                                      aggregated = FALSE,
+                                      minCharacterizationMean = 0) {
   if (!is(covariateSettings, "covariateSettings")) {
     stop("Covariate settings object not of type covariateSettings")
   }
@@ -79,6 +83,11 @@ getDbDefaultCovariateData <- function(connection,
     warning("cohortId argument has been deprecated, please use cohortIds")
     cohortIds <- cohortId
   }
+  errorMessages <- checkmate::makeAssertCollection()
+  minCharacterizationMean <- utils::type.convert(minCharacterizationMean, as.is = TRUE)
+  checkmate::assertNumeric(x = minCharacterizationMean, lower = 0, add = errorMessages)
+  checkmate::reportAssertions(collection = errorMessages)
+
   settings <- .toJson(covariateSettings)
   rJava::J("org.ohdsi.featureExtraction.FeatureExtraction")$init(system.file("", package = "FeatureExtraction"))
   json <- rJava::J("org.ohdsi.featureExtraction.FeatureExtraction")$createSql(settings, aggregated, cohortTable, rowIdField, rJava::.jarray(as.character(cohortIds)), cdmDatabaseSchema)
@@ -126,6 +135,7 @@ getDbDefaultCovariateData <- function(connection,
         andromedaTableName = "covariates",
         snakeCaseToCamelCase = TRUE
       )
+      filterCovariateDataCovariates(covariateData, "covariates", minCharacterizationMean)
     }
 
     # Continuous aggregated features
@@ -142,6 +152,7 @@ getDbDefaultCovariateData <- function(connection,
         andromedaTableName = "covariatesContinuous",
         snakeCaseToCamelCase = TRUE
       )
+      filterCovariateDataCovariates(covariateData, "covariatesContinuous", minCharacterizationMean)
     }
 
     # Covariate reference
@@ -273,3 +284,17 @@ getDbDefaultCovariateData <- function(connection,
     return(covariateData)
   }
 }
+
+#' Filters the covariateData covariates based on the given characterization mean value.
+#'
+#' @param covariateData The covariate data
+#' @param covariatesName The name of the covariates object inside the covariateData
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#'                                will help reduce the file size of the characterization output, but will remove information
+#'                                on covariates that have very low values. The default is 0.
+filterCovariateDataCovariates <- function(covariateData, covariatesName, minCharacterizationMean = 0) {
+  if ("averageValue" %in% colnames(covariateData[[covariatesName]]) && minCharacterizationMean != 0) {
+    covariateData[[covariatesName]] <- covariateData[[covariatesName]] %>%
+      dplyr::filter(.data$averageValue > minCharacterizationMean)
+  }
+}
diff --git a/R/Table1.R b/R/Table1.R
@@ -562,18 +562,20 @@ createTable1 <- function(covariateData1,
 
   if (nrow(binaryTable) != 0) {
     if (comparison) {
+      populationSize1 <- getPopulationSize(covariateData1, cohortId1)
+      populationSize2 <- getPopulationSize(covariateData2, cohortId2)
       colnames(binaryTable) <- c(
         "Characteristic",
         "Count",
         paste0(
           "% (n = ",
-          formatCount(attr(covariateData1, "metaData")$populationSize),
+          formatCount(populationSize1),
           ")"
         ),
         "Count",
         paste0(
           "% (n = ",
-          formatCount(attr(covariateData2, "metaData")$populationSize),
+          formatCount(populationSize2),
           ")"
         ),
         "Std.Diff"
@@ -590,12 +592,13 @@ createTable1 <- function(covariateData1,
       binaryTable$count2 <- NULL
       binaryTable$percent2 <- NULL
       binaryTable$stdDiff <- NULL
+      populationSize1 <- getPopulationSize(covariateData1, cohortId1)
       colnames(binaryTable) <- c(
         "Characteristic",
         "Count",
         paste0(
           "% (n = ",
-          formatCount(attr(covariateData1, "metaData")$populationSize),
+          formatCount(populationSize1),
           ")"
         )
       )
@@ -722,3 +725,11 @@ createTable1CovariateSettings <- function(specifications = getDefaultTable1Speci
   covariateSettings$analyses <- analyses
   return(covariateSettings)
 }
+
+getPopulationSize <- function(covariateData, cohortId) {
+  result <- attr(covariateData, "metaData")$populationSize
+  if (!is.null(cohortId)) {
+    result <- result[cohortId]
+  }
+  return(result)
+}
diff --git a/R/UnitTestHelperFunctions.R b/R/UnitTestHelperFunctions.R
@@ -59,7 +59,9 @@
 #'                               of the createCovariate functions, or a list of such objects.
 #' @param aggregated             Should aggregate statistics be computed instead of covariates per
 #'                               cohort entry?
-#'
+#' @param minCharacterizationMean The minimum mean value for characterization output. Values below this will be cut off from output. This 
+#'                                will help reduce the file size of the characterization output, but will remove information
+#'                                on covariates that have very low values. The default is 0.
 #' @return
 #' Returns an object of type \code{covariateData}, containing information on the covariates.
 #'
@@ -94,7 +96,8 @@
                                    cdmVersion = "5",
                                    rowIdField = "subject_id",
                                    covariateSettings,
-                                   aggregated = FALSE) {
+                                   aggregated = FALSE,
+                                   minCharacterizationMean = 0) {
   writeLines("Constructing length of observation covariates")
   if (covariateSettings$useLengthOfObs == FALSE) {
     return(NULL)

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/articles/CreatingCovariatesBasedOnOtherCohorts.html b/docs/articles/CreatingCovariatesBasedOnOtherCohorts.html
diff --git a/docs/articles/CreatingCovariatesUsingCohortAttributes.html b/docs/articles/CreatingCovariatesUsingCohortAttributes.html
diff --git a/docs/articles/CreatingCustomCovariateBuilders.html b/docs/articles/CreatingCustomCovariateBuilders.html
diff --git a/docs/articles/CreatingCustomCovariateBuildersKorean.html b/docs/articles/CreatingCustomCovariateBuildersKorean.html
diff --git a/docs/articles/UsingFeatureExtraction.html b/docs/articles/UsingFeatureExtraction.html
diff --git a/docs/articles/UsingFeatureExtractionKorean.html b/docs/articles/UsingFeatureExtractionKorean.html
diff --git a/docs/articles/index.html b/docs/articles/index.html