Merge pull request #457 from OHDSI/develop

Develop
OHDSI · May 21, 2023 · 741f748 · 741f748
2 parents 162e709 + 7a2d0e5
commit 741f748
Show file tree

Hide file tree

Showing 63 changed files with 8,398 additions and 158 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DataQualityDashboard
 Type: Package
 Title: Execute and View Data Quality Checks on OMOP CDM Database
-Version: 2.2.0
-Date: 2023-05-05
+Version: 2.3.0
+Date: 2023-05-21
 Authors@R: c(
   person("Katy", "Sadowski", email = "[email protected]", role = c("aut", "cre")),
   person("Clair", "Blacketer", role = c("aut")),
@@ -27,7 +27,7 @@ Imports:
     dplyr,
     jsonlite,
     rJava,
-    SqlRender (>= 1.6.0),
+    SqlRender (>= 1.10.1),
     plyr,
     stringr,
     rlang,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(convertJsonResultsFileCase)
 export(executeDqChecks)
 export(listDqChecks)
 export(reEvaluateThresholds)
@@ -8,8 +9,14 @@ export(writeJsonResultsToCsv)
 export(writeJsonResultsToTable)
 import(DatabaseConnector)
 import(magrittr)
+importFrom(SqlRender,camelCaseToSnakeCase)
+importFrom(SqlRender,snakeCaseToCamelCase)
 importFrom(dplyr,case_when)
 importFrom(dplyr,mutate)
+importFrom(dplyr,rename_with)
+importFrom(jsonlite,fromJSON)
+importFrom(jsonlite,parse_json)
+importFrom(jsonlite,toJSON)
 importFrom(magrittr,"%>%")
 importFrom(readr,read_csv)
 importFrom(rlang,.data)
@@ -18,6 +25,7 @@ importFrom(stats,setNames)
 importFrom(stringr,regex)
 importFrom(stringr,str_detect)
 importFrom(tidyselect,all_of)
+importFrom(tools,file_path_sans_ext)
 importFrom(utils,install.packages)
 importFrom(utils,menu)
 importFrom(utils,packageVersion)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,17 @@
+DataQualityDashboard 2.3.0
+==========================
+This release includes:
+
+### New features
+
+- *New SQL-only Mode:* Setting `sqlOnly` and `sqlOnlyIncrementalInsert` to TRUE in `executeDqChecks` will return (but not run) a set of SQL queries that, when executed, will calculate the results of the DQ checks and insert them into a database table. Additionally, `sqlOnlyUnionCount` can be used to specify a number of SQL queries to union for each check type, allowing for parallel execution of these queries and potentially large performance gains. See the [SqlOnly vignette](https://ohdsi.github.io/DataQualityDashboard/articles/SqlOnly.html) for details
+- *Results File Case Converter:* The new function `convertJsonResultsFileCase` can be used to convert the keys in a DQD results JSON file between snakecase and camelcase. This allows reading of v2.1.0+ JSON files in older DQD versions, and other conversions which may be necessary for secondary use of the DQD results file. See [function documentation](https://ohdsi.github.io/DataQualityDashboard/reference/convertJsonResultsFileCase.html) for details
+
+### Bugfixes
+
+- In the v2.1.0 release, all DQD variables were converted from snakecase to camelcase, including those in the results JSON file. This resulted in errors for users trying to view results files generated by older DQD versions in DQD v2.1.0+. This issue has now been fixed. `viewDqDashboard` will now automatically convert the case of pre-v2.1.0 results files to camelcase so that older results files may be viewed in v2.3.0+
+
+
 DataQualityDashboard 2.2.0
 ==========================
 This release includes:
@@ -60,7 +74,7 @@ This release includes:
   - **withinVisitDates** looks at clinical facts and the visits they are associated with to make sure that the visit dates occur within one week on either side of the visit
   - **plausibleUnitConceptIds** identifies records with invalid Unit_Concept_Ids by Measurement_Concept_Id
 
-### outputFolder input paramater
+### outputFolder input parameter
 
   - The `outputFolder` parameter for the `executeDqChecks` function is now REQUIRED and no longer has a default value.  **This may be a breaking change for users who have not specified this parameter in their script to run DQD.**
 

diff --git a/R/convertResultsCase.R b/R/convertResultsCase.R
@@ -0,0 +1,81 @@
+# Copyright 2023 Observational Health Data Sciences and Informatics
+#
+# This file is part of DataQualityDashboard
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' @title Convert JSON results file case
+#' 
+#' @description Convert a DQD JSON results file between camelcase and (all-caps) snakecase. Enables viewing of pre-v.2.1.0 results files in later DQD versions, and vice versa
+#'
+#' @param jsonFilePath  Path to the JSON results file to be converted
+#' @param writeToFile   Whether or not to write the converted results back to a file (must be either TRUE or FALSE)
+#' @param outputFolder  The folder to output the converted JSON results file to
+#' @param outputFile    (OPTIONAL) File to write converted results JSON object to. Default is name of input file with a "_camel" or "_snake" postfix
+#' @param targetCase    Case into which the results file parameters should be converted (must be either "camel" or "snake")
+#'
+#' @returns DQD results object (a named list)
+#'
+#' @importFrom jsonlite fromJSON
+#' @importFrom SqlRender snakeCaseToCamelCase camelCaseToSnakeCase
+#' @importFrom dplyr rename_with
+#' @importFrom tools file_path_sans_ext
+#'
+#' @export
+
+convertJsonResultsFileCase <- function(
+    jsonFilePath,
+    writeToFile,
+    outputFolder = NA,
+    outputFile = "",
+    targetCase) {
+  if (!any(targetCase %in% c("camel", "snake"))) {
+    stop("targetCase must be either 'camel' or 'snake'.")
+  }
+  stopifnot(is.logical(writeToFile))
+  if (writeToFile && is.na(outputFolder)) {
+    stop("You must specify an output folder if writing to file.")
+  }
+
+  results <- jsonlite::fromJSON(jsonFilePath)
+
+  if ("numViolatedRows" %in% names(results$CheckResults) && targetCase == "camel") {
+    warning("File is already in camelcase! No conversion will be performed.")
+    return(results)
+  }
+  if ("NUM_VIOLATED_ROWS" %in% names(results$CheckResults) && targetCase == "snake") {
+    warning("File is already in snakecase! No conversion will be performed.")
+    return(results)
+  }
+
+  if (targetCase == "camel") {
+    swapFunction <- SqlRender::snakeCaseToCamelCase
+  } else {
+    swapFunction <- function(x) {
+      toupper(SqlRender::camelCaseToSnakeCase(x))
+    }
+  }
+
+  results$Metadata <- dplyr::rename_with(results$Metadata, swapFunction)
+  results$CheckResults <- dplyr::rename_with(results$CheckResults, swapFunction, -c("checkId"))
+
+  if (writeToFile) {
+    if (nchar(outputFile) == 0) {
+      jsonFile <- tools::file_path_sans_ext(basename(jsonFilePath))
+      outputFile <- paste(jsonFile, "_", targetCase, ".json", sep = "")
+    }
+    .writeResultsToJson(results, outputFolder, outputFile)
+  }
+
+  return(results)
+}
diff --git a/R/executeDqChecks.R b/R/executeDqChecks.R
@@ -25,11 +25,13 @@
 #' @param numThreads                The number of concurrent threads to use to execute the queries
 #' @param cdmSourceName             The name of the CDM data source
 #' @param sqlOnly                   Should the SQLs be executed (FALSE) or just returned (TRUE)?
+#' @param sqlOnlyUnionCount         (OPTIONAL) In sqlOnlyIncrementalInsert mode, how many SQL commands to union in each query to insert check results into results table (can speed processing when queries done in parallel). Default is 1.
+#' @param sqlOnlyIncrementalInsert  (OPTIONAL) In sqlOnly mode, boolean to determine whether to generate SQL queries that insert check results and associated metadata into results table.  Default is FALSE (for backwards compatibility to <= v2.2.0)
 #' @param outputFolder              The folder to output logs, SQL files, and JSON results file to
 #' @param outputFile                (OPTIONAL) File to write results JSON object
 #' @param verboseMode               Boolean to determine if the console will show all execution steps. Default is FALSE
 #' @param writeToTable              Boolean to indicate if the check results will be written to the dqdashboard_results table in the resultsDatabaseSchema. Default is TRUE
-#' @param writeTableName            The name of the results table. Defaults to `dqdashboard_results`.
+#' @param writeTableName            The name of the results table. Defaults to `dqdashboard_results`.  Used when sqlOnly or writeToTable is True.
 #' @param writeToCsv                Boolean to indicate if the check results will be written to a csv file. Default is FALSE
 #' @param csvFile                   (OPTIONAL) CSV file to write results
 #' @param checkLevels               Choose which DQ check levels to execute. Default is all 3 (TABLE, FIELD, CONCEPT)
@@ -64,6 +66,8 @@ executeDqChecks <- function(connectionDetails,
                             cdmSourceName,
                             numThreads = 1,
                             sqlOnly = FALSE,
+                            sqlOnlyUnionCount = 1,
+                            sqlOnlyIncrementalInsert = FALSE,
                             outputFolder,
                             outputFile = "",
                             verboseMode = FALSE,
@@ -93,6 +97,8 @@ executeDqChecks <- function(connectionDetails,
   stopifnot(is.character(cdmDatabaseSchema), is.character(resultsDatabaseSchema), is.numeric(numThreads))
   stopifnot(is.character(cdmSourceName), is.logical(sqlOnly), is.character(outputFolder), is.logical(verboseMode))
   stopifnot(is.logical(writeToTable), is.character(checkLevels))
+  stopifnot(is.numeric(sqlOnlyUnionCount) && sqlOnlyUnionCount > 0)
+  stopifnot(is.logical(sqlOnlyIncrementalInsert))
   stopifnot(is.character(cohortDatabaseSchema), is.character(cohortTableName))
 
   if (!all(checkLevels %in% c("TABLE", "FIELD", "CONCEPT"))) {
@@ -128,7 +134,10 @@ executeDqChecks <- function(connectionDetails,
     metadata$dqdVersion <- as.character(packageVersion("DataQualityDashboard"))
     DatabaseConnector::disconnect(connection)
   } else {
-    metadata <- NA
+    metadata <- data.frame(
+      dqdVersion = as.character(packageVersion("DataQualityDashboard")),
+      cdmSourceName = cdmSourceName
+    )
   }
 
   # Setup output folder ------------------------------------------------------------------------------------------------------------
@@ -259,10 +268,14 @@ executeDqChecks <- function(connectionDetails,
     connection,
     cdmDatabaseSchema,
     vocabDatabaseSchema,
+    resultsDatabaseSchema,
+    writeTableName,
     cohortDatabaseSchema,
     cohortTableName,
     cohortDefinitionId,
     outputFolder,
+    sqlOnlyUnionCount,
+    sqlOnlyIncrementalInsert,
     sqlOnly,
     progressBar = TRUE
   )
@@ -310,9 +323,10 @@ executeDqChecks <- function(connectionDetails,
     .writeResultsToJson(allResults, outputFolder, outputFile)
 
     ParallelLogger::logInfo("Execution Complete")
+  } else {
+    .writeDDL(resultsDatabaseSchema, writeTableName, connectionDetails$dbms, outputFolder)
   }
 
-
   # write to table ----------------------------------------------------------------------
 
   if (!sqlOnly && writeToTable) {

diff --git a/R/listChecks.R b/R/listChecks.R
@@ -35,65 +35,21 @@ listDqChecks <- function(cdmVersion = "5.3", tableCheckThresholdLoc = "default",
       sprintf("OMOP_CDMv%s_Check_Descriptions.csv", cdmVersion),
       package = "DataQualityDashboard"
     ))
-  dqChecks$checkDescriptions <- as.data.frame(dqChecks$checkDescriptions)
 
+  dqChecks$tableChecks <- .readThresholdFile(
+    checkThresholdLoc = tableCheckThresholdLoc,
+    defaultLoc = sprintf("OMOP_CDMv%s_Table_Level.csv", cdmVersion)
+  )
 
-  if (tableCheckThresholdLoc == "default") {
-    dqChecks$tableChecks <-
-      read_csv(
-        system.file(
-          "csv",
-          sprintf("OMOP_CDMv%s_Table_Level.csv", cdmVersion),
-          package = "DataQualityDashboard"
-        ),
-        na = c(" ", "")
-      )
-    dqChecks$tableChecks <- as.data.frame(dqChecks$tableChecks)
-  } else {
-    dqChecks$tableChecks <- read_csv(
-      tableCheckThresholdLoc,
-      na = c(" ", "")
-    )
-    dqChecks$tableChecks <- as.data.frame(dqChecks$tableChecks)
-  }
+  dqChecks$fieldChecks <- .readThresholdFile(
+    checkThresholdLoc = fieldCheckThresholdLoc,
+    defaultLoc = sprintf("OMOP_CDMv%s_Field_Level.csv", cdmVersion)
+  )
 
-  if (fieldCheckThresholdLoc == "default") {
-    dqChecks$fieldChecks <-
-      read_csv(
-        system.file(
-          "csv",
-          sprintf("OMOP_CDMv%s_Field_Level.csv", cdmVersion),
-          package = "DataQualityDashboard"
-        ),
-        na = c(" ", "")
-      )
-    dqChecks$fieldChecks <- as.data.frame(dqChecks$fieldChecks)
-  } else {
-    dqChecks$fieldChecks <- read_csv(
-      fieldCheckThresholdLoc,
-      na = c(" ", "")
-    )
-    dqChecks$fieldChecks <- as.data.frame(dqChecks$fieldChecks)
-  }
-
-  if (conceptCheckThresholdLoc == "default") {
-    dqChecks$conceptChecks <-
-      read_csv(
-        system.file(
-          "csv",
-          sprintf("OMOP_CDMv%s_Concept_Level.csv", cdmVersion),
-          package = "DataQualityDashboard"
-        ),
-        na = c(" ", "")
-      )
-    dqChecks$conceptChecks <- as.data.frame(dqChecks$conceptChecks)
-  } else {
-    dqChecks$conceptChecks <- read_csv(
-      conceptCheckThresholdLoc,
-      na = c(" ", "")
-    )
-    dqChecks$conceptChecks <- as.data.frame(dqChecks$conceptChecks)
-  }
+  dqChecks$conceptChecks <- .readThresholdFile(
+    checkThresholdLoc = conceptCheckThresholdLoc,
+    defaultLoc = sprintf("OMOP_CDMv%s_Concept_Level.csv", cdmVersion)
+  )
 
   return(dqChecks)
 }
diff --git a/R/runCheck.R b/R/runCheck.R
@@ -24,10 +24,14 @@
 #' @param connection                A connection for connecting to the CDM database using the DatabaseConnector::connect(connectionDetails) function.
 #' @param cdmDatabaseSchema         The fully qualified database name of the CDM schema
 #' @param vocabDatabaseSchema       The fully qualified database name of the vocabulary schema (default is to set it as the cdmDatabaseSchema)
+#' @param resultsDatabaseSchema     The fully qualified database name of the results schema
+#' @param writeTableName            The table tor write DQD results to. Used when sqlOnly or writeToTable is True.
 #' @param cohortDatabaseSchema      The schema where the cohort table is located.
 #' @param cohortTableName           The name of the cohort table.
 #' @param cohortDefinitionId        The cohort definition id for the cohort you wish to run the DQD on. The package assumes a standard OHDSI cohort table called 'Cohort'
 #' @param outputFolder              The folder to output logs and SQL files to
+#' @param sqlOnlyUnionCount         (OPTIONAL) How many SQL commands to union before inserting them into output table (speeds processing when queries done in parallel). Default is 1.
+#' @param sqlOnlyIncrementalInsert  (OPTIONAL) Boolean to determine whether insert check results and associated metadata into output table.  Default is FALSE (for backwards compatability to <= v2.2.0)
 #' @param sqlOnly                   Should the SQLs be executed (FALSE) or just returned (TRUE)?
 #'
 #' @import magrittr
@@ -42,10 +46,14 @@
                       connection,
                       cdmDatabaseSchema,
                       vocabDatabaseSchema,
+                      resultsDatabaseSchema,
+                      writeTableName,
                       cohortDatabaseSchema,
                       cohortTableName,
                       cohortDefinitionId,
                       outputFolder,
+                      sqlOnlyUnionCount,
+                      sqlOnlyIncrementalInsert,
                       sqlOnly) {
   ParallelLogger::logInfo(sprintf("Processing check description: %s", checkDescription$checkName))
 
@@ -62,10 +70,6 @@
     cohort <- FALSE
   }
 
-  if (sqlOnly) {
-    unlink(file.path(outputFolder, sprintf("%s.sql", checkDescription$checkName)))
-  }
-
   if (nrow(checks) > 0) {
     dfs <- apply(X = checks, MARGIN = 1, function(check) {
       columns <- lapply(names(check), function(c) {
@@ -88,7 +92,19 @@
 
       sql <- do.call(SqlRender::loadRenderTranslateSql, params)
 
-      if (sqlOnly) {
+      if (sqlOnly && sqlOnlyIncrementalInsert) {
+        checkQuery <- .createSqlOnlyQueries(
+          params,
+          check,
+          tableChecks,
+          fieldChecks,
+          conceptChecks,
+          sql,
+          connectionDetails,
+          checkDescription
+        )
+        data.frame(query = checkQuery)
+      } else if (sqlOnly) {
         write(x = sql, file = file.path(
           outputFolder,
           sprintf("%s.sql", checkDescription$checkName)
@@ -105,7 +121,17 @@
         )
       }
     })
-    do.call(rbind, dfs)
+
+    dfs <- do.call(rbind, dfs)
+
+    if (sqlOnlyIncrementalInsert) {
+      sqlToUnion <- dfs$query
+      if (length(sqlToUnion) > 0) {
+        .writeSqlOnlyQueries(sqlToUnion, sqlOnlyUnionCount, resultsDatabaseSchema, writeTableName, connectionDetails$dbms, outputFolder, checkDescription)
+      }
+    } else {
+      dfs
+    }
   } else {
     ParallelLogger::logWarn(paste0("Warning: Evaluation resulted in no checks: ", filterExpression))
     data.frame()