Add tests to generate templates from data model (#26)

* Update .gitignore to ignore artifacts and creds * Add README for tests * Mirror config.yml for tests * Add generate tests for test suite * Ignore log files * Handle rate-limiting when generating templates * Add templates from test * Fix newline
gf-dcc · Feb 27, 2023 · 71e882d · 71e882d
1 parent d36303a
commit 71e882d
Show file tree

Hide file tree

Showing 12 changed files with 152 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,8 @@
-creds.json
+# Creds
+schematic_service_account_creds.json
+
+# Template files generated locally
+*.schema.json
+
+# Ignore test logs
+logs
diff --git a/config.yml b/config.yml
@@ -1,26 +1,34 @@
-# Do not change the 'definitions' section unless you know what you're doing
+# Do not change the "definitions" section unless you know what you"re doing
 # Usage: schematic manifest -c config.yml get -dt ClinicalAssay -t "Test Manifest" --jsonld ./NF.jsonld -s
 definitions:
   synapse_config: ".synapseConfig"
-  creds_path: "credentials.json"
-  token_pickle: "token.pickle"
-  service_acct_creds: "creds.json"
+  service_acct_creds: "schematic_service_account_creds.json"
 
 synapse:
-  master_fileview: 'syn23643253'
-  manifest_folder: 'manifests'
-  manifest_filename: 'synapse_storage_manifest.csv'
-  token_creds: 'syn23643259'
-  service_acct_creds: 'syn25171627'
+  master_fileview: "syn28142805"
+  manifest_folder: "manifests"
+  manifest_basename: "synapse_storage_manifest"
+  manifest_filename: "synapse_storage_manifest.csv"
 
 manifest:
-  title: 'Genomics_Data_Manifest'
-  data_type: 'GenomicsAssay'
+  title: "Gray_Foundation_Manifest"
+  data_type: 
+
+  # Clinical Data
+    - "CohortCore"
+    - "CohortCoreRFC"
+    - "CohortCorePortal"
+
+  # File Metadata
+    - "ImagingLevel2"
+    - "scATAC-seqLevel1"
+    - "scRNA-seqLevel1"
+    - "scRNA-seqLevel3"
 
 model:
   input:
-    location: '~/NF.jsonld'
-    file_type: 'local'
+    location: "./GF.jsonld"
+    file_type: "local"
 
 style:
   google_manifest:
@@ -32,5 +40,5 @@ style:
       red: 1.0
       green: 1.0
       blue: 0.9019
-    master_template_id: '1LYS5qE4nV9jzcYw5sXwCza25slDfRA1CIg3cs-hCdpU'
+    master_template_id: "1LYS5qE4nV9jzcYw5sXwCza25slDfRA1CIg3cs-hCdpU"
     strict_validation: true
diff --git a/templates/CohortCore.csv b/templates/CohortCore.csv
@@ -0,0 +1 @@
+Lab ID,Sex,Race,Ethnicity,Age,BMI,Age at Menarche,Genetic Alteration,Menopause Status,Gravidity,Parity
diff --git a/templates/CohortCorePortal.csv b/templates/CohortCorePortal.csv
@@ -0,0 +1 @@
+Atlas Participant ID,Atlas Group
diff --git a/templates/CohortCoreRFC.csv b/templates/CohortCoreRFC.csv
@@ -0,0 +1 @@
+Class BRCA1,Class BRCA2,Menopause Cycle Stage,Primary Diagnosis,Previous Diagnosis,Tobacco Use,Pack Years Smoked,Alcohol Use,Drinks Per Week,Antibiotic Use
diff --git a/templates/ImagingLevel2.csv b/templates/ImagingLevel2.csv
@@ -0,0 +1,2 @@
+Component,Filename,File Format,Lab ID,Channel Metadata Filename,Imaging Assay Type,Protocol Link,Software and Version,Microscope,Objective,NominalMagnification,LensNA,WorkingDistance,WorkingDistanceUnit,Immersion,Pyramid,Zstack,Tseries,Passed QC,Comment,FOV number,FOVX,FOVXUnit,FOVY,FOVYUnit,Frame Averaging,Image ID,DimensionOrder,PhysicalSizeX,PhysicalSizeXUnit,PhysicalSizeY,PhysicalSizeYUnit,PhysicalSizeZ,PhysicalSizeZUnit,Pixels BigEndian,PlaneCount,SizeC,SizeT,SizeX,SizeY,SizeZ,PixelType
+ImagingLevel2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
diff --git a/templates/ScATAC-seqLevel1.csv b/templates/ScATAC-seqLevel1.csv
@@ -0,0 +1,2 @@
+Component,Filename,File Format,Lab ID,Nucleic Acid Source,Dissociation Method,Single Nucleus Buffer,Single Cell Isolation Method,Transposition Reaction,scATACseq Library Layout,Nucleus Identifier,Nuclei Barcode Length,Nuclei Barcode Read,scATACseq Read1,scATACseq Read2,scATACseq Read3,Library Construction Method,Sequencing Platform,Threshold for Minimum Passing Reads,Total Number of Passing Nuclei,Median Fraction of Reads in Peaks,Median Fraction of Reads in Annotated cis DNA Elements,Median Passing Read Percentage,Median Percentage of Mitochondrial Reads per Nucleus,Technical Replicate Group,Total Reads,Protocol Link,Empty Well Barcode,Peaks Calling Software,Well Index
+ScATAC-seqLevel1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
diff --git a/templates/ScRNA-seqLevel1.csv b/templates/ScRNA-seqLevel1.csv
@@ -0,0 +1,2 @@
+Component,Filename,File Format,Lab ID,Cryopreserved Cells in Sample,Single Cell Isolation Method,Dissociation Method,Library Construction Method,Read Indicator,Read1,Read2,End Bias,Reverse Transcription Primer,Spike In,Sequencing Platform,Total Number of Input Cells,Input Cells and Nuclei,Protocol Link,Technical Replicate Group,Empty Well Barcode,Feature Reference Id,Well Index
+ScRNA-seqLevel1,,,,,,,,,,,,,,,,,,,,,
diff --git a/templates/ScRNA-seqLevel2.csv b/templates/ScRNA-seqLevel2.csv
@@ -0,0 +1,2 @@
+Component,Filename,File Format,Parent File ID,scRNAseq Workflow Type,Workflow Version,scRNAseq Workflow Parameters Description,Workflow Link,Genomic Reference,Genomic Reference URL,Genome Annotation URL,Whitelist Cell Barcode File Link,Cell Barcode Tag,UMI Tag,Applied Hard Trimming
+ScRNA-seqLevel2,,,,,,,,,,,,,,
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,45 @@
+## Test documentation
+
+This describes the testing framework for the data model. 
+The different types of tests are organized by the subdirectory and are explained below.
+
+### Test generation of templates
+
+This means means checking expectations that:
+1. Production templates can be generated at all with the version of `schematic` used. 
+2. Production templates look as expected when created (e.g. all the attributes are there).
+
+Issues in case #1 could mean using an old version of version of `schematic` that doesn't support new rules, there is a problem with the data model such as a missing key error, etc. 
+Issues in case #2 could mean changes in template definitions inadvertently led to a different set of attributes than intended for the template.
+
+#### Test fixtures
+
+The fixtures needed are, of course, the data model, as well as the configuration to run, defined in `generate_config.json`.
+
+### Test validation of manifests against their templates
+
+This means checking expectations that:
+1. Manifest data that should pass are indeed passed by `schematic`.
+2. Manifest data that should fail are indeed caught by `schematic`. 
+
+Issues in case #1 lead to a poor experience for data contributors, who wouldn't appreciate spurious validation errors. Issues in case #2 lead to bad data.  
+
+#### Test fixtures
+
+In addition to the data model, some representative `.csv` manifests are needed. 
+The file `validate_config.json` defines the matrix of which manifests should be tested with their respective templates.
+
+### Test submission of manifests (TO DO)
+
+> Note : This will be more complicated than the other two test suites combined.
+
+This means checking that:
+1. Valid manifests can be submitted at all. There have been cases where valid manifests have been unable to be submitted. 
+2. Manifest data are transferred correctly to Synapse (e.g. no weird conversions of types or truncation of data). This requires querying the test data that has been transferred to Synapse.
+
+#### Test fixtures
+
+This uses the same fixtures above plus a definition of what data should look like in Synapse. 
+
+
+
diff --git a/tests/generate/config.json b/tests/generate/config.json
@@ -0,0 +1,37 @@
+[
+  {
+    "template": "CohortCore",
+    "title": "Cohort Core Data",
+    "comment": ""
+  },
+  {
+    "template": "CohortCoreRFC",
+    "title": "Cohort Core RFC Data",
+    "comment": ""
+  },
+  {
+    "template": "CohortCorePortal",
+    "title": "Cohort Core Portal Data",
+    "comment": ""
+  },
+  {
+    "template": "ImagingLevel2",
+    "title": "Imaging Level 2 Data",
+    "comment": ""
+  },
+  {
+    "template": "ScATAC-seqLevel1",
+    "title": "scATAC-seq Level 1 Data",
+    "comment": ""
+  },
+  {
+    "template": "ScRNA-seqLevel1",
+    "title": "scRNA-seq Level 1 Data",
+    "comment": ""
+  },
+  {
+    "template": "ScRNA-seqLevel2",
+    "title": "scRNA-seq Level 2 Data",
+    "comment": ""
+  }
+]
diff --git a/tests/generate/generate.sh b/tests/generate/generate.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Generate .csv/Excel templates, to be provided:
+# 2) as fallback for contribs who can't access GoogleSheets / DCA (maybe they're blocked by their institution, DCA is down, or just prefer being handed a .csv directly)
+# 1) for documentation, i.e. read-only templates are embedded in user docs
+
+# A config file is expected as the first arg.
+# Locally, run test from the root of the repo with `./tests/generate/generate.sh ./tests/generate/config.json`.
+# However, most runs use GitHub Actions.
+
+OUTPUT_DIR=templates
+LOG_DIR=logs
+TEMPLATES=($(jq -r '.[].template' $1 | tr -d '"'))
+TITLES=($(jq -r '.[].title' $1 | tr -d '"'))
+
+mkdir -p $OUTPUT_DIR
+mkdir -p $LOG_DIR
+
+for i in ${!TEMPLATES[@]}
+do
+  echo ">>>>>>> Getting ${TEMPLATES[$i]}"
+  schematic manifest --config config.yml get -dt ${TEMPLATES[$i]} --title ${TITLES[$i]} -o $OUTPUT_DIR/${TEMPLATES[$i]}.csv | tee $LOG_DIR/${TEMPLATES[$i]%.*}_log.txt
+  sleep 0.5
+done
+
+# Clean up all the intermediates
+rm *.schema.json
+
+echo "Done!"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Lab ID,Sex,Race,Ethnicity,Age,BMI,Age at Menarche,Genetic Alteration,Menopause Status,Gravidity,Parity
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Class BRCA1,Class BRCA2,Menopause Cycle Stage,Primary Diagnosis,Previous Diagnosis,Tobacco Use,Pack Years Smoked,Alcohol Use,Drinks Per Week,Antibiotic Use
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Component,Filename,File Format,Lab ID,Channel Metadata Filename,Imaging Assay Type,Protocol Link,Software and Version,Microscope,Objective,NominalMagnification,LensNA,WorkingDistance,WorkingDistanceUnit,Immersion,Pyramid,Zstack,Tseries,Passed QC,Comment,FOV number,FOVX,FOVXUnit,FOVY,FOVYUnit,Frame Averaging,Image ID,DimensionOrder,PhysicalSizeX,PhysicalSizeXUnit,PhysicalSizeY,PhysicalSizeYUnit,PhysicalSizeZ,PhysicalSizeZUnit,Pixels BigEndian,PlaneCount,SizeC,SizeT,SizeX,SizeY,SizeZ,PixelType
		ImagingLevel2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Component,Filename,File Format,Lab ID,Nucleic Acid Source,Dissociation Method,Single Nucleus Buffer,Single Cell Isolation Method,Transposition Reaction,scATACseq Library Layout,Nucleus Identifier,Nuclei Barcode Length,Nuclei Barcode Read,scATACseq Read1,scATACseq Read2,scATACseq Read3,Library Construction Method,Sequencing Platform,Threshold for Minimum Passing Reads,Total Number of Passing Nuclei,Median Fraction of Reads in Peaks,Median Fraction of Reads in Annotated cis DNA Elements,Median Passing Read Percentage,Median Percentage of Mitochondrial Reads per Nucleus,Technical Replicate Group,Total Reads,Protocol Link,Empty Well Barcode,Peaks Calling Software,Well Index
		ScATAC-seqLevel1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Component,Filename,File Format,Lab ID,Cryopreserved Cells in Sample,Single Cell Isolation Method,Dissociation Method,Library Construction Method,Read Indicator,Read1,Read2,End Bias,Reverse Transcription Primer,Spike In,Sequencing Platform,Total Number of Input Cells,Input Cells and Nuclei,Protocol Link,Technical Replicate Group,Empty Well Barcode,Feature Reference Id,Well Index
		ScRNA-seqLevel1,,,,,,,,,,,,,,,,,,,,,
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Component,Filename,File Format,Parent File ID,scRNAseq Workflow Type,Workflow Version,scRNAseq Workflow Parameters Description,Workflow Link,Genomic Reference,Genomic Reference URL,Genome Annotation URL,Whitelist Cell Barcode File Link,Cell Barcode Tag,UMI Tag,Applied Hard Trimming
		ScRNA-seqLevel2,,,,,,,,,,,,,,