From b6c4003c29c1dc5120616d00b9552df55d9a19c1 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Wed, 14 Feb 2024 17:36:34 -0500 Subject: [PATCH 1/3] . --- pipeline_utils/__main__.py | 1 + pipeline_utils/lib/yaml_parser.py | 82 ++++++++++++++++--- pipeline_utils/pipeline_deploy.py | 9 +- pipeline_utils/schemas/yaml_file_format.py | 16 ++-- .../schemas/yaml_reference_genome.py | 32 ++++++++ .../portal_objects/reference_genome.yaml | 12 +++ .../repo_correct/portal_objects/software.yaml | 12 +++ tests/test_yaml_file_format.py | 6 +- tests/test_yaml_reference_genome.py | 36 ++++++++ tests/test_yaml_software.py | 18 +++- 10 files changed, 200 insertions(+), 24 deletions(-) create mode 100644 pipeline_utils/schemas/yaml_reference_genome.py create mode 100644 tests/repo_correct/portal_objects/reference_genome.yaml create mode 100644 tests/test_yaml_reference_genome.py diff --git a/pipeline_utils/__main__.py b/pipeline_utils/__main__.py index c2a8e9a..4b229e1 100644 --- a/pipeline_utils/__main__.py +++ b/pipeline_utils/__main__.py @@ -58,6 +58,7 @@ def main(args=None): pipeline_deploy_parser.add_argument('--post-software', action='store_true', help='POST|PATCH Software objects') pipeline_deploy_parser.add_argument('--post-file-format', action='store_true', help='POST|PATCH FileFormat objects') pipeline_deploy_parser.add_argument('--post-file-reference', action='store_true', help='POST|PATCH ReferenceFile objects') + pipeline_deploy_parser.add_argument('--post-reference-genome', action='store_true', help='POST|PATCH ReferenceGenome objects') pipeline_deploy_parser.add_argument('--post-workflow', action='store_true', help='POST|PATCH Workflow objects') pipeline_deploy_parser.add_argument('--post-metaworkflow', action='store_true', help='POST|PATCH MetaWorkflow objects') pipeline_deploy_parser.add_argument('--post-wfl', action='store_true', help='Upload Workflow Description files (.cwl, .wdl)') diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 3f1f342..f664863 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -23,6 +23,7 @@ from pipeline_utils.schemas.yaml_software import yaml_software_schema from pipeline_utils.schemas.yaml_reference_file import yaml_reference_file_schema from pipeline_utils.schemas.yaml_file_format import yaml_file_format_schema +from pipeline_utils.schemas.yaml_reference_genome import yaml_reference_genome_schema ############################################################### @@ -118,8 +119,11 @@ class YAMLTemplate(object): METAWORKFLOW_TYPE_SCHEMA = 'MetaWorkflow' FILEFORMAT_TYPE_SCHEMA = 'FileFormat' REFERENCEFILE_TYPE_SCHEMA = 'ReferenceFile' + REFERENCEGENOME_TYPE_SCHEMA = 'ReferenceGenome' SOFTWARE_TYPE_SCHEMA = 'Software' VARIANT_TYPE_SCHEMA = "variant_type" + CODE_SCHEMA = 'code' + IDENTIFIER_SCHEMA = 'identifier' def __init__(self, data, schema): """Constructor method. @@ -145,17 +149,23 @@ def _clean_newline(self, line): line = line.replace('|', '') return line - def _link_title(self, name, version): + def _link_title(self, name, version=None): """Helper to create a "title" field. """ title = getattr(self, self.TITLE_SCHEMA, None) if title: - if version in title: - return title + if version: + if version in title: + return title + else: + return f'{title} [{version}]' else: - return f'{title} [{version}]' + return title else: - return f'{name.replace("_", " ")} [{version}]' + if version: + return f'{name.replace("_", " ")} [{version}]' + else: + return f'{name.replace("_", " ")}' def _string_consortia(self, consortia): """Helper to create a string from "consortia" field. @@ -549,7 +559,7 @@ def to_json( if getattr(self, self.SOURCE_URL_SCHEMA, None): sftwr_json[self.SOURCE_URL_SCHEMA] = self.source_url - sftwr_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) + sftwr_json[self.TITLE_SCHEMA] = self._link_title(self.name) sftwr_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{self.name}_{version}'] # uuid, accession if specified @@ -562,6 +572,10 @@ def to_json( if getattr(self, self.LICENSE_SCHEMA, None): sftwr_json[self.LICENSE_SCHEMA] = self.license + # code + if getattr(self, self.CODE_SCHEMA, None): + sftwr_json[self.CODE_SCHEMA] = self.code + return sftwr_json @@ -638,11 +652,10 @@ class YAMLFileFormat(YAMLTemplate): """ # schema constants - IDENTIFIER_SCHEMA = 'identifier' STANDARD_FILE_EXTENSION_SCHEMA = 'standard_file_extension' - # VALID_ITEM_TYPES_SCHEMA = 'valid_item_types' + VALID_ITEM_TYPES_SCHEMA = 'valid_item_types' EXTRA_FILE_FORMATS_SCHEMA = 'extra_file_formats' - # FILE_TYPES_SCHEMA = 'file_types' + FILE_TYPES_SCHEMA = 'file_types' def __init__(self, data): """Constructor method. @@ -672,7 +685,7 @@ def to_json( frmt_json[self.CONSORTIA_SCHEMA] = consortia frmt_json[self.DESCRIPTION_SCHEMA] = self.description frmt_json[self.STANDARD_FILE_EXTENSION_SCHEMA] = self.extension - # frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['ReferenceFile', 'FileProcessed']) + frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['ReferenceFile', 'OutputFile']) # check for secondary formats if getattr(self, self.SECONDARY_FORMATS_SCHEMA, None): frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA) @@ -685,3 +698,52 @@ def to_json( frmt_json[self.ACCESSION_SCHEMA] = self.accession return frmt_json + +############################################################### +# YAMLReferenceGenome, YAML ReferenceGenome +############################################################### +class YAMLReferenceGenome(YAMLTemplate): + """Class to work with YAML documents representing ReferenceGenome objects. + """ + + def __init__(self, data): + """Constructor method. + """ + super().__init__(data, yaml_reference_genome_schema) + # validate data with schema + self._validate() + # load attributes + for key, val in data.items(): + setattr(self, key, val) + + def to_json( + self, + submission_centers, # alias list + consortia # alias list + ): + """Function to build the corresponding object in JSON format. + """ + gen_json = {} + + # common metadata + gen_json[self.IDENTIFIER_SCHEMA] = self.name.lower() + gen_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.REFERENCEGENOME_TYPE_SCHEMA}-{self.name}_{self.version}'] + gen_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + gen_json[self.CONSORTIA_SCHEMA] = consortia + gen_json[self.TITLE_SCHEMA] = self._link_title(self.name, self.version) + gen_json[self.CODE_SCHEMA] = self.code + + # uuid, accession if specified + if getattr(self, self.UUID_SCHEMA, None): + gen_json[self.UUID_SCHEMA] = self.uuid + if getattr(self, self.ACCESSION_SCHEMA, None): + gen_json[self.ACCESSION_SCHEMA] = self.accession + + # check linked files + if getattr(self, self.FILES_SCHEMA, None): + gen_json[self.FILES_SCHEMA] = [] + for file in self.files: + gen_json[self.FILES_SCHEMA].append( + f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{file.replace("@", "_")}') + + return gen_json diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index 54944ed..deffeb4 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -76,6 +76,7 @@ def __init__(self, args, repo, version='VERSION', pipeline='PIPELINE'): 'Software': yaml_parser.YAMLSoftware, 'FileFormat': yaml_parser.YAMLFileFormat, 'ReferenceFile': yaml_parser.YAMLReferenceFile, + 'ReferenceGenome': yaml_parser.YAMLReferenceGenome, 'Workflow': yaml_parser.YAMLWorkflow, 'MetaWorkflow': yaml_parser.YAMLMetaWorkflow } @@ -84,10 +85,12 @@ def __init__(self, args, repo, version='VERSION', pipeline='PIPELINE'): 'Software': 'portal_objects/software.yaml', 'FileFormat': 'portal_objects/file_format.yaml', 'ReferenceFile': 'portal_objects/file_reference.yaml', + 'ReferenceGenome': 'portal_objects/reference_genome.yaml', # .yml files 'Software_yml': 'portal_objects/software.yml', 'FileFormat_yml': 'portal_objects/file_format.yml', 'ReferenceFile_yml': 'portal_objects/file_reference.yml', + 'ReferenceGenome_yml': 'portal_objects/reference_genome.yml', # folders 'Workflow': 'portal_objects/workflows', 'MetaWorkflow': 'portal_objects/metaworkflows', @@ -206,7 +209,7 @@ def _yaml_to_json(self, data_yaml, YAMLClass, **kwargs): def _post_patch_file(self, type): """ - 'Software', 'FileFormat', 'ReferenceFile' + 'Software', 'FileFormat', 'ReferenceFile', 'ReferenceGenome' """ logger.info(f'@ {type}...') @@ -414,6 +417,10 @@ def run_post_patch(self): if self.post_file_reference: self._post_patch_file('ReferenceFile') + # ReferenceGenome + if self.post_reference_genome: + self._post_patch_file('ReferenceGenome') + # Workflow if self.post_workflow: self._post_patch_folder('Workflow') diff --git a/pipeline_utils/schemas/yaml_file_format.py b/pipeline_utils/schemas/yaml_file_format.py index 023c5d1..3afe1fc 100644 --- a/pipeline_utils/schemas/yaml_file_format.py +++ b/pipeline_utils/schemas/yaml_file_format.py @@ -20,14 +20,14 @@ schema.DESCRIPTION: 'Extension of the FileFormat', schema.TYPE: schema.STRING }, - # 'file_types': { - # schema.DESCRIPTION: 'File types that can use the FileFormat', - # schema.TYPE: schema.ARRAY, - # schema.ITEMS: { - # schema.TYPE: schema.STRING, - # schema.PATTERN: 'ReferenceFile|FileProcessed|FileSubmitted|FileFastq' - # } - # }, + 'file_types': { + schema.DESCRIPTION: 'File types that can use the FileFormat', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING, + schema.PATTERN: 'ReferenceFile|FileProcessed|FileSubmitted|FileFastq' + } + }, 'status': { schema.TYPE: schema.STRING }, diff --git a/pipeline_utils/schemas/yaml_reference_genome.py b/pipeline_utils/schemas/yaml_reference_genome.py new file mode 100644 index 0000000..565c7c8 --- /dev/null +++ b/pipeline_utils/schemas/yaml_reference_genome.py @@ -0,0 +1,32 @@ +from pipeline_utils.schemas import schema + +yaml_reference_genome_schema = { + ## Schema ######################### + schema.SCHEMA: 'https://json-schema.org/draft/2020-12/schema', + schema.ID: '/schemas/YAMLReferenceGenome', + schema.TITLE: 'YAMLReferenceGenome', + schema.DESCRIPTION: 'Schema to validate a YAML description of a ReferenceGenome', + schema.TYPE: schema.OBJECT, + schema.PROPERTIES: { + 'name': { + schema.DESCRIPTION: 'Name of the ReferenceGenome', + schema.TYPE: schema.STRING + }, + 'version': { + schema.DESCRIPTION: 'Version of the ReferenceGenome', + schema.TYPE: schema.STRING + }, + 'code': { + schema.DESCRIPTION: 'Code for the ReferenceGenome', + schema.TYPE: schema.STRING + }, + 'files': { + schema.DESCRIPTION: 'Associated reference files', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + } + }, + schema.REQUIRED: ['name', 'version', 'code'] +} diff --git a/tests/repo_correct/portal_objects/reference_genome.yaml b/tests/repo_correct/portal_objects/reference_genome.yaml new file mode 100644 index 0000000..a203198 --- /dev/null +++ b/tests/repo_correct/portal_objects/reference_genome.yaml @@ -0,0 +1,12 @@ +################################################################ +# GRCh38 Genome Reference +################################################################ +name: GRCh38 +version: GCA_000001405.15 +files: + - complete-reference-fasta-no-alt@GCA_000001405.15_GRCh38_no_decoy + - complete-reference-bwt-no-alt@GCA_000001405.15_GRCh38_no_decoy +# Required for displaying in the file name +code: GRCh38 +# This is required to sync with a previously generated object +uuid: e89937e6-80d3-4605-8dea-4a74c7981a9f diff --git a/tests/repo_correct/portal_objects/software.yaml b/tests/repo_correct/portal_objects/software.yaml index 4ea9c7d..c77a3c0 100644 --- a/tests/repo_correct/portal_objects/software.yaml +++ b/tests/repo_correct/portal_objects/software.yaml @@ -7,6 +7,7 @@ source_url: 'http:/broad' description: gatk software package category: - Aligner +code: pippo --- @@ -19,3 +20,14 @@ category: commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K + +--- + +# Sentieon +name: Sentieon_BWA-MEM +code: sentieon_bwamem +version: '202308.01' +source_url: https://www.sentieon.com +category: + - Alignment +uuid: b42e44e5-a829-4687-aeff-65cd040b1528 diff --git a/tests/test_yaml_file_format.py b/tests/test_yaml_file_format.py index a9c920e..0ce9ce3 100644 --- a/tests/test_yaml_file_format.py +++ b/tests/test_yaml_file_format.py @@ -20,8 +20,8 @@ def test_file_format(): "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], "standard_file_extension": "bam", - "status": "obsolete" - # "valid_item_types": ["ReferenceFile", "FileProcessed"] + "status": "obsolete", + "valid_item_types": ["ReferenceFile", "OutputFile"] }, { "accession": 'GAPFIXRDPDK1', @@ -32,7 +32,7 @@ def test_file_format(): "consortia": ["cgap-core"], "standard_file_extension": "bam.bai", "status": "released", - # "valid_item_types": ["ReferenceFile", "FileProcessed"], + "valid_item_types": ["ReferenceFile", "OutputFile"], "uuid": '1936f246-22e1-45dc-bb5c-9cfd55537fe9' } ] diff --git a/tests/test_yaml_reference_genome.py b/tests/test_yaml_reference_genome.py new file mode 100644 index 0000000..47b5b1b --- /dev/null +++ b/tests/test_yaml_reference_genome.py @@ -0,0 +1,36 @@ +################################################################# +# Libraries +################################################################# +import sys, os +import pytest +from pipeline_utils.lib import yaml_parser + +################################################################# +# Tests +################################################################# +def test_software(): + """ + """ + res = [ + {"code": "GRCh38", + "title": "GRCh38 [GCA_000001405.15]", + "consortia": ["cgap-core"], + "identifier": "grch38", + "submission_centers": ["hms-dbmi"], + "uuid": "e89937e6-80d3-4605-8dea-4a74c7981a9f", + "files": [ + "cgap-core:ReferenceFile-complete-reference-fasta-no-alt_GCA_000001405.15_GRCh38_no_decoy", + "cgap-core:ReferenceFile-complete-reference-bwt-no-alt_GCA_000001405.15_GRCh38_no_decoy" + ], + "aliases": ["cgap-core:ReferenceGenome-GRCh38_GCA_000001405.15"]} + + ] + + for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/reference_genome.yaml')): + # creating JSON object + d_ = yaml_parser.YAMLReferenceGenome(d).to_json( + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] + ) + # check + assert d_ == res[i] diff --git a/tests/test_yaml_software.py b/tests/test_yaml_software.py index f1b5cf0..7d9f5fa 100644 --- a/tests/test_yaml_software.py +++ b/tests/test_yaml_software.py @@ -21,7 +21,8 @@ def test_software(): "source_url": "http:/broad", "title": "gatk 4.1.2", "version": "4.1.2", - "category": ["Aligner"] + "category": ["Aligner"], + "code": "pippo" }, { "accession": "GAPMKF1LL29K", @@ -30,10 +31,23 @@ def test_software(): "submission_centers": ["hms-dbmi"], "name": "picard", "consortia": ["cgap-core"], - "title": "picard [324ePT]", + "title": "picard", "uuid": "efdac7ec-7da3-4f23-9056-7a04abbc5e8b", "category": ["Variant Caller"] + }, + { + "aliases": ["cgap-core:Software-Sentieon_BWA-MEM_202308.01"], + "version": "202308.01", + "submission_centers": ["hms-dbmi"], + "name": "Sentieon_BWA-MEM", + "consortia": ["cgap-core"], + "title": "Sentieon BWA-MEM", + "uuid": "b42e44e5-a829-4687-aeff-65cd040b1528", + "category": ["Alignment"], + "code": "sentieon_bwamem", + "source_url": "https://www.sentieon.com", } + ] for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/software.yaml')): From af555061197f70b40247ae7f1a119fb644070cae Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Thu, 15 Feb 2024 11:23:50 -0500 Subject: [PATCH 2/3] . --- pipeline_utils/lib/yaml_parser.py | 2 +- tests/test_yaml_reference_genome.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index f664863..8c26e96 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -726,7 +726,7 @@ def to_json( gen_json = {} # common metadata - gen_json[self.IDENTIFIER_SCHEMA] = self.name.lower() + gen_json[self.IDENTIFIER_SCHEMA] = self.name gen_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.REFERENCEGENOME_TYPE_SCHEMA}-{self.name}_{self.version}'] gen_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers gen_json[self.CONSORTIA_SCHEMA] = consortia diff --git a/tests/test_yaml_reference_genome.py b/tests/test_yaml_reference_genome.py index 47b5b1b..ea22947 100644 --- a/tests/test_yaml_reference_genome.py +++ b/tests/test_yaml_reference_genome.py @@ -15,7 +15,7 @@ def test_software(): {"code": "GRCh38", "title": "GRCh38 [GCA_000001405.15]", "consortia": ["cgap-core"], - "identifier": "grch38", + "identifier": "GRCh38", "submission_centers": ["hms-dbmi"], "uuid": "e89937e6-80d3-4605-8dea-4a74c7981a9f", "files": [ From e49096020c3a2bb604229e4492d4d6588585a137 Mon Sep 17 00:00:00 2001 From: Michele Date: Tue, 20 Feb 2024 16:06:02 -0500 Subject: [PATCH 3/3] Update pipeline_utils/schemas/yaml_file_format.py Co-authored-by: drio18 <58236592+drio18@users.noreply.github.com> --- pipeline_utils/schemas/yaml_file_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_utils/schemas/yaml_file_format.py b/pipeline_utils/schemas/yaml_file_format.py index 3afe1fc..6f3a480 100644 --- a/pipeline_utils/schemas/yaml_file_format.py +++ b/pipeline_utils/schemas/yaml_file_format.py @@ -25,7 +25,7 @@ schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: 'ReferenceFile|FileProcessed|FileSubmitted|FileFastq' + schema.PATTERN: 'ReferenceFile|OutputFile|AlignedReads|UnalignedReads|VariantCalls' } }, 'status': {