From 77a0ed0bd20c637a838e0368955f8c50ed03f5d8 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Thu, 19 Oct 2023 13:49:12 -0400 Subject: [PATCH 01/18] initial refactoring to match SMaHT portal changes --- .github/workflows/main.yml | 6 +- README.md | 10 +- docs/conf.py | 4 +- docs/contribute_pipeline.rst | 2 +- docs/deploy_pipeline.rst | 30 +-- docs/index.rst | 4 +- docs/install.rst | 10 +- docs/pipeline_utils.rst | 10 +- docs/repo.rst | 2 +- docs/yaml_metaworkflow.rst | 5 - pipeline_utils/__main__.py | 14 +- pipeline_utils/lib/yaml_parser.py | 185 ++++++++++-------- pipeline_utils/pipeline_deploy.py | 8 +- pipeline_utils/schemas/yaml_file_format.py | 16 +- pipeline_utils/schemas/yaml_file_reference.py | 10 +- pipeline_utils/schemas/yaml_metaworkflow.py | 35 +++- pipeline_utils/schemas/yaml_software.py | 6 +- pipeline_utils/schemas/yaml_workflow.py | 6 +- pyproject.toml | 10 +- .../portal_objects/file_reference.yaml | 4 + .../metaworkflows/A_gatk-HC-GT.yaml | 4 + .../metaworkflows/B_minimal-gatk-HC-GT.yaml | 3 + .../portal_objects/metaworkflows/QC_test.yaml | 68 +++++++ .../repo_correct/portal_objects/software.yaml | 2 + .../portal_objects/workflows/A_gatk-HC.yaml | 2 + .../workflows/B_minimal-gatk-HC.yaml | 2 + .../portal_objects/file_reference.yaml | 4 + .../metaworkflows/A_gatk-HC-GT.yaml | 4 + .../metaworkflows/B_minimal-gatk-HC-GT.yaml | 3 + tests/repo_error/portal_objects/software.yaml | 2 + .../portal_objects/workflows/A_gatk-HC.yaml | 2 + .../workflows/B_minimal-gatk-HC.yaml | 2 + tests/test_yaml_file_format.py | 30 +-- tests/test_yaml_file_reference.py | 24 ++- tests/test_yaml_metaworkflow.py | 148 ++++++++++++-- tests/test_yaml_software.py | 22 ++- tests/test_yaml_workflow.py | 49 ++--- 37 files changed, 519 insertions(+), 229 deletions(-) create mode 100644 tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 59dec77..2dd71be 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,4 @@ -# portal-pipeline-utils GA Workflow +# smaht-pipeline-utils GA Workflow name: CI @@ -17,7 +17,7 @@ on: jobs: # This workflow contains a single job called "test" test: - name: Test portal-pipeline-utils with Python ${{ matrix.python_version }} + name: Test smaht-pipeline-utils with Python ${{ matrix.python_version }} # The type of runner that the job will run on runs-on: ubuntu-22.04 @@ -33,7 +33,7 @@ jobs: with: python-version: ${{ matrix.python_version }} - - name: Test portal-pipeline-utils + - name: Test smaht-pipeline-utils run: | make configure make build diff --git a/README.md b/README.md index 52706a2..f7fb627 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,21 @@ Utilities for deploying pipelines and interfacing with portal infrastructure. -For more information on available commands and how to contribute and deploy pipelines within the infrastructure check the extended [*documentation*](https://portal-pipeline-utils.readthedocs.io/en/latest/ "portal-pipeline-utils documentation"). +For more information on available commands and how to contribute and deploy pipelines within the infrastructure check the extended [*documentation*](https://smaht-pipeline-utils.readthedocs.io/en/latest/ "smaht-pipeline-utils documentation"). ## Install The software is python based. To install the software and the required packages, we recommend using a fresh virtual environment. Please refer to `pyproject.toml` for the supported Python versions. -The package is available on [*pypi*](https://pypi.org/project/portal-pipeline-utils "portal-pipeline-utils pypi"): +The package is available on [*pypi*](https://pypi.org/project/smaht-pipeline-utils "smaht-pipeline-utils pypi"): - pip install portal-pipeline-utils + pip install smaht-pipeline-utils To install from source: - git clone https://github.com/dbmi-bgm/portal-pipeline-utils.git - cd portal-pipeline-utils + git clone https://github.com/smaht-dac/smaht-pipeline-utils.git + cd smaht-pipeline-utils make configure make update make build diff --git a/docs/conf.py b/docs/conf.py index 9565327..d983ecb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,9 +17,9 @@ # -- Project information ----------------------------------------------------- -project = 'portal-pipeline-utils' +project = 'smaht-pipeline-utils' copyright = '2021, HMS DBMI' -author = 'Michele Berselli, CGAP & SMaHT Team' +author = 'Michele Berselli, SMaHT Team' # The full version, including alpha/beta/rc tags release = '2.1.0' diff --git a/docs/contribute_pipeline.rst b/docs/contribute_pipeline.rst index ee8fc0b..afdcabf 100644 --- a/docs/contribute_pipeline.rst +++ b/docs/contribute_pipeline.rst @@ -177,4 +177,4 @@ Example Examples -------- -Real examples of implemented pipeline modules can be found linked as submodules in our main pipeline repository for the CGAP project here: https://github.com/dbmi-bgm/cgap-pipeline-main. +Real examples of implemented pipeline modules can be found linked as submodules in our main pipeline repository for the SMaHT project here: https://github.com/smaht-dac/main-pipelines. diff --git a/docs/deploy_pipeline.rst b/docs/deploy_pipeline.rst index 217fc2c..d110ce4 100644 --- a/docs/deploy_pipeline.rst +++ b/docs/deploy_pipeline.rst @@ -55,37 +55,37 @@ by running a test command: More information on how to setup Docker can be found in the `AWS Documentation `_. -We now need to install the ``pipeline_utils`` software to deploy the pipeline components. +We now need to install the ``smaht_pipeline_utils`` software to deploy the pipeline components. -Install pipeline_utils -====================== +Install smaht_pipeline_utils +============================ The software is Python-based. To install the software and the required packages, we recommend using a fresh virtual environment. -Please refer to `pyproject.toml `_ for the supported Python version. +Please refer to `pyproject.toml `_ for the supported Python version. We recommend using pyenv to manage virtual environments. Instructions for installing and using pyenv can be found `here `_. -Once the virtual environment is set up and activated, we can proceed to :ref:`install ` portal-pipeline-utils software. +Once the virtual environment is set up and activated, we can proceed to :ref:`install ` smaht-pipeline-utils software. .. code-block:: bash # Install from source - git clone https://github.com/dbmi-bgm/portal-pipeline-utils.git - cd portal-pipeline-utils + git clone https://github.com/smaht-dac/smaht-pipeline-utils.git + cd smaht-pipeline-utils make configure make update make build cd .. # Install from pypi - pip install portal-pipeline-utils + pip install smaht-pipeline-utils -To check that the software is correctly installed, try to run ``pipeline_utils``. -If installed from source, this command may fail with a bash "command not found" error, try ``poetry run pipeline_utils`` instead. +To check that the software is correctly installed, try to run ``smaht_pipeline_utils``. +If installed from source, this command may fail with a bash "command not found" error, try ``poetry run smaht_pipeline_utils`` instead. Set Up Credentials and Environmental Variables ============================================== @@ -188,7 +188,7 @@ by the ``--repos`` argument. .. code-block:: bash - pipeline_utils pipeline_deploy \ + smaht_pipeline_utils pipeline_deploy \ --ff-env ${ENV_NAME} \ --keydicts-json ${KEYDICTS_JSON} \ --wfl-bucket ${WFL_BUCKET} \ @@ -214,10 +214,10 @@ The default is set to the ``main`` branch. The ``--local-build`` flag will preve *Note: we are working to enable more builders with a command line argument for which builder to use to deploy modules from different repositories through AWS CodeBuild.* -Deploying CGAP Pipelines +Deploying SMaHT Pipelines ======================== -CGAP pipelines are released as a complete package with a customized set up for automated deployment to the desired environment. +SMaHT pipelines are released as a complete package with a customized set up for automated deployment to the desired environment. To deploy the pipelines run the following steps: 1. Clone the main pipeline repository. @@ -225,7 +225,7 @@ The submodules will be empty and set to the current commits saved for the main b .. code-block:: bash - git clone https://github.com/dbmi-bgm/cgap-pipeline-main.git + git clone https://github.com/smaht-dac/main-pipelines.git 2. Check out the desired version. This will set the submodules to the commits saved for that pipeline release. @@ -241,7 +241,7 @@ The submodules will be set in detached state on their current commit. make pull -4. Build pipeline_utils (optional). +4. Build smaht_pipeline_utils (optional). This will build from source the latest version linked for the current release. .. code-block:: bash diff --git a/docs/index.rst b/docs/index.rst index a6c307a..65d81e4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,9 +2,9 @@ Portal Pipeline Utilities ========================= -Documentation for portal-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with portal infrastructure. +Documentation for smaht-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with portal infrastructure. -.. _portal-pipeline-utils: https://github.com/dbmi-bgm/portal-pipeline-utils +.. _smaht-pipeline-utils: https://github.com/smaht-dac/smaht-pipeline-utils Contents ######## diff --git a/docs/install.rst b/docs/install.rst index f057538..0fbf5e3 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -9,11 +9,11 @@ PyPI The package is available on pypi_: -.. _pypi: https://pypi.org/project/portal-pipeline-utils +.. _pypi: https://pypi.org/project/smaht-pipeline-utils .. code-block:: bash - pip install portal-pipeline-utils + pip install smaht-pipeline-utils Source ^^^^^^ @@ -23,10 +23,10 @@ To install the latest version from source: .. code-block:: bash - git clone https://github.com/dbmi-bgm/portal-pipeline-utils.git - cd portal-pipeline-utils + git clone https://github.com/smaht-dac/smaht-pipeline-utils.git + cd smaht-pipeline-utils make configure make update make build -Please refer to `pyproject.toml `_ for the supported Python version. +Please refer to `pyproject.toml `_ for the supported Python version. diff --git a/docs/pipeline_utils.rst b/docs/pipeline_utils.rst index 979d921..301bdc6 100644 --- a/docs/pipeline_utils.rst +++ b/docs/pipeline_utils.rst @@ -1,6 +1,6 @@ -============== -pipeline_utils -============== +==================== +smaht_pipeline_utils +==================== This is the entry point for a collection of utilities available as commands: @@ -10,7 +10,7 @@ Usage: .. code-block:: bash - pipeline_utils [COMMAND] [ARGS] + smaht_pipeline_utils [COMMAND] [ARGS] .. _pipeline_deploy: @@ -24,7 +24,7 @@ Usage: .. code-block:: bash - pipeline_utils pipeline_deploy --ff-env FF_ENV --repos REPO [REPO ...] [OPTIONAL ARGS] + smaht_pipeline_utils pipeline_deploy --ff-env FF_ENV --repos REPO [REPO ...] [OPTIONAL ARGS] **Arguments:** diff --git a/docs/repo.rst b/docs/repo.rst index 5a505be..dbc914f 100644 --- a/docs/repo.rst +++ b/docs/repo.rst @@ -56,4 +56,4 @@ Example ``foo_bar`` pipeline: ├── PIPELINE └── VERSION -Real examples can be found linked as submodules in our pipelines repository for CGAP project here: https://github.com/dbmi-bgm/cgap-pipeline-main. +Real examples can be found linked as submodules in our pipelines repository for SMaHT project here: https://github.com/smaht-dac/main-pipelines. diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index 9909100..3159a65 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -20,11 +20,6 @@ Template name: description: - # All the following fields are optional and provided as example, - # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - proband_only: - ## General arguments ######################################## # Pipeline input, reference files, and general arguments # define all arguments for the pipeline here diff --git a/pipeline_utils/__main__.py b/pipeline_utils/__main__.py index d1eff02..c3ea156 100644 --- a/pipeline_utils/__main__.py +++ b/pipeline_utils/__main__.py @@ -20,8 +20,8 @@ # Variables PIPELINE_DEPLOY = 'pipeline_deploy' -CGAP_ALIAS = 'cgap-core' -DBMI_ALIAS = 'hms-dbmi' +CONSORTIA_ALIAS = 'CONSORTIUM' +SUBMISSION_CENTERS_ALIAS = 'SUBMISSION_CENTER' KEYS_ALIAS = '~/.cgap-keys.json' MAIN_ALIAS = 'main' BUILDER_ALIAS = '-pipeline-builder' @@ -32,7 +32,7 @@ def main(args=None): '''Command line wrapper around available commands. ''' # Adding parser and subparsers - parser = argparse.ArgumentParser(prog='pipeline_utils', description='Collection of utilities for deploying pipelines and interfacing with portal infrastructure') + parser = argparse.ArgumentParser(prog='smaht_pipeline_utils', description='Collection of utilities for deploying pipelines and interfacing with portal infrastructure') subparsers = parser.add_subparsers(dest='func', metavar="") # Add pipeline_deploy to subparsers @@ -50,10 +50,10 @@ def main(args=None): pipeline_deploy_parser.add_argument('--wfl-bucket', required=False, help='Bucket to use for upload of Workflow Description files') pipeline_deploy_parser.add_argument('--account', required=False, help='AWS account to use for deployment') pipeline_deploy_parser.add_argument('--region', required=False, help='AWS account region to use for deployment') - pipeline_deploy_parser.add_argument('--project', required=False, help=f'Project to use for deployment [{CGAP_ALIAS}]', - default=CGAP_ALIAS) - pipeline_deploy_parser.add_argument('--institution', required=False, help=f'Institution to use for deployment [{DBMI_ALIAS}]', - default=DBMI_ALIAS) + pipeline_deploy_parser.add_argument('--consortia', required=False, nargs='+', help='List of consortia to use for deployment', + default=[CONSORTIA_ALIAS]) + pipeline_deploy_parser.add_argument('--submission-centers', required=False, nargs='+', help='List of centers to use for deployment', + default=[SUBMISSION_CENTERS_ALIAS]) pipeline_deploy_parser.add_argument('--post-software', action='store_true', help='POST|PATCH Software objects') pipeline_deploy_parser.add_argument('--post-file-format', action='store_true', help='POST|PATCH FileFormat objects') diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 49f796e..c4fe156 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -91,9 +91,10 @@ class YAMLTemplate(object): NAME_SCHEMA = 'name' TITLE_SCHEMA = 'title' DESCRIPTION_SCHEMA = 'description' + CATEGORY_SCHEMA = 'category' ALIASES_SCHEMA = 'aliases' - PROJECT_SCHEMA = 'project' - INSTITUTION_SCHEMA = 'institution' + CONSORTIA_SCHEMA = 'consortia' + SUBMISSION_CENTERS_SCHEMA = 'submission_centers' VERSION_SCHEMA = 'version' ACCESSION_SCHEMA = 'accession' UUID_SCHEMA = 'uuid' @@ -101,6 +102,7 @@ class YAMLTemplate(object): ARGUMENT_FORMAT_SCHEMA = 'argument_format' ARGUMENT_NAME_SCHEMA = 'argument_name' VALUE_TYPE_SCHEMA = 'value_type' + VALUE_SCHEMA = 'value' WORKFLOW_ARGUMENT_NAME_SCHEMA = 'workflow_argument_name' INPUT_SCHEMA = 'input' STATUS_SCHEMA = 'status' @@ -108,8 +110,6 @@ class YAMLTemplate(object): SECONDARY_FORMATS_SCHEMA = 'secondary_formats' FILE_FORMAT_SCHEMA = 'file_format' SECONDARY_FILE_FORMATS_SCHEMA = 'secondary_file_formats' - INSTITUTIONS_SCHEMA = 'institutions' - PROJECTS_SCHEMA = 'projects' FILE_SCHEMA = 'file' FILES_SCHEMA = 'files' PARAMETER_SCHEMA = 'parameter' @@ -156,16 +156,10 @@ def _link_title(self, name, version): else: return f'{name.replace("_", " ")} [{version}]' - def _link_institution(self, institution): - """Helper to create an "institution" field. + def _string_consortia(self, consortia): + """Helper to create a string from "consortia" field. """ - return f'/{self.INSTITUTIONS_SCHEMA}/{institution}/' - - def _link_project(self, project): - """Helper to create a "project" field. - """ - return f'/{self.PROJECTS_SCHEMA}/{project}/' - + return '_'.join(sorted(consortia)) ############################################################### # YAMLWorkflow, YAML Workflow @@ -188,8 +182,6 @@ class YAMLWorkflow(YAMLTemplate): HTML_SCHEMA = 'html' JSON_SCHEMA = 'json' TABLE_SCHEMA = 'table' - APP_NAME_SCHEMA = 'app_name' - APP_VERSION_SCHEMA = 'app_version' SOFTWARE_SCHEMA = 'software' ARGUMENTS_SCHEMA = 'arguments' QC_TYPE_SCHEMA = 'qc_type' @@ -212,7 +204,7 @@ def __init__(self, data): self._validate() # load attributes for key, val in data.items(): - if key in [self.DESCRIPTION_SCHEMA, self.TITLE_SCHEMA]: + if key in [self.DESCRIPTION_SCHEMA]: val = self._clean_newline(val) setattr(self, key, val) @@ -302,8 +294,8 @@ def _arguments_output(self): def to_json( self, version, - institution, # alias - project, # alias + submission_centers, # alias list + consortia, # alias list wflbucket_url ): """Function to build the corresponding object in JSON format. @@ -311,29 +303,22 @@ def to_json( wfl_json = {} # common metadata - wfl_json[self.APP_NAME_SCHEMA] = self.name # name - wfl_json[self.APP_VERSION_SCHEMA] = version # version - wfl_json[self.NAME_SCHEMA] = f'{self.name}_{version}' + wfl_json[self.VERSION_SCHEMA] = version # version + wfl_json[self.NAME_SCHEMA] = self.name wfl_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) - wfl_json[self.ALIASES_SCHEMA] = [f'{project}:{self.WORKFLOW_TYPE_SCHEMA}-{wfl_json[self.NAME_SCHEMA]}'] - wfl_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - wfl_json[self.PROJECT_SCHEMA] = self._link_project(project) + wfl_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.WORKFLOW_TYPE_SCHEMA}-{self.name}_{version}'] + wfl_json[self.CATEGORY_SCHEMA] = self.category + wfl_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + wfl_json[self.CONSORTIA_SCHEMA] = consortia wfl_json[self.DESCRIPTION_SCHEMA] = self.description - wfl_json[self.SOFTWARE_SCHEMA] = [f'{project}:{self.SOFTWARE_TYPE_SCHEMA}-{s.replace("@", "_")}' for s in getattr(self, self.SOFTWARE_SCHEMA, [])] + wfl_json[self.SOFTWARE_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{s.replace("@", "_")}' for s in getattr(self, self.SOFTWARE_SCHEMA, [])] wfl_json[self.ARGUMENTS_SCHEMA] = self._arguments_input() + self._arguments_output() - # workflow language (TODO) - # we need to improve tibanna to have a unique general key for this - language = self.runner.get('language') - if not language or language.lower() == 'cwl': - wfl_json['cwl_directory_url_v1'] = wflbucket_url - wfl_json['cwl_main_filename'] = self.runner['main'] - wfl_json['cwl_child_filenames'] = self.runner.get('child', []) - elif language.lower() == 'wdl': - wfl_json['wdl_directory_url'] = wflbucket_url - wfl_json['wdl_main_filename'] = self.runner['main'] - wfl_json['wdl_child_filenames'] = self.runner.get('child', []) - wfl_json['workflow_language'] = 'wdl' + # workflow language and description files + wfl_json['language'] = self.runner['language'].upper() + wfl_json['directory_url'] = wflbucket_url + wfl_json['main_file_name'] = self.runner['main'] + wfl_json['child_file_names'] = self.runner.get('child', []) # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -360,7 +345,17 @@ class YAMLMetaWorkflow(YAMLTemplate): CONFIG_SCHEMA = 'config' DEPENDENCIES_SCHEMA = 'dependencies' SHARDS_SCHEMA = 'shards' - PROBAND_ONLY_SCHEMA = 'proband_only' + QC_THRESHOLDS_SCHEMA = 'qc_thresholds' + OVERALL_QUALITY_STATUS_RULE_SCHEMA = 'overall_quality_status_rule' + ID_SCHEMA = 'id' + METRIC_SCHEMA = 'metric' + OPERATOR_SCHEMA = 'operator' + PASS_TARGET_SCHEMA = 'pass_target' + WARN_TARGET_SCHEMA = 'warn_target' + USE_AS_QC_FLAG_SCHEMA = 'use_as_qc_flag' + RULE_SCHEMA = 'rule' + FLAG_SCHEMA = 'flag' + QC_RULE_SCHEMA = 'qc_rule' def __init__(self, data): """Constructor method. @@ -370,11 +365,11 @@ def __init__(self, data): self._validate() # load attributes for key, val in data.items(): - if key in [self.DESCRIPTION_SCHEMA, self.TITLE_SCHEMA]: + if key in [self.DESCRIPTION_SCHEMA]: val = self._clean_newline(val) setattr(self, key, val) - def _arguments(self, input, project): + def _arguments(self, input, consortia): """Helper to parse arguments and map to expected JSON structure. """ arguments = [] @@ -387,7 +382,7 @@ def _arguments(self, input, project): if type == self.PARAMETER_SCHEMA: argument_[self.VALUE_TYPE_SCHEMA] = format for k, v in values.items(): - if k != self.ARGUMENT_TYPE_SCHEMA: + if k not in [self.ARGUMENT_TYPE_SCHEMA, self.QC_RULE_SCHEMA]: # handle files specifications, TODO # this system could be improved in how the schema works and deals with types # @@ -399,29 +394,54 @@ def _arguments(self, input, project): # - bar@v3 # need to convert to: # files: [ - # {file: ':FileReference-foo_v1'} + # {file: ':FileReference-foo_v1'} # ] # ----- or ------- # files: [ - # {file: ':FileReference-foo_v1', dimension: '0'}, - # {file: ':FileReference-bar_v3', dimension: '1'} + # {file: ':FileReference-foo_v1', dimension: '0'}, + # {file: ':FileReference-bar_v3', dimension: '1'} # ] if k == self.FILES_SCHEMA: v_ = [] for i, name_ in enumerate(v): - v_.append({self.FILE_SCHEMA: f'{project}:{self.FILEREFERENCE_TYPE_SCHEMA}-{name_.replace("@", "_")}', + v_.append({self.FILE_SCHEMA: f'{self._string_consortia(consortia)}:{self.FILEREFERENCE_TYPE_SCHEMA}-{name_.replace("@", "_")}', self.DIMENSION_SCHEMA: str(i)}) # remove DIMENSION_SCHEMA field if only one file if len(v_) == 1: del v_[0][self.DIMENSION_SCHEMA] argument_.setdefault(k, v_) + elif k == self.QC_THRESHOLDS_SCHEMA: + v_ = { + self.QC_THRESHOLDS_SCHEMA: [], + self.OVERALL_QUALITY_STATUS_RULE_SCHEMA: values[self.QC_RULE_SCHEMA] + } + for id, rule in v.items(): + metric, operator, pass_target, warn_target = rule[self.RULE_SCHEMA].split('|') + flag = rule.get(self.FLAG_SCHEMA) + # convert to float if number + try: pass_target = float(pass_target) + except ValueError: pass + try: warn_target = float(warn_target) + except ValueError: pass + # format rule + rule_ = { + self.ID_SCHEMA: id, + self.METRIC_SCHEMA: metric, + self.OPERATOR_SCHEMA: operator, + self.PASS_TARGET_SCHEMA: pass_target, + self.WARN_TARGET_SCHEMA: warn_target + } + if flag: # add use as flag if present + rule_[self.USE_AS_QC_FLAG_SCHEMA] = flag + v_[self.QC_THRESHOLDS_SCHEMA].append(rule_) + argument_.setdefault(self.VALUE_SCHEMA, v_) else: argument_.setdefault(k, v) arguments.append(argument_) return arguments - def _workflows(self, version, project): + def _workflows(self, version, consortia): """Helper to parse workflow definitions and map to expected JSON structure. """ workflows = [] @@ -435,9 +455,9 @@ def _workflows(self, version, project): # basic JSON workflow structure workflow_ = { self.NAME_SCHEMA: name, - self.WORKFLOW_SCHEMA: f'{project}:{self.WORKFLOW_TYPE_SCHEMA}-{name.split("@")[0]}_{version_}', + self.WORKFLOW_SCHEMA: f'{self._string_consortia(consortia)}:{self.WORKFLOW_TYPE_SCHEMA}-{name.split("@")[0]}_{version_}', # remove unique tag after @ to create the right alias to link - self.INPUT_SCHEMA: self._arguments(values[self.INPUT_SCHEMA], project), + self.INPUT_SCHEMA: self._arguments(values[self.INPUT_SCHEMA], consortia), self.CONFIG_SCHEMA: values[self.CONFIG_SCHEMA] } # file output can be optional @@ -457,8 +477,8 @@ def _workflows(self, version, project): def to_json( self, version, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ @@ -468,16 +488,13 @@ def to_json( metawfl_json[self.NAME_SCHEMA] = self.name metawfl_json[self.VERSION_SCHEMA] = version # version metawfl_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) - metawfl_json[self.ALIASES_SCHEMA] = [f'{project}:{self.METAWORKFLOW_TYPE_SCHEMA}-{self.name}_{version}'] - metawfl_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - metawfl_json[self.PROJECT_SCHEMA] = self._link_project(project) + metawfl_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.METAWORKFLOW_TYPE_SCHEMA}-{self.name}_{version}'] + metawfl_json[self.CATEGORY_SCHEMA] = self.category + metawfl_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + metawfl_json[self.CONSORTIA_SCHEMA] = consortia metawfl_json[self.DESCRIPTION_SCHEMA] = self.description - metawfl_json[self.INPUT_SCHEMA] = self._arguments(self.input, project) - metawfl_json[self.WORKFLOWS_SCHEMA] = self._workflows(version, project) - - # proband_only field - if getattr(self, self.PROBAND_ONLY_SCHEMA, None): - metawfl_json[self.PROBAND_ONLY_SCHEMA] = self.proband_only + metawfl_json[self.INPUT_SCHEMA] = self._arguments(self.input, consortia) + metawfl_json[self.WORKFLOWS_SCHEMA] = self._workflows(version, consortia) # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -507,14 +524,14 @@ def __init__(self, data): self._validate() # load attributes for key, val in data.items(): - if key in [self.DESCRIPTION_SCHEMA, self.TITLE_SCHEMA]: + if key in [self.DESCRIPTION_SCHEMA]: val = self._clean_newline(val) setattr(self, key, val) def to_json( self, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ @@ -522,8 +539,9 @@ def to_json( # common metadata sftwr_json[self.NAME_SCHEMA] = self.name - sftwr_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - sftwr_json[self.PROJECT_SCHEMA] = self._link_project(project) + sftwr_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + sftwr_json[self.CONSORTIA_SCHEMA] = consortia + sftwr_json[self.CATEGORY_SCHEMA] = self.category if getattr(self, self.VERSION_SCHEMA, None): sftwr_json[self.VERSION_SCHEMA] = self.version @@ -538,7 +556,7 @@ def to_json( sftwr_json[self.SOURCE_URL_SCHEMA] = self.source_url sftwr_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) - sftwr_json[self.ALIASES_SCHEMA] = [f'{project}:{self.SOFTWARE_TYPE_SCHEMA}-{self.name}_{version}'] + sftwr_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{self.name}_{version}'] # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -562,6 +580,8 @@ class YAMLFileReference(YAMLTemplate): # schema constants EXTRA_FILES_SCHEMA = 'extra_files' + DATA_CATEGORY_SCHEMA = 'data_category' + DATA_TYPE_SCHEMA = 'data_type' def __init__(self, data): """Constructor method. @@ -577,24 +597,26 @@ def __init__(self, data): def to_json( self, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ ref_json = {} # common metadata - ref_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - ref_json[self.PROJECT_SCHEMA] = self._link_project(project) + ref_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + ref_json[self.CONSORTIA_SCHEMA] = consortia ref_json[self.DESCRIPTION_SCHEMA] = self.description ref_json[self.FILE_FORMAT_SCHEMA] = self.format - ref_json[self.ALIASES_SCHEMA] = [f'{project}:{self.FILEREFERENCE_TYPE_SCHEMA}-{self.name}_{self.version}'] + ref_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.FILEREFERENCE_TYPE_SCHEMA}-{self.name}_{self.version}'] ref_json[self.EXTRA_FILES_SCHEMA] = getattr(self, self.SECONDARY_FILES_SCHEMA, []) ref_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, None) # this will be used during post/patch, # if None: # - leave it as is if patch # - set to uploading if post + ref_json[self.DATA_CATEGORY_SCHEMA] = self.category + ref_json[self.DATA_TYPE_SCHEMA] = self.type # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -617,10 +639,11 @@ class YAMLFileFormat(YAMLTemplate): """ # schema constants + IDENTIFIER_SCHEMA = 'identifier' STANDARD_FILE_EXTENSION_SCHEMA = 'standard_file_extension' - VALID_ITEM_TYPES_SCHEMA = 'valid_item_types' - EXTRAFILE_FORMATS_SCHEMA = 'extrafile_formats' - FILE_TYPES_SCHEMA = 'file_types' + # VALID_ITEM_TYPES_SCHEMA = 'valid_item_types' + EXTRA_FILE_FORMATS_SCHEMA = 'extra_file_formats' + # FILE_TYPES_SCHEMA = 'file_types' def __init__(self, data): """Constructor method. @@ -636,22 +659,22 @@ def __init__(self, data): def to_json( self, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ frmt_json = {} # common metadata - frmt_json[self.FILE_FORMAT_SCHEMA] = self.name - frmt_json[self.ALIASES_SCHEMA] = [f'{project}:{self.FILEFORMAT_TYPE_SCHEMA}-{self.name}'] - frmt_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - frmt_json[self.PROJECT_SCHEMA] = self._link_project(project) + frmt_json[self.IDENTIFIER_SCHEMA] = self.name + frmt_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.FILEFORMAT_TYPE_SCHEMA}-{self.name}'] + frmt_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + frmt_json[self.CONSORTIA_SCHEMA] = consortia frmt_json[self.DESCRIPTION_SCHEMA] = self.description frmt_json[self.STANDARD_FILE_EXTENSION_SCHEMA] = self.extension - frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['FileReference', 'FileProcessed']) - frmt_json[self.EXTRAFILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA, []) + # frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['FileReference', 'FileProcessed']) + frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA, []) frmt_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, 'shared') # uuid, accession if specified diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index 0d4e5d2..fba7ef0 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -221,8 +221,8 @@ def _post_patch_file(self, type): # creating JSON object d_ = self._yaml_to_json( d, self.object_[type], - institution=self.institution, - project=self.project + submission_centers=self.submission_centers, + consortia=self.consortia ) # post/patch object if d_: self._post_patch_json(d_, type) @@ -250,8 +250,8 @@ def _post_patch_folder(self, type): # creating _yaml_to_json **kwargs kwargs_ = { 'version': self.version, - 'institution': self.institution, - 'project': self.project + 'submission_centers': self.submission_centers, + 'consortia': self.consortia } if type == 'Workflow': kwargs_.setdefault( diff --git a/pipeline_utils/schemas/yaml_file_format.py b/pipeline_utils/schemas/yaml_file_format.py index ec904e0..190f714 100644 --- a/pipeline_utils/schemas/yaml_file_format.py +++ b/pipeline_utils/schemas/yaml_file_format.py @@ -20,14 +20,14 @@ schema.DESCRIPTION: 'Extension of the FileFormat', schema.TYPE: schema.STRING }, - 'file_types': { - schema.DESCRIPTION: 'File types that can use the FileFormat', - schema.TYPE: schema.ARRAY, - schema.ITEMS: { - schema.TYPE: schema.STRING, - schema.PATTERN: 'FileReference|FileProcessed|FileSubmitted|FileFastq' - } - }, + # 'file_types': { + # schema.DESCRIPTION: 'File types that can use the FileFormat', + # schema.TYPE: schema.ARRAY, + # schema.ITEMS: { + # schema.TYPE: schema.STRING, + # schema.PATTERN: 'FileReference|FileProcessed|FileSubmitted|FileFastq' + # } + # }, 'status': { schema.TYPE: schema.STRING }, diff --git a/pipeline_utils/schemas/yaml_file_reference.py b/pipeline_utils/schemas/yaml_file_reference.py index 7641d5c..9fc750b 100644 --- a/pipeline_utils/schemas/yaml_file_reference.py +++ b/pipeline_utils/schemas/yaml_file_reference.py @@ -20,6 +20,14 @@ schema.DESCRIPTION: 'Format of the FileReference', schema.TYPE: schema.STRING }, + 'category': { + schema.DESCRIPTION: 'Category of the FileReference', + schema.TYPE: schema.STRING + }, + 'type': { + schema.DESCRIPTION: 'Type of the FileReference', + schema.TYPE: schema.STRING + }, 'version': { schema.DESCRIPTION: 'Version of the FileReference', schema.TYPE: schema.STRING @@ -41,5 +49,5 @@ schema.TYPE: schema.STRING } }, - schema.REQUIRED: ['name', 'description', 'format', 'version'] + schema.REQUIRED: ['name', 'description', 'format', 'category', 'type', 'version'] } diff --git a/pipeline_utils/schemas/yaml_metaworkflow.py b/pipeline_utils/schemas/yaml_metaworkflow.py index ab4ebaa..98d4733 100644 --- a/pipeline_utils/schemas/yaml_metaworkflow.py +++ b/pipeline_utils/schemas/yaml_metaworkflow.py @@ -22,6 +22,13 @@ schema.DESCRIPTION: 'Description of the MetaWorkflow', schema.TYPE: schema.STRING }, + 'category': { + schema.DESCRIPTION: 'Categories of the MetaWorkflow', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, ## General input information ## 'input': { @@ -76,7 +83,7 @@ } } }, - schema.REQUIRED: ['name', 'description', 'input', 'workflows'], + schema.REQUIRED: ['name', 'description', 'category', 'input', 'workflows'], ## Sub-schemas #################### schema.DEFS: { @@ -87,7 +94,7 @@ schema.PROPERTIES: { 'argument_type': { schema.TYPE: schema.STRING, - schema.PATTERN: '^file\..+|^parameter\..+' + schema.PATTERN: '^file\\..+|^parameter\\..+' }, 'dimensionality': { schema.TYPE: schema.NUMBER @@ -96,7 +103,7 @@ schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: '.+\@.+' # check for @ + schema.PATTERN: '.+\\@.+' # check for @ } }, 'source': { @@ -125,10 +132,30 @@ }, 'rename': { schema.TYPE: schema.STRING, - schema.PATTERN: '^formula\:.+' + schema.PATTERN: '^formula\\:.+' }, 'unzip': { schema.TYPE: schema.STRING + }, + 'qc_thresholds': { + schema.TYPE: schema.OBJECT, + schema.PATTERNPROPERTIES: { + '.+': { + schema.TYPE: schema.OBJECT, + schema.PROPERTIES: { + 'rule': { + schema.TYPE: schema.STRING, + schema.PATTERN: '^([^|]+\\|[^|]+\\|[^|]+\\|[^|]+)$' + }, + 'flag': { + schema.TYPE: schema.BOOLEAN + } + } + } + } + }, + 'qc_rule': { + schema.TYPE: schema.STRING } }, schema.REQUIRED: ['argument_type'] diff --git a/pipeline_utils/schemas/yaml_software.py b/pipeline_utils/schemas/yaml_software.py index 3afb69f..04eabfb 100644 --- a/pipeline_utils/schemas/yaml_software.py +++ b/pipeline_utils/schemas/yaml_software.py @@ -37,9 +37,13 @@ 'license': { schema.DESCRIPTION: 'License of the Software', schema.TYPE: schema.STRING + }, + 'category': { + schema.DESCRIPTION: 'Category of the Software', + schema.TYPE: schema.STRING } }, - schema.REQUIRED: ['name'], + schema.REQUIRED: ['name', 'category'], schema.ONEOF: [ {schema.REQUIRED: ['version']}, {schema.REQUIRED: ['commit']} diff --git a/pipeline_utils/schemas/yaml_workflow.py b/pipeline_utils/schemas/yaml_workflow.py index 3d7a19e..a9e151e 100644 --- a/pipeline_utils/schemas/yaml_workflow.py +++ b/pipeline_utils/schemas/yaml_workflow.py @@ -55,6 +55,10 @@ schema.PATTERN: '.+\@.+' # check for @ } }, + 'category': { + schema.DESCRIPTION: 'Category of the Workflow', + schema.TYPE: schema.STRING + }, ## Input information ########## 'input': { @@ -74,7 +78,7 @@ } } }, - schema.REQUIRED: ['name', 'description', 'runner', 'input', 'output'], + schema.REQUIRED: ['name', 'description', 'runner', 'category', 'input', 'output'], ## Sub-schemas #################### schema.DEFS: { diff --git a/pyproject.toml b/pyproject.toml index 54842e4..cad422b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] -name = "portal-pipeline-utils" +name = "smaht-pipeline-utils" version = "3.0.0" -description = "Utilities for deploying pipelines and interfacing with portal infrastructure." +description = "Utilities for deploying pipelines and interfacing with SMaHT portal infrastructure." authors = [ "Michele Berselli ", - "CGAP & SMaHT Team" + "SMaHT Team" ] license = "MIT" -repository = 'https://github.com/dbmi-bgm/portal-pipeline-utils' +repository = 'https://github.com/smaht-dac/smaht-pipeline-utils' readme = "README.md" classifiers = [ 'License :: OSI Approved :: MIT License', @@ -35,7 +35,7 @@ boto3-stubs = "^1.28.62" [tool.poetry.scripts] -pipeline_utils = "pipeline_utils.__main__:main" +smaht_pipeline_utils = "pipeline_utils.__main__:main" publish-to-pypi = "dcicutils.scripts.publish_to_pypi:main" diff --git a/tests/repo_correct/portal_objects/file_reference.yaml b/tests/repo_correct/portal_objects/file_reference.yaml index 2bc0101..daeec31 100644 --- a/tests/repo_correct/portal_objects/file_reference.yaml +++ b/tests/repo_correct/portal_objects/file_reference.yaml @@ -12,11 +12,15 @@ secondary_files: status: uploading uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe7 accession: GAPFIXRDPDK5 +category: Sequencing Reads +type: Unaligned Reads --- # hg38 fasta (MINIMAL) name: reference_genome +category: Sequencing Reads +type: Aligned Reads description: hg38 full reference genome plus decoy for CGAP, fasta format format: fa version: hg38 diff --git a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index 47d57a6..d48833f 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -18,6 +18,10 @@ input: argument_type: parameter.json value: ['SAMPLENAME'] +category: + - Alignment + - Format Conversion + workflows: ########################################## # gatk-HC diff --git a/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml index 22f312f..db788d3 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml @@ -7,6 +7,9 @@ description: Pipeline to run gatk-HC to call variants uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 +category: + - Variant Calling + input: input_vcf: argument_type: file.vcf diff --git a/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml b/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml new file mode 100644 index 0000000..df26442 --- /dev/null +++ b/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml @@ -0,0 +1,68 @@ + +# gatk-HC-GT-pipeline (MINIMAL) +# + uuid +# + accession +name: gatk-HC-pipeline +description: Pipeline to run gatk-HC to call variants +uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 +accession: GAPFIXRDPDK1 + +category: + - Variant Calling + +input: + input_vcf: + argument_type: file.vcf + + reference: + argument_type: file.fa + files: + - reference_genome@hg38 + - reference_bam@hg38 + + samples: + argument_type: parameter.json + + qc_ruleset_name_1: + argument_type: parameter.qc_ruleset + qc_thresholds: + c1: + rule: coverage|>=|100|80 + flag: True + c2: + rule: coverage|<=|200|180 + c3: + rule: coverage|>|80|3.3 + rl: + rule: read_length|==|PASS|NOT PASS + flag: True + qc_rule: ( {c1} and {c2} ) or not ( {c3} and {rl} ) + +workflows: + ########################################## + # gatk-HC + ########################################## + gatk-HC: + # gatk-HC input + input: + vcf: + argument_type: file.vcf + source_argument_name: input_vcf + + reference: + argument_type: file.fa + + samples: + argument_type: parameter.json + + qc_ruleset: + argument_type: parameter.qc_ruleset + source_argument_name: qc_ruleset_name_1 + # gatk-HC output + output: + HC_vcf: + file_type: hc-vcf + # gatk-HC config + config: + ebs_size: 2x + ec2_type: m.5xlarge diff --git a/tests/repo_correct/portal_objects/software.yaml b/tests/repo_correct/portal_objects/software.yaml index 9a0463b..0a20ca5 100644 --- a/tests/repo_correct/portal_objects/software.yaml +++ b/tests/repo_correct/portal_objects/software.yaml @@ -5,6 +5,7 @@ version: 4.1.2 title: gatk 4.1.2 source_url: 'http:/broad' description: gatk software package +category: Aligner --- @@ -12,6 +13,7 @@ description: gatk software package # + uuid # + accession name: picard +category: Variant Caller commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K diff --git a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml index 6c06db7..527beeb 100644 --- a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml @@ -3,6 +3,8 @@ name: gatk-HaplotypeCaller description: Run HaplotypeCaller from gatk package +category: Annotation + runner: language: wdl main: workflow_gatk-HaplotypeCaller-check.wdl diff --git a/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml index 7885f67..3aead30 100644 --- a/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml @@ -19,3 +19,5 @@ output: uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 + +category: Feature Calling diff --git a/tests/repo_error/portal_objects/file_reference.yaml b/tests/repo_error/portal_objects/file_reference.yaml index cb47497..0748dd5 100644 --- a/tests/repo_error/portal_objects/file_reference.yaml +++ b/tests/repo_error/portal_objects/file_reference.yaml @@ -12,11 +12,15 @@ secondary_files: status: uploading uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe7 accession: GAPFIXRDPDK5 +category: Sequencing Reads +type: Unaligned Reads --- # hg38 fasta (MINIMAL) name: reference_genome +category: Sequencing Reads +type: Aligned Reads description: hg38 full reference genome plus decoy for CGAP, fasta format format: fa version: hg38 diff --git a/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index 1cf0506..f84a8b8 100644 --- a/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -19,6 +19,10 @@ input: argument_type: parameter.json value: ['SAMPLENAME'] +category: + - Alignment + - Format Conversion + workflows: ########################################## # gatk-HC diff --git a/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml b/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml index 22f312f..db788d3 100644 --- a/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml +++ b/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml @@ -7,6 +7,9 @@ description: Pipeline to run gatk-HC to call variants uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 +category: + - Variant Calling + input: input_vcf: argument_type: file.vcf diff --git a/tests/repo_error/portal_objects/software.yaml b/tests/repo_error/portal_objects/software.yaml index 0282749..306e926 100644 --- a/tests/repo_error/portal_objects/software.yaml +++ b/tests/repo_error/portal_objects/software.yaml @@ -5,6 +5,7 @@ name: gatk title: gatk 4.1.2 source_url: 'http:/broad' description: gatk software package +category: Aligner --- @@ -12,6 +13,7 @@ description: gatk software package # + uuid # + accession name: picard +category: Variant Caller commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K diff --git a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml index 6c06db7..527beeb 100644 --- a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml @@ -3,6 +3,8 @@ name: gatk-HaplotypeCaller description: Run HaplotypeCaller from gatk package +category: Annotation + runner: language: wdl main: workflow_gatk-HaplotypeCaller-check.wdl diff --git a/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml index fc0d14f..ba07ef7 100644 --- a/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml @@ -20,3 +20,5 @@ output: uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 + +category: Feature Calling diff --git a/tests/test_yaml_file_format.py b/tests/test_yaml_file_format.py index e6a8ba7..bf33304 100644 --- a/tests/test_yaml_file_format.py +++ b/tests/test_yaml_file_format.py @@ -15,25 +15,25 @@ def test_file_format(): { "aliases": ["cgap-core:FileFormat-bam"], "description": "format to represent aligned reads", - "extrafile_formats": ["bai"], - "file_format": "bam", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", + "extra_file_formats": ["bai"], + "identifier": "bam", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "standard_file_extension": "bam", - "status": "shared", - "valid_item_types": ["FileReference", "FileProcessed"] + "status": "shared" + # "valid_item_types": ["FileReference", "FileProcessed"] }, { "accession": 'GAPFIXRDPDK1', "aliases": ["cgap-core:FileFormat-bam_bai"], "description": "index for bam format", - "extrafile_formats": [], - "file_format": "bam_bai", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", + "extra_file_formats": [], + "identifier": "bam_bai", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "standard_file_extension": "bam.bai", "status": "shared", - "valid_item_types": ["FileReference", "FileProcessed"], + # "valid_item_types": ["FileReference", "FileProcessed"], "uuid": '1936f246-22e1-45dc-bb5c-9cfd55537fe9' } ] @@ -41,8 +41,8 @@ def test_file_format(): for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/file_format.yaml')): # creating JSON object d_ = yaml_parser.YAMLFileFormat(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) # check assert d_ == res[i] @@ -55,8 +55,8 @@ def test_file_format_error(): try: # creating JSON object d_ = yaml_parser.YAMLFileFormat(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) except yaml_parser.ValidationError as e: pass diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index 797a1ce..f5c0ea0 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -18,27 +18,31 @@ def test_file_reference(): "description": "hg38 full reference genome plus decoy for CGAP, fasta format", "extra_files": ["fa_fai", "dict"], "file_format": "fa", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "status": "uploading", - "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7" + "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7", + "data_category": "Sequencing Reads", + "data_type": "Unaligned Reads" }, { "aliases": ["cgap-core:FileReference-reference_genome_hg38"], "description": "hg38 full reference genome plus decoy for CGAP, fasta format", "extra_files": [], "file_format": "fa", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", - "status": None + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], + "status": None, + "data_category": "Sequencing Reads", + "data_type": "Aligned Reads" } ] for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/file_reference.yaml')): # creating JSON object d_ = yaml_parser.YAMLFileReference(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) # check assert d_ == res[i] @@ -51,8 +55,8 @@ def test_file_reference_error(): try: # creating JSON object d_ = yaml_parser.YAMLFileReference(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) except yaml_parser.ValidationError as e: pass diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index e390579..fd4578e 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -16,6 +16,7 @@ def test_metaworkflow(): { "aliases": ["cgap-core:MetaWorkflow-gatk-HC-GT-pipeline_v1.0.0"], "description": "Pipeline to run gatk-HC and gatk-GT to call and genotype variants", + "category": ["Alignment", "Format Conversion"], "input": [ { "argument_name": "input_vcf", @@ -34,9 +35,9 @@ def test_metaworkflow(): "value_type": "json" } ], - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi", "smaht-dbmi"], "name": "gatk-HC-GT-pipeline", - "project": "/projects/cgap-core/", + "consortia": ["cgap-core"], "title": "gatk-HC and gatk-GT pipeline [v1.0.0]", "version": "v1.0.0", "workflows": [ @@ -115,7 +116,8 @@ def test_metaworkflow(): }, { "accession": "GAPFIXRDPDK1", - "aliases": ["cgap-core:MetaWorkflow-gatk-HC-pipeline_v1.0.0"], + "category": ["Variant Calling"], + "aliases": ["cgap-core_cgap-test:MetaWorkflow-gatk-HC-pipeline_v1.0.0"], "description": "Pipeline to run gatk-HC to call variants", "input": [ { @@ -125,8 +127,8 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"dimension": "0", "file": "cgap-core:FileReference-reference_genome_hg38"}, - {"dimension": "1", "file": "cgap-core:FileReference-reference_bam_hg38"}] + "files": [{"dimension": "0", "file": "cgap-core_cgap-test:FileReference-reference_genome_hg38"}, + {"dimension": "1", "file": "cgap-core_cgap-test:FileReference-reference_bam_hg38"}] }, { "argument_name": "samples", @@ -134,9 +136,9 @@ def test_metaworkflow(): "value_type": "json" } ], - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi"], "name": "gatk-HC-pipeline", - "project": "/projects/cgap-core/", + "consortia": ["cgap-test", "cgap-core"], "title": "gatk-HC-pipeline [v1.0.0]", "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", "version": "v1.0.0", @@ -168,7 +170,7 @@ def test_metaworkflow(): } ], "name": "gatk-HC", - "workflow": "cgap-core:Workflow-gatk-HC_v1.0.0" + "workflow": "cgap-core_cgap-test:Workflow-gatk-HC_v1.0.0" } ] } @@ -176,8 +178,8 @@ def test_metaworkflow(): for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml'): d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi", "smaht-dbmi"], + consortia=["cgap-core"], version='v1.0.0' ) # check @@ -185,13 +187,131 @@ def test_metaworkflow(): for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml'): d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-test", "cgap-core"], version='v1.0.0' ) # check assert d_ == res[1] +def test_qc_ruleset(): + """ + """ + res = { + "accession": "GAPFIXRDPDK1", + "category": ["Variant Calling"], + "aliases": ["cgap-core_cgap-test:MetaWorkflow-gatk-HC-pipeline_v1.0.0"], + "description": "Pipeline to run gatk-HC to call variants", + "input": [ + { + "argument_name": "input_vcf", + "argument_type": "file" + }, + { + "argument_name": "reference", + "argument_type": "file", + "files": [{"dimension": "0", "file": "cgap-core_cgap-test:FileReference-reference_genome_hg38"}, + {"dimension": "1", "file": "cgap-core_cgap-test:FileReference-reference_bam_hg38"}] + }, + { + "argument_name": "samples", + "argument_type": "parameter", + "value_type": "json" + }, + { + "argument_name": "qc_ruleset_name_1", + "argument_type": "parameter", + "value_type": "qc_ruleset", + "value": { + "qc_thresholds": [ + { + "id": "c1", + "metric": "coverage", + "operator": ">=", + "pass_target": 100.0, + "warn_target": 80.0, + "use_as_qc_flag": True + }, + { + "id": "c2", + "metric": "coverage", + "operator": "<=", + "pass_target": 200.0, + "warn_target": 180.0, + }, + { + "id": "c3", + "metric": "coverage", + "operator": ">", + "pass_target": 80.0, + "warn_target": 3.3 + }, + { + "id": "rl", + "metric": "read_length", + "operator": "==", + "pass_target": "PASS", + "warn_target": "NOT PASS", + "use_as_qc_flag": True + } + ], + "overall_quality_status_rule": "( {c1} and {c2} ) or not ( {c3} and {rl} )" + } + } + ], + "submission_centers": ["hms-dbmi"], + "name": "gatk-HC-pipeline", + "consortia": ["cgap-test", "cgap-core"], + "title": "gatk-HC-pipeline [v1.0.0]", + "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", + "version": "v1.0.0", + "workflows": [ + { + "config": { + "ebs_size": "2x", + "ec2_type": "m.5xlarge" + }, + "custom_pf_fields": { + "HC_vcf": { + "file_type": "hc-vcf" + } + }, + "input": [ + { + "argument_name": "vcf", + "argument_type": "file", + "source_argument_name": "input_vcf" + }, + { + "argument_name": "reference", + "argument_type": "file" + }, + { + "argument_name": "samples", + "argument_type": "parameter", + "value_type": "json" + }, + { + 'argument_name': 'qc_ruleset', + 'argument_type': 'parameter', + 'value_type': 'qc_ruleset', + 'source_argument_name': 'qc_ruleset_name_1' + } + ], + "name": "gatk-HC", + "workflow": "cgap-core_cgap-test:Workflow-gatk-HC_v1.0.0" + } + ] + } + + for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml'): + d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( + submission_centers=["hms-dbmi"], + consortia=["cgap-test", "cgap-core"], + version='v1.0.0' + ) + # check + assert d_ == res def test_metaworkflow_error(): """ @@ -201,8 +321,8 @@ def test_metaworkflow_error(): for d in yaml_parser.load_yaml(fn): try: d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0' ) except yaml_parser.ValidationError as e: diff --git a/tests/test_yaml_software.py b/tests/test_yaml_software.py index acedf61..1f0204d 100644 --- a/tests/test_yaml_software.py +++ b/tests/test_yaml_software.py @@ -15,30 +15,32 @@ def test_software(): { "aliases": ["cgap-core:Software-gatk_4.1.2"], "description": "gatk software package", - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi"], "name": "gatk", - "project": "/projects/cgap-core/", + "consortia": ["cgap-core"], "source_url": "http:/broad", "title": "gatk 4.1.2", - "version": "4.1.2" + "version": "4.1.2", + "category": "Aligner" }, { "accession": "GAPMKF1LL29K", "aliases": ["cgap-core:Software-picard_324ePT"], "commit": "324ePT", - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi"], "name": "picard", - "project": "/projects/cgap-core/", + "consortia": ["cgap-core"], "title": "picard [324ePT]", - "uuid": "efdac7ec-7da3-4f23-9056-7a04abbc5e8b" + "uuid": "efdac7ec-7da3-4f23-9056-7a04abbc5e8b", + "category": "Variant Caller" } ] for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/software.yaml')): # creating JSON object d_ = yaml_parser.YAMLSoftware(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) # check assert d_ == res[i] @@ -51,8 +53,8 @@ def test_software_error(): try: # creating JSON object d_ = yaml_parser.YAMLSoftware(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) except yaml_parser.ValidationError as e: pass diff --git a/tests/test_yaml_workflow.py b/tests/test_yaml_workflow.py index fcb79d5..4f31e77 100644 --- a/tests/test_yaml_workflow.py +++ b/tests/test_yaml_workflow.py @@ -15,8 +15,9 @@ def test_workflow(): res = [ { "aliases": ["cgap-core:Workflow-gatk-HaplotypeCaller_v1.0.0"], - "app_name": "gatk-HaplotypeCaller", - "app_version": "v1.0.0", + "name": "gatk-HaplotypeCaller", + "version": "v1.0.0", + "category": "Annotation", "arguments": [ { "argument_format": "bam", @@ -47,27 +48,27 @@ def test_workflow(): } ], "description": "Run HaplotypeCaller from gatk package", - "institution": "/institutions/hms-dbmi/", - "name": "gatk-HaplotypeCaller_v1.0.0", - "project": "/projects/cgap-core/", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "software": [ "cgap-core:Software-gatk_4.2.1", "cgap-core:Software-vcf-tools_5A63Aa1" ], "title": "HaplotypeCaller plus integity-check [v1.0.0]", - "wdl_child_filenames": [ + "child_file_names": [ "gatk-HaplotypeCaller.wdl", "integrity-check.wdl" ], - "wdl_directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", - "wdl_main_filename": "workflow_gatk-HaplotypeCaller-check.wdl", - "workflow_language": "wdl" + "directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", + "main_file_name": "workflow_gatk-HaplotypeCaller-check.wdl", + "language": "WDL" }, { "accession": "GAPFIXRDPDK1", + "category": "Feature Calling", "aliases": ["cgap-core:Workflow-gatk-HaplotypeCaller_v1.0.0"], - "app_name": "gatk-HaplotypeCaller", - "app_version": "v1.0.0", + "name": "gatk-HaplotypeCaller", + "version": "v1.0.0", "arguments": [ { "argument_format": "bam", @@ -82,22 +83,22 @@ def test_workflow(): } ], "description": "Run HaplotypeCaller from gatk package", - "institution": "/institutions/hms-dbmi/", - "name": "gatk-HaplotypeCaller_v1.0.0", - "project": "/projects/cgap-core/", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "software": [], "title": "gatk-HaplotypeCaller [v1.0.0]", - "cwl_child_filenames": [], - "cwl_directory_url_v1": "s3://BUCKETCWL/test_pipeline/v1.0.0", - "cwl_main_filename": "gatk-HaplotypeCaller-check.cwl", - "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9" + "child_file_names": [], + "directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", + "main_file_name": "gatk-HaplotypeCaller-check.cwl", + "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", + "language": "CWL" } ] for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml'): d_ = yaml_parser.YAMLWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0', wflbucket_url='s3://BUCKETCWL/test_pipeline/v1.0.0' ) @@ -106,8 +107,8 @@ def test_workflow(): for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml'): d_ = yaml_parser.YAMLWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0', wflbucket_url='s3://BUCKETCWL/test_pipeline/v1.0.0' ) @@ -123,8 +124,8 @@ def test_workflow_error(): for d in yaml_parser.load_yaml(fn): try: d_ = yaml_parser.YAMLWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0', wflbucket_url='s3://BUCKETCWL/test_pipeline/v1.0.0' ) From 3fc75e5f11aea4cb196747ec3823c72961bff153 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Tue, 31 Oct 2023 10:46:53 -0400 Subject: [PATCH 02/18] category to array for all objects --- pipeline_utils/schemas/yaml_file_reference.py | 7 +++++-- pipeline_utils/schemas/yaml_software.py | 7 +++++-- pipeline_utils/schemas/yaml_workflow.py | 7 +++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pipeline_utils/schemas/yaml_file_reference.py b/pipeline_utils/schemas/yaml_file_reference.py index 9fc750b..a22b9d9 100644 --- a/pipeline_utils/schemas/yaml_file_reference.py +++ b/pipeline_utils/schemas/yaml_file_reference.py @@ -21,8 +21,11 @@ schema.TYPE: schema.STRING }, 'category': { - schema.DESCRIPTION: 'Category of the FileReference', - schema.TYPE: schema.STRING + schema.DESCRIPTION: 'Categories of the FileReference', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } }, 'type': { schema.DESCRIPTION: 'Type of the FileReference', diff --git a/pipeline_utils/schemas/yaml_software.py b/pipeline_utils/schemas/yaml_software.py index 04eabfb..17c44d4 100644 --- a/pipeline_utils/schemas/yaml_software.py +++ b/pipeline_utils/schemas/yaml_software.py @@ -39,8 +39,11 @@ schema.TYPE: schema.STRING }, 'category': { - schema.DESCRIPTION: 'Category of the Software', - schema.TYPE: schema.STRING + schema.DESCRIPTION: 'Categories of the Software', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } } }, schema.REQUIRED: ['name', 'category'], diff --git a/pipeline_utils/schemas/yaml_workflow.py b/pipeline_utils/schemas/yaml_workflow.py index a9e151e..2ca14f9 100644 --- a/pipeline_utils/schemas/yaml_workflow.py +++ b/pipeline_utils/schemas/yaml_workflow.py @@ -56,8 +56,11 @@ } }, 'category': { - schema.DESCRIPTION: 'Category of the Workflow', - schema.TYPE: schema.STRING + schema.DESCRIPTION: 'Categories of the Workflow', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } }, ## Input information ########## From fd77a9baf41d8d15dfd14eec6eaabbf944fcea6b Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Tue, 31 Oct 2023 11:13:41 -0400 Subject: [PATCH 03/18] updated tests to use list for all objects category and type fields --- pipeline_utils/__main__.py | 8 ++++---- pipeline_utils/schemas/yaml_file_reference.py | 7 +++++-- tests/repo_correct/portal_objects/file_format.yaml | 6 +++--- .../repo_correct/portal_objects/file_reference.yaml | 12 ++++++++---- tests/repo_correct/portal_objects/software.yaml | 6 ++++-- .../portal_objects/workflows/A_gatk-HC.yaml | 3 ++- .../portal_objects/workflows/B_minimal-gatk-HC.yaml | 3 ++- tests/repo_error/portal_objects/file_format.yaml | 6 +++--- tests/repo_error/portal_objects/file_reference.yaml | 12 ++++++++---- tests/repo_error/portal_objects/software.yaml | 6 ++++-- .../portal_objects/workflows/A_gatk-HC.yaml | 3 ++- .../portal_objects/workflows/B_minimal-gatk-HC.yaml | 3 ++- tests/test_yaml_file_reference.py | 8 ++++---- tests/test_yaml_software.py | 4 ++-- tests/test_yaml_workflow.py | 4 ++-- 15 files changed, 55 insertions(+), 36 deletions(-) diff --git a/pipeline_utils/__main__.py b/pipeline_utils/__main__.py index c3ea156..7c04d94 100644 --- a/pipeline_utils/__main__.py +++ b/pipeline_utils/__main__.py @@ -20,8 +20,8 @@ # Variables PIPELINE_DEPLOY = 'pipeline_deploy' -CONSORTIA_ALIAS = 'CONSORTIUM' -SUBMISSION_CENTERS_ALIAS = 'SUBMISSION_CENTER' +CONSORTIA_ALIAS = ['smaht_consortium'] +SUBMISSION_CENTERS_ALIAS = ['smaht_dac'] KEYS_ALIAS = '~/.cgap-keys.json' MAIN_ALIAS = 'main' BUILDER_ALIAS = '-pipeline-builder' @@ -51,9 +51,9 @@ def main(args=None): pipeline_deploy_parser.add_argument('--account', required=False, help='AWS account to use for deployment') pipeline_deploy_parser.add_argument('--region', required=False, help='AWS account region to use for deployment') pipeline_deploy_parser.add_argument('--consortia', required=False, nargs='+', help='List of consortia to use for deployment', - default=[CONSORTIA_ALIAS]) + default=CONSORTIA_ALIAS) pipeline_deploy_parser.add_argument('--submission-centers', required=False, nargs='+', help='List of centers to use for deployment', - default=[SUBMISSION_CENTERS_ALIAS]) + default=SUBMISSION_CENTERS_ALIAS) pipeline_deploy_parser.add_argument('--post-software', action='store_true', help='POST|PATCH Software objects') pipeline_deploy_parser.add_argument('--post-file-format', action='store_true', help='POST|PATCH FileFormat objects') diff --git a/pipeline_utils/schemas/yaml_file_reference.py b/pipeline_utils/schemas/yaml_file_reference.py index a22b9d9..a01758d 100644 --- a/pipeline_utils/schemas/yaml_file_reference.py +++ b/pipeline_utils/schemas/yaml_file_reference.py @@ -28,8 +28,11 @@ } }, 'type': { - schema.DESCRIPTION: 'Type of the FileReference', - schema.TYPE: schema.STRING + schema.DESCRIPTION: 'Types of the FileReference', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } }, 'version': { schema.DESCRIPTION: 'Version of the FileReference', diff --git a/tests/repo_correct/portal_objects/file_format.yaml b/tests/repo_correct/portal_objects/file_format.yaml index 52bb0df..81408ba 100644 --- a/tests/repo_correct/portal_objects/file_format.yaml +++ b/tests/repo_correct/portal_objects/file_format.yaml @@ -5,9 +5,9 @@ extension: bam description: format to represent aligned reads secondary_formats: - bai -file_types: - - FileReference - - FileProcessed +# file_types: +# - FileReference +# - FileProcessed status: shared --- diff --git a/tests/repo_correct/portal_objects/file_reference.yaml b/tests/repo_correct/portal_objects/file_reference.yaml index daeec31..c5ca6b1 100644 --- a/tests/repo_correct/portal_objects/file_reference.yaml +++ b/tests/repo_correct/portal_objects/file_reference.yaml @@ -12,15 +12,19 @@ secondary_files: status: uploading uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe7 accession: GAPFIXRDPDK5 -category: Sequencing Reads -type: Unaligned Reads +category: + - Sequencing Reads +type: + - Unaligned Reads --- # hg38 fasta (MINIMAL) name: reference_genome -category: Sequencing Reads -type: Aligned Reads +category: + - Sequencing Reads +type: + - Aligned Reads description: hg38 full reference genome plus decoy for CGAP, fasta format format: fa version: hg38 diff --git a/tests/repo_correct/portal_objects/software.yaml b/tests/repo_correct/portal_objects/software.yaml index 0a20ca5..4ea9c7d 100644 --- a/tests/repo_correct/portal_objects/software.yaml +++ b/tests/repo_correct/portal_objects/software.yaml @@ -5,7 +5,8 @@ version: 4.1.2 title: gatk 4.1.2 source_url: 'http:/broad' description: gatk software package -category: Aligner +category: + - Aligner --- @@ -13,7 +14,8 @@ category: Aligner # + uuid # + accession name: picard -category: Variant Caller +category: + - Variant Caller commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K diff --git a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml index 527beeb..5037466 100644 --- a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml @@ -3,7 +3,8 @@ name: gatk-HaplotypeCaller description: Run HaplotypeCaller from gatk package -category: Annotation +category: + - Annotation runner: language: wdl diff --git a/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml index 3aead30..9d382d4 100644 --- a/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml @@ -20,4 +20,5 @@ output: uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 -category: Feature Calling +category: + - Feature Calling diff --git a/tests/repo_error/portal_objects/file_format.yaml b/tests/repo_error/portal_objects/file_format.yaml index 03ee21c..0aa0aed 100644 --- a/tests/repo_error/portal_objects/file_format.yaml +++ b/tests/repo_error/portal_objects/file_format.yaml @@ -6,9 +6,9 @@ extension: bam description: format to represent aligned reads secondary_formats: - bai -file_types: - - FileReference - - FileProcessed +# file_types: +# - FileReference +# - FileProcessed status: shared --- diff --git a/tests/repo_error/portal_objects/file_reference.yaml b/tests/repo_error/portal_objects/file_reference.yaml index 0748dd5..d910dee 100644 --- a/tests/repo_error/portal_objects/file_reference.yaml +++ b/tests/repo_error/portal_objects/file_reference.yaml @@ -12,15 +12,19 @@ secondary_files: status: uploading uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe7 accession: GAPFIXRDPDK5 -category: Sequencing Reads -type: Unaligned Reads +category: + - Sequencing Reads +type: + - Unaligned Reads --- # hg38 fasta (MINIMAL) name: reference_genome -category: Sequencing Reads -type: Aligned Reads +category: + - Sequencing Reads +type: + - Aligned Reads description: hg38 full reference genome plus decoy for CGAP, fasta format format: fa version: hg38 diff --git a/tests/repo_error/portal_objects/software.yaml b/tests/repo_error/portal_objects/software.yaml index 306e926..729f530 100644 --- a/tests/repo_error/portal_objects/software.yaml +++ b/tests/repo_error/portal_objects/software.yaml @@ -5,7 +5,8 @@ name: gatk title: gatk 4.1.2 source_url: 'http:/broad' description: gatk software package -category: Aligner +category: + - Aligner --- @@ -13,7 +14,8 @@ category: Aligner # + uuid # + accession name: picard -category: Variant Caller +category: + - Variant Caller commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K diff --git a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml index 527beeb..5037466 100644 --- a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml @@ -3,7 +3,8 @@ name: gatk-HaplotypeCaller description: Run HaplotypeCaller from gatk package -category: Annotation +category: + - Annotation runner: language: wdl diff --git a/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml index ba07ef7..afb7c17 100644 --- a/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml @@ -21,4 +21,5 @@ output: uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 -category: Feature Calling +category: + - Feature Calling diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index f5c0ea0..07e92c2 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -22,8 +22,8 @@ def test_file_reference(): "consortia": ["cgap-core"], "status": "uploading", "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7", - "data_category": "Sequencing Reads", - "data_type": "Unaligned Reads" + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] }, { "aliases": ["cgap-core:FileReference-reference_genome_hg38"], @@ -33,8 +33,8 @@ def test_file_reference(): "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], "status": None, - "data_category": "Sequencing Reads", - "data_type": "Aligned Reads" + "data_category": ["Sequencing Reads"], + "data_type": ["Aligned Reads"] } ] diff --git a/tests/test_yaml_software.py b/tests/test_yaml_software.py index 1f0204d..f1b5cf0 100644 --- a/tests/test_yaml_software.py +++ b/tests/test_yaml_software.py @@ -21,7 +21,7 @@ def test_software(): "source_url": "http:/broad", "title": "gatk 4.1.2", "version": "4.1.2", - "category": "Aligner" + "category": ["Aligner"] }, { "accession": "GAPMKF1LL29K", @@ -32,7 +32,7 @@ def test_software(): "consortia": ["cgap-core"], "title": "picard [324ePT]", "uuid": "efdac7ec-7da3-4f23-9056-7a04abbc5e8b", - "category": "Variant Caller" + "category": ["Variant Caller"] } ] diff --git a/tests/test_yaml_workflow.py b/tests/test_yaml_workflow.py index 4f31e77..ef97d38 100644 --- a/tests/test_yaml_workflow.py +++ b/tests/test_yaml_workflow.py @@ -17,7 +17,7 @@ def test_workflow(): "aliases": ["cgap-core:Workflow-gatk-HaplotypeCaller_v1.0.0"], "name": "gatk-HaplotypeCaller", "version": "v1.0.0", - "category": "Annotation", + "category": ["Annotation"], "arguments": [ { "argument_format": "bam", @@ -65,7 +65,7 @@ def test_workflow(): }, { "accession": "GAPFIXRDPDK1", - "category": "Feature Calling", + "category": ["Feature Calling"], "aliases": ["cgap-core:Workflow-gatk-HaplotypeCaller_v1.0.0"], "name": "gatk-HaplotypeCaller", "version": "v1.0.0", From 77fd92e433f35d6db2b642526e31b7071f6fdbda Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Wed, 1 Nov 2023 12:55:42 -0400 Subject: [PATCH 04/18] updated QC fields --- docs/yaml_workflow.rst | 35 +++--------- pipeline_utils/lib/yaml_parser.py | 57 ++++++------------- pipeline_utils/schemas/yaml_software.py | 2 +- pipeline_utils/schemas/yaml_workflow.py | 25 ++------ .../portal_objects/workflows/A_gatk-HC.yaml | 2 +- .../portal_objects/workflows/A_gatk-HC.yaml | 2 +- tests/test_yaml_workflow.py | 8 +-- 7 files changed, 37 insertions(+), 94 deletions(-) diff --git a/docs/yaml_workflow.rst b/docs/yaml_workflow.rst index 124eb2c..197a168 100644 --- a/docs/yaml_workflow.rst +++ b/docs/yaml_workflow.rst @@ -60,31 +60,16 @@ Template # QC output : - argument_type: qc. # qc_type, e.g. quality_metric_vcfcheck - # none can be used as - # if a qc_type is not defined - # quality_metric_generic can be used as - # to use the general qc_type instead of a custom one + argument_type: qc argument_to_be_attached_to: - # All the following fields are optional and provided as example, - # can be expanded to anything accepted by the schema - html: + # Fields to specify the output type + # either json or zipped folder json: - table: zipped: - # If the output is a zipped folder with multiple QC files, - # fields to define the target files inside the folder - html_in_zipped: - tables_in_zipped: - - - # Fields still used by tibanna that needs refactoring - # listing them as they are - qc_acl: # e.g. private - qc_unzip_from_ec2: # Report output : - argument_type: report. # report_type, e.g. file + argument_type: report General Fields Definition @@ -166,20 +151,18 @@ Definition of the type of the output. For a **file** output, the argument type is defined as ``file.``, where ```` is the format used by the file. ```` needs to match a file format that has been previously defined, see :ref:`File Format `. -For a **report** output, the argument type is defined as ``report.``, where ```` is the type of the report (e.g., file). +For a **report** output, the argument type is defined as ``report``. -For a **QC** (Quality Control) output, the argument type is defined as ``qc.``, where ```` is a ``qc_type`` defined in the schema, see `schemas `__. -While custom ``qc_type`` schemas are still supported for compatibility, we introduced a new generic type ``quality_metric_generic``. -We recommend to use this new type to implement QCs. +For a **QC** (Quality Control) output, the argument type is defined as ``qc``. -When using ``quality_metric_generic`` as a ``qc_type``, it is possible to generate two different types of output: a key-value pairs JSON file and a compressed file. +For a QC, it is possible to generate two different types of output: a key-value pairs JSON file and a compressed file. The JSON file can be used to create a summary report of the quality metrics generated by the QC process. The compressed file can be used to store the original output for the QC, including additional data or graphs. -Both the JSON file and compressed file will be attached to the file specified as target by ``argument_to_be_attached_to`` with a ``QualityMetricGeneric`` object. +Both the JSON file and compressed file will be attached to the file specified as target by ``argument_to_be_attached_to`` with a ``QualityMetric`` object. The content of the JSON file will be patched directly on the object, while the compressed file will be made available for download via a link. The output type can be specified by setting ``json: True`` or ``zipped: True`` in the the QC output definition. -Template for ``quality_metric_generic``: +Template for key-value pairs JSON: .. code-block:: python diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index c4fe156..1eacdcf 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -171,7 +171,6 @@ class YAMLWorkflow(YAMLTemplate): # schema constants INPUT_FILE_SCHEMA = 'Input file' OUTPUT_PROCESSED_FILE_SCHEMA = 'Output processed file' - OUTPUT_QC_FILE_SCHEMA = 'Output QC file' GENERIC_QC_FILE_SCHEMA = 'Generic QC file' OUTPUT_REPORT_FILE_SCHEMA = 'Output report file' QC_SCHEMA = 'qc' @@ -179,22 +178,11 @@ class YAMLWorkflow(YAMLTemplate): REPORT_SCHEMA = 'report' ARGUMENT_TO_BE_ATTACHED_TO_SCHEMA = 'argument_to_be_attached_to' ZIPPED_SCHEMA = 'zipped' - HTML_SCHEMA = 'html' JSON_SCHEMA = 'json' - TABLE_SCHEMA = 'table' SOFTWARE_SCHEMA = 'software' ARGUMENTS_SCHEMA = 'arguments' - QC_TYPE_SCHEMA = 'qc_type' QC_ZIPPED_SCHEMA = 'qc_zipped' - QC_HTML_SCHEMA = 'qc_html' QC_JSON_SCHEMA = 'qc_json' - QC_TABLE_SCHEMA = 'qc_table' - QC_ZIPPED_HTML_SCHEMA = 'qc_zipped_html' - QC_ZIPPED_TABLES_SCHEMA = 'qc_zipped_tables' - HTML_IN_ZIPPED_SCHEMA = 'html_in_zipped' - TABLES_IN_ZIPPED_SCHEMA = 'tables_in_zipped' - QC_ACL = 'qc_acl' - QC_UNZIP_FROM_EC2 = 'qc_unzip_from_ec2' def __init__(self, data): """Constructor method. @@ -237,7 +225,16 @@ def _arguments_output(self): """ arguments = [] for name, values in self.output.items(): - type, format = values[self.ARGUMENT_TYPE_SCHEMA].split('.') + # check if it is a file or qc or report argument + # if it is file it has a type and a format + # argument_type: file. + # if it is qc or report only has type + # argument_type: qc | report + try: + type, format = values[self.ARGUMENT_TYPE_SCHEMA].split('.') + except ValueError: + type = values[self.ARGUMENT_TYPE_SCHEMA] + # create right argument schema according to type if type == self.FILE_SCHEMA: argument_type = self.OUTPUT_PROCESSED_FILE_SCHEMA argument_ = { @@ -247,41 +244,21 @@ def _arguments_output(self): self.SECONDARY_FILE_FORMATS_SCHEMA: values.get(self.SECONDARY_FILES_SCHEMA, []) } elif type == self.QC_SCHEMA: - # handle generic vs specific QC schema - if format == self.QUALITY_METRIC_GENERIC_SCHEMA: - argument_type = self.GENERIC_QC_FILE_SCHEMA - else: - argument_type = self.OUTPUT_QC_FILE_SCHEMA + argument_type = self.GENERIC_QC_FILE_SCHEMA # create base QC argument argument_ = { self.ARGUMENT_TYPE_SCHEMA: argument_type, self.WORKFLOW_ARGUMENT_NAME_SCHEMA: name, self.ARGUMENT_TO_BE_ATTACHED_TO_SCHEMA: values[self.ARGUMENT_TO_BE_ATTACHED_TO_SCHEMA], self.QC_ZIPPED_SCHEMA: values.get(self.ZIPPED_SCHEMA, False), - self.QC_HTML_SCHEMA: values.get(self.HTML_SCHEMA, False), self.QC_JSON_SCHEMA: values.get(self.JSON_SCHEMA, False), - self.QC_TABLE_SCHEMA: values.get(self.TABLE_SCHEMA, False) } - # handle edge case for missing or generic QC type - if format not in ['none', self.QUALITY_METRIC_GENERIC_SCHEMA]: - argument_[self.QC_TYPE_SCHEMA] = format - # create argument format for generic QCs (JSON or ZIP) - elif format == self.QUALITY_METRIC_GENERIC_SCHEMA: - if argument_[self.QC_JSON_SCHEMA]: - argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'json' - else: - argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'zip' - # quality controls, TODO - # these fields are bad, need to rework how QCs work - if values.get(self.HTML_IN_ZIPPED_SCHEMA): - argument_[self.QC_ZIPPED_HTML_SCHEMA] = values[self.HTML_IN_ZIPPED_SCHEMA] - if values.get(self.TABLES_IN_ZIPPED_SCHEMA): - argument_[self.QC_ZIPPED_TABLES_SCHEMA] = values[self.TABLES_IN_ZIPPED_SCHEMA] - if values.get(self.QC_ACL): - argument_[self.QC_ACL] = values[self.QC_ACL] - if values.get(self.QC_UNZIP_FROM_EC2): - argument_[self.QC_UNZIP_FROM_EC2] = values[self.QC_UNZIP_FROM_EC2] - elif type == self.REPORT_SCHEMA and format == self.FILE_SCHEMA: + # check if it is json or zip + if argument_[self.QC_JSON_SCHEMA]: + argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'json' + else: + argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'zip' + elif type == self.REPORT_SCHEMA: argument_type = self.OUTPUT_REPORT_FILE_SCHEMA argument_ = { self.ARGUMENT_TYPE_SCHEMA: argument_type, diff --git a/pipeline_utils/schemas/yaml_software.py b/pipeline_utils/schemas/yaml_software.py index 17c44d4..2750536 100644 --- a/pipeline_utils/schemas/yaml_software.py +++ b/pipeline_utils/schemas/yaml_software.py @@ -20,7 +20,7 @@ schema.DESCRIPTION: 'Source url of the Software', schema.TYPE: schema.STRING, schema.FORMAT: 'uri', - schema.PATTERN: '^https?\:.+' + schema.PATTERN: '^https?\\:.+' }, 'description': { schema.DESCRIPTION: 'Description of the Software', diff --git a/pipeline_utils/schemas/yaml_workflow.py b/pipeline_utils/schemas/yaml_workflow.py index 2ca14f9..98d87be 100644 --- a/pipeline_utils/schemas/yaml_workflow.py +++ b/pipeline_utils/schemas/yaml_workflow.py @@ -34,14 +34,14 @@ 'main': { schema.DESCRIPTION: 'Main description file', schema.TYPE: schema.STRING, - schema.PATTERN: '.+\.cwl|.+\.wdl' + schema.PATTERN: '.+\\.cwl|.+\\.wdl' }, 'child': { schema.DESCRIPTION: 'Supplementary description files used by main', schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: '.+\.cwl|.+\.wdl' + schema.PATTERN: '.+\\.cwl|.+\\.wdl' } } }, @@ -52,7 +52,7 @@ schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: '.+\@.+' # check for @ + schema.PATTERN: '.+\\@.+' # check for @ } }, 'category': { @@ -92,7 +92,7 @@ schema.PROPERTIES: { 'argument_type': { schema.TYPE: schema.STRING, - schema.PATTERN: '^file\..+|^parameter\..+|^qc\..+|^report\..+' + schema.PATTERN: '^file\\..+|^parameter\\..+|^qc$|^report$' }, 'secondary_files': { schema.TYPE: schema.ARRAY, @@ -108,7 +108,7 @@ schema.TYPE: schema.OBJECT, schema.PROPERTIES: { 'argument_type': { - schema.PATTERN: '^qc\..+' + schema.PATTERN: '^qc\\..+' } }, }, @@ -120,23 +120,8 @@ 'zipped': { schema.TYPE: schema.BOOLEAN }, - 'html': { - schema.TYPE: schema.BOOLEAN - }, 'json': { schema.TYPE: schema.BOOLEAN - }, - 'table': { - schema.TYPE: schema.BOOLEAN - }, - 'html_in_zipped': { - schema.TYPE: schema.STRING - }, - 'tables_in_zipped': { - schema.TYPE: schema.ARRAY, - schema.ITEMS: { - schema.TYPE: schema.STRING - } } }, schema.REQUIRED: ['argument_to_be_attached_to'] diff --git a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml index 5037466..717dd5b 100644 --- a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml @@ -33,7 +33,7 @@ output: - vcf_gz_tbi vcfcheck: - argument_type: qc.quality_metric_vcfcheck + argument_type: qc argument_to_be_attached_to: output_vcf zipped: False json: True diff --git a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml index 5037466..717dd5b 100644 --- a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml @@ -33,7 +33,7 @@ output: - vcf_gz_tbi vcfcheck: - argument_type: qc.quality_metric_vcfcheck + argument_type: qc argument_to_be_attached_to: output_vcf zipped: False json: True diff --git a/tests/test_yaml_workflow.py b/tests/test_yaml_workflow.py index ef97d38..baf3243 100644 --- a/tests/test_yaml_workflow.py +++ b/tests/test_yaml_workflow.py @@ -38,13 +38,11 @@ def test_workflow(): }, { "argument_to_be_attached_to": "output_vcf", - "argument_type": "Output QC file", - "qc_html": False, + "argument_type": "Generic QC file", "qc_json": True, - "qc_table": False, - "qc_type": "quality_metric_vcfcheck", "qc_zipped": False, - "workflow_argument_name": "vcfcheck" + "workflow_argument_name": "vcfcheck", + "argument_format": "json" } ], "description": "Run HaplotypeCaller from gatk package", From c61244cc46638a8ad674616de6f8c3e7ae403f3a Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Mon, 6 Nov 2023 10:37:17 -0500 Subject: [PATCH 05/18] removed empty lists from the objects --- pipeline_utils/__main__.py | 2 +- pipeline_utils/lib/yaml_parser.py | 22 ++++++++--- pipeline_utils/pipeline_deploy.py | 8 ++-- pipeline_utils/schemas/yaml_metaworkflow.py | 38 ++++++++++++------- .../metaworkflows/A_gatk-HC-GT.yaml | 10 ++++- .../metaworkflows/B_minimal-gatk-HC-GT.yaml | 6 ++- .../portal_objects/metaworkflows/QC_test.yaml | 6 ++- .../metaworkflows/A_gatk-HC-GT.yaml | 5 ++- .../metaworkflows/B_minimal-gatk-HC-GT.yaml | 6 ++- tests/test_yaml_file_format.py | 1 - tests/test_yaml_file_reference.py | 1 - tests/test_yaml_metaworkflow.py | 18 ++++++--- tests/test_yaml_workflow.py | 3 -- 13 files changed, 85 insertions(+), 41 deletions(-) diff --git a/pipeline_utils/__main__.py b/pipeline_utils/__main__.py index 7c04d94..9a952ce 100644 --- a/pipeline_utils/__main__.py +++ b/pipeline_utils/__main__.py @@ -20,7 +20,7 @@ # Variables PIPELINE_DEPLOY = 'pipeline_deploy' -CONSORTIA_ALIAS = ['smaht_consortium'] +CONSORTIA_ALIAS = ['smaht'] SUBMISSION_CENTERS_ALIAS = ['smaht_dac'] KEYS_ALIAS = '~/.cgap-keys.json' MAIN_ALIAS = 'main' diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 1eacdcf..d40b7fc 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -240,9 +240,11 @@ def _arguments_output(self): argument_ = { self.ARGUMENT_FORMAT_SCHEMA: format, self.ARGUMENT_TYPE_SCHEMA: argument_type, - self.WORKFLOW_ARGUMENT_NAME_SCHEMA: name, - self.SECONDARY_FILE_FORMATS_SCHEMA: values.get(self.SECONDARY_FILES_SCHEMA, []) + self.WORKFLOW_ARGUMENT_NAME_SCHEMA: name } + # check for secondary files + if values.get(self.SECONDARY_FILES_SCHEMA): + argument_[self.SECONDARY_FILE_FORMATS_SCHEMA] = values.get(self.SECONDARY_FILES_SCHEMA) elif type == self.QC_SCHEMA: argument_type = self.GENERIC_QC_FILE_SCHEMA # create base QC argument @@ -288,14 +290,18 @@ def to_json( wfl_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers wfl_json[self.CONSORTIA_SCHEMA] = consortia wfl_json[self.DESCRIPTION_SCHEMA] = self.description - wfl_json[self.SOFTWARE_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{s.replace("@", "_")}' for s in getattr(self, self.SOFTWARE_SCHEMA, [])] + # check if software + if getattr(self, self.SOFTWARE_SCHEMA, None): + wfl_json[self.SOFTWARE_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{s.replace("@", "_")}' for s in getattr(self, self.SOFTWARE_SCHEMA)] wfl_json[self.ARGUMENTS_SCHEMA] = self._arguments_input() + self._arguments_output() # workflow language and description files wfl_json['language'] = self.runner['language'].upper() wfl_json['directory_url'] = wflbucket_url wfl_json['main_file_name'] = self.runner['main'] - wfl_json['child_file_names'] = self.runner.get('child', []) + # check if child description files + if self.runner.get('child'): + wfl_json['child_file_names'] = self.runner.get('child') # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -587,7 +593,9 @@ def to_json( ref_json[self.DESCRIPTION_SCHEMA] = self.description ref_json[self.FILE_FORMAT_SCHEMA] = self.format ref_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.FILEREFERENCE_TYPE_SCHEMA}-{self.name}_{self.version}'] - ref_json[self.EXTRA_FILES_SCHEMA] = getattr(self, self.SECONDARY_FILES_SCHEMA, []) + # check for secondary files + if getattr(self, self.SECONDARY_FILES_SCHEMA, None): + ref_json[self.EXTRA_FILES_SCHEMA] = getattr(self, self.SECONDARY_FILES_SCHEMA) ref_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, None) # this will be used during post/patch, # if None: # - leave it as is if patch @@ -651,7 +659,9 @@ def to_json( frmt_json[self.DESCRIPTION_SCHEMA] = self.description frmt_json[self.STANDARD_FILE_EXTENSION_SCHEMA] = self.extension # frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['FileReference', 'FileProcessed']) - frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA, []) + # check for secondary formats + if getattr(self, self.SECONDARY_FORMATS_SCHEMA, None): + frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA) frmt_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, 'shared') # uuid, accession if specified diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index fba7ef0..791a028 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -275,7 +275,7 @@ def _post_patch_wfl(self, type='WFL'): filepath_ = f'{self.repo}/{self.filepath[type]}' upload_ = f'{filepath_}/upload' account_ = f'{self.account}.dkr.ecr.{self.region}.amazonaws.com' - update_ = { + auth_keys_ = { 'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': self.kms_key_id } @@ -316,10 +316,10 @@ def _post_patch_wfl(self, type='WFL'): line = line.replace('LICENSEID', self.sentieon_server) write_.write(line) # upload to s3 - extra_args = {'ACL': 'public-read'} # note that this is no longer public if using encryption! if self.kms_key_id: - extra_args.update(update_) - s3.meta.client.upload_file(upload_file_, self.wfl_bucket, s3_file_, ExtraArgs=extra_args) + s3.meta.client.upload_file(upload_file_, self.wfl_bucket, s3_file_, ExtraArgs=auth_keys_) + else: # no kms_key_id, ExtraArgs not needed + s3.meta.client.upload_file(upload_file_, self.wfl_bucket, s3_file_) logger.info('> Posted %s' % s3_file_) # delete file to allow tmp folder to be deleted at the end os.remove(upload_file_) diff --git a/pipeline_utils/schemas/yaml_metaworkflow.py b/pipeline_utils/schemas/yaml_metaworkflow.py index 98d4733..6e47896 100644 --- a/pipeline_utils/schemas/yaml_metaworkflow.py +++ b/pipeline_utils/schemas/yaml_metaworkflow.py @@ -168,30 +168,42 @@ 'description': { schema.TYPE: schema.STRING }, - 'linkto_location': { + # 'linkto_location': { + # schema.TYPE: schema.ARRAY, + # schema.ITEMS: { + # schema.TYPE: schema.STRING + # } + # }, + # 'file_type': { + # schema.TYPE: schema.STRING + # }, + 'data_category': { schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING } }, - 'file_type': { - schema.TYPE: schema.STRING - }, - 'higlass_file': { - schema.TYPE: schema.BOOLEAN - }, - 'variant_type': { - schema.TYPE: schema.STRING - }, - 'vcf_to_ingest': { - schema.TYPE: schema.BOOLEAN + 'data_type': { + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } }, + # 'higlass_file': { + # schema.TYPE: schema.BOOLEAN + # }, + # 'variant_type': { + # schema.TYPE: schema.STRING + # }, + # 'vcf_to_ingest': { + # schema.TYPE: schema.BOOLEAN + # }, 's3_lifecycle_category': { schema.TYPE: schema.STRING, schema.PATTERN: 'short_term_access_long_term_archive|short_term_access|short_term_archive|long_term_access_long_term_archive|long_term_access|long_term_archive|no_storage|ignore' } }, - schema.REQUIRED: ['file_type'] + schema.REQUIRED: ['data_category', 'data_type'] } } } diff --git a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index d48833f..e6830b2 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -47,7 +47,10 @@ workflows: output: HC_vcf: description: output from gatk-HC - file_type: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads linkto_location: - SampleProcessing # gatk-HC config @@ -76,7 +79,10 @@ workflows: output: GT_vcf: description: output from gatk-GT - file_type: GT-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads higlass_file: True variant_type: SNV # gatk-GT config diff --git a/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml index db788d3..1e0d92c 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml @@ -42,7 +42,11 @@ workflows: # gatk-HC output output: HC_vcf: - file_type: hc-vcf + description: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads # gatk-HC config config: ebs_size: 2x diff --git a/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml b/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml index df26442..eb54676 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml @@ -61,7 +61,11 @@ workflows: # gatk-HC output output: HC_vcf: - file_type: hc-vcf + description: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads # gatk-HC config config: ebs_size: 2x diff --git a/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index f84a8b8..ca13039 100644 --- a/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -48,7 +48,10 @@ workflows: output: HC_vcf: description: output from gatk-HC - file_type: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads linkto_location: - SampleProcessing # gatk-HC config diff --git a/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml b/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml index db788d3..1e0d92c 100644 --- a/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml +++ b/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml @@ -42,7 +42,11 @@ workflows: # gatk-HC output output: HC_vcf: - file_type: hc-vcf + description: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads # gatk-HC config config: ebs_size: 2x diff --git a/tests/test_yaml_file_format.py b/tests/test_yaml_file_format.py index bf33304..5655e04 100644 --- a/tests/test_yaml_file_format.py +++ b/tests/test_yaml_file_format.py @@ -27,7 +27,6 @@ def test_file_format(): "accession": 'GAPFIXRDPDK1', "aliases": ["cgap-core:FileFormat-bam_bai"], "description": "index for bam format", - "extra_file_formats": [], "identifier": "bam_bai", "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index 07e92c2..7cd9e81 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -28,7 +28,6 @@ def test_file_reference(): { "aliases": ["cgap-core:FileReference-reference_genome_hg38"], "description": "hg38 full reference genome plus decoy for CGAP, fasta format", - "extra_files": [], "file_format": "fa", "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index fd4578e..8125dc2 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -49,10 +49,11 @@ def test_metaworkflow(): "custom_pf_fields": { "HC_vcf": { "description": "output from gatk-HC", - "file_type": "hc-vcf", "linkto_location": [ "SampleProcessing" - ] + ], + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ @@ -87,9 +88,10 @@ def test_metaworkflow(): "custom_pf_fields": { "GT_vcf": { "description": "output from gatk-GT", - "file_type": "GT-vcf", "higlass_file": True, - "variant_type": "SNV" + "variant_type": "SNV", + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ @@ -150,7 +152,9 @@ def test_metaworkflow(): }, "custom_pf_fields": { "HC_vcf": { - "file_type": "hc-vcf" + "description": "hc-vcf", + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ @@ -273,7 +277,9 @@ def test_qc_ruleset(): }, "custom_pf_fields": { "HC_vcf": { - "file_type": "hc-vcf" + "description": "hc-vcf", + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ diff --git a/tests/test_yaml_workflow.py b/tests/test_yaml_workflow.py index baf3243..6c09c1a 100644 --- a/tests/test_yaml_workflow.py +++ b/tests/test_yaml_workflow.py @@ -76,16 +76,13 @@ def test_workflow(): { "argument_format": "vcf", "argument_type": "Output processed file", - "secondary_file_formats": [], "workflow_argument_name": "output_vcf" } ], "description": "Run HaplotypeCaller from gatk package", "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], - "software": [], "title": "gatk-HaplotypeCaller [v1.0.0]", - "child_file_names": [], "directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", "main_file_name": "gatk-HaplotypeCaller-check.cwl", "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", From 15b0247ee586548e40e25dd73c6cc4da90bff791 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Mon, 6 Nov 2023 14:27:06 -0500 Subject: [PATCH 06/18] . --- docs/yaml_metaworkflow.rst | 12 +++++----- pipeline_utils/lib/yaml_parser.py | 5 ++++ pipeline_utils/schemas/yaml_file_reference.py | 7 ++++++ pipeline_utils/schemas/yaml_metaworkflow.py | 24 +++++-------------- .../portal_objects/file_reference.yaml | 2 ++ .../portal_objects/file_reference.yaml | 2 ++ tests/test_yaml_file_reference.py | 3 ++- 7 files changed, 30 insertions(+), 25 deletions(-) diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index 3159a65..e38015e 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -121,15 +121,15 @@ Template # File output : - file_type: + data_category: + - + data_type: + - # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema description: - linkto_location: - - # Sample, SampleProcessing - higlass_file: - variant_type: # SNV, SV, CNV - vcf_to_ingest: + variant_types: + - s3_lifecycle_category: # short_term_access_long_term_archive, # short_term_access, short_term_archive, # long_term_access_long_term_archive, diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index d40b7fc..ec4e03c 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -119,6 +119,8 @@ class YAMLTemplate(object): FILEFORMAT_TYPE_SCHEMA = 'FileFormat' FILEREFERENCE_TYPE_SCHEMA = 'FileReference' SOFTWARE_TYPE_SCHEMA = 'Software' + VARIANT_TYPE_SCHEMA = "variant_type" + VARIANT_TYPES_SCHEMA = "variant_types" def __init__(self, data, schema): """Constructor method. @@ -602,6 +604,9 @@ def to_json( # - set to uploading if post ref_json[self.DATA_CATEGORY_SCHEMA] = self.category ref_json[self.DATA_TYPE_SCHEMA] = self.type + # variant_type + if getattr(self, self.VARIANT_TYPE_SCHEMA, None): + ref_json[self.VARIANT_TYPES_SCHEMA] = self.variant_type # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): diff --git a/pipeline_utils/schemas/yaml_file_reference.py b/pipeline_utils/schemas/yaml_file_reference.py index a01758d..c7bbdb4 100644 --- a/pipeline_utils/schemas/yaml_file_reference.py +++ b/pipeline_utils/schemas/yaml_file_reference.py @@ -34,6 +34,13 @@ schema.TYPE: schema.STRING } }, + 'variant_type': { + schema.DESCRIPTION: 'Types of variants in FileReference', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, 'version': { schema.DESCRIPTION: 'Version of the FileReference', schema.TYPE: schema.STRING diff --git a/pipeline_utils/schemas/yaml_metaworkflow.py b/pipeline_utils/schemas/yaml_metaworkflow.py index 6e47896..56adcf1 100644 --- a/pipeline_utils/schemas/yaml_metaworkflow.py +++ b/pipeline_utils/schemas/yaml_metaworkflow.py @@ -168,15 +168,6 @@ 'description': { schema.TYPE: schema.STRING }, - # 'linkto_location': { - # schema.TYPE: schema.ARRAY, - # schema.ITEMS: { - # schema.TYPE: schema.STRING - # } - # }, - # 'file_type': { - # schema.TYPE: schema.STRING - # }, 'data_category': { schema.TYPE: schema.ARRAY, schema.ITEMS: { @@ -189,15 +180,12 @@ schema.TYPE: schema.STRING } }, - # 'higlass_file': { - # schema.TYPE: schema.BOOLEAN - # }, - # 'variant_type': { - # schema.TYPE: schema.STRING - # }, - # 'vcf_to_ingest': { - # schema.TYPE: schema.BOOLEAN - # }, + 'variant_types': { + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, 's3_lifecycle_category': { schema.TYPE: schema.STRING, schema.PATTERN: 'short_term_access_long_term_archive|short_term_access|short_term_archive|long_term_access_long_term_archive|long_term_access|long_term_archive|no_storage|ignore' diff --git a/tests/repo_correct/portal_objects/file_reference.yaml b/tests/repo_correct/portal_objects/file_reference.yaml index c5ca6b1..73c1442 100644 --- a/tests/repo_correct/portal_objects/file_reference.yaml +++ b/tests/repo_correct/portal_objects/file_reference.yaml @@ -16,6 +16,8 @@ category: - Sequencing Reads type: - Unaligned Reads +variant_type: + - SNV --- diff --git a/tests/repo_error/portal_objects/file_reference.yaml b/tests/repo_error/portal_objects/file_reference.yaml index d910dee..bc64811 100644 --- a/tests/repo_error/portal_objects/file_reference.yaml +++ b/tests/repo_error/portal_objects/file_reference.yaml @@ -16,6 +16,8 @@ category: - Sequencing Reads type: - Unaligned Reads +variant_type: + - SNV --- diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index 7cd9e81..2de420b 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -23,7 +23,8 @@ def test_file_reference(): "status": "uploading", "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7", "data_category": ["Sequencing Reads"], - "data_type": ["Unaligned Reads"] + "data_type": ["Unaligned Reads"], + "variant_types": ["SNV"] }, { "aliases": ["cgap-core:FileReference-reference_genome_hg38"], From 91779b16ff53fc22c35ae6dc00337638c94f50e8 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Wed, 15 Nov 2023 11:56:47 -0500 Subject: [PATCH 07/18] . --- docs/yaml_metaworkflow.rst | 15 ++++++--------- docs/yaml_workflow.rst | 4 ++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index e38015e..ca43649 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -37,7 +37,7 @@ Template # Parameter argument : - argument_type: parameter. # string, integer, float, json, boolean + argument_type: parameter. # string, integer, float, array, object, boolean # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema value: <...> @@ -234,7 +234,7 @@ Definition of the type of the argument. For a **file** argument, the argument type is defined as ``file.``, where ```` is the format used by the file. ```` needs to match a file format that has been previously defined, see :ref:`File Format `. -For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, json, boolean]. +For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, array, boolean, object]. files ^^^^^ @@ -248,24 +248,21 @@ value ^^^^^ This field can be used to assign a specific value to a **parameter** argument. -*Note*: As of now, the value needs to be always encoded as ````. -We are working to improve this and enable usage of real types. - Example .. code-block:: yaml a_float: argument_type: parameter.float - value: "0.8" + value: 0.8 an_integer: argument_type: parameter.integer - value: "1" + value: 1 a_string_array: - argument_type: parameter.json - value: "[\"DEL\", \"DUP\"]" + argument_type: parameter.array + value: ["DEL", "DUP"] Linking Fields ^^^^^^^^^^^^^^ diff --git a/docs/yaml_workflow.rst b/docs/yaml_workflow.rst index 197a168..ba63bd8 100644 --- a/docs/yaml_workflow.rst +++ b/docs/yaml_workflow.rst @@ -45,7 +45,7 @@ Template # Parameter argument : - argument_type: parameter. # string, integer, float, json, boolean + argument_type: parameter. # string, integer, float, array, boolean, object ## Output information ####################################### # Output files and quality controls @@ -135,7 +135,7 @@ Definition of the type of the argument. For a **file** argument, the argument type is defined as ``file.``, where ```` is the format used by the file. ```` needs to match a file format that has been previously defined, see :ref:`File Format `. -For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, json, boolean]. +For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, array, boolean, object]. .. _output_a: From 179fffbfb6e5d749d955774434f420442e2357c6 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Thu, 16 Nov 2023 11:20:15 -0500 Subject: [PATCH 08/18] . --- pipeline_utils/lib/yaml_parser.py | 3 +-- pipeline_utils/schemas/yaml_metaworkflow.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index ec4e03c..93e8311 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -120,7 +120,6 @@ class YAMLTemplate(object): FILEREFERENCE_TYPE_SCHEMA = 'FileReference' SOFTWARE_TYPE_SCHEMA = 'Software' VARIANT_TYPE_SCHEMA = "variant_type" - VARIANT_TYPES_SCHEMA = "variant_types" def __init__(self, data, schema): """Constructor method. @@ -606,7 +605,7 @@ def to_json( ref_json[self.DATA_TYPE_SCHEMA] = self.type # variant_type if getattr(self, self.VARIANT_TYPE_SCHEMA, None): - ref_json[self.VARIANT_TYPES_SCHEMA] = self.variant_type + ref_json[self.VARIANT_TYPE_SCHEMA] = self.variant_type # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): diff --git a/pipeline_utils/schemas/yaml_metaworkflow.py b/pipeline_utils/schemas/yaml_metaworkflow.py index 56adcf1..743e1b2 100644 --- a/pipeline_utils/schemas/yaml_metaworkflow.py +++ b/pipeline_utils/schemas/yaml_metaworkflow.py @@ -180,7 +180,7 @@ schema.TYPE: schema.STRING } }, - 'variant_types': { + 'variant_type': { schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING From fe4e8cae5642421833d010e035737b9501a6a67c Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Thu, 16 Nov 2023 11:35:38 -0500 Subject: [PATCH 09/18] . --- .../portal_objects/metaworkflows/A_gatk-HC-GT.yaml | 3 ++- tests/test_yaml_file_reference.py | 2 +- tests/test_yaml_metaworkflow.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index e6830b2..1fc09d7 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -84,7 +84,8 @@ workflows: data_type: - Unaligned Reads higlass_file: True - variant_type: SNV + variant_type: + - SNV # gatk-GT config config: ebs_size: 3x diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index 2de420b..95e71c2 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -24,7 +24,7 @@ def test_file_reference(): "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7", "data_category": ["Sequencing Reads"], "data_type": ["Unaligned Reads"], - "variant_types": ["SNV"] + "variant_type": ["SNV"] }, { "aliases": ["cgap-core:FileReference-reference_genome_hg38"], diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index 8125dc2..1d599f3 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -89,7 +89,7 @@ def test_metaworkflow(): "GT_vcf": { "description": "output from gatk-GT", "higlass_file": True, - "variant_type": "SNV", + "variant_type": ["SNV"], "data_category": ["Sequencing Reads"], "data_type": ["Unaligned Reads"] } From 5c542253e1b84c7f9e48451b8dbb29f2308c8846 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Thu, 16 Nov 2023 12:25:39 -0500 Subject: [PATCH 10/18] changed FileReference to ReferenceFile --- docs/yaml_file_format.rst | 8 ------ pipeline_utils/__main__.py | 2 +- pipeline_utils/lib/yaml_parser.py | 24 ++++++++-------- pipeline_utils/pipeline_deploy.py | 18 ++++++------ pipeline_utils/schemas/yaml_file_format.py | 2 +- ...le_reference.py => yaml_reference_file.py} | 28 +++++++++---------- tests/test_check_schemas.py | 6 ++-- tests/test_yaml_file_format.py | 4 +-- tests/test_yaml_file_reference.py | 8 +++--- tests/test_yaml_metaworkflow.py | 10 +++---- 10 files changed, 51 insertions(+), 59 deletions(-) rename pipeline_utils/schemas/{yaml_file_reference.py => yaml_reference_file.py} (65%) diff --git a/docs/yaml_file_format.rst b/docs/yaml_file_format.rst index 44aaa8a..0f10970 100644 --- a/docs/yaml_file_format.rst +++ b/docs/yaml_file_format.rst @@ -25,8 +25,6 @@ Template # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas secondary_formats: - # bam, fastq, bwt, ... - file_types: - - # FileReference, FileProcessed, FileSubmitted status: # shared @@ -57,9 +55,3 @@ secondary_formats ----------------- List of secondary ```` available for the file format. Each ```` needs to match a file format that has been previously defined. - -file_types ----------- -File types that can use the file format. -List of ````. The possible values are ``FileReference``, ``FileProcessed`` and ``FileSubmitted``. -Default value if not specified is ``FileReference`` and ``FileProcessed``. diff --git a/pipeline_utils/__main__.py b/pipeline_utils/__main__.py index 9a952ce..c2a8e9a 100644 --- a/pipeline_utils/__main__.py +++ b/pipeline_utils/__main__.py @@ -57,7 +57,7 @@ def main(args=None): pipeline_deploy_parser.add_argument('--post-software', action='store_true', help='POST|PATCH Software objects') pipeline_deploy_parser.add_argument('--post-file-format', action='store_true', help='POST|PATCH FileFormat objects') - pipeline_deploy_parser.add_argument('--post-file-reference', action='store_true', help='POST|PATCH FileReference objects') + pipeline_deploy_parser.add_argument('--post-file-reference', action='store_true', help='POST|PATCH ReferenceFile objects') pipeline_deploy_parser.add_argument('--post-workflow', action='store_true', help='POST|PATCH Workflow objects') pipeline_deploy_parser.add_argument('--post-metaworkflow', action='store_true', help='POST|PATCH MetaWorkflow objects') pipeline_deploy_parser.add_argument('--post-wfl', action='store_true', help='Upload Workflow Description files (.cwl, .wdl)') diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 93e8311..d1d71ff 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -21,7 +21,7 @@ from pipeline_utils.schemas.yaml_workflow import yaml_workflow_schema from pipeline_utils.schemas.yaml_metaworkflow import yaml_metaworkflow_schema from pipeline_utils.schemas.yaml_software import yaml_software_schema -from pipeline_utils.schemas.yaml_file_reference import yaml_file_reference_schema +from pipeline_utils.schemas.yaml_reference_file import yaml_reference_file_schema from pipeline_utils.schemas.yaml_file_format import yaml_file_format_schema @@ -117,7 +117,7 @@ class YAMLTemplate(object): WORKFLOW_TYPE_SCHEMA = 'Workflow' METAWORKFLOW_TYPE_SCHEMA = 'MetaWorkflow' FILEFORMAT_TYPE_SCHEMA = 'FileFormat' - FILEREFERENCE_TYPE_SCHEMA = 'FileReference' + REFERENCEFILE_TYPE_SCHEMA = 'ReferenceFile' SOFTWARE_TYPE_SCHEMA = 'Software' VARIANT_TYPE_SCHEMA = "variant_type" @@ -378,17 +378,17 @@ def _arguments(self, input, consortia): # - bar@v3 # need to convert to: # files: [ - # {file: ':FileReference-foo_v1'} + # {file: ':ReferenceFile-foo_v1'} # ] # ----- or ------- # files: [ - # {file: ':FileReference-foo_v1', dimension: '0'}, - # {file: ':FileReference-bar_v3', dimension: '1'} + # {file: ':ReferenceFile-foo_v1', dimension: '0'}, + # {file: ':ReferenceFile-bar_v3', dimension: '1'} # ] if k == self.FILES_SCHEMA: v_ = [] for i, name_ in enumerate(v): - v_.append({self.FILE_SCHEMA: f'{self._string_consortia(consortia)}:{self.FILEREFERENCE_TYPE_SCHEMA}-{name_.replace("@", "_")}', + v_.append({self.FILE_SCHEMA: f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{name_.replace("@", "_")}', self.DIMENSION_SCHEMA: str(i)}) # remove DIMENSION_SCHEMA field if only one file if len(v_) == 1: @@ -556,10 +556,10 @@ def to_json( ############################################################### -# YAMLFileReference, YAML FileReference +# YAMLReferenceFile, YAML ReferenceFile ############################################################### -class YAMLFileReference(YAMLTemplate): - """Class to work with YAML documents representing FileReference objects. +class YAMLReferenceFile(YAMLTemplate): + """Class to work with YAML documents representing ReferenceFile objects. """ # schema constants @@ -570,7 +570,7 @@ class YAMLFileReference(YAMLTemplate): def __init__(self, data): """Constructor method. """ - super().__init__(data, yaml_file_reference_schema) + super().__init__(data, yaml_reference_file_schema) # validate data with schema self._validate() # load attributes @@ -593,7 +593,7 @@ def to_json( ref_json[self.CONSORTIA_SCHEMA] = consortia ref_json[self.DESCRIPTION_SCHEMA] = self.description ref_json[self.FILE_FORMAT_SCHEMA] = self.format - ref_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.FILEREFERENCE_TYPE_SCHEMA}-{self.name}_{self.version}'] + ref_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{self.name}_{self.version}'] # check for secondary files if getattr(self, self.SECONDARY_FILES_SCHEMA, None): ref_json[self.EXTRA_FILES_SCHEMA] = getattr(self, self.SECONDARY_FILES_SCHEMA) @@ -662,7 +662,7 @@ def to_json( frmt_json[self.CONSORTIA_SCHEMA] = consortia frmt_json[self.DESCRIPTION_SCHEMA] = self.description frmt_json[self.STANDARD_FILE_EXTENSION_SCHEMA] = self.extension - # frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['FileReference', 'FileProcessed']) + # frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['ReferenceFile', 'FileProcessed']) # check for secondary formats if getattr(self, self.SECONDARY_FORMATS_SCHEMA, None): frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA) diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index 791a028..9c6caa7 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -75,7 +75,7 @@ def __init__(self, args, repo, version='VERSION', pipeline='PIPELINE'): self.object_ = { 'Software': yaml_parser.YAMLSoftware, 'FileFormat': yaml_parser.YAMLFileFormat, - 'FileReference': yaml_parser.YAMLFileReference, + 'ReferenceFile': yaml_parser.YAMLReferenceFile, 'Workflow': yaml_parser.YAMLWorkflow, 'MetaWorkflow': yaml_parser.YAMLMetaWorkflow } @@ -83,11 +83,11 @@ def __init__(self, args, repo, version='VERSION', pipeline='PIPELINE'): # .yaml files 'Software': 'portal_objects/software.yaml', 'FileFormat': 'portal_objects/file_format.yaml', - 'FileReference': 'portal_objects/file_reference.yaml', + 'ReferenceFile': 'portal_objects/file_reference.yaml', # .yml files 'Software_yml': 'portal_objects/software.yml', 'FileFormat_yml': 'portal_objects/file_format.yml', - 'FileReference_yml': 'portal_objects/file_reference.yml', + 'ReferenceFile_yml': 'portal_objects/file_reference.yml', # folders 'Workflow': 'portal_objects/workflows', 'MetaWorkflow': 'portal_objects/metaworkflows', @@ -141,11 +141,11 @@ def _post_patch_json(self, data_json, type): except Exception: is_patch = False - # Exception for uploading of FileReference objects + # Exception for uploading of ReferenceFile objects # status -> uploading, uploaded # default is None -> the status will not be updated during patch, # and set to uploading if post for the first time - if type == 'FileReference': + if type == 'ReferenceFile': # main status if data_json['status'] is None: if is_patch: @@ -199,7 +199,7 @@ def _yaml_to_json(self, data_yaml, YAMLClass, **kwargs): def _post_patch_file(self, type): """ - 'Software', 'FileFormat', 'FileReference' + 'Software', 'FileFormat', 'ReferenceFile' """ logger.info(f'@ {type}...') @@ -403,9 +403,9 @@ def run_post_patch(self): if self.post_file_format: self._post_patch_file('FileFormat') - # FileReference + # ReferenceFile if self.post_file_reference: - self._post_patch_file('FileReference') + self._post_patch_file('ReferenceFile') # Workflow if self.post_workflow: @@ -432,7 +432,7 @@ def main(args): For each repository a PostPatchRepo object is created to: - Create and POST|PATCH to database objects in JSON format for - Workflow, MetaWorkflow, FileReference, FileFormat, and Software components + Workflow, MetaWorkflow, ReferenceFile, FileFormat, and Software components - PUSH workflow descriptions to target S3 bucket - BUILD Docker images and PUSH to target ECR folder """ diff --git a/pipeline_utils/schemas/yaml_file_format.py b/pipeline_utils/schemas/yaml_file_format.py index 190f714..023c5d1 100644 --- a/pipeline_utils/schemas/yaml_file_format.py +++ b/pipeline_utils/schemas/yaml_file_format.py @@ -25,7 +25,7 @@ # schema.TYPE: schema.ARRAY, # schema.ITEMS: { # schema.TYPE: schema.STRING, - # schema.PATTERN: 'FileReference|FileProcessed|FileSubmitted|FileFastq' + # schema.PATTERN: 'ReferenceFile|FileProcessed|FileSubmitted|FileFastq' # } # }, 'status': { diff --git a/pipeline_utils/schemas/yaml_file_reference.py b/pipeline_utils/schemas/yaml_reference_file.py similarity index 65% rename from pipeline_utils/schemas/yaml_file_reference.py rename to pipeline_utils/schemas/yaml_reference_file.py index c7bbdb4..7e19a99 100644 --- a/pipeline_utils/schemas/yaml_file_reference.py +++ b/pipeline_utils/schemas/yaml_reference_file.py @@ -1,64 +1,64 @@ from pipeline_utils.schemas import schema -yaml_file_reference_schema = { +yaml_reference_file_schema = { ## Schema ######################### schema.SCHEMA: 'https://json-schema.org/draft/2020-12/schema', - schema.ID: '/schemas/YAMLFileReference', - schema.TITLE: 'YAMLFileReference', - schema.DESCRIPTION: 'Schema to validate a YAML description of a FileReference', + schema.ID: '/schemas/YAMLReferenceFile', + schema.TITLE: 'YAMLReferenceFile', + schema.DESCRIPTION: 'Schema to validate a YAML description of a ReferenceFile', schema.TYPE: schema.OBJECT, schema.PROPERTIES: { 'name': { - schema.DESCRIPTION: 'Name of the FileReference', + schema.DESCRIPTION: 'Name of the ReferenceFile', schema.TYPE: schema.STRING }, 'description': { - schema.DESCRIPTION: 'Description of the FileReference', + schema.DESCRIPTION: 'Description of the ReferenceFile', schema.TYPE: schema.STRING }, 'format': { - schema.DESCRIPTION: 'Format of the FileReference', + schema.DESCRIPTION: 'Format of the ReferenceFile', schema.TYPE: schema.STRING }, 'category': { - schema.DESCRIPTION: 'Categories of the FileReference', + schema.DESCRIPTION: 'Categories of the ReferenceFile', schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING } }, 'type': { - schema.DESCRIPTION: 'Types of the FileReference', + schema.DESCRIPTION: 'Types of the ReferenceFile', schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING } }, 'variant_type': { - schema.DESCRIPTION: 'Types of variants in FileReference', + schema.DESCRIPTION: 'Types of variants in ReferenceFile', schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING } }, 'version': { - schema.DESCRIPTION: 'Version of the FileReference', + schema.DESCRIPTION: 'Version of the ReferenceFile', schema.TYPE: schema.STRING }, 'status': { - schema.DESCRIPTION: 'Status of the upload of the FileReference', + schema.DESCRIPTION: 'Status of the upload of the ReferenceFile', schema.TYPE: schema.STRING, schema.PATTERN: 'uploading|uploaded' }, 'secondary_files': { - schema.DESCRIPTION: 'Secondary files for the FileReference', + schema.DESCRIPTION: 'Secondary files for the ReferenceFile', schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING } }, 'license': { - schema.DESCRIPTION: 'License of the FileReference', + schema.DESCRIPTION: 'License of the ReferenceFile', schema.TYPE: schema.STRING } }, diff --git a/tests/test_check_schemas.py b/tests/test_check_schemas.py index b7a5c8b..4a3bace 100644 --- a/tests/test_check_schemas.py +++ b/tests/test_check_schemas.py @@ -11,7 +11,7 @@ from pipeline_utils.schemas.yaml_workflow import yaml_workflow_schema from pipeline_utils.schemas.yaml_metaworkflow import yaml_metaworkflow_schema from pipeline_utils.schemas.yaml_software import yaml_software_schema -from pipeline_utils.schemas.yaml_file_reference import yaml_file_reference_schema +from pipeline_utils.schemas.yaml_reference_file import yaml_reference_file_schema from pipeline_utils.schemas.yaml_file_format import yaml_file_format_schema ############################################################### @@ -30,10 +30,10 @@ def test_yaml_file_format_schema(): """ Draft202012Validator.check_schema(yaml_file_format_schema) -def test_yaml_file_reference_schema(): +def test_yaml_reference_file_schema(): """ """ - Draft202012Validator.check_schema(yaml_file_reference_schema) + Draft202012Validator.check_schema(yaml_reference_file_schema) def test_yaml_software_schema(): """ diff --git a/tests/test_yaml_file_format.py b/tests/test_yaml_file_format.py index 5655e04..aa194e1 100644 --- a/tests/test_yaml_file_format.py +++ b/tests/test_yaml_file_format.py @@ -21,7 +21,7 @@ def test_file_format(): "consortia": ["cgap-core"], "standard_file_extension": "bam", "status": "shared" - # "valid_item_types": ["FileReference", "FileProcessed"] + # "valid_item_types": ["ReferenceFile", "FileProcessed"] }, { "accession": 'GAPFIXRDPDK1', @@ -32,7 +32,7 @@ def test_file_format(): "consortia": ["cgap-core"], "standard_file_extension": "bam.bai", "status": "shared", - # "valid_item_types": ["FileReference", "FileProcessed"], + # "valid_item_types": ["ReferenceFile", "FileProcessed"], "uuid": '1936f246-22e1-45dc-bb5c-9cfd55537fe9' } ] diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index 95e71c2..829181d 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -14,7 +14,7 @@ def test_file_reference(): res = [ { "accession": "GAPFIXRDPDK5", - "aliases": ["cgap-core:FileReference-reference_genome_hg38"], + "aliases": ["cgap-core:ReferenceFile-reference_genome_hg38"], "description": "hg38 full reference genome plus decoy for CGAP, fasta format", "extra_files": ["fa_fai", "dict"], "file_format": "fa", @@ -27,7 +27,7 @@ def test_file_reference(): "variant_type": ["SNV"] }, { - "aliases": ["cgap-core:FileReference-reference_genome_hg38"], + "aliases": ["cgap-core:ReferenceFile-reference_genome_hg38"], "description": "hg38 full reference genome plus decoy for CGAP, fasta format", "file_format": "fa", "submission_centers": ["hms-dbmi"], @@ -40,7 +40,7 @@ def test_file_reference(): for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/file_reference.yaml')): # creating JSON object - d_ = yaml_parser.YAMLFileReference(d).to_json( + d_ = yaml_parser.YAMLReferenceFile(d).to_json( submission_centers=["hms-dbmi"], consortia=["cgap-core"] ) @@ -54,7 +54,7 @@ def test_file_reference_error(): for i, d in enumerate(yaml_parser.load_yaml('tests/repo_error/portal_objects/file_reference.yaml')): try: # creating JSON object - d_ = yaml_parser.YAMLFileReference(d).to_json( + d_ = yaml_parser.YAMLReferenceFile(d).to_json( submission_centers=["hms-dbmi"], consortia=["cgap-core"] ) diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index 1d599f3..02f7bf5 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -26,7 +26,7 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"file": "cgap-core:FileReference-reference_genome_hg38"}] + "files": [{"file": "cgap-core:ReferenceFile-reference_genome_hg38"}] }, { "argument_name": "samples", @@ -129,8 +129,8 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"dimension": "0", "file": "cgap-core_cgap-test:FileReference-reference_genome_hg38"}, - {"dimension": "1", "file": "cgap-core_cgap-test:FileReference-reference_bam_hg38"}] + "files": [{"dimension": "0", "file": "cgap-core_cgap-test:ReferenceFile-reference_genome_hg38"}, + {"dimension": "1", "file": "cgap-core_cgap-test:ReferenceFile-reference_bam_hg38"}] }, { "argument_name": "samples", @@ -214,8 +214,8 @@ def test_qc_ruleset(): { "argument_name": "reference", "argument_type": "file", - "files": [{"dimension": "0", "file": "cgap-core_cgap-test:FileReference-reference_genome_hg38"}, - {"dimension": "1", "file": "cgap-core_cgap-test:FileReference-reference_bam_hg38"}] + "files": [{"dimension": "0", "file": "cgap-core_cgap-test:ReferenceFile-reference_genome_hg38"}, + {"dimension": "1", "file": "cgap-core_cgap-test:ReferenceFile-reference_bam_hg38"}] }, { "argument_name": "samples", From 429a3daa09a12d8ef7e071fe9c427cf7695fe42f Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Thu, 16 Nov 2023 12:51:42 -0500 Subject: [PATCH 11/18] . --- pipeline_utils/pipeline_deploy.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index 9c6caa7..e559d66 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -154,14 +154,15 @@ def _post_patch_json(self, data_json, type): data_json['status'] = 'uploading' # extra_files status - extra_files_ = [] - for ext in data_json['extra_files']: - ext_ = { - 'file_format': ext, - 'status': data_json.get('status', 'uploaded') - } - extra_files_.append(ext_) - data_json['extra_files'] = extra_files_ + if data_json.get('extra_files'): + extra_files_ = [] + for ext in data_json['extra_files']: + ext_ = { + 'file_format': ext, + 'status': data_json.get('status', 'uploaded') + } + extra_files_.append(ext_) + data_json['extra_files'] = extra_files_ ########################################################### if is_patch: From b32dfe4184539211acf65d62104fbac3caa2793e Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Mon, 20 Nov 2023 14:06:51 -0500 Subject: [PATCH 12/18] . --- pipeline_utils/lib/yaml_parser.py | 3 --- tests/test_yaml_metaworkflow.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index d1d71ff..35c1165 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -390,9 +390,6 @@ def _arguments(self, input, consortia): for i, name_ in enumerate(v): v_.append({self.FILE_SCHEMA: f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{name_.replace("@", "_")}', self.DIMENSION_SCHEMA: str(i)}) - # remove DIMENSION_SCHEMA field if only one file - if len(v_) == 1: - del v_[0][self.DIMENSION_SCHEMA] argument_.setdefault(k, v_) elif k == self.QC_THRESHOLDS_SCHEMA: v_ = { diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index 02f7bf5..60a57f2 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -26,7 +26,7 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"file": "cgap-core:ReferenceFile-reference_genome_hg38"}] + "files": [{"file": "cgap-core:ReferenceFile-reference_genome_hg38", "dimension": "0"}] }, { "argument_name": "samples", From 4ef1bbdd54bad3b5d40baadc00726e0b5a76e02e Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Tue, 21 Nov 2023 13:50:07 -0500 Subject: [PATCH 13/18] added logger infto to push trhough object failing portal validation --- pipeline_utils/pipeline_deploy.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index e559d66..54944ed 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -165,10 +165,16 @@ def _post_patch_json(self, data_json, type): data_json['extra_files'] = extra_files_ ########################################################### - if is_patch: - ff_utils.patch_metadata(data_json, uuid, key=self.ff_key) - else: - ff_utils.post_metadata(data_json, type, key=self.ff_key) + try: + if is_patch: + ff_utils.patch_metadata(data_json, uuid, key=self.ff_key) + else: + ff_utils.post_metadata(data_json, type, key=self.ff_key) + except Exception as E: + # this will skip and report errors during patching and posting + logger.info('> FAILED PORTAL VALIDATION') + logger.info(E) + pass logger.info('> Posted %s' % data_json['aliases'][0]) From 88c44b845cf7d499121762cfb5754adebef5dfc0 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Tue, 21 Nov 2023 14:47:17 -0500 Subject: [PATCH 14/18] . --- pipeline_utils/lib/yaml_parser.py | 2 +- tests/repo_correct/portal_objects/file_format.yaml | 2 +- tests/test_yaml_file_format.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 35c1165..17e2a6e 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -663,7 +663,7 @@ def to_json( # check for secondary formats if getattr(self, self.SECONDARY_FORMATS_SCHEMA, None): frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA) - frmt_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, 'shared') + frmt_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, 'released') # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): diff --git a/tests/repo_correct/portal_objects/file_format.yaml b/tests/repo_correct/portal_objects/file_format.yaml index 81408ba..80ab7b9 100644 --- a/tests/repo_correct/portal_objects/file_format.yaml +++ b/tests/repo_correct/portal_objects/file_format.yaml @@ -8,7 +8,7 @@ secondary_formats: # file_types: # - FileReference # - FileProcessed -status: shared +status: obsolete --- diff --git a/tests/test_yaml_file_format.py b/tests/test_yaml_file_format.py index aa194e1..a9c920e 100644 --- a/tests/test_yaml_file_format.py +++ b/tests/test_yaml_file_format.py @@ -20,7 +20,7 @@ def test_file_format(): "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], "standard_file_extension": "bam", - "status": "shared" + "status": "obsolete" # "valid_item_types": ["ReferenceFile", "FileProcessed"] }, { @@ -31,7 +31,7 @@ def test_file_format(): "submission_centers": ["hms-dbmi"], "consortia": ["cgap-core"], "standard_file_extension": "bam.bai", - "status": "shared", + "status": "released", # "valid_item_types": ["ReferenceFile", "FileProcessed"], "uuid": '1936f246-22e1-45dc-bb5c-9cfd55537fe9' } From b847737e37150bcf8f7ceb6b1dbd45bb7fe8e6f3 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Wed, 29 Nov 2023 10:22:24 -0500 Subject: [PATCH 15/18] . --- pipeline_utils/lib/yaml_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 17e2a6e..0f8ffca 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -390,6 +390,11 @@ def _arguments(self, input, consortia): for i, name_ in enumerate(v): v_.append({self.FILE_SCHEMA: f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{name_.replace("@", "_")}', self.DIMENSION_SCHEMA: str(i)}) + # remove DIMENSION_SCHEMA field if only one file + # this is necessary so the file will be posted as a string and not a list + # having a list will break tibanna creating the correct input for cwltool + if len(v_) == 1: + del v_[0][self.DIMENSION_SCHEMA] argument_.setdefault(k, v_) elif k == self.QC_THRESHOLDS_SCHEMA: v_ = { From e3abacee456fe7d240c4f756402ebfdaf20c96a0 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Wed, 29 Nov 2023 10:26:34 -0500 Subject: [PATCH 16/18] . --- tests/test_yaml_metaworkflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index 60a57f2..02f7bf5 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -26,7 +26,7 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"file": "cgap-core:ReferenceFile-reference_genome_hg38", "dimension": "0"}] + "files": [{"file": "cgap-core:ReferenceFile-reference_genome_hg38"}] }, { "argument_name": "samples", From 493d6969aa187228619a7727641616a5ae560d38 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Mon, 4 Dec 2023 16:20:23 -0500 Subject: [PATCH 17/18] . --- CHANGELOG.rst | 6 +- LOG.md | 8 - README.md | 2 +- docs/conf.py | 4 +- docs/deploy_pipeline.rst | 20 +-- docs/functions.rst | 47 +----- docs/index.rst | 8 +- docs/pipeline_utils.rst | 8 +- docs/yaml_file_format.rst | 5 +- docs/yaml_file_reference.rst | 17 +- docs/yaml_metaworkflow.rst | 19 ++- docs/yaml_software.rst | 16 +- docs/yaml_workflow.rst | 17 +- pipeline_utils/lib/check_lines.py | 247 ------------------------------ pyproject.toml | 2 +- 15 files changed, 71 insertions(+), 355 deletions(-) delete mode 100644 LOG.md delete mode 100644 pipeline_utils/lib/check_lines.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f46ecfc..531261c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,8 +4,6 @@ Change Log ========== -3.0.0 +0.0.1 ===== -* 2023-10-10 -* Added this CHANGELOG.rst file. -* Upgrade to Python 3.11. +* Initial release diff --git a/LOG.md b/LOG.md deleted file mode 100644 index fbaf79c..0000000 --- a/LOG.md +++ /dev/null @@ -1,8 +0,0 @@ -### Version Updates - -#### v2.1.0 - * Added support for updated QCs, to enable the new generic schema ``quality_metric_generic`` - - -#### v2.0.0 - * Initial release after major changes to support the new YAML format for portal objects diff --git a/README.md b/README.md index f7fb627..a2aa59a 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,6 @@ To install from source: make update make build -To check that the software is correctly installed, try to run `pipeline_utils`. If installed from source, this command may fail with a bash “command not found” error, try `poetry run pipeline_utils` instead. +To check that the software is correctly installed, try to run `smaht_pipeline_utils`. If installed from source, this command may fail with a bash “command not found” error, try `poetry run smaht_pipeline_utils` instead. See `make info` for details on make targets. diff --git a/docs/conf.py b/docs/conf.py index d983ecb..56edc83 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'smaht-pipeline-utils' -copyright = '2021, HMS DBMI' +copyright = '2023, HMS DBMI' author = 'Michele Berselli, SMaHT Team' # The full version, including alpha/beta/rc tags -release = '2.1.0' +release = '0.0.1' # -- General configuration --------------------------------------------------- diff --git a/docs/deploy_pipeline.rst b/docs/deploy_pipeline.rst index d110ce4..fbb1ef3 100644 --- a/docs/deploy_pipeline.rst +++ b/docs/deploy_pipeline.rst @@ -142,7 +142,7 @@ Example of a key-pair entry: } } -```` is the namespace for the environment and can be found in the portal health page (e.g., cgap-wolf). +```` is the namespace for the environment and can be found in the portal health page (e.g., smaht-wolf). .. _account_vars: @@ -154,21 +154,21 @@ Finally we need to setup the information to identify the target environment to u .. code-block:: bash # Set the namespace of the target environment - # e.g., cgap-wolf + # e.g., smaht-wolf export ENV_NAME= # Set the bucket used to store the worklow description files - # e.g., cgap-biotest-main-application-tibanna-cwls + # e.g., smaht-wolf-application-tibanna-cwls export WFL_BUCKET= # Set the path to the keypair file with the portal credential export KEYDICTS_JSON=~/.cgap-keys.json - # Set up project and institution - # Project and institution need to correspond to metadata present on the portal - # e.g., cgap-core and hms-dbmi - export PROJECT= - export INSTITUTION= + # Set up consortia and submission centers + # consortia and submission_centers need to correspond to metadata present on the portal + # e.g., ['smaht'] and ['smaht_dac'] + export CONSORTIA= + export SUBMISSION_CENTERS= # If running sentieon code, # specify the address for the server that validate the software license @@ -194,8 +194,8 @@ by the ``--repos`` argument. --wfl-bucket ${WFL_BUCKET} \ --account ${AWS_ACCOUNT_NUMBER} \ --region ${TIBANNA_AWS_REGION} \ - --project ${PROJECT} \ - --institution ${INSTITUTION} \ + --consortia ${CONSORTIA} \ + --submission-centers ${SUBMISSION_CENTERS} \ --sentieon-server ${SENTIEON_LICENSE} \ --post-software \ --post-file-format \ diff --git a/docs/functions.rst b/docs/functions.rst index 49378d8..078930f 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -2,49 +2,4 @@ Functions ========= -Collection of utilities available as functions: - - - :ref:`check_lines ` - -.. _check_lines: - -check_lines -+++++++++++ - -*check_lines* function can be used to check that line counts are matching between the output of two steps where lines should not be dropped (i.e., any steps that modify without filtering), or between an output ``bam`` and the input ``fastq`` files. -Requires uuid for the *MetaWorkflowRun* object to check and ff_key to access the metadata on the portal. The steps to compare are specified as dictionaries, examples below. - -.. code-block:: python - - from pipeline_utils.lib import check_lines - - result = check_lines.check_lines(metawfr_uuid, ff_key, steps=steps_dict, fastqs=fastqs_dict) - - # metawfr_uuid - # -> uuid for MetaWorkflowRun object - - # ff_key - # -> key to authenticate on the portal - - ## steps_dict example - # steps_dict = { - # 'workflow_add-readgroups-check': { - # 'dependency': 'workflow_bwa-mem_no_unzip-check', - # 'output': 'bam_w_readgroups', - # 'output_match': 'raw_bam', - # 'key': 'Total Reads', - # 'key_match': 'Total Reads' - # }, - # ... - # } - - ## fastqs_dict example - # fastqs_dict = { - # 'workflow_bwa-mem_no_unzip-check': { - # 'output': 'raw_bam', - # 'input_match': ['fastq_R1', 'fastq_R2'], - # 'key': 'Total Reads', - # 'key_match': 'Total Sequences' - # }, - # ... - # } +In development. diff --git a/docs/index.rst b/docs/index.rst index 65d81e4..2c47eed 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,8 @@ -========================= -Portal Pipeline Utilities -========================= +=============================== +SMaHT Portal Pipeline Utilities +=============================== -Documentation for smaht-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with portal infrastructure. +Documentation for smaht-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with SMaHT portal infrastructure. .. _smaht-pipeline-utils: https://github.com/smaht-dac/smaht-pipeline-utils diff --git a/docs/pipeline_utils.rst b/docs/pipeline_utils.rst index 301bdc6..f087975 100644 --- a/docs/pipeline_utils.rst +++ b/docs/pipeline_utils.rst @@ -61,10 +61,10 @@ Usage: - AWS account to use for deployment * - *-\-region* - AWS account region to use for deployment - * - *-\-project* - - Project to use for deployment [cgap-core] - * - *-\-institution* - - Institution to use for deployment [hms-dbmi] + * - *-\-consortia* + - List of consortia to use for deployment [smaht] + * - *-\-submission-centers* + - List of centers to use for deployment [smaht_dac] * - *-\-post-software* - DEPLOY | UPDATE Software objects (.yaml or .yml) * - *-\-post-file-format* diff --git a/docs/yaml_file_format.rst b/docs/yaml_file_format.rst index 0f10970..531fe2d 100644 --- a/docs/yaml_file_format.rst +++ b/docs/yaml_file_format.rst @@ -22,10 +22,9 @@ Template # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas secondary_formats: - # bam, fastq, bwt, ... - status: # shared Fields Definition @@ -49,7 +48,7 @@ Description of the file format. Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. secondary_formats ----------------- diff --git a/docs/yaml_file_reference.rst b/docs/yaml_file_reference.rst index e30dd5c..ba961b3 100644 --- a/docs/yaml_file_reference.rst +++ b/docs/yaml_file_reference.rst @@ -21,9 +21,14 @@ Template format: # bam, fastq, bwt, ... version: + category: + - # Reference Genome, ... + type: + - # Reference Sequence, ... + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas secondary_files: - # bam, fastq, bwt, ... status: # uploading, uploaded @@ -60,7 +65,7 @@ Version of the reference file. Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. secondary_files --------------- @@ -78,3 +83,11 @@ Most likely you don't want to set this field and just use the default logic auto license ------- License information. + +category +-------- +Categories for the reference file, see `schemas `__. + +type +---- +Types for the reference file, see `schemas `__. diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index ca43649..25e56de 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -20,6 +20,9 @@ Template name: description: + category: + - # Alignment, ... + ## General arguments ######################################## # Pipeline input, reference files, and general arguments # define all arguments for the pipeline here @@ -61,7 +64,7 @@ Template # Allows to force a fixed shards structure ignoring # the input structure, scatter and gather dimensions #################################### - shards: [[], ..] # e.g., [['0'], ['1'], ['2']] + shards: [[], ..] # e.g., [['0'], ['1'], ['2']] ## Lock version #################### # Specific version to use @@ -81,7 +84,7 @@ Template # File argument : - argument_type: file. # bam, fastq, bwt ... + argument_type: file. # bam, fastq, bwt ... # Linking fields # These are optional fields # Check https://magma-suite.readthedocs.io/en/latest/meta-workflow.html @@ -157,6 +160,10 @@ description ----------- Description of the pipeline. +category +-------- +Categories for the pipeline, see `schemas `__. + input ----- Description of general input files and parameters for the pipeline. See :ref:`Input Definition `. @@ -167,11 +174,7 @@ Description of workflows that are steps of the pipeline. See :ref:`Workflows Def Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. - -title ------ -Title of the pipeline. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. .. _workflows: @@ -210,7 +213,7 @@ output Description of expected output files for the workflow. Each output is defined by its name. Additional subfields can be specified. -See `schemas `__. +See `schemas `__. Each output name needs to match an output name that has been previously defined in the corresponding workflow, see :ref:`Workflow `. diff --git a/docs/yaml_software.rst b/docs/yaml_software.rst index ccfa378..58adf6d 100644 --- a/docs/yaml_software.rst +++ b/docs/yaml_software.rst @@ -22,10 +22,12 @@ Template version: commit: + category: + - # Quality Control + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - title: + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas source_url: description: license: # MIT, GPLv3, ... @@ -51,13 +53,13 @@ commit ------ Commit of the software. +category +-------- +Categories for the software, see `schemas `__. + Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. - -title ------ -Title for the software. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. source_url ---------- diff --git a/docs/yaml_workflow.rst b/docs/yaml_workflow.rst index ba63bd8..6d3ecf4 100644 --- a/docs/yaml_workflow.rst +++ b/docs/yaml_workflow.rst @@ -26,11 +26,12 @@ Template child: - # .cwl or .wdl file + category: + - # Annotation + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - title: - + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas software: - @ @@ -99,6 +100,10 @@ Several subfields need to be specified: At the moment we support two standards, `Common Workflow Language `__ (CWL) and `Workflow Description Language `__ (WDL). +category +-------- +Categories for the workflow, see `schemas `__. + input ----- Description of input files and parameters for the workflow. See :ref:`Input Definition `. @@ -109,11 +114,7 @@ Description of expected outputs for the workflow. See :ref:`Output Definition `__. - -title ------ -Title of the workflow. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. software -------- diff --git a/pipeline_utils/lib/check_lines.py b/pipeline_utils/lib/check_lines.py deleted file mode 100644 index 7d26d2e..0000000 --- a/pipeline_utils/lib/check_lines.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 - -################################################ -# -# check_lines -# implement a function to check line counts -# match between specified pipeline steps output -# -# Michele Berselli -# berselli.michele@gmail.com -# -################################################ - -################################################ -# Libraries -################################################ -import sys, os -from magma_ff.metawflrun import MetaWorkflowRun -from magma_ff import wfrutils -from dcicutils import ff_utils - -################################################ -# Variables -################################################ -steps_dict = { - # BAM - 'workflow_add-readgroups-check': { - 'dependency': 'workflow_bwa-mem_no_unzip-check', - 'output': 'bam_w_readgroups', - 'output_match': 'raw_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_merge-bam-check': { - 'dependency': 'workflow_add-readgroups-check', - 'output': 'merged_bam', - 'output_match': 'bam_w_readgroups', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_picard-MarkDuplicates-check':{ - 'dependency': 'workflow_merge-bam-check', - 'output': 'dupmarked_bam', - 'output_match': 'merged_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_sort-bam-check': { - 'dependency': 'workflow_picard-MarkDuplicates-check', - 'output': 'sorted_bam', - 'output_match': 'dupmarked_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_gatk-ApplyBQSR-check': { - 'dependency': 'workflow_sort-bam-check', - 'output': 'recalibrated_bam', - 'output_match': 'sorted_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - # VCF - 'workflow_samplegeno': { - 'dependency': 'workflow_gatk-GenotypeGVCFs-check', - 'output': 'samplegeno_vcf', - 'output_match': 'vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - # 'workflow_vep-annot-check': { - # 'dependency': 'workflow_samplegeno', - # 'output': 'annotated_vcf', - # 'output_match': 'samplegeno_vcf', - # 'key': 'Total Variants Called', - # 'key_match': 'Filtered Variants' - # }, - 'workflow_granite-comHet-check': { - 'dependency': 'workflow_granite-filtering-check', - 'output': 'comHet_vcf', - 'output_match': 'merged_vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - 'workflow_dbSNP_ID_fixer-check': { - 'dependency': 'workflow_granite-comHet-check', - 'output': 'vcf', - 'output_match': 'comHet_vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - 'workflow_hg19lo_hgvsg-check': { - 'dependency': 'workflow_dbSNP_ID_fixer-check', - 'output': 'vcf', - 'output_match': 'vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - } - } - -fastqs_dict = { - 'workflow_bwa-mem_no_unzip-check': { - 'output': 'raw_bam', - 'input_match': ['fastq_R1', 'fastq_R2'], - 'key': 'Total Reads', - 'key_match': 'Total Sequences' - } -} - -################################################ -# Functions -################################################ -################################################ -# check_lines -################################################ -def check_lines(metawfr_uuid, ff_key, steps=steps_dict, fastqs=fastqs_dict): - """ - """ - print('Meta Workflow:') - print(' -> ' + metawfr_uuid + '\n') - - # Get meta-workflow-run and create MetaWorkflowRun object - run_json = ff_utils.get_metadata(metawfr_uuid, add_on='?frame=raw&datastore=database', key=ff_key) - metawflrun_obj = MetaWorkflowRun(run_json) - - is_match = True - # Check fastqs - for _, run_obj in metawflrun_obj.runs.items(): - count, match_count = 0, 0 - if run_obj.name in fastqs: - if run_obj.status == 'completed': - # Get output count - for output in run_obj.output: - if output['argument_name'] == fastqs[run_obj.name]['output']: - output_uuid = output['file'] - qc_key = fastqs[run_obj.name]['key'] - count = int(get_count_qc(qc_key, output_uuid, ff_key)) - break - #end if - #end for - print('Shard:') - print(' -> ' + run_obj.shard_name + ', ' + str(count)) - - # Get input file to match from jobid - print('File/s to match:') - ffwr_obj = wfrutils.FFWfrUtils(env='env') - ffwr_obj._ff_key = ff_key - file_match = True - for file in ffwr_obj.wfr_metadata(run_obj.jobid)['input_files']: - if file['workflow_argument_name'] in fastqs[run_obj.name]['input_match']: - input_uuid = file['value']['uuid'] - qc_key = fastqs[run_obj.name]['key_match'] - match_count = int(get_count_fastqc(qc_key, input_uuid, ff_key)) - if not count == match_count: - is_match = False - file_match = False - #end if - print(' -> ' + file['workflow_argument_name'] + ', ' + str(match_count)) - #end if - #end for - print('Matching: ' + str(file_match) + '\n') - else: - print('Missing: ' + run_obj.name + '\n') - print('Completed: False\n') - return False - #end if - #end if - #end for - - # Check steps - for _, run_obj in metawflrun_obj.runs.items(): - count, total_count = 0, 0 - if run_obj.name in steps: - if run_obj.status == 'completed': - # Get output count - for output in run_obj.output: - if output['argument_name'] == steps[run_obj.name]['output']: - output_uuid = output['file'] - qc_key = steps[run_obj.name]['key'] - count = int(get_count_qc(qc_key, output_uuid, ff_key)) - break - #end if - #end for - print('Shard:') - print(' -> ' + run_obj.shard_name + ', ' + str(count)) - - # Get dependencies count - print('Shard/s to match (sum):') - for shard_name in run_obj.dependencies: - if shard_name.split(':')[0] == steps[run_obj.name]['dependency']: - run_obj_ = metawflrun_obj.runs[shard_name] - for output in run_obj_.output: - if output['argument_name'] == steps[run_obj.name]['output_match']: - output_uuid = output['file'] - qc_key = steps[run_obj.name]['key_match'] - count_ = int(get_count_qc(qc_key, output_uuid, ff_key)) - total_count += count_ - break - #end if - #end for - print(' -> ' + shard_name + ', ' + str(count_)) - #end if - #end for - print('Matching: ' + str(count == total_count) + '\n') - # Check counts match - if not count == total_count: - is_match = False - #end if - else: - print('Missing: ' + run_obj.name + '\n') - print('Completed: False\n') - return False - #end if - #end if - #end for - print('Completed: ' + str(is_match) + '\n') - return is_match -#end def - -def get_count_qc(qc_key, uuid, ff_key): - """ - """ - try: - res_json = ff_utils.get_metadata(uuid, add_on='?frame=raw&datastore=database', key=ff_key) - qc_uuid = res_json['quality_metric'] - qc_json = ff_utils.get_metadata(qc_uuid, add_on='?datastore=database', key=ff_key) - for qc in qc_json['quality_metric_summary']: - if qc['title'] == qc_key: - return qc['value'] - #end if - #end for - except KeyError: - return 0 - #end try -#end def - -def get_count_fastqc(qc_key, uuid, ff_key): - """ - """ - try: - res_json = ff_utils.get_metadata(uuid, add_on='?frame=raw&datastore=database', key=ff_key) - qc_uuid = res_json['quality_metric'] - qc_json = ff_utils.get_metadata(qc_uuid, add_on='?datastore=database', key=ff_key) - return qc_json[qc_key] - except KeyError: - return 0 - #end try -#end def diff --git a/pyproject.toml b/pyproject.toml index cad422b..56472c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "smaht-pipeline-utils" -version = "3.0.0" +version = "0.0.1" description = "Utilities for deploying pipelines and interfacing with SMaHT portal infrastructure." authors = [ "Michele Berselli ", From a11d2d9f80e9f46bbef1e8d3757dad46492e81fe Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Mon, 4 Dec 2023 16:32:09 -0500 Subject: [PATCH 18/18] . --- docs/yaml_file_reference.rst | 4 ++-- docs/yaml_metaworkflow.rst | 2 +- docs/yaml_software.rst | 2 +- docs/yaml_workflow.rst | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/yaml_file_reference.rst b/docs/yaml_file_reference.rst index ba961b3..8ea8123 100644 --- a/docs/yaml_file_reference.rst +++ b/docs/yaml_file_reference.rst @@ -22,9 +22,9 @@ Template version: category: - - # Reference Genome, ... + - # Reference Genome, ... type: - - # Reference Sequence, ... + - # Reference Sequence, ... # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index 25e56de..e3f2b53 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -21,7 +21,7 @@ Template description: category: - - # Alignment, ... + - # Alignment, ... ## General arguments ######################################## # Pipeline input, reference files, and general arguments diff --git a/docs/yaml_software.rst b/docs/yaml_software.rst index 58adf6d..058be6b 100644 --- a/docs/yaml_software.rst +++ b/docs/yaml_software.rst @@ -23,7 +23,7 @@ Template commit: category: - - # Quality Control + - # Quality Control # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema diff --git a/docs/yaml_workflow.rst b/docs/yaml_workflow.rst index 6d3ecf4..53b23a2 100644 --- a/docs/yaml_workflow.rst +++ b/docs/yaml_workflow.rst @@ -27,7 +27,7 @@ Template - # .cwl or .wdl file category: - - # Annotation + - # Annotation # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema