diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 59dec77..2dd71be 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,4 @@ -# portal-pipeline-utils GA Workflow +# smaht-pipeline-utils GA Workflow name: CI @@ -17,7 +17,7 @@ on: jobs: # This workflow contains a single job called "test" test: - name: Test portal-pipeline-utils with Python ${{ matrix.python_version }} + name: Test smaht-pipeline-utils with Python ${{ matrix.python_version }} # The type of runner that the job will run on runs-on: ubuntu-22.04 @@ -33,7 +33,7 @@ jobs: with: python-version: ${{ matrix.python_version }} - - name: Test portal-pipeline-utils + - name: Test smaht-pipeline-utils run: | make configure make build diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f46ecfc..531261c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,8 +4,6 @@ Change Log ========== -3.0.0 +0.0.1 ===== -* 2023-10-10 -* Added this CHANGELOG.rst file. -* Upgrade to Python 3.11. +* Initial release diff --git a/LOG.md b/LOG.md deleted file mode 100644 index fbaf79c..0000000 --- a/LOG.md +++ /dev/null @@ -1,8 +0,0 @@ -### Version Updates - -#### v2.1.0 - * Added support for updated QCs, to enable the new generic schema ``quality_metric_generic`` - - -#### v2.0.0 - * Initial release after major changes to support the new YAML format for portal objects diff --git a/README.md b/README.md index 52706a2..a2aa59a 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,25 @@ Utilities for deploying pipelines and interfacing with portal infrastructure. -For more information on available commands and how to contribute and deploy pipelines within the infrastructure check the extended [*documentation*](https://portal-pipeline-utils.readthedocs.io/en/latest/ "portal-pipeline-utils documentation"). +For more information on available commands and how to contribute and deploy pipelines within the infrastructure check the extended [*documentation*](https://smaht-pipeline-utils.readthedocs.io/en/latest/ "smaht-pipeline-utils documentation"). ## Install The software is python based. To install the software and the required packages, we recommend using a fresh virtual environment. Please refer to `pyproject.toml` for the supported Python versions. -The package is available on [*pypi*](https://pypi.org/project/portal-pipeline-utils "portal-pipeline-utils pypi"): +The package is available on [*pypi*](https://pypi.org/project/smaht-pipeline-utils "smaht-pipeline-utils pypi"): - pip install portal-pipeline-utils + pip install smaht-pipeline-utils To install from source: - git clone https://github.com/dbmi-bgm/portal-pipeline-utils.git - cd portal-pipeline-utils + git clone https://github.com/smaht-dac/smaht-pipeline-utils.git + cd smaht-pipeline-utils make configure make update make build -To check that the software is correctly installed, try to run `pipeline_utils`. If installed from source, this command may fail with a bash “command not found” error, try `poetry run pipeline_utils` instead. +To check that the software is correctly installed, try to run `smaht_pipeline_utils`. If installed from source, this command may fail with a bash “command not found” error, try `poetry run smaht_pipeline_utils` instead. See `make info` for details on make targets. diff --git a/docs/conf.py b/docs/conf.py index 9565327..56edc83 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,12 +17,12 @@ # -- Project information ----------------------------------------------------- -project = 'portal-pipeline-utils' -copyright = '2021, HMS DBMI' -author = 'Michele Berselli, CGAP & SMaHT Team' +project = 'smaht-pipeline-utils' +copyright = '2023, HMS DBMI' +author = 'Michele Berselli, SMaHT Team' # The full version, including alpha/beta/rc tags -release = '2.1.0' +release = '0.0.1' # -- General configuration --------------------------------------------------- diff --git a/docs/contribute_pipeline.rst b/docs/contribute_pipeline.rst index ee8fc0b..afdcabf 100644 --- a/docs/contribute_pipeline.rst +++ b/docs/contribute_pipeline.rst @@ -177,4 +177,4 @@ Example Examples -------- -Real examples of implemented pipeline modules can be found linked as submodules in our main pipeline repository for the CGAP project here: https://github.com/dbmi-bgm/cgap-pipeline-main. +Real examples of implemented pipeline modules can be found linked as submodules in our main pipeline repository for the SMaHT project here: https://github.com/smaht-dac/main-pipelines. diff --git a/docs/deploy_pipeline.rst b/docs/deploy_pipeline.rst index 217fc2c..fbb1ef3 100644 --- a/docs/deploy_pipeline.rst +++ b/docs/deploy_pipeline.rst @@ -55,37 +55,37 @@ by running a test command: More information on how to setup Docker can be found in the `AWS Documentation `_. -We now need to install the ``pipeline_utils`` software to deploy the pipeline components. +We now need to install the ``smaht_pipeline_utils`` software to deploy the pipeline components. -Install pipeline_utils -====================== +Install smaht_pipeline_utils +============================ The software is Python-based. To install the software and the required packages, we recommend using a fresh virtual environment. -Please refer to `pyproject.toml `_ for the supported Python version. +Please refer to `pyproject.toml `_ for the supported Python version. We recommend using pyenv to manage virtual environments. Instructions for installing and using pyenv can be found `here `_. -Once the virtual environment is set up and activated, we can proceed to :ref:`install ` portal-pipeline-utils software. +Once the virtual environment is set up and activated, we can proceed to :ref:`install ` smaht-pipeline-utils software. .. code-block:: bash # Install from source - git clone https://github.com/dbmi-bgm/portal-pipeline-utils.git - cd portal-pipeline-utils + git clone https://github.com/smaht-dac/smaht-pipeline-utils.git + cd smaht-pipeline-utils make configure make update make build cd .. # Install from pypi - pip install portal-pipeline-utils + pip install smaht-pipeline-utils -To check that the software is correctly installed, try to run ``pipeline_utils``. -If installed from source, this command may fail with a bash "command not found" error, try ``poetry run pipeline_utils`` instead. +To check that the software is correctly installed, try to run ``smaht_pipeline_utils``. +If installed from source, this command may fail with a bash "command not found" error, try ``poetry run smaht_pipeline_utils`` instead. Set Up Credentials and Environmental Variables ============================================== @@ -142,7 +142,7 @@ Example of a key-pair entry: } } -```` is the namespace for the environment and can be found in the portal health page (e.g., cgap-wolf). +```` is the namespace for the environment and can be found in the portal health page (e.g., smaht-wolf). .. _account_vars: @@ -154,21 +154,21 @@ Finally we need to setup the information to identify the target environment to u .. code-block:: bash # Set the namespace of the target environment - # e.g., cgap-wolf + # e.g., smaht-wolf export ENV_NAME= # Set the bucket used to store the worklow description files - # e.g., cgap-biotest-main-application-tibanna-cwls + # e.g., smaht-wolf-application-tibanna-cwls export WFL_BUCKET= # Set the path to the keypair file with the portal credential export KEYDICTS_JSON=~/.cgap-keys.json - # Set up project and institution - # Project and institution need to correspond to metadata present on the portal - # e.g., cgap-core and hms-dbmi - export PROJECT= - export INSTITUTION= + # Set up consortia and submission centers + # consortia and submission_centers need to correspond to metadata present on the portal + # e.g., ['smaht'] and ['smaht_dac'] + export CONSORTIA= + export SUBMISSION_CENTERS= # If running sentieon code, # specify the address for the server that validate the software license @@ -188,14 +188,14 @@ by the ``--repos`` argument. .. code-block:: bash - pipeline_utils pipeline_deploy \ + smaht_pipeline_utils pipeline_deploy \ --ff-env ${ENV_NAME} \ --keydicts-json ${KEYDICTS_JSON} \ --wfl-bucket ${WFL_BUCKET} \ --account ${AWS_ACCOUNT_NUMBER} \ --region ${TIBANNA_AWS_REGION} \ - --project ${PROJECT} \ - --institution ${INSTITUTION} \ + --consortia ${CONSORTIA} \ + --submission-centers ${SUBMISSION_CENTERS} \ --sentieon-server ${SENTIEON_LICENSE} \ --post-software \ --post-file-format \ @@ -214,10 +214,10 @@ The default is set to the ``main`` branch. The ``--local-build`` flag will preve *Note: we are working to enable more builders with a command line argument for which builder to use to deploy modules from different repositories through AWS CodeBuild.* -Deploying CGAP Pipelines +Deploying SMaHT Pipelines ======================== -CGAP pipelines are released as a complete package with a customized set up for automated deployment to the desired environment. +SMaHT pipelines are released as a complete package with a customized set up for automated deployment to the desired environment. To deploy the pipelines run the following steps: 1. Clone the main pipeline repository. @@ -225,7 +225,7 @@ The submodules will be empty and set to the current commits saved for the main b .. code-block:: bash - git clone https://github.com/dbmi-bgm/cgap-pipeline-main.git + git clone https://github.com/smaht-dac/main-pipelines.git 2. Check out the desired version. This will set the submodules to the commits saved for that pipeline release. @@ -241,7 +241,7 @@ The submodules will be set in detached state on their current commit. make pull -4. Build pipeline_utils (optional). +4. Build smaht_pipeline_utils (optional). This will build from source the latest version linked for the current release. .. code-block:: bash diff --git a/docs/functions.rst b/docs/functions.rst index 49378d8..078930f 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -2,49 +2,4 @@ Functions ========= -Collection of utilities available as functions: - - - :ref:`check_lines ` - -.. _check_lines: - -check_lines -+++++++++++ - -*check_lines* function can be used to check that line counts are matching between the output of two steps where lines should not be dropped (i.e., any steps that modify without filtering), or between an output ``bam`` and the input ``fastq`` files. -Requires uuid for the *MetaWorkflowRun* object to check and ff_key to access the metadata on the portal. The steps to compare are specified as dictionaries, examples below. - -.. code-block:: python - - from pipeline_utils.lib import check_lines - - result = check_lines.check_lines(metawfr_uuid, ff_key, steps=steps_dict, fastqs=fastqs_dict) - - # metawfr_uuid - # -> uuid for MetaWorkflowRun object - - # ff_key - # -> key to authenticate on the portal - - ## steps_dict example - # steps_dict = { - # 'workflow_add-readgroups-check': { - # 'dependency': 'workflow_bwa-mem_no_unzip-check', - # 'output': 'bam_w_readgroups', - # 'output_match': 'raw_bam', - # 'key': 'Total Reads', - # 'key_match': 'Total Reads' - # }, - # ... - # } - - ## fastqs_dict example - # fastqs_dict = { - # 'workflow_bwa-mem_no_unzip-check': { - # 'output': 'raw_bam', - # 'input_match': ['fastq_R1', 'fastq_R2'], - # 'key': 'Total Reads', - # 'key_match': 'Total Sequences' - # }, - # ... - # } +In development. diff --git a/docs/index.rst b/docs/index.rst index a6c307a..2c47eed 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,10 +1,10 @@ -========================= -Portal Pipeline Utilities -========================= +=============================== +SMaHT Portal Pipeline Utilities +=============================== -Documentation for portal-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with portal infrastructure. +Documentation for smaht-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with SMaHT portal infrastructure. -.. _portal-pipeline-utils: https://github.com/dbmi-bgm/portal-pipeline-utils +.. _smaht-pipeline-utils: https://github.com/smaht-dac/smaht-pipeline-utils Contents ######## diff --git a/docs/install.rst b/docs/install.rst index f057538..0fbf5e3 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -9,11 +9,11 @@ PyPI The package is available on pypi_: -.. _pypi: https://pypi.org/project/portal-pipeline-utils +.. _pypi: https://pypi.org/project/smaht-pipeline-utils .. code-block:: bash - pip install portal-pipeline-utils + pip install smaht-pipeline-utils Source ^^^^^^ @@ -23,10 +23,10 @@ To install the latest version from source: .. code-block:: bash - git clone https://github.com/dbmi-bgm/portal-pipeline-utils.git - cd portal-pipeline-utils + git clone https://github.com/smaht-dac/smaht-pipeline-utils.git + cd smaht-pipeline-utils make configure make update make build -Please refer to `pyproject.toml `_ for the supported Python version. +Please refer to `pyproject.toml `_ for the supported Python version. diff --git a/docs/pipeline_utils.rst b/docs/pipeline_utils.rst index 979d921..f087975 100644 --- a/docs/pipeline_utils.rst +++ b/docs/pipeline_utils.rst @@ -1,6 +1,6 @@ -============== -pipeline_utils -============== +==================== +smaht_pipeline_utils +==================== This is the entry point for a collection of utilities available as commands: @@ -10,7 +10,7 @@ Usage: .. code-block:: bash - pipeline_utils [COMMAND] [ARGS] + smaht_pipeline_utils [COMMAND] [ARGS] .. _pipeline_deploy: @@ -24,7 +24,7 @@ Usage: .. code-block:: bash - pipeline_utils pipeline_deploy --ff-env FF_ENV --repos REPO [REPO ...] [OPTIONAL ARGS] + smaht_pipeline_utils pipeline_deploy --ff-env FF_ENV --repos REPO [REPO ...] [OPTIONAL ARGS] **Arguments:** @@ -61,10 +61,10 @@ Usage: - AWS account to use for deployment * - *-\-region* - AWS account region to use for deployment - * - *-\-project* - - Project to use for deployment [cgap-core] - * - *-\-institution* - - Institution to use for deployment [hms-dbmi] + * - *-\-consortia* + - List of consortia to use for deployment [smaht] + * - *-\-submission-centers* + - List of centers to use for deployment [smaht_dac] * - *-\-post-software* - DEPLOY | UPDATE Software objects (.yaml or .yml) * - *-\-post-file-format* diff --git a/docs/repo.rst b/docs/repo.rst index 5a505be..dbc914f 100644 --- a/docs/repo.rst +++ b/docs/repo.rst @@ -56,4 +56,4 @@ Example ``foo_bar`` pipeline: ├── PIPELINE └── VERSION -Real examples can be found linked as submodules in our pipelines repository for CGAP project here: https://github.com/dbmi-bgm/cgap-pipeline-main. +Real examples can be found linked as submodules in our pipelines repository for SMaHT project here: https://github.com/smaht-dac/main-pipelines. diff --git a/docs/yaml_file_format.rst b/docs/yaml_file_format.rst index 44aaa8a..531fe2d 100644 --- a/docs/yaml_file_format.rst +++ b/docs/yaml_file_format.rst @@ -22,12 +22,9 @@ Template # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas secondary_formats: - # bam, fastq, bwt, ... - file_types: - - # FileReference, FileProcessed, FileSubmitted - status: # shared Fields Definition @@ -51,15 +48,9 @@ Description of the file format. Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. secondary_formats ----------------- List of secondary ```` available for the file format. Each ```` needs to match a file format that has been previously defined. - -file_types ----------- -File types that can use the file format. -List of ````. The possible values are ``FileReference``, ``FileProcessed`` and ``FileSubmitted``. -Default value if not specified is ``FileReference`` and ``FileProcessed``. diff --git a/docs/yaml_file_reference.rst b/docs/yaml_file_reference.rst index e30dd5c..8ea8123 100644 --- a/docs/yaml_file_reference.rst +++ b/docs/yaml_file_reference.rst @@ -21,9 +21,14 @@ Template format: # bam, fastq, bwt, ... version: + category: + - # Reference Genome, ... + type: + - # Reference Sequence, ... + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas secondary_files: - # bam, fastq, bwt, ... status: # uploading, uploaded @@ -60,7 +65,7 @@ Version of the reference file. Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. secondary_files --------------- @@ -78,3 +83,11 @@ Most likely you don't want to set this field and just use the default logic auto license ------- License information. + +category +-------- +Categories for the reference file, see `schemas `__. + +type +---- +Types for the reference file, see `schemas `__. diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index 9909100..e3f2b53 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -20,10 +20,8 @@ Template name: description: - # All the following fields are optional and provided as example, - # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - proband_only: + category: + - # Alignment, ... ## General arguments ######################################## # Pipeline input, reference files, and general arguments @@ -42,7 +40,7 @@ Template # Parameter argument : - argument_type: parameter. # string, integer, float, json, boolean + argument_type: parameter. # string, integer, float, array, object, boolean # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema value: <...> @@ -66,7 +64,7 @@ Template # Allows to force a fixed shards structure ignoring # the input structure, scatter and gather dimensions #################################### - shards: [[], ..] # e.g., [['0'], ['1'], ['2']] + shards: [[], ..] # e.g., [['0'], ['1'], ['2']] ## Lock version #################### # Specific version to use @@ -86,7 +84,7 @@ Template # File argument : - argument_type: file. # bam, fastq, bwt ... + argument_type: file. # bam, fastq, bwt ... # Linking fields # These are optional fields # Check https://magma-suite.readthedocs.io/en/latest/meta-workflow.html @@ -126,15 +124,15 @@ Template # File output : - file_type: + data_category: + - + data_type: + - # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema description: - linkto_location: - - # Sample, SampleProcessing - higlass_file: - variant_type: # SNV, SV, CNV - vcf_to_ingest: + variant_types: + - s3_lifecycle_category: # short_term_access_long_term_archive, # short_term_access, short_term_archive, # long_term_access_long_term_archive, @@ -162,6 +160,10 @@ description ----------- Description of the pipeline. +category +-------- +Categories for the pipeline, see `schemas `__. + input ----- Description of general input files and parameters for the pipeline. See :ref:`Input Definition `. @@ -172,11 +174,7 @@ Description of workflows that are steps of the pipeline. See :ref:`Workflows Def Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. - -title ------ -Title of the pipeline. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. .. _workflows: @@ -215,7 +213,7 @@ output Description of expected output files for the workflow. Each output is defined by its name. Additional subfields can be specified. -See `schemas `__. +See `schemas `__. Each output name needs to match an output name that has been previously defined in the corresponding workflow, see :ref:`Workflow `. @@ -239,7 +237,7 @@ Definition of the type of the argument. For a **file** argument, the argument type is defined as ``file.``, where ```` is the format used by the file. ```` needs to match a file format that has been previously defined, see :ref:`File Format `. -For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, json, boolean]. +For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, array, boolean, object]. files ^^^^^ @@ -253,24 +251,21 @@ value ^^^^^ This field can be used to assign a specific value to a **parameter** argument. -*Note*: As of now, the value needs to be always encoded as ````. -We are working to improve this and enable usage of real types. - Example .. code-block:: yaml a_float: argument_type: parameter.float - value: "0.8" + value: 0.8 an_integer: argument_type: parameter.integer - value: "1" + value: 1 a_string_array: - argument_type: parameter.json - value: "[\"DEL\", \"DUP\"]" + argument_type: parameter.array + value: ["DEL", "DUP"] Linking Fields ^^^^^^^^^^^^^^ diff --git a/docs/yaml_software.rst b/docs/yaml_software.rst index ccfa378..058be6b 100644 --- a/docs/yaml_software.rst +++ b/docs/yaml_software.rst @@ -22,10 +22,12 @@ Template version: commit: + category: + - # Quality Control + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - title: + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas source_url: description: license: # MIT, GPLv3, ... @@ -51,13 +53,13 @@ commit ------ Commit of the software. +category +-------- +Categories for the software, see `schemas `__. + Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. - -title ------ -Title for the software. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. source_url ---------- diff --git a/docs/yaml_workflow.rst b/docs/yaml_workflow.rst index 124eb2c..53b23a2 100644 --- a/docs/yaml_workflow.rst +++ b/docs/yaml_workflow.rst @@ -26,11 +26,12 @@ Template child: - # .cwl or .wdl file + category: + - # Annotation + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - title: - + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas software: - @ @@ -45,7 +46,7 @@ Template # Parameter argument : - argument_type: parameter. # string, integer, float, json, boolean + argument_type: parameter. # string, integer, float, array, boolean, object ## Output information ####################################### # Output files and quality controls @@ -60,31 +61,16 @@ Template # QC output : - argument_type: qc. # qc_type, e.g. quality_metric_vcfcheck - # none can be used as - # if a qc_type is not defined - # quality_metric_generic can be used as - # to use the general qc_type instead of a custom one + argument_type: qc argument_to_be_attached_to: - # All the following fields are optional and provided as example, - # can be expanded to anything accepted by the schema - html: + # Fields to specify the output type + # either json or zipped folder json: - table: zipped: - # If the output is a zipped folder with multiple QC files, - # fields to define the target files inside the folder - html_in_zipped: - tables_in_zipped: - - - # Fields still used by tibanna that needs refactoring - # listing them as they are - qc_acl: # e.g. private - qc_unzip_from_ec2: # Report output : - argument_type: report. # report_type, e.g. file + argument_type: report General Fields Definition @@ -114,6 +100,10 @@ Several subfields need to be specified: At the moment we support two standards, `Common Workflow Language `__ (CWL) and `Workflow Description Language `__ (WDL). +category +-------- +Categories for the workflow, see `schemas `__. + input ----- Description of input files and parameters for the workflow. See :ref:`Input Definition `. @@ -124,11 +114,7 @@ Description of expected outputs for the workflow. See :ref:`Output Definition `__. - -title ------ -Title of the workflow. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. software -------- @@ -150,7 +136,7 @@ Definition of the type of the argument. For a **file** argument, the argument type is defined as ``file.``, where ```` is the format used by the file. ```` needs to match a file format that has been previously defined, see :ref:`File Format `. -For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, json, boolean]. +For a **parameter** argument, the argument type is defined as ``parameter.``, where ```` is the type of the value expected for the argument [string, integer, float, array, boolean, object]. .. _output_a: @@ -166,20 +152,18 @@ Definition of the type of the output. For a **file** output, the argument type is defined as ``file.``, where ```` is the format used by the file. ```` needs to match a file format that has been previously defined, see :ref:`File Format `. -For a **report** output, the argument type is defined as ``report.``, where ```` is the type of the report (e.g., file). +For a **report** output, the argument type is defined as ``report``. -For a **QC** (Quality Control) output, the argument type is defined as ``qc.``, where ```` is a ``qc_type`` defined in the schema, see `schemas `__. -While custom ``qc_type`` schemas are still supported for compatibility, we introduced a new generic type ``quality_metric_generic``. -We recommend to use this new type to implement QCs. +For a **QC** (Quality Control) output, the argument type is defined as ``qc``. -When using ``quality_metric_generic`` as a ``qc_type``, it is possible to generate two different types of output: a key-value pairs JSON file and a compressed file. +For a QC, it is possible to generate two different types of output: a key-value pairs JSON file and a compressed file. The JSON file can be used to create a summary report of the quality metrics generated by the QC process. The compressed file can be used to store the original output for the QC, including additional data or graphs. -Both the JSON file and compressed file will be attached to the file specified as target by ``argument_to_be_attached_to`` with a ``QualityMetricGeneric`` object. +Both the JSON file and compressed file will be attached to the file specified as target by ``argument_to_be_attached_to`` with a ``QualityMetric`` object. The content of the JSON file will be patched directly on the object, while the compressed file will be made available for download via a link. The output type can be specified by setting ``json: True`` or ``zipped: True`` in the the QC output definition. -Template for ``quality_metric_generic``: +Template for key-value pairs JSON: .. code-block:: python diff --git a/pipeline_utils/__main__.py b/pipeline_utils/__main__.py index d1eff02..c2a8e9a 100644 --- a/pipeline_utils/__main__.py +++ b/pipeline_utils/__main__.py @@ -20,8 +20,8 @@ # Variables PIPELINE_DEPLOY = 'pipeline_deploy' -CGAP_ALIAS = 'cgap-core' -DBMI_ALIAS = 'hms-dbmi' +CONSORTIA_ALIAS = ['smaht'] +SUBMISSION_CENTERS_ALIAS = ['smaht_dac'] KEYS_ALIAS = '~/.cgap-keys.json' MAIN_ALIAS = 'main' BUILDER_ALIAS = '-pipeline-builder' @@ -32,7 +32,7 @@ def main(args=None): '''Command line wrapper around available commands. ''' # Adding parser and subparsers - parser = argparse.ArgumentParser(prog='pipeline_utils', description='Collection of utilities for deploying pipelines and interfacing with portal infrastructure') + parser = argparse.ArgumentParser(prog='smaht_pipeline_utils', description='Collection of utilities for deploying pipelines and interfacing with portal infrastructure') subparsers = parser.add_subparsers(dest='func', metavar="") # Add pipeline_deploy to subparsers @@ -50,14 +50,14 @@ def main(args=None): pipeline_deploy_parser.add_argument('--wfl-bucket', required=False, help='Bucket to use for upload of Workflow Description files') pipeline_deploy_parser.add_argument('--account', required=False, help='AWS account to use for deployment') pipeline_deploy_parser.add_argument('--region', required=False, help='AWS account region to use for deployment') - pipeline_deploy_parser.add_argument('--project', required=False, help=f'Project to use for deployment [{CGAP_ALIAS}]', - default=CGAP_ALIAS) - pipeline_deploy_parser.add_argument('--institution', required=False, help=f'Institution to use for deployment [{DBMI_ALIAS}]', - default=DBMI_ALIAS) + pipeline_deploy_parser.add_argument('--consortia', required=False, nargs='+', help='List of consortia to use for deployment', + default=CONSORTIA_ALIAS) + pipeline_deploy_parser.add_argument('--submission-centers', required=False, nargs='+', help='List of centers to use for deployment', + default=SUBMISSION_CENTERS_ALIAS) pipeline_deploy_parser.add_argument('--post-software', action='store_true', help='POST|PATCH Software objects') pipeline_deploy_parser.add_argument('--post-file-format', action='store_true', help='POST|PATCH FileFormat objects') - pipeline_deploy_parser.add_argument('--post-file-reference', action='store_true', help='POST|PATCH FileReference objects') + pipeline_deploy_parser.add_argument('--post-file-reference', action='store_true', help='POST|PATCH ReferenceFile objects') pipeline_deploy_parser.add_argument('--post-workflow', action='store_true', help='POST|PATCH Workflow objects') pipeline_deploy_parser.add_argument('--post-metaworkflow', action='store_true', help='POST|PATCH MetaWorkflow objects') pipeline_deploy_parser.add_argument('--post-wfl', action='store_true', help='Upload Workflow Description files (.cwl, .wdl)') diff --git a/pipeline_utils/lib/check_lines.py b/pipeline_utils/lib/check_lines.py deleted file mode 100644 index 7d26d2e..0000000 --- a/pipeline_utils/lib/check_lines.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 - -################################################ -# -# check_lines -# implement a function to check line counts -# match between specified pipeline steps output -# -# Michele Berselli -# berselli.michele@gmail.com -# -################################################ - -################################################ -# Libraries -################################################ -import sys, os -from magma_ff.metawflrun import MetaWorkflowRun -from magma_ff import wfrutils -from dcicutils import ff_utils - -################################################ -# Variables -################################################ -steps_dict = { - # BAM - 'workflow_add-readgroups-check': { - 'dependency': 'workflow_bwa-mem_no_unzip-check', - 'output': 'bam_w_readgroups', - 'output_match': 'raw_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_merge-bam-check': { - 'dependency': 'workflow_add-readgroups-check', - 'output': 'merged_bam', - 'output_match': 'bam_w_readgroups', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_picard-MarkDuplicates-check':{ - 'dependency': 'workflow_merge-bam-check', - 'output': 'dupmarked_bam', - 'output_match': 'merged_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_sort-bam-check': { - 'dependency': 'workflow_picard-MarkDuplicates-check', - 'output': 'sorted_bam', - 'output_match': 'dupmarked_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_gatk-ApplyBQSR-check': { - 'dependency': 'workflow_sort-bam-check', - 'output': 'recalibrated_bam', - 'output_match': 'sorted_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - # VCF - 'workflow_samplegeno': { - 'dependency': 'workflow_gatk-GenotypeGVCFs-check', - 'output': 'samplegeno_vcf', - 'output_match': 'vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - # 'workflow_vep-annot-check': { - # 'dependency': 'workflow_samplegeno', - # 'output': 'annotated_vcf', - # 'output_match': 'samplegeno_vcf', - # 'key': 'Total Variants Called', - # 'key_match': 'Filtered Variants' - # }, - 'workflow_granite-comHet-check': { - 'dependency': 'workflow_granite-filtering-check', - 'output': 'comHet_vcf', - 'output_match': 'merged_vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - 'workflow_dbSNP_ID_fixer-check': { - 'dependency': 'workflow_granite-comHet-check', - 'output': 'vcf', - 'output_match': 'comHet_vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - 'workflow_hg19lo_hgvsg-check': { - 'dependency': 'workflow_dbSNP_ID_fixer-check', - 'output': 'vcf', - 'output_match': 'vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - } - } - -fastqs_dict = { - 'workflow_bwa-mem_no_unzip-check': { - 'output': 'raw_bam', - 'input_match': ['fastq_R1', 'fastq_R2'], - 'key': 'Total Reads', - 'key_match': 'Total Sequences' - } -} - -################################################ -# Functions -################################################ -################################################ -# check_lines -################################################ -def check_lines(metawfr_uuid, ff_key, steps=steps_dict, fastqs=fastqs_dict): - """ - """ - print('Meta Workflow:') - print(' -> ' + metawfr_uuid + '\n') - - # Get meta-workflow-run and create MetaWorkflowRun object - run_json = ff_utils.get_metadata(metawfr_uuid, add_on='?frame=raw&datastore=database', key=ff_key) - metawflrun_obj = MetaWorkflowRun(run_json) - - is_match = True - # Check fastqs - for _, run_obj in metawflrun_obj.runs.items(): - count, match_count = 0, 0 - if run_obj.name in fastqs: - if run_obj.status == 'completed': - # Get output count - for output in run_obj.output: - if output['argument_name'] == fastqs[run_obj.name]['output']: - output_uuid = output['file'] - qc_key = fastqs[run_obj.name]['key'] - count = int(get_count_qc(qc_key, output_uuid, ff_key)) - break - #end if - #end for - print('Shard:') - print(' -> ' + run_obj.shard_name + ', ' + str(count)) - - # Get input file to match from jobid - print('File/s to match:') - ffwr_obj = wfrutils.FFWfrUtils(env='env') - ffwr_obj._ff_key = ff_key - file_match = True - for file in ffwr_obj.wfr_metadata(run_obj.jobid)['input_files']: - if file['workflow_argument_name'] in fastqs[run_obj.name]['input_match']: - input_uuid = file['value']['uuid'] - qc_key = fastqs[run_obj.name]['key_match'] - match_count = int(get_count_fastqc(qc_key, input_uuid, ff_key)) - if not count == match_count: - is_match = False - file_match = False - #end if - print(' -> ' + file['workflow_argument_name'] + ', ' + str(match_count)) - #end if - #end for - print('Matching: ' + str(file_match) + '\n') - else: - print('Missing: ' + run_obj.name + '\n') - print('Completed: False\n') - return False - #end if - #end if - #end for - - # Check steps - for _, run_obj in metawflrun_obj.runs.items(): - count, total_count = 0, 0 - if run_obj.name in steps: - if run_obj.status == 'completed': - # Get output count - for output in run_obj.output: - if output['argument_name'] == steps[run_obj.name]['output']: - output_uuid = output['file'] - qc_key = steps[run_obj.name]['key'] - count = int(get_count_qc(qc_key, output_uuid, ff_key)) - break - #end if - #end for - print('Shard:') - print(' -> ' + run_obj.shard_name + ', ' + str(count)) - - # Get dependencies count - print('Shard/s to match (sum):') - for shard_name in run_obj.dependencies: - if shard_name.split(':')[0] == steps[run_obj.name]['dependency']: - run_obj_ = metawflrun_obj.runs[shard_name] - for output in run_obj_.output: - if output['argument_name'] == steps[run_obj.name]['output_match']: - output_uuid = output['file'] - qc_key = steps[run_obj.name]['key_match'] - count_ = int(get_count_qc(qc_key, output_uuid, ff_key)) - total_count += count_ - break - #end if - #end for - print(' -> ' + shard_name + ', ' + str(count_)) - #end if - #end for - print('Matching: ' + str(count == total_count) + '\n') - # Check counts match - if not count == total_count: - is_match = False - #end if - else: - print('Missing: ' + run_obj.name + '\n') - print('Completed: False\n') - return False - #end if - #end if - #end for - print('Completed: ' + str(is_match) + '\n') - return is_match -#end def - -def get_count_qc(qc_key, uuid, ff_key): - """ - """ - try: - res_json = ff_utils.get_metadata(uuid, add_on='?frame=raw&datastore=database', key=ff_key) - qc_uuid = res_json['quality_metric'] - qc_json = ff_utils.get_metadata(qc_uuid, add_on='?datastore=database', key=ff_key) - for qc in qc_json['quality_metric_summary']: - if qc['title'] == qc_key: - return qc['value'] - #end if - #end for - except KeyError: - return 0 - #end try -#end def - -def get_count_fastqc(qc_key, uuid, ff_key): - """ - """ - try: - res_json = ff_utils.get_metadata(uuid, add_on='?frame=raw&datastore=database', key=ff_key) - qc_uuid = res_json['quality_metric'] - qc_json = ff_utils.get_metadata(qc_uuid, add_on='?datastore=database', key=ff_key) - return qc_json[qc_key] - except KeyError: - return 0 - #end try -#end def diff --git a/pipeline_utils/lib/yaml_parser.py b/pipeline_utils/lib/yaml_parser.py index 49f796e..0f8ffca 100644 --- a/pipeline_utils/lib/yaml_parser.py +++ b/pipeline_utils/lib/yaml_parser.py @@ -21,7 +21,7 @@ from pipeline_utils.schemas.yaml_workflow import yaml_workflow_schema from pipeline_utils.schemas.yaml_metaworkflow import yaml_metaworkflow_schema from pipeline_utils.schemas.yaml_software import yaml_software_schema -from pipeline_utils.schemas.yaml_file_reference import yaml_file_reference_schema +from pipeline_utils.schemas.yaml_reference_file import yaml_reference_file_schema from pipeline_utils.schemas.yaml_file_format import yaml_file_format_schema @@ -91,9 +91,10 @@ class YAMLTemplate(object): NAME_SCHEMA = 'name' TITLE_SCHEMA = 'title' DESCRIPTION_SCHEMA = 'description' + CATEGORY_SCHEMA = 'category' ALIASES_SCHEMA = 'aliases' - PROJECT_SCHEMA = 'project' - INSTITUTION_SCHEMA = 'institution' + CONSORTIA_SCHEMA = 'consortia' + SUBMISSION_CENTERS_SCHEMA = 'submission_centers' VERSION_SCHEMA = 'version' ACCESSION_SCHEMA = 'accession' UUID_SCHEMA = 'uuid' @@ -101,6 +102,7 @@ class YAMLTemplate(object): ARGUMENT_FORMAT_SCHEMA = 'argument_format' ARGUMENT_NAME_SCHEMA = 'argument_name' VALUE_TYPE_SCHEMA = 'value_type' + VALUE_SCHEMA = 'value' WORKFLOW_ARGUMENT_NAME_SCHEMA = 'workflow_argument_name' INPUT_SCHEMA = 'input' STATUS_SCHEMA = 'status' @@ -108,8 +110,6 @@ class YAMLTemplate(object): SECONDARY_FORMATS_SCHEMA = 'secondary_formats' FILE_FORMAT_SCHEMA = 'file_format' SECONDARY_FILE_FORMATS_SCHEMA = 'secondary_file_formats' - INSTITUTIONS_SCHEMA = 'institutions' - PROJECTS_SCHEMA = 'projects' FILE_SCHEMA = 'file' FILES_SCHEMA = 'files' PARAMETER_SCHEMA = 'parameter' @@ -117,8 +117,9 @@ class YAMLTemplate(object): WORKFLOW_TYPE_SCHEMA = 'Workflow' METAWORKFLOW_TYPE_SCHEMA = 'MetaWorkflow' FILEFORMAT_TYPE_SCHEMA = 'FileFormat' - FILEREFERENCE_TYPE_SCHEMA = 'FileReference' + REFERENCEFILE_TYPE_SCHEMA = 'ReferenceFile' SOFTWARE_TYPE_SCHEMA = 'Software' + VARIANT_TYPE_SCHEMA = "variant_type" def __init__(self, data, schema): """Constructor method. @@ -156,16 +157,10 @@ def _link_title(self, name, version): else: return f'{name.replace("_", " ")} [{version}]' - def _link_institution(self, institution): - """Helper to create an "institution" field. + def _string_consortia(self, consortia): + """Helper to create a string from "consortia" field. """ - return f'/{self.INSTITUTIONS_SCHEMA}/{institution}/' - - def _link_project(self, project): - """Helper to create a "project" field. - """ - return f'/{self.PROJECTS_SCHEMA}/{project}/' - + return '_'.join(sorted(consortia)) ############################################################### # YAMLWorkflow, YAML Workflow @@ -177,7 +172,6 @@ class YAMLWorkflow(YAMLTemplate): # schema constants INPUT_FILE_SCHEMA = 'Input file' OUTPUT_PROCESSED_FILE_SCHEMA = 'Output processed file' - OUTPUT_QC_FILE_SCHEMA = 'Output QC file' GENERIC_QC_FILE_SCHEMA = 'Generic QC file' OUTPUT_REPORT_FILE_SCHEMA = 'Output report file' QC_SCHEMA = 'qc' @@ -185,24 +179,11 @@ class YAMLWorkflow(YAMLTemplate): REPORT_SCHEMA = 'report' ARGUMENT_TO_BE_ATTACHED_TO_SCHEMA = 'argument_to_be_attached_to' ZIPPED_SCHEMA = 'zipped' - HTML_SCHEMA = 'html' JSON_SCHEMA = 'json' - TABLE_SCHEMA = 'table' - APP_NAME_SCHEMA = 'app_name' - APP_VERSION_SCHEMA = 'app_version' SOFTWARE_SCHEMA = 'software' ARGUMENTS_SCHEMA = 'arguments' - QC_TYPE_SCHEMA = 'qc_type' QC_ZIPPED_SCHEMA = 'qc_zipped' - QC_HTML_SCHEMA = 'qc_html' QC_JSON_SCHEMA = 'qc_json' - QC_TABLE_SCHEMA = 'qc_table' - QC_ZIPPED_HTML_SCHEMA = 'qc_zipped_html' - QC_ZIPPED_TABLES_SCHEMA = 'qc_zipped_tables' - HTML_IN_ZIPPED_SCHEMA = 'html_in_zipped' - TABLES_IN_ZIPPED_SCHEMA = 'tables_in_zipped' - QC_ACL = 'qc_acl' - QC_UNZIP_FROM_EC2 = 'qc_unzip_from_ec2' def __init__(self, data): """Constructor method. @@ -212,7 +193,7 @@ def __init__(self, data): self._validate() # load attributes for key, val in data.items(): - if key in [self.DESCRIPTION_SCHEMA, self.TITLE_SCHEMA]: + if key in [self.DESCRIPTION_SCHEMA]: val = self._clean_newline(val) setattr(self, key, val) @@ -245,51 +226,42 @@ def _arguments_output(self): """ arguments = [] for name, values in self.output.items(): - type, format = values[self.ARGUMENT_TYPE_SCHEMA].split('.') + # check if it is a file or qc or report argument + # if it is file it has a type and a format + # argument_type: file. + # if it is qc or report only has type + # argument_type: qc | report + try: + type, format = values[self.ARGUMENT_TYPE_SCHEMA].split('.') + except ValueError: + type = values[self.ARGUMENT_TYPE_SCHEMA] + # create right argument schema according to type if type == self.FILE_SCHEMA: argument_type = self.OUTPUT_PROCESSED_FILE_SCHEMA argument_ = { self.ARGUMENT_FORMAT_SCHEMA: format, self.ARGUMENT_TYPE_SCHEMA: argument_type, - self.WORKFLOW_ARGUMENT_NAME_SCHEMA: name, - self.SECONDARY_FILE_FORMATS_SCHEMA: values.get(self.SECONDARY_FILES_SCHEMA, []) + self.WORKFLOW_ARGUMENT_NAME_SCHEMA: name } + # check for secondary files + if values.get(self.SECONDARY_FILES_SCHEMA): + argument_[self.SECONDARY_FILE_FORMATS_SCHEMA] = values.get(self.SECONDARY_FILES_SCHEMA) elif type == self.QC_SCHEMA: - # handle generic vs specific QC schema - if format == self.QUALITY_METRIC_GENERIC_SCHEMA: - argument_type = self.GENERIC_QC_FILE_SCHEMA - else: - argument_type = self.OUTPUT_QC_FILE_SCHEMA + argument_type = self.GENERIC_QC_FILE_SCHEMA # create base QC argument argument_ = { self.ARGUMENT_TYPE_SCHEMA: argument_type, self.WORKFLOW_ARGUMENT_NAME_SCHEMA: name, self.ARGUMENT_TO_BE_ATTACHED_TO_SCHEMA: values[self.ARGUMENT_TO_BE_ATTACHED_TO_SCHEMA], self.QC_ZIPPED_SCHEMA: values.get(self.ZIPPED_SCHEMA, False), - self.QC_HTML_SCHEMA: values.get(self.HTML_SCHEMA, False), self.QC_JSON_SCHEMA: values.get(self.JSON_SCHEMA, False), - self.QC_TABLE_SCHEMA: values.get(self.TABLE_SCHEMA, False) } - # handle edge case for missing or generic QC type - if format not in ['none', self.QUALITY_METRIC_GENERIC_SCHEMA]: - argument_[self.QC_TYPE_SCHEMA] = format - # create argument format for generic QCs (JSON or ZIP) - elif format == self.QUALITY_METRIC_GENERIC_SCHEMA: - if argument_[self.QC_JSON_SCHEMA]: - argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'json' - else: - argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'zip' - # quality controls, TODO - # these fields are bad, need to rework how QCs work - if values.get(self.HTML_IN_ZIPPED_SCHEMA): - argument_[self.QC_ZIPPED_HTML_SCHEMA] = values[self.HTML_IN_ZIPPED_SCHEMA] - if values.get(self.TABLES_IN_ZIPPED_SCHEMA): - argument_[self.QC_ZIPPED_TABLES_SCHEMA] = values[self.TABLES_IN_ZIPPED_SCHEMA] - if values.get(self.QC_ACL): - argument_[self.QC_ACL] = values[self.QC_ACL] - if values.get(self.QC_UNZIP_FROM_EC2): - argument_[self.QC_UNZIP_FROM_EC2] = values[self.QC_UNZIP_FROM_EC2] - elif type == self.REPORT_SCHEMA and format == self.FILE_SCHEMA: + # check if it is json or zip + if argument_[self.QC_JSON_SCHEMA]: + argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'json' + else: + argument_[self.ARGUMENT_FORMAT_SCHEMA] = 'zip' + elif type == self.REPORT_SCHEMA: argument_type = self.OUTPUT_REPORT_FILE_SCHEMA argument_ = { self.ARGUMENT_TYPE_SCHEMA: argument_type, @@ -302,8 +274,8 @@ def _arguments_output(self): def to_json( self, version, - institution, # alias - project, # alias + submission_centers, # alias list + consortia, # alias list wflbucket_url ): """Function to build the corresponding object in JSON format. @@ -311,29 +283,26 @@ def to_json( wfl_json = {} # common metadata - wfl_json[self.APP_NAME_SCHEMA] = self.name # name - wfl_json[self.APP_VERSION_SCHEMA] = version # version - wfl_json[self.NAME_SCHEMA] = f'{self.name}_{version}' + wfl_json[self.VERSION_SCHEMA] = version # version + wfl_json[self.NAME_SCHEMA] = self.name wfl_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) - wfl_json[self.ALIASES_SCHEMA] = [f'{project}:{self.WORKFLOW_TYPE_SCHEMA}-{wfl_json[self.NAME_SCHEMA]}'] - wfl_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - wfl_json[self.PROJECT_SCHEMA] = self._link_project(project) + wfl_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.WORKFLOW_TYPE_SCHEMA}-{self.name}_{version}'] + wfl_json[self.CATEGORY_SCHEMA] = self.category + wfl_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + wfl_json[self.CONSORTIA_SCHEMA] = consortia wfl_json[self.DESCRIPTION_SCHEMA] = self.description - wfl_json[self.SOFTWARE_SCHEMA] = [f'{project}:{self.SOFTWARE_TYPE_SCHEMA}-{s.replace("@", "_")}' for s in getattr(self, self.SOFTWARE_SCHEMA, [])] + # check if software + if getattr(self, self.SOFTWARE_SCHEMA, None): + wfl_json[self.SOFTWARE_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{s.replace("@", "_")}' for s in getattr(self, self.SOFTWARE_SCHEMA)] wfl_json[self.ARGUMENTS_SCHEMA] = self._arguments_input() + self._arguments_output() - # workflow language (TODO) - # we need to improve tibanna to have a unique general key for this - language = self.runner.get('language') - if not language or language.lower() == 'cwl': - wfl_json['cwl_directory_url_v1'] = wflbucket_url - wfl_json['cwl_main_filename'] = self.runner['main'] - wfl_json['cwl_child_filenames'] = self.runner.get('child', []) - elif language.lower() == 'wdl': - wfl_json['wdl_directory_url'] = wflbucket_url - wfl_json['wdl_main_filename'] = self.runner['main'] - wfl_json['wdl_child_filenames'] = self.runner.get('child', []) - wfl_json['workflow_language'] = 'wdl' + # workflow language and description files + wfl_json['language'] = self.runner['language'].upper() + wfl_json['directory_url'] = wflbucket_url + wfl_json['main_file_name'] = self.runner['main'] + # check if child description files + if self.runner.get('child'): + wfl_json['child_file_names'] = self.runner.get('child') # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -360,7 +329,17 @@ class YAMLMetaWorkflow(YAMLTemplate): CONFIG_SCHEMA = 'config' DEPENDENCIES_SCHEMA = 'dependencies' SHARDS_SCHEMA = 'shards' - PROBAND_ONLY_SCHEMA = 'proband_only' + QC_THRESHOLDS_SCHEMA = 'qc_thresholds' + OVERALL_QUALITY_STATUS_RULE_SCHEMA = 'overall_quality_status_rule' + ID_SCHEMA = 'id' + METRIC_SCHEMA = 'metric' + OPERATOR_SCHEMA = 'operator' + PASS_TARGET_SCHEMA = 'pass_target' + WARN_TARGET_SCHEMA = 'warn_target' + USE_AS_QC_FLAG_SCHEMA = 'use_as_qc_flag' + RULE_SCHEMA = 'rule' + FLAG_SCHEMA = 'flag' + QC_RULE_SCHEMA = 'qc_rule' def __init__(self, data): """Constructor method. @@ -370,11 +349,11 @@ def __init__(self, data): self._validate() # load attributes for key, val in data.items(): - if key in [self.DESCRIPTION_SCHEMA, self.TITLE_SCHEMA]: + if key in [self.DESCRIPTION_SCHEMA]: val = self._clean_newline(val) setattr(self, key, val) - def _arguments(self, input, project): + def _arguments(self, input, consortia): """Helper to parse arguments and map to expected JSON structure. """ arguments = [] @@ -387,7 +366,7 @@ def _arguments(self, input, project): if type == self.PARAMETER_SCHEMA: argument_[self.VALUE_TYPE_SCHEMA] = format for k, v in values.items(): - if k != self.ARGUMENT_TYPE_SCHEMA: + if k not in [self.ARGUMENT_TYPE_SCHEMA, self.QC_RULE_SCHEMA]: # handle files specifications, TODO # this system could be improved in how the schema works and deals with types # @@ -399,29 +378,56 @@ def _arguments(self, input, project): # - bar@v3 # need to convert to: # files: [ - # {file: ':FileReference-foo_v1'} + # {file: ':ReferenceFile-foo_v1'} # ] # ----- or ------- # files: [ - # {file: ':FileReference-foo_v1', dimension: '0'}, - # {file: ':FileReference-bar_v3', dimension: '1'} + # {file: ':ReferenceFile-foo_v1', dimension: '0'}, + # {file: ':ReferenceFile-bar_v3', dimension: '1'} # ] if k == self.FILES_SCHEMA: v_ = [] for i, name_ in enumerate(v): - v_.append({self.FILE_SCHEMA: f'{project}:{self.FILEREFERENCE_TYPE_SCHEMA}-{name_.replace("@", "_")}', + v_.append({self.FILE_SCHEMA: f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{name_.replace("@", "_")}', self.DIMENSION_SCHEMA: str(i)}) # remove DIMENSION_SCHEMA field if only one file + # this is necessary so the file will be posted as a string and not a list + # having a list will break tibanna creating the correct input for cwltool if len(v_) == 1: del v_[0][self.DIMENSION_SCHEMA] argument_.setdefault(k, v_) + elif k == self.QC_THRESHOLDS_SCHEMA: + v_ = { + self.QC_THRESHOLDS_SCHEMA: [], + self.OVERALL_QUALITY_STATUS_RULE_SCHEMA: values[self.QC_RULE_SCHEMA] + } + for id, rule in v.items(): + metric, operator, pass_target, warn_target = rule[self.RULE_SCHEMA].split('|') + flag = rule.get(self.FLAG_SCHEMA) + # convert to float if number + try: pass_target = float(pass_target) + except ValueError: pass + try: warn_target = float(warn_target) + except ValueError: pass + # format rule + rule_ = { + self.ID_SCHEMA: id, + self.METRIC_SCHEMA: metric, + self.OPERATOR_SCHEMA: operator, + self.PASS_TARGET_SCHEMA: pass_target, + self.WARN_TARGET_SCHEMA: warn_target + } + if flag: # add use as flag if present + rule_[self.USE_AS_QC_FLAG_SCHEMA] = flag + v_[self.QC_THRESHOLDS_SCHEMA].append(rule_) + argument_.setdefault(self.VALUE_SCHEMA, v_) else: argument_.setdefault(k, v) arguments.append(argument_) return arguments - def _workflows(self, version, project): + def _workflows(self, version, consortia): """Helper to parse workflow definitions and map to expected JSON structure. """ workflows = [] @@ -435,9 +441,9 @@ def _workflows(self, version, project): # basic JSON workflow structure workflow_ = { self.NAME_SCHEMA: name, - self.WORKFLOW_SCHEMA: f'{project}:{self.WORKFLOW_TYPE_SCHEMA}-{name.split("@")[0]}_{version_}', + self.WORKFLOW_SCHEMA: f'{self._string_consortia(consortia)}:{self.WORKFLOW_TYPE_SCHEMA}-{name.split("@")[0]}_{version_}', # remove unique tag after @ to create the right alias to link - self.INPUT_SCHEMA: self._arguments(values[self.INPUT_SCHEMA], project), + self.INPUT_SCHEMA: self._arguments(values[self.INPUT_SCHEMA], consortia), self.CONFIG_SCHEMA: values[self.CONFIG_SCHEMA] } # file output can be optional @@ -457,8 +463,8 @@ def _workflows(self, version, project): def to_json( self, version, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ @@ -468,16 +474,13 @@ def to_json( metawfl_json[self.NAME_SCHEMA] = self.name metawfl_json[self.VERSION_SCHEMA] = version # version metawfl_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) - metawfl_json[self.ALIASES_SCHEMA] = [f'{project}:{self.METAWORKFLOW_TYPE_SCHEMA}-{self.name}_{version}'] - metawfl_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - metawfl_json[self.PROJECT_SCHEMA] = self._link_project(project) + metawfl_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.METAWORKFLOW_TYPE_SCHEMA}-{self.name}_{version}'] + metawfl_json[self.CATEGORY_SCHEMA] = self.category + metawfl_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + metawfl_json[self.CONSORTIA_SCHEMA] = consortia metawfl_json[self.DESCRIPTION_SCHEMA] = self.description - metawfl_json[self.INPUT_SCHEMA] = self._arguments(self.input, project) - metawfl_json[self.WORKFLOWS_SCHEMA] = self._workflows(version, project) - - # proband_only field - if getattr(self, self.PROBAND_ONLY_SCHEMA, None): - metawfl_json[self.PROBAND_ONLY_SCHEMA] = self.proband_only + metawfl_json[self.INPUT_SCHEMA] = self._arguments(self.input, consortia) + metawfl_json[self.WORKFLOWS_SCHEMA] = self._workflows(version, consortia) # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -507,14 +510,14 @@ def __init__(self, data): self._validate() # load attributes for key, val in data.items(): - if key in [self.DESCRIPTION_SCHEMA, self.TITLE_SCHEMA]: + if key in [self.DESCRIPTION_SCHEMA]: val = self._clean_newline(val) setattr(self, key, val) def to_json( self, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ @@ -522,8 +525,9 @@ def to_json( # common metadata sftwr_json[self.NAME_SCHEMA] = self.name - sftwr_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - sftwr_json[self.PROJECT_SCHEMA] = self._link_project(project) + sftwr_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + sftwr_json[self.CONSORTIA_SCHEMA] = consortia + sftwr_json[self.CATEGORY_SCHEMA] = self.category if getattr(self, self.VERSION_SCHEMA, None): sftwr_json[self.VERSION_SCHEMA] = self.version @@ -538,7 +542,7 @@ def to_json( sftwr_json[self.SOURCE_URL_SCHEMA] = self.source_url sftwr_json[self.TITLE_SCHEMA] = self._link_title(self.name, version) - sftwr_json[self.ALIASES_SCHEMA] = [f'{project}:{self.SOFTWARE_TYPE_SCHEMA}-{self.name}_{version}'] + sftwr_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.SOFTWARE_TYPE_SCHEMA}-{self.name}_{version}'] # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -554,19 +558,21 @@ def to_json( ############################################################### -# YAMLFileReference, YAML FileReference +# YAMLReferenceFile, YAML ReferenceFile ############################################################### -class YAMLFileReference(YAMLTemplate): - """Class to work with YAML documents representing FileReference objects. +class YAMLReferenceFile(YAMLTemplate): + """Class to work with YAML documents representing ReferenceFile objects. """ # schema constants EXTRA_FILES_SCHEMA = 'extra_files' + DATA_CATEGORY_SCHEMA = 'data_category' + DATA_TYPE_SCHEMA = 'data_type' def __init__(self, data): """Constructor method. """ - super().__init__(data, yaml_file_reference_schema) + super().__init__(data, yaml_reference_file_schema) # validate data with schema self._validate() # load attributes @@ -577,24 +583,31 @@ def __init__(self, data): def to_json( self, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ ref_json = {} # common metadata - ref_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - ref_json[self.PROJECT_SCHEMA] = self._link_project(project) + ref_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + ref_json[self.CONSORTIA_SCHEMA] = consortia ref_json[self.DESCRIPTION_SCHEMA] = self.description ref_json[self.FILE_FORMAT_SCHEMA] = self.format - ref_json[self.ALIASES_SCHEMA] = [f'{project}:{self.FILEREFERENCE_TYPE_SCHEMA}-{self.name}_{self.version}'] - ref_json[self.EXTRA_FILES_SCHEMA] = getattr(self, self.SECONDARY_FILES_SCHEMA, []) + ref_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.REFERENCEFILE_TYPE_SCHEMA}-{self.name}_{self.version}'] + # check for secondary files + if getattr(self, self.SECONDARY_FILES_SCHEMA, None): + ref_json[self.EXTRA_FILES_SCHEMA] = getattr(self, self.SECONDARY_FILES_SCHEMA) ref_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, None) # this will be used during post/patch, # if None: # - leave it as is if patch # - set to uploading if post + ref_json[self.DATA_CATEGORY_SCHEMA] = self.category + ref_json[self.DATA_TYPE_SCHEMA] = self.type + # variant_type + if getattr(self, self.VARIANT_TYPE_SCHEMA, None): + ref_json[self.VARIANT_TYPE_SCHEMA] = self.variant_type # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): @@ -617,10 +630,11 @@ class YAMLFileFormat(YAMLTemplate): """ # schema constants + IDENTIFIER_SCHEMA = 'identifier' STANDARD_FILE_EXTENSION_SCHEMA = 'standard_file_extension' - VALID_ITEM_TYPES_SCHEMA = 'valid_item_types' - EXTRAFILE_FORMATS_SCHEMA = 'extrafile_formats' - FILE_TYPES_SCHEMA = 'file_types' + # VALID_ITEM_TYPES_SCHEMA = 'valid_item_types' + EXTRA_FILE_FORMATS_SCHEMA = 'extra_file_formats' + # FILE_TYPES_SCHEMA = 'file_types' def __init__(self, data): """Constructor method. @@ -636,23 +650,25 @@ def __init__(self, data): def to_json( self, - institution, # alias - project # alias + submission_centers, # alias list + consortia # alias list ): """Function to build the corresponding object in JSON format. """ frmt_json = {} # common metadata - frmt_json[self.FILE_FORMAT_SCHEMA] = self.name - frmt_json[self.ALIASES_SCHEMA] = [f'{project}:{self.FILEFORMAT_TYPE_SCHEMA}-{self.name}'] - frmt_json[self.INSTITUTION_SCHEMA] = self._link_institution(institution) - frmt_json[self.PROJECT_SCHEMA] = self._link_project(project) + frmt_json[self.IDENTIFIER_SCHEMA] = self.name + frmt_json[self.ALIASES_SCHEMA] = [f'{self._string_consortia(consortia)}:{self.FILEFORMAT_TYPE_SCHEMA}-{self.name}'] + frmt_json[self.SUBMISSION_CENTERS_SCHEMA] = submission_centers + frmt_json[self.CONSORTIA_SCHEMA] = consortia frmt_json[self.DESCRIPTION_SCHEMA] = self.description frmt_json[self.STANDARD_FILE_EXTENSION_SCHEMA] = self.extension - frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['FileReference', 'FileProcessed']) - frmt_json[self.EXTRAFILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA, []) - frmt_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, 'shared') + # frmt_json[self.VALID_ITEM_TYPES_SCHEMA] = getattr(self, self.FILE_TYPES_SCHEMA, ['ReferenceFile', 'FileProcessed']) + # check for secondary formats + if getattr(self, self.SECONDARY_FORMATS_SCHEMA, None): + frmt_json[self.EXTRA_FILE_FORMATS_SCHEMA] = getattr(self, self.SECONDARY_FORMATS_SCHEMA) + frmt_json[self.STATUS_SCHEMA] = getattr(self, self.STATUS_SCHEMA, 'released') # uuid, accession if specified if getattr(self, self.UUID_SCHEMA, None): diff --git a/pipeline_utils/pipeline_deploy.py b/pipeline_utils/pipeline_deploy.py index 0d4e5d2..54944ed 100644 --- a/pipeline_utils/pipeline_deploy.py +++ b/pipeline_utils/pipeline_deploy.py @@ -75,7 +75,7 @@ def __init__(self, args, repo, version='VERSION', pipeline='PIPELINE'): self.object_ = { 'Software': yaml_parser.YAMLSoftware, 'FileFormat': yaml_parser.YAMLFileFormat, - 'FileReference': yaml_parser.YAMLFileReference, + 'ReferenceFile': yaml_parser.YAMLReferenceFile, 'Workflow': yaml_parser.YAMLWorkflow, 'MetaWorkflow': yaml_parser.YAMLMetaWorkflow } @@ -83,11 +83,11 @@ def __init__(self, args, repo, version='VERSION', pipeline='PIPELINE'): # .yaml files 'Software': 'portal_objects/software.yaml', 'FileFormat': 'portal_objects/file_format.yaml', - 'FileReference': 'portal_objects/file_reference.yaml', + 'ReferenceFile': 'portal_objects/file_reference.yaml', # .yml files 'Software_yml': 'portal_objects/software.yml', 'FileFormat_yml': 'portal_objects/file_format.yml', - 'FileReference_yml': 'portal_objects/file_reference.yml', + 'ReferenceFile_yml': 'portal_objects/file_reference.yml', # folders 'Workflow': 'portal_objects/workflows', 'MetaWorkflow': 'portal_objects/metaworkflows', @@ -141,11 +141,11 @@ def _post_patch_json(self, data_json, type): except Exception: is_patch = False - # Exception for uploading of FileReference objects + # Exception for uploading of ReferenceFile objects # status -> uploading, uploaded # default is None -> the status will not be updated during patch, # and set to uploading if post for the first time - if type == 'FileReference': + if type == 'ReferenceFile': # main status if data_json['status'] is None: if is_patch: @@ -154,20 +154,27 @@ def _post_patch_json(self, data_json, type): data_json['status'] = 'uploading' # extra_files status - extra_files_ = [] - for ext in data_json['extra_files']: - ext_ = { - 'file_format': ext, - 'status': data_json.get('status', 'uploaded') - } - extra_files_.append(ext_) - data_json['extra_files'] = extra_files_ + if data_json.get('extra_files'): + extra_files_ = [] + for ext in data_json['extra_files']: + ext_ = { + 'file_format': ext, + 'status': data_json.get('status', 'uploaded') + } + extra_files_.append(ext_) + data_json['extra_files'] = extra_files_ ########################################################### - if is_patch: - ff_utils.patch_metadata(data_json, uuid, key=self.ff_key) - else: - ff_utils.post_metadata(data_json, type, key=self.ff_key) + try: + if is_patch: + ff_utils.patch_metadata(data_json, uuid, key=self.ff_key) + else: + ff_utils.post_metadata(data_json, type, key=self.ff_key) + except Exception as E: + # this will skip and report errors during patching and posting + logger.info('> FAILED PORTAL VALIDATION') + logger.info(E) + pass logger.info('> Posted %s' % data_json['aliases'][0]) @@ -199,7 +206,7 @@ def _yaml_to_json(self, data_yaml, YAMLClass, **kwargs): def _post_patch_file(self, type): """ - 'Software', 'FileFormat', 'FileReference' + 'Software', 'FileFormat', 'ReferenceFile' """ logger.info(f'@ {type}...') @@ -221,8 +228,8 @@ def _post_patch_file(self, type): # creating JSON object d_ = self._yaml_to_json( d, self.object_[type], - institution=self.institution, - project=self.project + submission_centers=self.submission_centers, + consortia=self.consortia ) # post/patch object if d_: self._post_patch_json(d_, type) @@ -250,8 +257,8 @@ def _post_patch_folder(self, type): # creating _yaml_to_json **kwargs kwargs_ = { 'version': self.version, - 'institution': self.institution, - 'project': self.project + 'submission_centers': self.submission_centers, + 'consortia': self.consortia } if type == 'Workflow': kwargs_.setdefault( @@ -275,7 +282,7 @@ def _post_patch_wfl(self, type='WFL'): filepath_ = f'{self.repo}/{self.filepath[type]}' upload_ = f'{filepath_}/upload' account_ = f'{self.account}.dkr.ecr.{self.region}.amazonaws.com' - update_ = { + auth_keys_ = { 'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': self.kms_key_id } @@ -316,10 +323,10 @@ def _post_patch_wfl(self, type='WFL'): line = line.replace('LICENSEID', self.sentieon_server) write_.write(line) # upload to s3 - extra_args = {'ACL': 'public-read'} # note that this is no longer public if using encryption! if self.kms_key_id: - extra_args.update(update_) - s3.meta.client.upload_file(upload_file_, self.wfl_bucket, s3_file_, ExtraArgs=extra_args) + s3.meta.client.upload_file(upload_file_, self.wfl_bucket, s3_file_, ExtraArgs=auth_keys_) + else: # no kms_key_id, ExtraArgs not needed + s3.meta.client.upload_file(upload_file_, self.wfl_bucket, s3_file_) logger.info('> Posted %s' % s3_file_) # delete file to allow tmp folder to be deleted at the end os.remove(upload_file_) @@ -403,9 +410,9 @@ def run_post_patch(self): if self.post_file_format: self._post_patch_file('FileFormat') - # FileReference + # ReferenceFile if self.post_file_reference: - self._post_patch_file('FileReference') + self._post_patch_file('ReferenceFile') # Workflow if self.post_workflow: @@ -432,7 +439,7 @@ def main(args): For each repository a PostPatchRepo object is created to: - Create and POST|PATCH to database objects in JSON format for - Workflow, MetaWorkflow, FileReference, FileFormat, and Software components + Workflow, MetaWorkflow, ReferenceFile, FileFormat, and Software components - PUSH workflow descriptions to target S3 bucket - BUILD Docker images and PUSH to target ECR folder """ diff --git a/pipeline_utils/schemas/yaml_file_format.py b/pipeline_utils/schemas/yaml_file_format.py index ec904e0..023c5d1 100644 --- a/pipeline_utils/schemas/yaml_file_format.py +++ b/pipeline_utils/schemas/yaml_file_format.py @@ -20,14 +20,14 @@ schema.DESCRIPTION: 'Extension of the FileFormat', schema.TYPE: schema.STRING }, - 'file_types': { - schema.DESCRIPTION: 'File types that can use the FileFormat', - schema.TYPE: schema.ARRAY, - schema.ITEMS: { - schema.TYPE: schema.STRING, - schema.PATTERN: 'FileReference|FileProcessed|FileSubmitted|FileFastq' - } - }, + # 'file_types': { + # schema.DESCRIPTION: 'File types that can use the FileFormat', + # schema.TYPE: schema.ARRAY, + # schema.ITEMS: { + # schema.TYPE: schema.STRING, + # schema.PATTERN: 'ReferenceFile|FileProcessed|FileSubmitted|FileFastq' + # } + # }, 'status': { schema.TYPE: schema.STRING }, diff --git a/pipeline_utils/schemas/yaml_file_reference.py b/pipeline_utils/schemas/yaml_file_reference.py deleted file mode 100644 index 7641d5c..0000000 --- a/pipeline_utils/schemas/yaml_file_reference.py +++ /dev/null @@ -1,45 +0,0 @@ -from pipeline_utils.schemas import schema - -yaml_file_reference_schema = { - ## Schema ######################### - schema.SCHEMA: 'https://json-schema.org/draft/2020-12/schema', - schema.ID: '/schemas/YAMLFileReference', - schema.TITLE: 'YAMLFileReference', - schema.DESCRIPTION: 'Schema to validate a YAML description of a FileReference', - schema.TYPE: schema.OBJECT, - schema.PROPERTIES: { - 'name': { - schema.DESCRIPTION: 'Name of the FileReference', - schema.TYPE: schema.STRING - }, - 'description': { - schema.DESCRIPTION: 'Description of the FileReference', - schema.TYPE: schema.STRING - }, - 'format': { - schema.DESCRIPTION: 'Format of the FileReference', - schema.TYPE: schema.STRING - }, - 'version': { - schema.DESCRIPTION: 'Version of the FileReference', - schema.TYPE: schema.STRING - }, - 'status': { - schema.DESCRIPTION: 'Status of the upload of the FileReference', - schema.TYPE: schema.STRING, - schema.PATTERN: 'uploading|uploaded' - }, - 'secondary_files': { - schema.DESCRIPTION: 'Secondary files for the FileReference', - schema.TYPE: schema.ARRAY, - schema.ITEMS: { - schema.TYPE: schema.STRING - } - }, - 'license': { - schema.DESCRIPTION: 'License of the FileReference', - schema.TYPE: schema.STRING - } - }, - schema.REQUIRED: ['name', 'description', 'format', 'version'] -} diff --git a/pipeline_utils/schemas/yaml_metaworkflow.py b/pipeline_utils/schemas/yaml_metaworkflow.py index ab4ebaa..743e1b2 100644 --- a/pipeline_utils/schemas/yaml_metaworkflow.py +++ b/pipeline_utils/schemas/yaml_metaworkflow.py @@ -22,6 +22,13 @@ schema.DESCRIPTION: 'Description of the MetaWorkflow', schema.TYPE: schema.STRING }, + 'category': { + schema.DESCRIPTION: 'Categories of the MetaWorkflow', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, ## General input information ## 'input': { @@ -76,7 +83,7 @@ } } }, - schema.REQUIRED: ['name', 'description', 'input', 'workflows'], + schema.REQUIRED: ['name', 'description', 'category', 'input', 'workflows'], ## Sub-schemas #################### schema.DEFS: { @@ -87,7 +94,7 @@ schema.PROPERTIES: { 'argument_type': { schema.TYPE: schema.STRING, - schema.PATTERN: '^file\..+|^parameter\..+' + schema.PATTERN: '^file\\..+|^parameter\\..+' }, 'dimensionality': { schema.TYPE: schema.NUMBER @@ -96,7 +103,7 @@ schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: '.+\@.+' # check for @ + schema.PATTERN: '.+\\@.+' # check for @ } }, 'source': { @@ -125,10 +132,30 @@ }, 'rename': { schema.TYPE: schema.STRING, - schema.PATTERN: '^formula\:.+' + schema.PATTERN: '^formula\\:.+' }, 'unzip': { schema.TYPE: schema.STRING + }, + 'qc_thresholds': { + schema.TYPE: schema.OBJECT, + schema.PATTERNPROPERTIES: { + '.+': { + schema.TYPE: schema.OBJECT, + schema.PROPERTIES: { + 'rule': { + schema.TYPE: schema.STRING, + schema.PATTERN: '^([^|]+\\|[^|]+\\|[^|]+\\|[^|]+)$' + }, + 'flag': { + schema.TYPE: schema.BOOLEAN + } + } + } + } + }, + 'qc_rule': { + schema.TYPE: schema.STRING } }, schema.REQUIRED: ['argument_type'] @@ -141,30 +168,30 @@ 'description': { schema.TYPE: schema.STRING }, - 'linkto_location': { + 'data_category': { schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING } }, - 'file_type': { - schema.TYPE: schema.STRING - }, - 'higlass_file': { - schema.TYPE: schema.BOOLEAN + 'data_type': { + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } }, 'variant_type': { - schema.TYPE: schema.STRING - }, - 'vcf_to_ingest': { - schema.TYPE: schema.BOOLEAN + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } }, 's3_lifecycle_category': { schema.TYPE: schema.STRING, schema.PATTERN: 'short_term_access_long_term_archive|short_term_access|short_term_archive|long_term_access_long_term_archive|long_term_access|long_term_archive|no_storage|ignore' } }, - schema.REQUIRED: ['file_type'] + schema.REQUIRED: ['data_category', 'data_type'] } } } diff --git a/pipeline_utils/schemas/yaml_reference_file.py b/pipeline_utils/schemas/yaml_reference_file.py new file mode 100644 index 0000000..7e19a99 --- /dev/null +++ b/pipeline_utils/schemas/yaml_reference_file.py @@ -0,0 +1,66 @@ +from pipeline_utils.schemas import schema + +yaml_reference_file_schema = { + ## Schema ######################### + schema.SCHEMA: 'https://json-schema.org/draft/2020-12/schema', + schema.ID: '/schemas/YAMLReferenceFile', + schema.TITLE: 'YAMLReferenceFile', + schema.DESCRIPTION: 'Schema to validate a YAML description of a ReferenceFile', + schema.TYPE: schema.OBJECT, + schema.PROPERTIES: { + 'name': { + schema.DESCRIPTION: 'Name of the ReferenceFile', + schema.TYPE: schema.STRING + }, + 'description': { + schema.DESCRIPTION: 'Description of the ReferenceFile', + schema.TYPE: schema.STRING + }, + 'format': { + schema.DESCRIPTION: 'Format of the ReferenceFile', + schema.TYPE: schema.STRING + }, + 'category': { + schema.DESCRIPTION: 'Categories of the ReferenceFile', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, + 'type': { + schema.DESCRIPTION: 'Types of the ReferenceFile', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, + 'variant_type': { + schema.DESCRIPTION: 'Types of variants in ReferenceFile', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, + 'version': { + schema.DESCRIPTION: 'Version of the ReferenceFile', + schema.TYPE: schema.STRING + }, + 'status': { + schema.DESCRIPTION: 'Status of the upload of the ReferenceFile', + schema.TYPE: schema.STRING, + schema.PATTERN: 'uploading|uploaded' + }, + 'secondary_files': { + schema.DESCRIPTION: 'Secondary files for the ReferenceFile', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } + }, + 'license': { + schema.DESCRIPTION: 'License of the ReferenceFile', + schema.TYPE: schema.STRING + } + }, + schema.REQUIRED: ['name', 'description', 'format', 'category', 'type', 'version'] +} diff --git a/pipeline_utils/schemas/yaml_software.py b/pipeline_utils/schemas/yaml_software.py index 3afb69f..2750536 100644 --- a/pipeline_utils/schemas/yaml_software.py +++ b/pipeline_utils/schemas/yaml_software.py @@ -20,7 +20,7 @@ schema.DESCRIPTION: 'Source url of the Software', schema.TYPE: schema.STRING, schema.FORMAT: 'uri', - schema.PATTERN: '^https?\:.+' + schema.PATTERN: '^https?\\:.+' }, 'description': { schema.DESCRIPTION: 'Description of the Software', @@ -37,9 +37,16 @@ 'license': { schema.DESCRIPTION: 'License of the Software', schema.TYPE: schema.STRING + }, + 'category': { + schema.DESCRIPTION: 'Categories of the Software', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING + } } }, - schema.REQUIRED: ['name'], + schema.REQUIRED: ['name', 'category'], schema.ONEOF: [ {schema.REQUIRED: ['version']}, {schema.REQUIRED: ['commit']} diff --git a/pipeline_utils/schemas/yaml_workflow.py b/pipeline_utils/schemas/yaml_workflow.py index 3d7a19e..98d87be 100644 --- a/pipeline_utils/schemas/yaml_workflow.py +++ b/pipeline_utils/schemas/yaml_workflow.py @@ -34,14 +34,14 @@ 'main': { schema.DESCRIPTION: 'Main description file', schema.TYPE: schema.STRING, - schema.PATTERN: '.+\.cwl|.+\.wdl' + schema.PATTERN: '.+\\.cwl|.+\\.wdl' }, 'child': { schema.DESCRIPTION: 'Supplementary description files used by main', schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: '.+\.cwl|.+\.wdl' + schema.PATTERN: '.+\\.cwl|.+\\.wdl' } } }, @@ -52,7 +52,14 @@ schema.TYPE: schema.ARRAY, schema.ITEMS: { schema.TYPE: schema.STRING, - schema.PATTERN: '.+\@.+' # check for @ + schema.PATTERN: '.+\\@.+' # check for @ + } + }, + 'category': { + schema.DESCRIPTION: 'Categories of the Workflow', + schema.TYPE: schema.ARRAY, + schema.ITEMS: { + schema.TYPE: schema.STRING } }, @@ -74,7 +81,7 @@ } } }, - schema.REQUIRED: ['name', 'description', 'runner', 'input', 'output'], + schema.REQUIRED: ['name', 'description', 'runner', 'category', 'input', 'output'], ## Sub-schemas #################### schema.DEFS: { @@ -85,7 +92,7 @@ schema.PROPERTIES: { 'argument_type': { schema.TYPE: schema.STRING, - schema.PATTERN: '^file\..+|^parameter\..+|^qc\..+|^report\..+' + schema.PATTERN: '^file\\..+|^parameter\\..+|^qc$|^report$' }, 'secondary_files': { schema.TYPE: schema.ARRAY, @@ -101,7 +108,7 @@ schema.TYPE: schema.OBJECT, schema.PROPERTIES: { 'argument_type': { - schema.PATTERN: '^qc\..+' + schema.PATTERN: '^qc\\..+' } }, }, @@ -113,23 +120,8 @@ 'zipped': { schema.TYPE: schema.BOOLEAN }, - 'html': { - schema.TYPE: schema.BOOLEAN - }, 'json': { schema.TYPE: schema.BOOLEAN - }, - 'table': { - schema.TYPE: schema.BOOLEAN - }, - 'html_in_zipped': { - schema.TYPE: schema.STRING - }, - 'tables_in_zipped': { - schema.TYPE: schema.ARRAY, - schema.ITEMS: { - schema.TYPE: schema.STRING - } } }, schema.REQUIRED: ['argument_to_be_attached_to'] diff --git a/pyproject.toml b/pyproject.toml index 54842e4..56472c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] -name = "portal-pipeline-utils" -version = "3.0.0" -description = "Utilities for deploying pipelines and interfacing with portal infrastructure." +name = "smaht-pipeline-utils" +version = "0.0.1" +description = "Utilities for deploying pipelines and interfacing with SMaHT portal infrastructure." authors = [ "Michele Berselli ", - "CGAP & SMaHT Team" + "SMaHT Team" ] license = "MIT" -repository = 'https://github.com/dbmi-bgm/portal-pipeline-utils' +repository = 'https://github.com/smaht-dac/smaht-pipeline-utils' readme = "README.md" classifiers = [ 'License :: OSI Approved :: MIT License', @@ -35,7 +35,7 @@ boto3-stubs = "^1.28.62" [tool.poetry.scripts] -pipeline_utils = "pipeline_utils.__main__:main" +smaht_pipeline_utils = "pipeline_utils.__main__:main" publish-to-pypi = "dcicutils.scripts.publish_to_pypi:main" diff --git a/tests/repo_correct/portal_objects/file_format.yaml b/tests/repo_correct/portal_objects/file_format.yaml index 52bb0df..80ab7b9 100644 --- a/tests/repo_correct/portal_objects/file_format.yaml +++ b/tests/repo_correct/portal_objects/file_format.yaml @@ -5,10 +5,10 @@ extension: bam description: format to represent aligned reads secondary_formats: - bai -file_types: - - FileReference - - FileProcessed -status: shared +# file_types: +# - FileReference +# - FileProcessed +status: obsolete --- diff --git a/tests/repo_correct/portal_objects/file_reference.yaml b/tests/repo_correct/portal_objects/file_reference.yaml index 2bc0101..73c1442 100644 --- a/tests/repo_correct/portal_objects/file_reference.yaml +++ b/tests/repo_correct/portal_objects/file_reference.yaml @@ -12,11 +12,21 @@ secondary_files: status: uploading uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe7 accession: GAPFIXRDPDK5 +category: + - Sequencing Reads +type: + - Unaligned Reads +variant_type: + - SNV --- # hg38 fasta (MINIMAL) name: reference_genome +category: + - Sequencing Reads +type: + - Aligned Reads description: hg38 full reference genome plus decoy for CGAP, fasta format format: fa version: hg38 diff --git a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index 47d57a6..1fc09d7 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -18,6 +18,10 @@ input: argument_type: parameter.json value: ['SAMPLENAME'] +category: + - Alignment + - Format Conversion + workflows: ########################################## # gatk-HC @@ -43,7 +47,10 @@ workflows: output: HC_vcf: description: output from gatk-HC - file_type: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads linkto_location: - SampleProcessing # gatk-HC config @@ -72,9 +79,13 @@ workflows: output: GT_vcf: description: output from gatk-GT - file_type: GT-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads higlass_file: True - variant_type: SNV + variant_type: + - SNV # gatk-GT config config: ebs_size: 3x diff --git a/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml b/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml index 22f312f..1e0d92c 100644 --- a/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml +++ b/tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml @@ -7,6 +7,9 @@ description: Pipeline to run gatk-HC to call variants uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 +category: + - Variant Calling + input: input_vcf: argument_type: file.vcf @@ -39,7 +42,11 @@ workflows: # gatk-HC output output: HC_vcf: - file_type: hc-vcf + description: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads # gatk-HC config config: ebs_size: 2x diff --git a/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml b/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml new file mode 100644 index 0000000..eb54676 --- /dev/null +++ b/tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml @@ -0,0 +1,72 @@ + +# gatk-HC-GT-pipeline (MINIMAL) +# + uuid +# + accession +name: gatk-HC-pipeline +description: Pipeline to run gatk-HC to call variants +uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 +accession: GAPFIXRDPDK1 + +category: + - Variant Calling + +input: + input_vcf: + argument_type: file.vcf + + reference: + argument_type: file.fa + files: + - reference_genome@hg38 + - reference_bam@hg38 + + samples: + argument_type: parameter.json + + qc_ruleset_name_1: + argument_type: parameter.qc_ruleset + qc_thresholds: + c1: + rule: coverage|>=|100|80 + flag: True + c2: + rule: coverage|<=|200|180 + c3: + rule: coverage|>|80|3.3 + rl: + rule: read_length|==|PASS|NOT PASS + flag: True + qc_rule: ( {c1} and {c2} ) or not ( {c3} and {rl} ) + +workflows: + ########################################## + # gatk-HC + ########################################## + gatk-HC: + # gatk-HC input + input: + vcf: + argument_type: file.vcf + source_argument_name: input_vcf + + reference: + argument_type: file.fa + + samples: + argument_type: parameter.json + + qc_ruleset: + argument_type: parameter.qc_ruleset + source_argument_name: qc_ruleset_name_1 + # gatk-HC output + output: + HC_vcf: + description: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads + # gatk-HC config + config: + ebs_size: 2x + ec2_type: m.5xlarge diff --git a/tests/repo_correct/portal_objects/software.yaml b/tests/repo_correct/portal_objects/software.yaml index 9a0463b..4ea9c7d 100644 --- a/tests/repo_correct/portal_objects/software.yaml +++ b/tests/repo_correct/portal_objects/software.yaml @@ -5,6 +5,8 @@ version: 4.1.2 title: gatk 4.1.2 source_url: 'http:/broad' description: gatk software package +category: + - Aligner --- @@ -12,6 +14,8 @@ description: gatk software package # + uuid # + accession name: picard +category: + - Variant Caller commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K diff --git a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml index 6c06db7..717dd5b 100644 --- a/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml @@ -3,6 +3,9 @@ name: gatk-HaplotypeCaller description: Run HaplotypeCaller from gatk package +category: + - Annotation + runner: language: wdl main: workflow_gatk-HaplotypeCaller-check.wdl @@ -30,7 +33,7 @@ output: - vcf_gz_tbi vcfcheck: - argument_type: qc.quality_metric_vcfcheck + argument_type: qc argument_to_be_attached_to: output_vcf zipped: False json: True diff --git a/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml b/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml index 7885f67..9d382d4 100644 --- a/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml +++ b/tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml @@ -19,3 +19,6 @@ output: uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 + +category: + - Feature Calling diff --git a/tests/repo_error/portal_objects/file_format.yaml b/tests/repo_error/portal_objects/file_format.yaml index 03ee21c..0aa0aed 100644 --- a/tests/repo_error/portal_objects/file_format.yaml +++ b/tests/repo_error/portal_objects/file_format.yaml @@ -6,9 +6,9 @@ extension: bam description: format to represent aligned reads secondary_formats: - bai -file_types: - - FileReference - - FileProcessed +# file_types: +# - FileReference +# - FileProcessed status: shared --- diff --git a/tests/repo_error/portal_objects/file_reference.yaml b/tests/repo_error/portal_objects/file_reference.yaml index cb47497..bc64811 100644 --- a/tests/repo_error/portal_objects/file_reference.yaml +++ b/tests/repo_error/portal_objects/file_reference.yaml @@ -12,11 +12,21 @@ secondary_files: status: uploading uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe7 accession: GAPFIXRDPDK5 +category: + - Sequencing Reads +type: + - Unaligned Reads +variant_type: + - SNV --- # hg38 fasta (MINIMAL) name: reference_genome +category: + - Sequencing Reads +type: + - Aligned Reads description: hg38 full reference genome plus decoy for CGAP, fasta format format: fa version: hg38 diff --git a/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml b/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml index 1cf0506..ca13039 100644 --- a/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml +++ b/tests/repo_error/portal_objects/metaworkflows/A_gatk-HC-GT.yaml @@ -19,6 +19,10 @@ input: argument_type: parameter.json value: ['SAMPLENAME'] +category: + - Alignment + - Format Conversion + workflows: ########################################## # gatk-HC @@ -44,7 +48,10 @@ workflows: output: HC_vcf: description: output from gatk-HC - file_type: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads linkto_location: - SampleProcessing # gatk-HC config diff --git a/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml b/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml index 22f312f..1e0d92c 100644 --- a/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml +++ b/tests/repo_error/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml @@ -7,6 +7,9 @@ description: Pipeline to run gatk-HC to call variants uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 +category: + - Variant Calling + input: input_vcf: argument_type: file.vcf @@ -39,7 +42,11 @@ workflows: # gatk-HC output output: HC_vcf: - file_type: hc-vcf + description: hc-vcf + data_category: + - Sequencing Reads + data_type: + - Unaligned Reads # gatk-HC config config: ebs_size: 2x diff --git a/tests/repo_error/portal_objects/software.yaml b/tests/repo_error/portal_objects/software.yaml index 0282749..729f530 100644 --- a/tests/repo_error/portal_objects/software.yaml +++ b/tests/repo_error/portal_objects/software.yaml @@ -5,6 +5,8 @@ name: gatk title: gatk 4.1.2 source_url: 'http:/broad' description: gatk software package +category: + - Aligner --- @@ -12,6 +14,8 @@ description: gatk software package # + uuid # + accession name: picard +category: + - Variant Caller commit: 324ePT uuid: efdac7ec-7da3-4f23-9056-7a04abbc5e8b accession: GAPMKF1LL29K diff --git a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml index 6c06db7..717dd5b 100644 --- a/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/A_gatk-HC.yaml @@ -3,6 +3,9 @@ name: gatk-HaplotypeCaller description: Run HaplotypeCaller from gatk package +category: + - Annotation + runner: language: wdl main: workflow_gatk-HaplotypeCaller-check.wdl @@ -30,7 +33,7 @@ output: - vcf_gz_tbi vcfcheck: - argument_type: qc.quality_metric_vcfcheck + argument_type: qc argument_to_be_attached_to: output_vcf zipped: False json: True diff --git a/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml b/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml index fc0d14f..afb7c17 100644 --- a/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml +++ b/tests/repo_error/portal_objects/workflows/B_minimal-gatk-HC.yaml @@ -20,3 +20,6 @@ output: uuid: 1936f246-22e1-45dc-bb5c-9cfd55537fe9 accession: GAPFIXRDPDK1 + +category: + - Feature Calling diff --git a/tests/test_check_schemas.py b/tests/test_check_schemas.py index b7a5c8b..4a3bace 100644 --- a/tests/test_check_schemas.py +++ b/tests/test_check_schemas.py @@ -11,7 +11,7 @@ from pipeline_utils.schemas.yaml_workflow import yaml_workflow_schema from pipeline_utils.schemas.yaml_metaworkflow import yaml_metaworkflow_schema from pipeline_utils.schemas.yaml_software import yaml_software_schema -from pipeline_utils.schemas.yaml_file_reference import yaml_file_reference_schema +from pipeline_utils.schemas.yaml_reference_file import yaml_reference_file_schema from pipeline_utils.schemas.yaml_file_format import yaml_file_format_schema ############################################################### @@ -30,10 +30,10 @@ def test_yaml_file_format_schema(): """ Draft202012Validator.check_schema(yaml_file_format_schema) -def test_yaml_file_reference_schema(): +def test_yaml_reference_file_schema(): """ """ - Draft202012Validator.check_schema(yaml_file_reference_schema) + Draft202012Validator.check_schema(yaml_reference_file_schema) def test_yaml_software_schema(): """ diff --git a/tests/test_yaml_file_format.py b/tests/test_yaml_file_format.py index e6a8ba7..a9c920e 100644 --- a/tests/test_yaml_file_format.py +++ b/tests/test_yaml_file_format.py @@ -15,25 +15,24 @@ def test_file_format(): { "aliases": ["cgap-core:FileFormat-bam"], "description": "format to represent aligned reads", - "extrafile_formats": ["bai"], - "file_format": "bam", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", + "extra_file_formats": ["bai"], + "identifier": "bam", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "standard_file_extension": "bam", - "status": "shared", - "valid_item_types": ["FileReference", "FileProcessed"] + "status": "obsolete" + # "valid_item_types": ["ReferenceFile", "FileProcessed"] }, { "accession": 'GAPFIXRDPDK1', "aliases": ["cgap-core:FileFormat-bam_bai"], "description": "index for bam format", - "extrafile_formats": [], - "file_format": "bam_bai", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", + "identifier": "bam_bai", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "standard_file_extension": "bam.bai", - "status": "shared", - "valid_item_types": ["FileReference", "FileProcessed"], + "status": "released", + # "valid_item_types": ["ReferenceFile", "FileProcessed"], "uuid": '1936f246-22e1-45dc-bb5c-9cfd55537fe9' } ] @@ -41,8 +40,8 @@ def test_file_format(): for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/file_format.yaml')): # creating JSON object d_ = yaml_parser.YAMLFileFormat(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) # check assert d_ == res[i] @@ -55,8 +54,8 @@ def test_file_format_error(): try: # creating JSON object d_ = yaml_parser.YAMLFileFormat(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) except yaml_parser.ValidationError as e: pass diff --git a/tests/test_yaml_file_reference.py b/tests/test_yaml_file_reference.py index 797a1ce..829181d 100644 --- a/tests/test_yaml_file_reference.py +++ b/tests/test_yaml_file_reference.py @@ -14,31 +14,35 @@ def test_file_reference(): res = [ { "accession": "GAPFIXRDPDK5", - "aliases": ["cgap-core:FileReference-reference_genome_hg38"], + "aliases": ["cgap-core:ReferenceFile-reference_genome_hg38"], "description": "hg38 full reference genome plus decoy for CGAP, fasta format", "extra_files": ["fa_fai", "dict"], "file_format": "fa", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "status": "uploading", - "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7" + "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe7", + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"], + "variant_type": ["SNV"] }, { - "aliases": ["cgap-core:FileReference-reference_genome_hg38"], + "aliases": ["cgap-core:ReferenceFile-reference_genome_hg38"], "description": "hg38 full reference genome plus decoy for CGAP, fasta format", - "extra_files": [], "file_format": "fa", - "institution": "/institutions/hms-dbmi/", - "project": "/projects/cgap-core/", - "status": None + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], + "status": None, + "data_category": ["Sequencing Reads"], + "data_type": ["Aligned Reads"] } ] for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/file_reference.yaml')): # creating JSON object - d_ = yaml_parser.YAMLFileReference(d).to_json( - institution='hms-dbmi', - project='cgap-core' + d_ = yaml_parser.YAMLReferenceFile(d).to_json( + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) # check assert d_ == res[i] @@ -50,9 +54,9 @@ def test_file_reference_error(): for i, d in enumerate(yaml_parser.load_yaml('tests/repo_error/portal_objects/file_reference.yaml')): try: # creating JSON object - d_ = yaml_parser.YAMLFileReference(d).to_json( - institution='hms-dbmi', - project='cgap-core' + d_ = yaml_parser.YAMLReferenceFile(d).to_json( + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) except yaml_parser.ValidationError as e: pass diff --git a/tests/test_yaml_metaworkflow.py b/tests/test_yaml_metaworkflow.py index e390579..02f7bf5 100644 --- a/tests/test_yaml_metaworkflow.py +++ b/tests/test_yaml_metaworkflow.py @@ -16,6 +16,7 @@ def test_metaworkflow(): { "aliases": ["cgap-core:MetaWorkflow-gatk-HC-GT-pipeline_v1.0.0"], "description": "Pipeline to run gatk-HC and gatk-GT to call and genotype variants", + "category": ["Alignment", "Format Conversion"], "input": [ { "argument_name": "input_vcf", @@ -25,7 +26,7 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"file": "cgap-core:FileReference-reference_genome_hg38"}] + "files": [{"file": "cgap-core:ReferenceFile-reference_genome_hg38"}] }, { "argument_name": "samples", @@ -34,9 +35,9 @@ def test_metaworkflow(): "value_type": "json" } ], - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi", "smaht-dbmi"], "name": "gatk-HC-GT-pipeline", - "project": "/projects/cgap-core/", + "consortia": ["cgap-core"], "title": "gatk-HC and gatk-GT pipeline [v1.0.0]", "version": "v1.0.0", "workflows": [ @@ -48,10 +49,11 @@ def test_metaworkflow(): "custom_pf_fields": { "HC_vcf": { "description": "output from gatk-HC", - "file_type": "hc-vcf", "linkto_location": [ "SampleProcessing" - ] + ], + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ @@ -86,9 +88,10 @@ def test_metaworkflow(): "custom_pf_fields": { "GT_vcf": { "description": "output from gatk-GT", - "file_type": "GT-vcf", "higlass_file": True, - "variant_type": "SNV" + "variant_type": ["SNV"], + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ @@ -115,7 +118,8 @@ def test_metaworkflow(): }, { "accession": "GAPFIXRDPDK1", - "aliases": ["cgap-core:MetaWorkflow-gatk-HC-pipeline_v1.0.0"], + "category": ["Variant Calling"], + "aliases": ["cgap-core_cgap-test:MetaWorkflow-gatk-HC-pipeline_v1.0.0"], "description": "Pipeline to run gatk-HC to call variants", "input": [ { @@ -125,8 +129,8 @@ def test_metaworkflow(): { "argument_name": "reference", "argument_type": "file", - "files": [{"dimension": "0", "file": "cgap-core:FileReference-reference_genome_hg38"}, - {"dimension": "1", "file": "cgap-core:FileReference-reference_bam_hg38"}] + "files": [{"dimension": "0", "file": "cgap-core_cgap-test:ReferenceFile-reference_genome_hg38"}, + {"dimension": "1", "file": "cgap-core_cgap-test:ReferenceFile-reference_bam_hg38"}] }, { "argument_name": "samples", @@ -134,9 +138,9 @@ def test_metaworkflow(): "value_type": "json" } ], - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi"], "name": "gatk-HC-pipeline", - "project": "/projects/cgap-core/", + "consortia": ["cgap-test", "cgap-core"], "title": "gatk-HC-pipeline [v1.0.0]", "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", "version": "v1.0.0", @@ -148,7 +152,9 @@ def test_metaworkflow(): }, "custom_pf_fields": { "HC_vcf": { - "file_type": "hc-vcf" + "description": "hc-vcf", + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] } }, "input": [ @@ -168,7 +174,7 @@ def test_metaworkflow(): } ], "name": "gatk-HC", - "workflow": "cgap-core:Workflow-gatk-HC_v1.0.0" + "workflow": "cgap-core_cgap-test:Workflow-gatk-HC_v1.0.0" } ] } @@ -176,8 +182,8 @@ def test_metaworkflow(): for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/metaworkflows/A_gatk-HC-GT.yaml'): d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi", "smaht-dbmi"], + consortia=["cgap-core"], version='v1.0.0' ) # check @@ -185,13 +191,133 @@ def test_metaworkflow(): for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/metaworkflows/B_minimal-gatk-HC-GT.yaml'): d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-test", "cgap-core"], version='v1.0.0' ) # check assert d_ == res[1] +def test_qc_ruleset(): + """ + """ + res = { + "accession": "GAPFIXRDPDK1", + "category": ["Variant Calling"], + "aliases": ["cgap-core_cgap-test:MetaWorkflow-gatk-HC-pipeline_v1.0.0"], + "description": "Pipeline to run gatk-HC to call variants", + "input": [ + { + "argument_name": "input_vcf", + "argument_type": "file" + }, + { + "argument_name": "reference", + "argument_type": "file", + "files": [{"dimension": "0", "file": "cgap-core_cgap-test:ReferenceFile-reference_genome_hg38"}, + {"dimension": "1", "file": "cgap-core_cgap-test:ReferenceFile-reference_bam_hg38"}] + }, + { + "argument_name": "samples", + "argument_type": "parameter", + "value_type": "json" + }, + { + "argument_name": "qc_ruleset_name_1", + "argument_type": "parameter", + "value_type": "qc_ruleset", + "value": { + "qc_thresholds": [ + { + "id": "c1", + "metric": "coverage", + "operator": ">=", + "pass_target": 100.0, + "warn_target": 80.0, + "use_as_qc_flag": True + }, + { + "id": "c2", + "metric": "coverage", + "operator": "<=", + "pass_target": 200.0, + "warn_target": 180.0, + }, + { + "id": "c3", + "metric": "coverage", + "operator": ">", + "pass_target": 80.0, + "warn_target": 3.3 + }, + { + "id": "rl", + "metric": "read_length", + "operator": "==", + "pass_target": "PASS", + "warn_target": "NOT PASS", + "use_as_qc_flag": True + } + ], + "overall_quality_status_rule": "( {c1} and {c2} ) or not ( {c3} and {rl} )" + } + } + ], + "submission_centers": ["hms-dbmi"], + "name": "gatk-HC-pipeline", + "consortia": ["cgap-test", "cgap-core"], + "title": "gatk-HC-pipeline [v1.0.0]", + "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", + "version": "v1.0.0", + "workflows": [ + { + "config": { + "ebs_size": "2x", + "ec2_type": "m.5xlarge" + }, + "custom_pf_fields": { + "HC_vcf": { + "description": "hc-vcf", + "data_category": ["Sequencing Reads"], + "data_type": ["Unaligned Reads"] + } + }, + "input": [ + { + "argument_name": "vcf", + "argument_type": "file", + "source_argument_name": "input_vcf" + }, + { + "argument_name": "reference", + "argument_type": "file" + }, + { + "argument_name": "samples", + "argument_type": "parameter", + "value_type": "json" + }, + { + 'argument_name': 'qc_ruleset', + 'argument_type': 'parameter', + 'value_type': 'qc_ruleset', + 'source_argument_name': 'qc_ruleset_name_1' + } + ], + "name": "gatk-HC", + "workflow": "cgap-core_cgap-test:Workflow-gatk-HC_v1.0.0" + } + ] + } + + for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/metaworkflows/QC_test.yaml'): + d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( + submission_centers=["hms-dbmi"], + consortia=["cgap-test", "cgap-core"], + version='v1.0.0' + ) + # check + assert d_ == res def test_metaworkflow_error(): """ @@ -201,8 +327,8 @@ def test_metaworkflow_error(): for d in yaml_parser.load_yaml(fn): try: d_ = yaml_parser.YAMLMetaWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0' ) except yaml_parser.ValidationError as e: diff --git a/tests/test_yaml_software.py b/tests/test_yaml_software.py index acedf61..f1b5cf0 100644 --- a/tests/test_yaml_software.py +++ b/tests/test_yaml_software.py @@ -15,30 +15,32 @@ def test_software(): { "aliases": ["cgap-core:Software-gatk_4.1.2"], "description": "gatk software package", - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi"], "name": "gatk", - "project": "/projects/cgap-core/", + "consortia": ["cgap-core"], "source_url": "http:/broad", "title": "gatk 4.1.2", - "version": "4.1.2" + "version": "4.1.2", + "category": ["Aligner"] }, { "accession": "GAPMKF1LL29K", "aliases": ["cgap-core:Software-picard_324ePT"], "commit": "324ePT", - "institution": "/institutions/hms-dbmi/", + "submission_centers": ["hms-dbmi"], "name": "picard", - "project": "/projects/cgap-core/", + "consortia": ["cgap-core"], "title": "picard [324ePT]", - "uuid": "efdac7ec-7da3-4f23-9056-7a04abbc5e8b" + "uuid": "efdac7ec-7da3-4f23-9056-7a04abbc5e8b", + "category": ["Variant Caller"] } ] for i, d in enumerate(yaml_parser.load_yaml('tests/repo_correct/portal_objects/software.yaml')): # creating JSON object d_ = yaml_parser.YAMLSoftware(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) # check assert d_ == res[i] @@ -51,8 +53,8 @@ def test_software_error(): try: # creating JSON object d_ = yaml_parser.YAMLSoftware(d).to_json( - institution='hms-dbmi', - project='cgap-core' + submission_centers=["hms-dbmi"], + consortia=["cgap-core"] ) except yaml_parser.ValidationError as e: pass diff --git a/tests/test_yaml_workflow.py b/tests/test_yaml_workflow.py index fcb79d5..6c09c1a 100644 --- a/tests/test_yaml_workflow.py +++ b/tests/test_yaml_workflow.py @@ -15,8 +15,9 @@ def test_workflow(): res = [ { "aliases": ["cgap-core:Workflow-gatk-HaplotypeCaller_v1.0.0"], - "app_name": "gatk-HaplotypeCaller", - "app_version": "v1.0.0", + "name": "gatk-HaplotypeCaller", + "version": "v1.0.0", + "category": ["Annotation"], "arguments": [ { "argument_format": "bam", @@ -37,37 +38,35 @@ def test_workflow(): }, { "argument_to_be_attached_to": "output_vcf", - "argument_type": "Output QC file", - "qc_html": False, + "argument_type": "Generic QC file", "qc_json": True, - "qc_table": False, - "qc_type": "quality_metric_vcfcheck", "qc_zipped": False, - "workflow_argument_name": "vcfcheck" + "workflow_argument_name": "vcfcheck", + "argument_format": "json" } ], "description": "Run HaplotypeCaller from gatk package", - "institution": "/institutions/hms-dbmi/", - "name": "gatk-HaplotypeCaller_v1.0.0", - "project": "/projects/cgap-core/", + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "software": [ "cgap-core:Software-gatk_4.2.1", "cgap-core:Software-vcf-tools_5A63Aa1" ], "title": "HaplotypeCaller plus integity-check [v1.0.0]", - "wdl_child_filenames": [ + "child_file_names": [ "gatk-HaplotypeCaller.wdl", "integrity-check.wdl" ], - "wdl_directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", - "wdl_main_filename": "workflow_gatk-HaplotypeCaller-check.wdl", - "workflow_language": "wdl" + "directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", + "main_file_name": "workflow_gatk-HaplotypeCaller-check.wdl", + "language": "WDL" }, { "accession": "GAPFIXRDPDK1", + "category": ["Feature Calling"], "aliases": ["cgap-core:Workflow-gatk-HaplotypeCaller_v1.0.0"], - "app_name": "gatk-HaplotypeCaller", - "app_version": "v1.0.0", + "name": "gatk-HaplotypeCaller", + "version": "v1.0.0", "arguments": [ { "argument_format": "bam", @@ -77,27 +76,24 @@ def test_workflow(): { "argument_format": "vcf", "argument_type": "Output processed file", - "secondary_file_formats": [], "workflow_argument_name": "output_vcf" } ], "description": "Run HaplotypeCaller from gatk package", - "institution": "/institutions/hms-dbmi/", - "name": "gatk-HaplotypeCaller_v1.0.0", - "project": "/projects/cgap-core/", - "software": [], + "submission_centers": ["hms-dbmi"], + "consortia": ["cgap-core"], "title": "gatk-HaplotypeCaller [v1.0.0]", - "cwl_child_filenames": [], - "cwl_directory_url_v1": "s3://BUCKETCWL/test_pipeline/v1.0.0", - "cwl_main_filename": "gatk-HaplotypeCaller-check.cwl", - "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9" + "directory_url": "s3://BUCKETCWL/test_pipeline/v1.0.0", + "main_file_name": "gatk-HaplotypeCaller-check.cwl", + "uuid": "1936f246-22e1-45dc-bb5c-9cfd55537fe9", + "language": "CWL" } ] for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/workflows/A_gatk-HC.yaml'): d_ = yaml_parser.YAMLWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0', wflbucket_url='s3://BUCKETCWL/test_pipeline/v1.0.0' ) @@ -106,8 +102,8 @@ def test_workflow(): for d in yaml_parser.load_yaml('tests/repo_correct/portal_objects/workflows/B_minimal-gatk-HC.yaml'): d_ = yaml_parser.YAMLWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0', wflbucket_url='s3://BUCKETCWL/test_pipeline/v1.0.0' ) @@ -123,8 +119,8 @@ def test_workflow_error(): for d in yaml_parser.load_yaml(fn): try: d_ = yaml_parser.YAMLWorkflow(d).to_json( - institution='hms-dbmi', - project='cgap-core', + submission_centers=["hms-dbmi"], + consortia=["cgap-core"], version='v1.0.0', wflbucket_url='s3://BUCKETCWL/test_pipeline/v1.0.0' )