From 493d6969aa187228619a7727641616a5ae560d38 Mon Sep 17 00:00:00 2001 From: Michele Berselli Date: Mon, 4 Dec 2023 16:20:23 -0500 Subject: [PATCH] . --- CHANGELOG.rst | 6 +- LOG.md | 8 - README.md | 2 +- docs/conf.py | 4 +- docs/deploy_pipeline.rst | 20 +-- docs/functions.rst | 47 +----- docs/index.rst | 8 +- docs/pipeline_utils.rst | 8 +- docs/yaml_file_format.rst | 5 +- docs/yaml_file_reference.rst | 17 +- docs/yaml_metaworkflow.rst | 19 ++- docs/yaml_software.rst | 16 +- docs/yaml_workflow.rst | 17 +- pipeline_utils/lib/check_lines.py | 247 ------------------------------ pyproject.toml | 2 +- 15 files changed, 71 insertions(+), 355 deletions(-) delete mode 100644 LOG.md delete mode 100644 pipeline_utils/lib/check_lines.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f46ecfc..531261c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,8 +4,6 @@ Change Log ========== -3.0.0 +0.0.1 ===== -* 2023-10-10 -* Added this CHANGELOG.rst file. -* Upgrade to Python 3.11. +* Initial release diff --git a/LOG.md b/LOG.md deleted file mode 100644 index fbaf79c..0000000 --- a/LOG.md +++ /dev/null @@ -1,8 +0,0 @@ -### Version Updates - -#### v2.1.0 - * Added support for updated QCs, to enable the new generic schema ``quality_metric_generic`` - - -#### v2.0.0 - * Initial release after major changes to support the new YAML format for portal objects diff --git a/README.md b/README.md index f7fb627..a2aa59a 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,6 @@ To install from source: make update make build -To check that the software is correctly installed, try to run `pipeline_utils`. If installed from source, this command may fail with a bash “command not found” error, try `poetry run pipeline_utils` instead. +To check that the software is correctly installed, try to run `smaht_pipeline_utils`. If installed from source, this command may fail with a bash “command not found” error, try `poetry run smaht_pipeline_utils` instead. See `make info` for details on make targets. diff --git a/docs/conf.py b/docs/conf.py index d983ecb..56edc83 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'smaht-pipeline-utils' -copyright = '2021, HMS DBMI' +copyright = '2023, HMS DBMI' author = 'Michele Berselli, SMaHT Team' # The full version, including alpha/beta/rc tags -release = '2.1.0' +release = '0.0.1' # -- General configuration --------------------------------------------------- diff --git a/docs/deploy_pipeline.rst b/docs/deploy_pipeline.rst index d110ce4..fbb1ef3 100644 --- a/docs/deploy_pipeline.rst +++ b/docs/deploy_pipeline.rst @@ -142,7 +142,7 @@ Example of a key-pair entry: } } -```` is the namespace for the environment and can be found in the portal health page (e.g., cgap-wolf). +```` is the namespace for the environment and can be found in the portal health page (e.g., smaht-wolf). .. _account_vars: @@ -154,21 +154,21 @@ Finally we need to setup the information to identify the target environment to u .. code-block:: bash # Set the namespace of the target environment - # e.g., cgap-wolf + # e.g., smaht-wolf export ENV_NAME= # Set the bucket used to store the worklow description files - # e.g., cgap-biotest-main-application-tibanna-cwls + # e.g., smaht-wolf-application-tibanna-cwls export WFL_BUCKET= # Set the path to the keypair file with the portal credential export KEYDICTS_JSON=~/.cgap-keys.json - # Set up project and institution - # Project and institution need to correspond to metadata present on the portal - # e.g., cgap-core and hms-dbmi - export PROJECT= - export INSTITUTION= + # Set up consortia and submission centers + # consortia and submission_centers need to correspond to metadata present on the portal + # e.g., ['smaht'] and ['smaht_dac'] + export CONSORTIA= + export SUBMISSION_CENTERS= # If running sentieon code, # specify the address for the server that validate the software license @@ -194,8 +194,8 @@ by the ``--repos`` argument. --wfl-bucket ${WFL_BUCKET} \ --account ${AWS_ACCOUNT_NUMBER} \ --region ${TIBANNA_AWS_REGION} \ - --project ${PROJECT} \ - --institution ${INSTITUTION} \ + --consortia ${CONSORTIA} \ + --submission-centers ${SUBMISSION_CENTERS} \ --sentieon-server ${SENTIEON_LICENSE} \ --post-software \ --post-file-format \ diff --git a/docs/functions.rst b/docs/functions.rst index 49378d8..078930f 100644 --- a/docs/functions.rst +++ b/docs/functions.rst @@ -2,49 +2,4 @@ Functions ========= -Collection of utilities available as functions: - - - :ref:`check_lines ` - -.. _check_lines: - -check_lines -+++++++++++ - -*check_lines* function can be used to check that line counts are matching between the output of two steps where lines should not be dropped (i.e., any steps that modify without filtering), or between an output ``bam`` and the input ``fastq`` files. -Requires uuid for the *MetaWorkflowRun* object to check and ff_key to access the metadata on the portal. The steps to compare are specified as dictionaries, examples below. - -.. code-block:: python - - from pipeline_utils.lib import check_lines - - result = check_lines.check_lines(metawfr_uuid, ff_key, steps=steps_dict, fastqs=fastqs_dict) - - # metawfr_uuid - # -> uuid for MetaWorkflowRun object - - # ff_key - # -> key to authenticate on the portal - - ## steps_dict example - # steps_dict = { - # 'workflow_add-readgroups-check': { - # 'dependency': 'workflow_bwa-mem_no_unzip-check', - # 'output': 'bam_w_readgroups', - # 'output_match': 'raw_bam', - # 'key': 'Total Reads', - # 'key_match': 'Total Reads' - # }, - # ... - # } - - ## fastqs_dict example - # fastqs_dict = { - # 'workflow_bwa-mem_no_unzip-check': { - # 'output': 'raw_bam', - # 'input_match': ['fastq_R1', 'fastq_R2'], - # 'key': 'Total Reads', - # 'key_match': 'Total Sequences' - # }, - # ... - # } +In development. diff --git a/docs/index.rst b/docs/index.rst index 65d81e4..2c47eed 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,8 @@ -========================= -Portal Pipeline Utilities -========================= +=============================== +SMaHT Portal Pipeline Utilities +=============================== -Documentation for smaht-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with portal infrastructure. +Documentation for smaht-pipeline-utils_, a collection of utilities for deploying pipelines and interfacing with SMaHT portal infrastructure. .. _smaht-pipeline-utils: https://github.com/smaht-dac/smaht-pipeline-utils diff --git a/docs/pipeline_utils.rst b/docs/pipeline_utils.rst index 301bdc6..f087975 100644 --- a/docs/pipeline_utils.rst +++ b/docs/pipeline_utils.rst @@ -61,10 +61,10 @@ Usage: - AWS account to use for deployment * - *-\-region* - AWS account region to use for deployment - * - *-\-project* - - Project to use for deployment [cgap-core] - * - *-\-institution* - - Institution to use for deployment [hms-dbmi] + * - *-\-consortia* + - List of consortia to use for deployment [smaht] + * - *-\-submission-centers* + - List of centers to use for deployment [smaht_dac] * - *-\-post-software* - DEPLOY | UPDATE Software objects (.yaml or .yml) * - *-\-post-file-format* diff --git a/docs/yaml_file_format.rst b/docs/yaml_file_format.rst index 0f10970..531fe2d 100644 --- a/docs/yaml_file_format.rst +++ b/docs/yaml_file_format.rst @@ -22,10 +22,9 @@ Template # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas secondary_formats: - # bam, fastq, bwt, ... - status: # shared Fields Definition @@ -49,7 +48,7 @@ Description of the file format. Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. secondary_formats ----------------- diff --git a/docs/yaml_file_reference.rst b/docs/yaml_file_reference.rst index e30dd5c..ba961b3 100644 --- a/docs/yaml_file_reference.rst +++ b/docs/yaml_file_reference.rst @@ -21,9 +21,14 @@ Template format: # bam, fastq, bwt, ... version: + category: + - # Reference Genome, ... + type: + - # Reference Sequence, ... + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas secondary_files: - # bam, fastq, bwt, ... status: # uploading, uploaded @@ -60,7 +65,7 @@ Version of the reference file. Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. secondary_files --------------- @@ -78,3 +83,11 @@ Most likely you don't want to set this field and just use the default logic auto license ------- License information. + +category +-------- +Categories for the reference file, see `schemas `__. + +type +---- +Types for the reference file, see `schemas `__. diff --git a/docs/yaml_metaworkflow.rst b/docs/yaml_metaworkflow.rst index ca43649..25e56de 100644 --- a/docs/yaml_metaworkflow.rst +++ b/docs/yaml_metaworkflow.rst @@ -20,6 +20,9 @@ Template name: description: + category: + - # Alignment, ... + ## General arguments ######################################## # Pipeline input, reference files, and general arguments # define all arguments for the pipeline here @@ -61,7 +64,7 @@ Template # Allows to force a fixed shards structure ignoring # the input structure, scatter and gather dimensions #################################### - shards: [[], ..] # e.g., [['0'], ['1'], ['2']] + shards: [[], ..] # e.g., [['0'], ['1'], ['2']] ## Lock version #################### # Specific version to use @@ -81,7 +84,7 @@ Template # File argument : - argument_type: file. # bam, fastq, bwt ... + argument_type: file. # bam, fastq, bwt ... # Linking fields # These are optional fields # Check https://magma-suite.readthedocs.io/en/latest/meta-workflow.html @@ -157,6 +160,10 @@ description ----------- Description of the pipeline. +category +-------- +Categories for the pipeline, see `schemas `__. + input ----- Description of general input files and parameters for the pipeline. See :ref:`Input Definition `. @@ -167,11 +174,7 @@ Description of workflows that are steps of the pipeline. See :ref:`Workflows Def Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. - -title ------ -Title of the pipeline. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. .. _workflows: @@ -210,7 +213,7 @@ output Description of expected output files for the workflow. Each output is defined by its name. Additional subfields can be specified. -See `schemas `__. +See `schemas `__. Each output name needs to match an output name that has been previously defined in the corresponding workflow, see :ref:`Workflow `. diff --git a/docs/yaml_software.rst b/docs/yaml_software.rst index ccfa378..58adf6d 100644 --- a/docs/yaml_software.rst +++ b/docs/yaml_software.rst @@ -22,10 +22,12 @@ Template version: commit: + category: + - # Quality Control + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - title: + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas source_url: description: license: # MIT, GPLv3, ... @@ -51,13 +53,13 @@ commit ------ Commit of the software. +category +-------- +Categories for the software, see `schemas `__. + Optional ^^^^^^^^ -All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. - -title ------ -Title for the software. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. source_url ---------- diff --git a/docs/yaml_workflow.rst b/docs/yaml_workflow.rst index ba63bd8..6d3ecf4 100644 --- a/docs/yaml_workflow.rst +++ b/docs/yaml_workflow.rst @@ -26,11 +26,12 @@ Template child: - # .cwl or .wdl file + category: + - # Annotation + # All the following fields are optional and provided as example, # can be expanded to anything accepted by the schema - # https://github.com/dbmi-bgm/cgap-portal/tree/master/src/encoded/schemas - title: - + # https://github.com/smaht-dac/smaht-portal/tree/main/src/encoded/schemas software: - @ @@ -99,6 +100,10 @@ Several subfields need to be specified: At the moment we support two standards, `Common Workflow Language `__ (CWL) and `Workflow Description Language `__ (WDL). +category +-------- +Categories for the workflow, see `schemas `__. + input ----- Description of input files and parameters for the workflow. See :ref:`Input Definition `. @@ -109,11 +114,7 @@ Description of expected outputs for the workflow. See :ref:`Output Definition `__. - -title ------ -Title of the workflow. +All the following fields are optional and provided as example. Can be expanded to anything accepted by the schema, see `schemas `__. software -------- diff --git a/pipeline_utils/lib/check_lines.py b/pipeline_utils/lib/check_lines.py deleted file mode 100644 index 7d26d2e..0000000 --- a/pipeline_utils/lib/check_lines.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 - -################################################ -# -# check_lines -# implement a function to check line counts -# match between specified pipeline steps output -# -# Michele Berselli -# berselli.michele@gmail.com -# -################################################ - -################################################ -# Libraries -################################################ -import sys, os -from magma_ff.metawflrun import MetaWorkflowRun -from magma_ff import wfrutils -from dcicutils import ff_utils - -################################################ -# Variables -################################################ -steps_dict = { - # BAM - 'workflow_add-readgroups-check': { - 'dependency': 'workflow_bwa-mem_no_unzip-check', - 'output': 'bam_w_readgroups', - 'output_match': 'raw_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_merge-bam-check': { - 'dependency': 'workflow_add-readgroups-check', - 'output': 'merged_bam', - 'output_match': 'bam_w_readgroups', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_picard-MarkDuplicates-check':{ - 'dependency': 'workflow_merge-bam-check', - 'output': 'dupmarked_bam', - 'output_match': 'merged_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_sort-bam-check': { - 'dependency': 'workflow_picard-MarkDuplicates-check', - 'output': 'sorted_bam', - 'output_match': 'dupmarked_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - 'workflow_gatk-ApplyBQSR-check': { - 'dependency': 'workflow_sort-bam-check', - 'output': 'recalibrated_bam', - 'output_match': 'sorted_bam', - 'key': 'Total Reads', - 'key_match': 'Total Reads' - }, - # VCF - 'workflow_samplegeno': { - 'dependency': 'workflow_gatk-GenotypeGVCFs-check', - 'output': 'samplegeno_vcf', - 'output_match': 'vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - # 'workflow_vep-annot-check': { - # 'dependency': 'workflow_samplegeno', - # 'output': 'annotated_vcf', - # 'output_match': 'samplegeno_vcf', - # 'key': 'Total Variants Called', - # 'key_match': 'Filtered Variants' - # }, - 'workflow_granite-comHet-check': { - 'dependency': 'workflow_granite-filtering-check', - 'output': 'comHet_vcf', - 'output_match': 'merged_vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - 'workflow_dbSNP_ID_fixer-check': { - 'dependency': 'workflow_granite-comHet-check', - 'output': 'vcf', - 'output_match': 'comHet_vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - }, - 'workflow_hg19lo_hgvsg-check': { - 'dependency': 'workflow_dbSNP_ID_fixer-check', - 'output': 'vcf', - 'output_match': 'vcf', - 'key': 'Filtered Variants', - 'key_match': 'Filtered Variants' - } - } - -fastqs_dict = { - 'workflow_bwa-mem_no_unzip-check': { - 'output': 'raw_bam', - 'input_match': ['fastq_R1', 'fastq_R2'], - 'key': 'Total Reads', - 'key_match': 'Total Sequences' - } -} - -################################################ -# Functions -################################################ -################################################ -# check_lines -################################################ -def check_lines(metawfr_uuid, ff_key, steps=steps_dict, fastqs=fastqs_dict): - """ - """ - print('Meta Workflow:') - print(' -> ' + metawfr_uuid + '\n') - - # Get meta-workflow-run and create MetaWorkflowRun object - run_json = ff_utils.get_metadata(metawfr_uuid, add_on='?frame=raw&datastore=database', key=ff_key) - metawflrun_obj = MetaWorkflowRun(run_json) - - is_match = True - # Check fastqs - for _, run_obj in metawflrun_obj.runs.items(): - count, match_count = 0, 0 - if run_obj.name in fastqs: - if run_obj.status == 'completed': - # Get output count - for output in run_obj.output: - if output['argument_name'] == fastqs[run_obj.name]['output']: - output_uuid = output['file'] - qc_key = fastqs[run_obj.name]['key'] - count = int(get_count_qc(qc_key, output_uuid, ff_key)) - break - #end if - #end for - print('Shard:') - print(' -> ' + run_obj.shard_name + ', ' + str(count)) - - # Get input file to match from jobid - print('File/s to match:') - ffwr_obj = wfrutils.FFWfrUtils(env='env') - ffwr_obj._ff_key = ff_key - file_match = True - for file in ffwr_obj.wfr_metadata(run_obj.jobid)['input_files']: - if file['workflow_argument_name'] in fastqs[run_obj.name]['input_match']: - input_uuid = file['value']['uuid'] - qc_key = fastqs[run_obj.name]['key_match'] - match_count = int(get_count_fastqc(qc_key, input_uuid, ff_key)) - if not count == match_count: - is_match = False - file_match = False - #end if - print(' -> ' + file['workflow_argument_name'] + ', ' + str(match_count)) - #end if - #end for - print('Matching: ' + str(file_match) + '\n') - else: - print('Missing: ' + run_obj.name + '\n') - print('Completed: False\n') - return False - #end if - #end if - #end for - - # Check steps - for _, run_obj in metawflrun_obj.runs.items(): - count, total_count = 0, 0 - if run_obj.name in steps: - if run_obj.status == 'completed': - # Get output count - for output in run_obj.output: - if output['argument_name'] == steps[run_obj.name]['output']: - output_uuid = output['file'] - qc_key = steps[run_obj.name]['key'] - count = int(get_count_qc(qc_key, output_uuid, ff_key)) - break - #end if - #end for - print('Shard:') - print(' -> ' + run_obj.shard_name + ', ' + str(count)) - - # Get dependencies count - print('Shard/s to match (sum):') - for shard_name in run_obj.dependencies: - if shard_name.split(':')[0] == steps[run_obj.name]['dependency']: - run_obj_ = metawflrun_obj.runs[shard_name] - for output in run_obj_.output: - if output['argument_name'] == steps[run_obj.name]['output_match']: - output_uuid = output['file'] - qc_key = steps[run_obj.name]['key_match'] - count_ = int(get_count_qc(qc_key, output_uuid, ff_key)) - total_count += count_ - break - #end if - #end for - print(' -> ' + shard_name + ', ' + str(count_)) - #end if - #end for - print('Matching: ' + str(count == total_count) + '\n') - # Check counts match - if not count == total_count: - is_match = False - #end if - else: - print('Missing: ' + run_obj.name + '\n') - print('Completed: False\n') - return False - #end if - #end if - #end for - print('Completed: ' + str(is_match) + '\n') - return is_match -#end def - -def get_count_qc(qc_key, uuid, ff_key): - """ - """ - try: - res_json = ff_utils.get_metadata(uuid, add_on='?frame=raw&datastore=database', key=ff_key) - qc_uuid = res_json['quality_metric'] - qc_json = ff_utils.get_metadata(qc_uuid, add_on='?datastore=database', key=ff_key) - for qc in qc_json['quality_metric_summary']: - if qc['title'] == qc_key: - return qc['value'] - #end if - #end for - except KeyError: - return 0 - #end try -#end def - -def get_count_fastqc(qc_key, uuid, ff_key): - """ - """ - try: - res_json = ff_utils.get_metadata(uuid, add_on='?frame=raw&datastore=database', key=ff_key) - qc_uuid = res_json['quality_metric'] - qc_json = ff_utils.get_metadata(qc_uuid, add_on='?datastore=database', key=ff_key) - return qc_json[qc_key] - except KeyError: - return 0 - #end try -#end def diff --git a/pyproject.toml b/pyproject.toml index cad422b..56472c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "smaht-pipeline-utils" -version = "3.0.0" +version = "0.0.1" description = "Utilities for deploying pipelines and interfacing with SMaHT portal infrastructure." authors = [ "Michele Berselli ",