diff --git a/CHANGELOG.rst b/CHANGELOG.rst index aadc032df..e2a273ee4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ smaht-portal Change Log ---------- +0.122.0 +======= +`PR 277 DSA Release `_ +* In `commands/release-file.py` and `commands/create-annotated-filenames.py`: + * Assay and sequencer codes value set to XX for DSA fasta files and chain files + * For Supplementary Files, use `haplotype`, `target_assembly`, and `source_assembly` properties to create annotated filenames for chain and fasta files + 0.121.0 ======= `PR 300 SN Remove basecalling ` diff --git a/pyproject.toml b/pyproject.toml index f876be138..6c7abd793 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "encoded" -version = "0.121.0" +version = "0.122.0" description = "SMaHT Data Analysis Portal" authors = ["4DN-DCIC Team "] license = "MIT" diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py index 9ecbfb005..c1cca47d5 100644 --- a/src/encoded/commands/create_annotated_filenames.py +++ b/src/encoded/commands/create_annotated_filenames.py @@ -22,6 +22,8 @@ supplementary_file as supp_file_utils, tissue as tissue_utils, tissue_sample as tissue_sample_utils, + donor_specific_assembly as dsa_utils, + reference_genome as rg_utils ) from encoded.item_utils.constants import file as file_constants from encoded.item_utils.utils import RequestHandler @@ -32,6 +34,7 @@ FILENAME_SEPARATOR = "-" ANALYSIS_INFO_SEPARATOR = "_" CHAIN_FILE_INFO_SEPARATOR = "To" +DSA_INFO_VALUE = "DSA" RNA_DATA_CATEGORY = "RNA Quantification" GENE_DATA_TYPE = "Gene Expression" @@ -98,6 +101,8 @@ class AssociatedItems: tissue_samples: List[Dict[str, Any]] tissues: List[Dict[str, Any]] donors: List[Dict[str, Any]] + target_assembly: Dict[str, Any] + source_assembly: Dict[str, Any] def get_associated_items( @@ -116,6 +121,8 @@ def get_associated_items( reference_genome = get_reference_genome(file, request_handler) gene_annotations = get_gene_annotations(file, request_handler) donor_specific_assembly = get_donor_specific_assembly(file, request_handler) + target_assembly = get_target_assembly(file, request_handler) + source_assembly = get_source_assembly(file, request_handler) if donor_specific_assembly: file_sets=get_derived_from_file_sets(file, request_handler) else: @@ -146,6 +153,8 @@ def get_associated_items( tissues=tissues, cell_lines=cell_lines, donors=donors, + target_assembly=target_assembly, + source_assembly=source_assembly ) @@ -215,6 +224,20 @@ def get_reference_genome( return get_item(file_utils.get_reference_genome(file), request_handler) +def get_target_assembly( + file: Dict[str, Any], request_handler: RequestHandler +) -> Union[None, Dict[str, Any]]: + """Get target assembly for file.""" + return get_item(supp_file_utils.get_target_assembly(file), request_handler) + + +def get_source_assembly( + file: Dict[str, Any], request_handler: RequestHandler +) -> Union[None, Dict[str, Any]]: + """Get source assembly for file.""" + return get_item(supp_file_utils.get_source_assembly(file), request_handler) + + def get_gene_annotations( file: Dict[str, Any], request_handler: RequestHandler ) -> Dict[str, Any]: @@ -432,7 +455,9 @@ def get_annotated_filename( associated_items.donors, associated_items.sample_sources ) sequencing_and_assay_codes = get_sequencing_and_assay_codes( - associated_items.sequencers, associated_items.assays + associated_items.file, + associated_items.sequencers, + associated_items.assays ) sequencing_center_code = get_sequencing_center_code( associated_items.sequencing_center @@ -444,7 +469,10 @@ def get_annotated_filename( associated_items.software, associated_items.reference_genome, associated_items.gene_annotations, - associated_items.file_format + associated_items.file_format, + associated_items.target_assembly, + associated_items.source_assembly, + associated_items.donor_specific_assembly, ) errors = collect_errors( project_id, @@ -767,14 +795,20 @@ def get_sex_abbreviation(sex: str) -> str: def get_sequencing_and_assay_codes( + file: Dict[str, Any], sequencers: List[Dict[str], Any], assays: List[Dict[str], Any], ) -> FilenamePart: - """Get sequencing and assay codes for file.""" + """Get sequencing and assay codes for file. + + Returns XX for Genome Assembly and Reference Conversion files. + """ sequencing_codes = get_sequencing_codes(sequencers) assay_codes = get_assay_codes(assays) if len(sequencing_codes) == 1 and len(assay_codes) == 1: return get_filename_part(value=f"{sequencing_codes[0]}{assay_codes[0]}") + elif supp_file_utils.is_genome_assembly(file) or supp_file_utils.is_reference_conversion(file): + return get_filename_part(value="XX") errors = [] if not sequencing_codes: errors.append("No sequencing code found") @@ -819,6 +853,9 @@ def get_analysis( reference_genome: Dict[str, Any], gene_annotations: Dict[str, Any], file_extension: Dict[str, Any], + target_assembly: Dict[str, Any], + source_assembly: Dict[str, Any], + donor_specific_assembly: Dict[str, Any], ) -> FilenamePart: """Get analysis info for file. @@ -829,19 +866,23 @@ def get_analysis( reference_genome_code = item_utils.get_code(reference_genome) gene_annotation_code = get_annotations_and_versions(gene_annotations) transcript_info_code = get_rna_seq_tsv_value(file, file_extension) + haplotype_code = get_haplotype_value(file, file_extension, donor_specific_assembly) + chain_code = get_chain_file_value(file, target_assembly, source_assembly, file_extension) value = get_analysis_value( software_and_versions, reference_genome_code, gene_annotation_code, - transcript_info_code + transcript_info_code, + chain_code, + haplotype_code ) - if file_format_utils.is_chain_file(file_extension): - value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}" errors = get_analysis_errors( file, reference_genome_code, gene_annotation_code, transcript_info_code, + chain_code, + haplotype_code, file_extension, ) if errors: @@ -858,6 +899,8 @@ def get_analysis_errors( reference_genome_code: str, gene_annotation_code: str, transcript_info_code: str, + chain_code: str, + haplotype_code: str, file_extension: Dict[str, Any] ) -> List[str]: """Get analysis errors for file by file type.""" @@ -876,6 +919,9 @@ def get_analysis_errors( errors.append("No gene annotation code found") elif file_format_utils.is_tsv_file(file_extension) and not transcript_info_code: errors.append("No gene or isoform code found") + if file_format_utils.is_chain_file(file_extension): + if not chain_code: + errors.append("No target or source assembly found for chain conversion ") return errors @@ -883,12 +929,14 @@ def get_analysis_value( software_and_versions: str, reference_genome_code: str, gene_annotation_code: str, - transcript_info_code: str + transcript_info_code: str, + chain_code: str, + haplotype_code: str, ) -> str: """Get analysis value for filename.""" to_write = [ string - for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code] + for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code, chain_code, haplotype_code] if string ] return ANALYSIS_INFO_SEPARATOR.join(to_write) @@ -951,8 +999,7 @@ def get_annotation_codes_missing_versions( def get_software_and_versions(software: List[Dict[str, Any]]) -> str: """Get software and accompanying versions for file. - Currently only looking for software items with codes, as these are - expected to be the software used for naming. + Currently looking for software items with codes, as these are expected to be the software used for naming. """ software_with_codes = get_software_with_codes(software) if not software_with_codes: @@ -979,7 +1026,9 @@ def get_software_with_versions( return [item for item in software_items if item_utils.get_version(item)] -def get_software_and_versions_string(software_items: List[Dict[str, Any]]) -> str: +def get_software_and_versions_string( + software_items: List[Dict[str, Any]] + ) -> str: """Get string representation of software and versions.""" sorted_software_items = sorted(software_items, key=item_utils.get_code) return ANALYSIS_INFO_SEPARATOR.join( @@ -1002,11 +1051,36 @@ def get_software_codes_missing_versions( ] -def get_chain_file_value(file: Dict[str, Any]) -> str: - """Get reference conversion direction for chain files.""" - target_assembly=supp_file_utils.get_target_assembly(file) - source_assembly=supp_file_utils.get_source_assembly(file) - return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly]) +def get_chain_file_value( + file: Dict[str, Any], + target_assembly: Dict[str, Any], + source_assembly: Dict[str, Any], + file_extension: Dict[str, Any] + ) -> str: + """Get genome conversion direction for chain files.""" + if file_format_utils.is_chain_file(file_extension): + target_value = "" + source_value = "" + if target_assembly and source_assembly: + target_value = DSA_INFO_VALUE if dsa_utils.is_donor_specific_assembly(target_assembly) else item_utils.get_code(target_assembly) + source_value = DSA_INFO_VALUE if dsa_utils.is_donor_specific_assembly(source_assembly) else item_utils.get_code(source_assembly) + if target_value and source_value: + return CHAIN_FILE_INFO_SEPARATOR.join([source_value,target_value]) + return "" + + +def get_haplotype_value( + file: Dict[str, Any], + file_extension: Dict[str, Any], + donor_specific_assembly: Dict[str, Any] + ): + """Get haplotype value for fasta file.""" + if file_format_utils.is_fasta_file(file_extension): + if (haplotype := supp_file_utils.get_haplotype(file)): + return haplotype + elif donor_specific_assembly: + return DSA_INFO_VALUE + return "" def get_rna_seq_tsv_value(file: Dict[str, Any], file_extension: Dict[str, Any]) -> str: diff --git a/src/encoded/commands/release_file.py b/src/encoded/commands/release_file.py index 0ceebf4ea..7b9aec3a7 100644 --- a/src/encoded/commands/release_file.py +++ b/src/encoded/commands/release_file.py @@ -24,6 +24,7 @@ sample_source as sample_source_utils, submitted_file as submitted_file_utils, tissue as tissue_utils, + supplementary_file as supp_file_utils ) from encoded.item_utils.constants import ( file as file_constants, @@ -198,12 +199,15 @@ def get_file_sets_from_file(self) -> List[dict]: ) mwfrs = ff_utils.search_metadata(search_filter, key=self.key) if len(mwfrs) != 1: - self.print_error_and_exit( - ( - f"Expected exactly one associated MetaWorkflowRun, got" - f" {len(mwfrs)}: {search_filter}" + if not supp_file_utils.is_genome_assembly(self.file) and not supp_file_utils.is_reference_conversion(self.file): + self.print_error_and_exit( + ( + f"Expected exactly one associated MetaWorkflowRun, got" + f" {len(mwfrs)}: {search_filter}" + ) ) - ) + else: + return [] mwfr = mwfrs[0] file_sets = meta_workflow_run_utils.get_file_sets(mwfr) # Might need to be more general in the future @@ -238,6 +242,7 @@ def prepare( self.add_release_items_to_patchdict( self.quality_metrics_zips, "Compressed QC metrics file" ) + self.add_release_items_to_patchdict(self.file_sets, "FileSet") self.add_release_items_to_patchdict(self.sequencings, "Sequencing") self.add_release_items_to_patchdict(self.libraries, "Library") @@ -354,17 +359,16 @@ def add_file_patchdict(self, dataset: str) -> None: item_utils.get_accession(file_set) for file_set in self.file_sets ] annotated_filename_info = self.get_annotated_filename_info() - # Add file to file set and set status to released patch_body = { item_constants.UUID: item_utils.get_uuid(self.file), item_constants.STATUS: item_constants.STATUS_RELEASED, - file_constants.FILE_SETS: file_set_accessions, file_constants.DATASET: dataset, file_constants.ACCESS_STATUS: access_status, file_constants.ANNOTATED_FILENAME: annotated_filename_info.filename, } - + if file_set_accessions: + patch_body[file_constants.FILE_SETS] = file_set_accessions # Take the extra files from the annotated filename object if available. # They will have the correct filenames if annotated_filename_info.patch_dict: @@ -454,6 +458,12 @@ def get_access_status(self, dataset: str) -> str: file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( file_constants.ACCESS_STATUS_OPEN ), + file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: ( + file_constants.ACCESS_STATUS_OPEN + ), + file_constants.DATA_CATEGORY_GENOME_CONVERSION: ( + file_constants.ACCESS_STATUS_OPEN + ), file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: ( file_constants.ACCESS_STATUS_OPEN ) @@ -468,6 +478,12 @@ def get_access_status(self, dataset: str) -> str: file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( file_constants.ACCESS_STATUS_PROTECTED ), + file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: ( + file_constants.ACCESS_STATUS_PROTECTED + ), + file_constants.DATA_CATEGORY_GENOME_CONVERSION: ( + file_constants.ACCESS_STATUS_PROTECTED + ), file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: ( file_constants.ACCESS_STATUS_OPEN ) @@ -482,6 +498,12 @@ def get_access_status(self, dataset: str) -> str: file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( file_constants.ACCESS_STATUS_OPEN ), + file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: ( + file_constants.ACCESS_STATUS_PROTECTED + ), + file_constants.DATA_CATEGORY_GENOME_CONVERSION: ( + file_constants.ACCESS_STATUS_PROTECTED + ), file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: ( file_constants.ACCESS_STATUS_OPEN ) diff --git a/src/encoded/item_utils/constants/file.py b/src/encoded/item_utils/constants/file.py index 054199cb7..a0599150b 100644 --- a/src/encoded/item_utils/constants/file.py +++ b/src/encoded/item_utils/constants/file.py @@ -6,6 +6,8 @@ DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls" DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads" DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls" +DATA_CATEGORY_GENOME_CONVERSION = "Reference Conversion" +DATA_CATEGORY_GENOME_ASSEMBLY = "Genome Assembly" DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification" DATASET = "dataset" EXTRA_FILES = "extra_files" diff --git a/src/encoded/item_utils/donor_specific_assembly.py b/src/encoded/item_utils/donor_specific_assembly.py index 3a714fafc..e07f68938 100644 --- a/src/encoded/item_utils/donor_specific_assembly.py +++ b/src/encoded/item_utils/donor_specific_assembly.py @@ -3,9 +3,15 @@ from .utils import RequestHandler, get_property_value_from_identifier, get_property_values_from_identifiers from . import ( - file as file_utils + file as file_utils, + item as item_utils ) +def is_donor_specific_assembly(properties: Dict[str, Any]) -> bool: + """Check if item is a donor specific assembly.""" + return item_utils.get_type(properties) == "DonorSpecificAssembly" + + def get_file_format_id(request_handler: RequestHandler,identifier: str): """Return identifier of file_format for file.""" return get_property_value_from_identifier(request_handler,identifier,file_utils.get_file_format) diff --git a/src/encoded/item_utils/file_format.py b/src/encoded/item_utils/file_format.py index 8583c6180..c57e1fe10 100644 --- a/src/encoded/item_utils/file_format.py +++ b/src/encoded/item_utils/file_format.py @@ -13,5 +13,9 @@ def is_chain_file(properties: Dict[str, Any]) -> bool: return get_standard_file_extension(properties) in ["chain.gz","chain"] +def is_fasta_file(properties: Dict[str, Any]) -> bool: + return get_standard_file_extension(properties) in ["fa","fasta"] + + def is_tsv_file(properties: Dict[str, Any]) -> bool: return get_standard_file_extension(properties) == "tsv" diff --git a/src/encoded/item_utils/reference_genome.py b/src/encoded/item_utils/reference_genome.py index bf1573df7..730d46d88 100644 --- a/src/encoded/item_utils/reference_genome.py +++ b/src/encoded/item_utils/reference_genome.py @@ -1,6 +1,7 @@ from typing import Any, Dict from . import item as item_utils -def is_dsa(properties: Dict[str, Any]) -> bool: + +def is_reference_genome(properties: Dict[str, Any]) -> bool: """Check if item is a DonorSpecificAssembly.""" - return item_utils.get_type(properties) == "DonorSpecificAssembly" \ No newline at end of file + return item_utils.get_type(properties) == "ReferenceGenome" \ No newline at end of file diff --git a/src/encoded/item_utils/supplementary_file.py b/src/encoded/item_utils/supplementary_file.py index 6f83a8a97..83dce7427 100644 --- a/src/encoded/item_utils/supplementary_file.py +++ b/src/encoded/item_utils/supplementary_file.py @@ -21,6 +21,16 @@ def is_chain_file(properties: Dict[str, Any],request_handler: RequestHandler): return file_utils.get_file_extension(properties,request_handler) == "chain.gz" +def is_genome_assembly(properties: Dict[str, Any]): + """Check if data category is Genome Assembly""" + return "Genome Assembly" in file_utils.get_data_category(properties) + + +def is_reference_conversion(properties: Dict[str, Any]): + """Check if data category is Genome Conversion""" + return "Reference Conversion" in file_utils.get_data_category(properties) + + def get_donor_specific_assembly(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]: """Get donor-specific assembly from properties.""" return properties.get("donor_specific_assembly", "") @@ -40,6 +50,11 @@ def get_reference_genome(properties: Dict[str, Any]) -> Union[str, Dict[str, Any return properties.get("reference_genome", "") +def get_haplotype(properties: Dict[str, Any]) -> Union[str, Dict[str, Any], None]: + """Get haplotype from properties.""" + return properties.get("haplotype", "") + + def get_derived_from( properties: Dict[str, Any], request_handler: Optional[RequestHandler] = None ) -> List[Union[str, Dict[str, Any]]]: @@ -64,16 +79,3 @@ def get_derived_from_file_sets( file_utils.get_file_sets, ) return properties.get("file_sets", []) - - -def get_dsa_software( - properties: Dict[str, Any], request_handler: Optional[RequestHandler] = None -) -> List[Union[str, Dict[str, Any]]]: - """Get software from donor-specific assembly associated with file.""" - if request_handler: - return get_property_value_from_identifier( - request_handler, - get_donor_specific_assembly(properties), - dsa_utils.get_software, - ) - return [] \ No newline at end of file diff --git a/src/encoded/schemas/donor_specific_assembly.json b/src/encoded/schemas/donor_specific_assembly.json index f88249b62..4c1f508e6 100644 --- a/src/encoded/schemas/donor_specific_assembly.json +++ b/src/encoded/schemas/donor_specific_assembly.json @@ -23,6 +23,9 @@ { "$ref": "mixins.json#/attribution" }, + { + "$ref": "mixins.json#/code" + }, { "$ref": "mixins.json#/description" }, diff --git a/src/encoded/schemas/file.json b/src/encoded/schemas/file.json index bc1216437..9a62d20da 100644 --- a/src/encoded/schemas/file.json +++ b/src/encoded/schemas/file.json @@ -130,6 +130,7 @@ "Genome Region", "Germline Variant Calls", "Quality Control", + "Genome Assembly", "Reference Genome", "Reference Conversion", "Reference Transcriptome", @@ -154,6 +155,7 @@ "In Silico Generated", "Index", "Reference Sequence", + "DSA", "Chain File", "Sequence Interval", "Statistics", diff --git a/src/encoded/schemas/supplementary_file.json b/src/encoded/schemas/supplementary_file.json index a8bce5edf..f5c37b6c3 100644 --- a/src/encoded/schemas/supplementary_file.json +++ b/src/encoded/schemas/supplementary_file.json @@ -94,8 +94,10 @@ "description": "Haplotype name for the file (for fasta files).", "type": "string", "suggested_enum": [ - "Hap1", - "Hap2" + "hapX", + "hapY", + "hapX1", + "hapX2" ] }, "target_assembly": { diff --git a/src/encoded/tests/data/workbook-inserts/donor_specific_assembly.json b/src/encoded/tests/data/workbook-inserts/donor_specific_assembly.json index f0844259e..c4c2a8c6a 100644 --- a/src/encoded/tests/data/workbook-inserts/donor_specific_assembly.json +++ b/src/encoded/tests/data/workbook-inserts/donor_specific_assembly.json @@ -3,6 +3,7 @@ "uuid": "12a3228f-f46c-4fbf-a19c-d233bf7d20d8", "submitted_id": "TEST_DONOR-SPECIFIC-ASSEMBLY_HELA", "title": "Hela_DSA", + "code": "Hela_DSA", "derived_from": [ "TEST_UNALIGNED-READS_HELA-FASTQ-R2" ], diff --git a/src/encoded/tests/data/workbook-inserts/supplementary_file.json b/src/encoded/tests/data/workbook-inserts/supplementary_file.json index cf3456e6f..be794dc4f 100644 --- a/src/encoded/tests/data/workbook-inserts/supplementary_file.json +++ b/src/encoded/tests/data/workbook-inserts/supplementary_file.json @@ -26,10 +26,10 @@ "uuid": "f1074e68-5c24-462b-909d-472356e5a15f", "submitted_id": "TEST_SUPPLEMENTARY-FILE_HELA_FASTA", "data_category": [ - "Reference Genome" + "Genome Assembly" ], "data_type": [ - "Reference Sequence" + "DSA" ], "filename": "test_hela.fasta", "file_format": "FASTA", diff --git a/src/encoded/tests/test_annotated_filename.py b/src/encoded/tests/test_annotated_filename.py index 398c12845..2fbc32ea3 100644 --- a/src/encoded/tests/test_annotated_filename.py +++ b/src/encoded/tests/test_annotated_filename.py @@ -468,6 +468,8 @@ def test_get_donor_sex_and_age_parts( assert_filename_part_matches(result, expected, errors) +SOME_FILE = {"data_category": ["Aligned Reads"]} +REFERENCE_FILE = {"data_category": ["Genome Assembly"]} SEQUENCER_CODE = "A" SOME_SEQUENCER = {"code": SEQUENCER_CODE} ANOTHER_SEQUENCER = {"code": "B"} @@ -477,25 +479,27 @@ def test_get_donor_sex_and_age_parts( @pytest.mark.parametrize( - "sequencers,assays,expected,errors", + "file,sequencers,assays,expected,errors", [ - ([], [], "", True), - ([SOME_SEQUENCER], [], "", True), - ([], [SOME_ASSAY], "", True), - ([SOME_SEQUENCER], [SOME_ASSAY], f"{SEQUENCER_CODE}{ASSAY_CODE}", False), - ([SOME_SEQUENCER, ANOTHER_SEQUENCER], [SOME_ASSAY], "", True), - ([SOME_SEQUENCER], [SOME_ASSAY, ANOTHER_ASSAY], "", True), - ([SOME_SEQUENCER, SOME_ITEM], [SOME_ASSAY], "", True), + (SOME_FILE,[], [], "", True), + (SOME_FILE,[SOME_SEQUENCER], [], "", True), + (SOME_FILE,[], [SOME_ASSAY], "", True), + (SOME_FILE,[SOME_SEQUENCER], [SOME_ASSAY], f"{SEQUENCER_CODE}{ASSAY_CODE}", False), + (SOME_FILE,[SOME_SEQUENCER, ANOTHER_SEQUENCER], [SOME_ASSAY], "", True), + (REFERENCE_FILE,[SOME_SEQUENCER, ANOTHER_SEQUENCER], [SOME_ASSAY, ANOTHER_ASSAY], "XX", False), + (SOME_FILE,[SOME_SEQUENCER], [SOME_ASSAY, ANOTHER_ASSAY], "", True), + (SOME_FILE,[SOME_SEQUENCER, SOME_ITEM], [SOME_ASSAY], "", True), ], ) def test_get_sequencing_and_assay_codes( + file: Dict[str, Any], sequencers: List[Dict[str, Any]], assays: List[Dict[str, Any]], expected: str, errors: bool, ) -> None: """Test sequencing and assay codes retrieval for annotated filenames.""" - result = get_sequencing_and_assay_codes(sequencers, assays) + result = get_sequencing_and_assay_codes(file, sequencers, assays) assert_filename_part_matches(result, expected, errors) @@ -526,7 +530,10 @@ def test_get_sequencing_center_code( ANOTHER_SOFTWARE_VERSION = "2.3.4" ANOTHER_SOFTWARE = {"code": ANOTHER_SOFTWARE_CODE, "version": ANOTHER_SOFTWARE_VERSION} REFERENCE_GENOME_CODE = "GRCh38" -TARGET_GENOME_CODE = "HELA_DSA" +DSA_CODE = "Hela_DSA" +DSA_VALUE = "DSA" +HAPLOTYPE_CODE = "hapX" + GENE_ANNOTATION_CODE = "gencode" GENE_ANNOTATION_VERSION = "v45" SOME_REFERENCE_GENOME = {"code": REFERENCE_GENOME_CODE} @@ -534,10 +541,31 @@ def test_get_sequencing_center_code( SOME_UNALIGNED_READS = {"data_type": ["Unaligned Reads"]} SOME_ALIGNED_READS = {"data_type": ["Aligned Reads"]} RNA_ALIGNED_READS = {"data_type": ["Aligned Reads"], "data_category": ["RNA Quantification"]} + +SOME_TARGET_ASSEMBLY = { + "@type": ["ReferenceGenome"], + "code": REFERENCE_GENOME_CODE +} +SOME_SOURCE_ASSEMBLY = { + "@type": ["DonorSpecificAssembly"], + "code": DSA_CODE +} SOME_CHAIN_FILE = { - "data_type": ["SupplementaryFile"], - "source_assembly": REFERENCE_GENOME_CODE, - "target_assembly": TARGET_GENOME_CODE + "data_category": ["Reference Conversion"], + "data_type": ["Chain File"], + "source_assembly": DSA_CODE, + "target_assembly": REFERENCE_GENOME_CODE +} +SOME_FASTA_FILE = { + "data_type": ["DSA"], + "data_category": ["Genome Assembly"], + "donor_specific_assembly": "Some_DSA", + "haplotype": HAPLOTYPE_CODE +} + +ANOTHER_FASTA_FILE = { + "data_category": ["Genome Assembly"], + "data_type": ["Reference Sequence"], } SOME_TSV_FILE = { "data_type": ["Gene Expression"], @@ -570,6 +598,11 @@ def test_get_sequencing_center_code( "standard_file_extension": "chain.gz", "valid_item_types": ["SupplementaryFile"] } +FASTA_FILE_EXTENSION = { + "identifier": "FASTA", + "standard_file_extension": "fa", + "valid_item_types": ["SupplementaryFile"] +} TSV_FILE_EXTENSION = { "identifier": "TSV", "standard_file_extension": "tsv", @@ -578,38 +611,58 @@ def test_get_sequencing_center_code( @pytest.mark.parametrize( - "file,software,reference_genome,annotation,file_extension,expected,errors", + "file,software,reference_genome,annotation,file_extension,target_assembly,source_assembly,dsa,expected,errors", [ - ({}, [], {}, {}, {},"" , True), - (SOME_UNALIGNED_READS, [], {}, {}, SOME_FILE_EXTENSION,DEFAULT_ABSENT_FIELD, False), + ({}, [], {}, {}, {}, {}, {}, {}, "" , True), + (SOME_UNALIGNED_READS, [], {}, {}, SOME_FILE_EXTENSION, {}, {}, {}, DEFAULT_ABSENT_FIELD, False), ( SOME_UNALIGNED_READS, [SOME_SOFTWARE], {}, {}, SOME_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}", False, ), - (SOME_UNALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, "", True), - (SOME_ALIGNED_READS, [], {}, {}, {},"", True), - (SOME_ALIGNED_READS, [SOME_SOFTWARE], {}, {}, SOME_FILE_EXTENSION, "", True), + (SOME_UNALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, {}, {}, {}, "", True), + (SOME_ALIGNED_READS, [], {}, {}, {}, {}, {}, {}, "", True), + (SOME_ALIGNED_READS, [SOME_SOFTWARE], {}, {}, SOME_FILE_EXTENSION, {}, {}, {}, "", True), ( SOME_ALIGNED_READS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, ), - (SOME_SOMATIC_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, VCF_FILE_EXTENSION, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False), + ( + SOME_SOMATIC_VARIANT_CALLS, + [SOME_SOFTWARE], + SOME_REFERENCE_GENOME, + {}, + VCF_FILE_EXTENSION, + {}, + {}, + {}, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", + False + ), ( SOME_VARIANT_CALLS, [SOME_SOFTWARE], SOME_REFERENCE_GENOME, {}, VCF_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, ), @@ -619,6 +672,9 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, + {}, + {}, + {}, f"{ANOTHER_SOFTWARE_CODE}_{ANOTHER_SOFTWARE_VERSION}_{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, ), @@ -628,6 +684,9 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}", False, ), @@ -637,7 +696,46 @@ def test_get_sequencing_center_code( {}, {}, CHAIN_FILE_EXTENSION, - f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}To{TARGET_GENOME_CODE}", + SOME_TARGET_ASSEMBLY, + SOME_SOURCE_ASSEMBLY, + SOME_SOURCE_ASSEMBLY, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{DSA_VALUE}To{REFERENCE_GENOME_CODE}", + False, + ), + ( + SOME_CHAIN_FILE, + [SOME_SOFTWARE, SOME_ITEM], + {}, + {}, + CHAIN_FILE_EXTENSION, + {}, + {}, + SOME_SOURCE_ASSEMBLY, + "", + True, + ), + ( + SOME_FASTA_FILE, + [SOME_SOFTWARE, SOME_ITEM], + {}, + {}, + FASTA_FILE_EXTENSION, + {}, + {}, + SOME_SOURCE_ASSEMBLY, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{HAPLOTYPE_CODE}", + False, + ), + ( + ANOTHER_FASTA_FILE, + [SOME_SOFTWARE, SOME_ITEM], + {}, + {}, + FASTA_FILE_EXTENSION, + {}, + {}, + {}, + f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}", False, ), ( @@ -646,8 +744,11 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, TSV_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}_gene", - False + False, ), ( SOME_ISOFORM_TSV_FILE, @@ -655,6 +756,9 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, TSV_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}_isoform", False ), @@ -664,6 +768,9 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, TSV_FILE_EXTENSION, + {}, + {}, + {}, "", True ), @@ -673,6 +780,9 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, SOME_GENE_ANNOTATION, SOME_FILE_EXTENSION, + {}, + {}, + {}, f"{SOFTWARE_CODE}_{SOFTWARE_VERSION}_{REFERENCE_GENOME_CODE}_{GENE_ANNOTATION_CODE}_{GENE_ANNOTATION_VERSION}", False ), @@ -682,6 +792,9 @@ def test_get_sequencing_center_code( SOME_REFERENCE_GENOME, {}, SOME_FILE_EXTENSION, + {}, + {}, + {}, "", True ) @@ -693,11 +806,14 @@ def test_get_analysis( reference_genome: Dict[str, Any], annotation: Dict[str, Any], file_extension: Dict[str, Any], + target_assembly: Dict[str, Any], + source_assembly: Dict[str, Any], + dsa: Dict[str, Any], expected: str, errors: bool, ) -> None: """Test analysis info retrieval for annotated filenames.""" - result = get_analysis(file, software, reference_genome, annotation, file_extension) + result = get_analysis(file, software, reference_genome, annotation, file_extension, target_assembly, source_assembly, dsa) assert_filename_part_matches(result, expected, errors) @@ -719,7 +835,8 @@ def test_get_analysis( ], ) def test_get_software_and_versions( - software: List[Dict[str, Any]], expected: str + software: List[Dict[str, Any]], + expected: str ) -> None: """Test software names and versions retrieval.""" result = get_software_and_versions(software) diff --git a/src/encoded/tests/test_file_release.py b/src/encoded/tests/test_file_release.py index 49ec907fa..3c762e023 100644 --- a/src/encoded/tests/test_file_release.py +++ b/src/encoded/tests/test_file_release.py @@ -6,7 +6,11 @@ from .utils import get_search from ..commands.release_file import FileRelease -from ..item_utils import file as file_utils, item as item_utils +from ..item_utils import ( + file as file_utils, + item as item_utils, + supplementary_file as supp_file_utils +) from ..item_utils.utils import RequestHandler @@ -35,7 +39,8 @@ def test_file_release(es_testapp: TestApp, workbook: None) -> None: identifier = item_utils.get_uuid(file) file_release = FileRelease({}, identifier) file_release.prepare(dataset) - assert file_release.file_sets + if not supp_file_utils.is_reference_conversion(file) and not supp_file_utils.is_genome_assembly(file): + assert file_release.file_sets assert file_release.libraries assert file_release.assays assert file_release.sequencings