smaht-dac · sarahgonicholson · Dec 20, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,13 @@ smaht-portal
 Change Log
 ----------
 
+0.122.0
+=======
+`PR 277 DSA Release <https://github.com/smaht-dac/smaht-portal/pull/277>`_
+* In `commands/release-file.py` and `commands/create-annotated-filenames.py`:
+  * Assay and sequencer codes value set to XX for DSA fasta files and chain files
+  * For Supplementary Files, use `haplotype`, `target_assembly`, and `source_assembly` properties to create annotated filenames for chain and fasta files
+
 0.121.0
 =======
 `PR 300 SN Remove basecalling <https://github.com/smaht-dac/smaht-portal/pull/300>`

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "encoded"
-version = "0.121.0"
+version = "0.122.0"
 description = "SMaHT Data Analysis Portal"
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
@@ -22,6 +22,8 @@
     supplementary_file as supp_file_utils,
     tissue as tissue_utils,
     tissue_sample as tissue_sample_utils,
+    donor_specific_assembly as dsa_utils,
+    reference_genome as rg_utils
 )
 from encoded.item_utils.constants import file as file_constants
 from encoded.item_utils.utils import RequestHandler
@@ -32,6 +34,7 @@
 FILENAME_SEPARATOR = "-"
 ANALYSIS_INFO_SEPARATOR = "_"
 CHAIN_FILE_INFO_SEPARATOR = "To"
+DSA_INFO_VALUE = "DSA"
 
 RNA_DATA_CATEGORY = "RNA Quantification"
 GENE_DATA_TYPE = "Gene Expression"
@@ -98,6 +101,8 @@ class AssociatedItems:
     tissue_samples: List[Dict[str, Any]]
     tissues: List[Dict[str, Any]]
     donors: List[Dict[str, Any]]
+    target_assembly: Dict[str, Any]
+    source_assembly: Dict[str, Any]
 
 
 def get_associated_items(
@@ -116,6 +121,8 @@ def get_associated_items(
     reference_genome = get_reference_genome(file, request_handler)
     gene_annotations = get_gene_annotations(file, request_handler)
     donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
+    target_assembly = get_target_assembly(file, request_handler)
+    source_assembly = get_source_assembly(file, request_handler)
     if donor_specific_assembly:
         file_sets=get_derived_from_file_sets(file, request_handler)
     else:
@@ -146,6 +153,8 @@ def get_associated_items(
         tissues=tissues,
         cell_lines=cell_lines,
         donors=donors,
+        target_assembly=target_assembly,
+        source_assembly=source_assembly
     )
 
 
@@ -215,6 +224,20 @@ def get_reference_genome(
     return get_item(file_utils.get_reference_genome(file), request_handler)
 
 
+def get_target_assembly(
+    file: Dict[str, Any], request_handler: RequestHandler
+) -> Union[None, Dict[str, Any]]:
+    """Get target assembly for file."""
+    return get_item(supp_file_utils.get_target_assembly(file), request_handler)
+
+
+def get_source_assembly(
+    file: Dict[str, Any], request_handler: RequestHandler
+) -> Union[None, Dict[str, Any]]:
+    """Get source assembly for file."""
+    return get_item(supp_file_utils.get_source_assembly(file), request_handler)
+
+
 def get_gene_annotations(
     file: Dict[str, Any], request_handler: RequestHandler
 ) -> Dict[str, Any]:
@@ -432,7 +455,9 @@ def get_annotated_filename(
         associated_items.donors, associated_items.sample_sources
     )
     sequencing_and_assay_codes = get_sequencing_and_assay_codes(
-        associated_items.sequencers, associated_items.assays
+        associated_items.file, 
+        associated_items.sequencers, 
+        associated_items.assays
     )
     sequencing_center_code = get_sequencing_center_code(
         associated_items.sequencing_center
@@ -444,7 +469,10 @@ def get_annotated_filename(
         associated_items.software,
         associated_items.reference_genome,
         associated_items.gene_annotations,
-        associated_items.file_format
+        associated_items.file_format,
+        associated_items.target_assembly,
+        associated_items.source_assembly,
+        associated_items.donor_specific_assembly,
     )
     errors = collect_errors(
         project_id,
@@ -767,14 +795,20 @@ def get_sex_abbreviation(sex: str) -> str:
 
 
 def get_sequencing_and_assay_codes(
+    file: Dict[str, Any],
     sequencers: List[Dict[str], Any],
     assays: List[Dict[str], Any],
 ) -> FilenamePart:
-    """Get sequencing and assay codes for file."""
+    """Get sequencing and assay codes for file.
+
+    Returns XX for Genome Assembly and Reference Conversion files.
+    """
     sequencing_codes = get_sequencing_codes(sequencers)
     assay_codes = get_assay_codes(assays)
     if len(sequencing_codes) == 1 and len(assay_codes) == 1:
         return get_filename_part(value=f"{sequencing_codes[0]}{assay_codes[0]}")
+    elif supp_file_utils.is_genome_assembly(file) or supp_file_utils.is_reference_conversion(file):
+        return get_filename_part(value="XX")
     errors = []
     if not sequencing_codes:
         errors.append("No sequencing code found")
@@ -819,6 +853,9 @@ def get_analysis(
     reference_genome: Dict[str, Any],
     gene_annotations: Dict[str, Any],
     file_extension: Dict[str, Any],
+    target_assembly: Dict[str, Any],
+    source_assembly: Dict[str, Any],
+    donor_specific_assembly: Dict[str, Any],
 ) -> FilenamePart:
     """Get analysis info for file.
 
@@ -829,19 +866,23 @@ def get_analysis(
     reference_genome_code = item_utils.get_code(reference_genome)
     gene_annotation_code = get_annotations_and_versions(gene_annotations)
     transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
+    haplotype_code = get_haplotype_value(file, file_extension, donor_specific_assembly)
+    chain_code = get_chain_file_value(file, target_assembly, source_assembly, file_extension)
     value = get_analysis_value(
         software_and_versions,
         reference_genome_code,
         gene_annotation_code,
-        transcript_info_code
+        transcript_info_code,
+        chain_code,
+        haplotype_code
     )
-    if file_format_utils.is_chain_file(file_extension):
-        value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}"
     errors = get_analysis_errors(
         file,
         reference_genome_code,
         gene_annotation_code,
         transcript_info_code,
+        chain_code,
+        haplotype_code,
         file_extension,
     )
     if errors:
@@ -858,6 +899,8 @@ def get_analysis_errors(
     reference_genome_code: str,
     gene_annotation_code: str,
     transcript_info_code:  str,
+    chain_code: str,
+    haplotype_code: str,
     file_extension: Dict[str, Any]
 ) -> List[str]:
     """Get analysis errors for file by file type."""
@@ -876,19 +919,24 @@ def get_analysis_errors(
             errors.append("No gene annotation code found")
         elif file_format_utils.is_tsv_file(file_extension) and not transcript_info_code:
             errors.append("No gene or isoform code found")
+    if file_format_utils.is_chain_file(file_extension):
+        if not chain_code:
+            errors.append("No target or source assembly found for chain conversion ")
     return errors
 
 
 def get_analysis_value(
     software_and_versions: str,
     reference_genome_code: str,
     gene_annotation_code: str,
-    transcript_info_code: str
+    transcript_info_code: str,
+    chain_code: str,
+    haplotype_code: str,
 ) -> str:
     """Get analysis value for filename."""
     to_write = [
         string
-        for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code]
+        for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code, chain_code, haplotype_code]
         if string
     ]
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
@@ -951,8 +999,7 @@ def get_annotation_codes_missing_versions(
 def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
     """Get software and accompanying versions for file.
 
-    Currently only looking for software items with codes, as these are
-    expected to be the software used for naming.
+    Currently looking for software items with codes, as these are expected to be the software used for naming.
     """
     software_with_codes = get_software_with_codes(software)
     if not software_with_codes:
@@ -979,7 +1026,9 @@ def get_software_with_versions(
     return [item for item in software_items if item_utils.get_version(item)]
 
 
-def get_software_and_versions_string(software_items: List[Dict[str, Any]]) -> str:
+def get_software_and_versions_string(
+        software_items: List[Dict[str, Any]]
+    ) -> str:
     """Get string representation of software and versions."""
     sorted_software_items = sorted(software_items, key=item_utils.get_code)
     return ANALYSIS_INFO_SEPARATOR.join(
@@ -1002,11 +1051,36 @@ def get_software_codes_missing_versions(
     ]
 
 
-def get_chain_file_value(file: Dict[str, Any]) -> str:
-    """Get reference conversion direction for chain files."""
-    target_assembly=supp_file_utils.get_target_assembly(file)
-    source_assembly=supp_file_utils.get_source_assembly(file)
-    return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly])
+def get_chain_file_value(
+        file: Dict[str, Any],
+        target_assembly: Dict[str, Any],
+        source_assembly: Dict[str, Any],
+        file_extension: Dict[str, Any]
+    ) -> str:
+    """Get genome conversion direction for chain files."""
+    if file_format_utils.is_chain_file(file_extension):
+        target_value = ""
+        source_value = ""
+        if target_assembly and source_assembly:
+            target_value = DSA_INFO_VALUE if dsa_utils.is_donor_specific_assembly(target_assembly) else item_utils.get_code(target_assembly)
+            source_value = DSA_INFO_VALUE if dsa_utils.is_donor_specific_assembly(source_assembly) else item_utils.get_code(source_assembly)
+        if target_value and source_value:
+            return CHAIN_FILE_INFO_SEPARATOR.join([source_value,target_value])
+    return ""
+
+
+def get_haplotype_value(
+        file: Dict[str, Any],
+        file_extension: Dict[str, Any],
+        donor_specific_assembly: Dict[str, Any]
+    ):
+    """Get haplotype value for fasta file."""
+    if file_format_utils.is_fasta_file(file_extension):
+        if (haplotype := supp_file_utils.get_haplotype(file)):
+            return haplotype
+        elif donor_specific_assembly:
+            return DSA_INFO_VALUE
+    return ""
 
 
 def get_rna_seq_tsv_value(file: Dict[str, Any], file_extension: Dict[str, Any]) -> str:

diff --git a/src/encoded/commands/release_file.py b/src/encoded/commands/release_file.py
@@ -24,6 +24,7 @@
     sample_source as sample_source_utils,
     submitted_file as submitted_file_utils,
     tissue as tissue_utils,
+    supplementary_file as supp_file_utils
 )
 from encoded.item_utils.constants import (
     file as file_constants,
@@ -198,12 +199,15 @@ def get_file_sets_from_file(self) -> List[dict]:
         )
         mwfrs = ff_utils.search_metadata(search_filter, key=self.key)
         if len(mwfrs) != 1:
-            self.print_error_and_exit(
-                (
-                    f"Expected exactly one associated MetaWorkflowRun, got"
-                    f" {len(mwfrs)}: {search_filter}"
+            if not supp_file_utils.is_genome_assembly(self.file) and not supp_file_utils.is_reference_conversion(self.file):
+                self.print_error_and_exit(
+                    (
+                        f"Expected exactly one associated MetaWorkflowRun, got"
+                        f" {len(mwfrs)}: {search_filter}"
+                    )
                 )
-            )
+            else:
+                return []
         mwfr = mwfrs[0]
         file_sets = meta_workflow_run_utils.get_file_sets(mwfr)
         # Might need to be more general in the future
@@ -238,6 +242,7 @@ def prepare(
         self.add_release_items_to_patchdict(
             self.quality_metrics_zips, "Compressed QC metrics file"
         )
+
         self.add_release_items_to_patchdict(self.file_sets, "FileSet")
         self.add_release_items_to_patchdict(self.sequencings, "Sequencing")
         self.add_release_items_to_patchdict(self.libraries, "Library")
@@ -354,17 +359,16 @@ def add_file_patchdict(self, dataset: str) -> None:
             item_utils.get_accession(file_set) for file_set in self.file_sets
         ]
         annotated_filename_info = self.get_annotated_filename_info()
-
         # Add file to file set and set status to released
         patch_body = {
             item_constants.UUID: item_utils.get_uuid(self.file),
             item_constants.STATUS: item_constants.STATUS_RELEASED,
-            file_constants.FILE_SETS: file_set_accessions,
             file_constants.DATASET: dataset,
             file_constants.ACCESS_STATUS: access_status,
             file_constants.ANNOTATED_FILENAME: annotated_filename_info.filename,
         }
-
+        if file_set_accessions:
+            patch_body[file_constants.FILE_SETS] = file_set_accessions
         # Take the extra files from the annotated filename object if available.
         # They will have the correct filenames
         if annotated_filename_info.patch_dict:
@@ -454,6 +458,12 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: (
+                    file_constants.ACCESS_STATUS_OPEN
+                ),
+                file_constants.DATA_CATEGORY_GENOME_CONVERSION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                ),
                 file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
                     file_constants.ACCESS_STATUS_OPEN
                 )
@@ -468,6 +478,12 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_PROTECTED
                 ),
+                file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                ),
+                file_constants.DATA_CATEGORY_GENOME_CONVERSION: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                ),
                 file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
                     file_constants.ACCESS_STATUS_OPEN
                 )
@@ -482,6 +498,12 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                ),
+                file_constants.DATA_CATEGORY_GENOME_CONVERSION: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                ),
                 file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
                     file_constants.ACCESS_STATUS_OPEN
                 )

diff --git a/src/encoded/item_utils/constants/file.py b/src/encoded/item_utils/constants/file.py
@@ -6,6 +6,8 @@
 DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
 DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
 DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
+DATA_CATEGORY_GENOME_CONVERSION = "Reference Conversion"
+DATA_CATEGORY_GENOME_ASSEMBLY = "Genome Assembly"
 DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification"
 DATASET = "dataset"
 EXTRA_FILES = "extra_files"

diff --git a/src/encoded/item_utils/donor_specific_assembly.py b/src/encoded/item_utils/donor_specific_assembly.py
@@ -3,9 +3,15 @@
 
 from .utils import RequestHandler, get_property_value_from_identifier, get_property_values_from_identifiers
 from . import (
-    file as file_utils
+    file as file_utils,
+    item as item_utils
 )
 
+def is_donor_specific_assembly(properties: Dict[str, Any]) -> bool:
+    """Check if item is a donor specific assembly."""
+    return item_utils.get_type(properties) == "DonorSpecificAssembly"
+
+
 def get_file_format_id(request_handler: RequestHandler,identifier: str):
     """Return identifier of file_format for file."""
     return get_property_value_from_identifier(request_handler,identifier,file_utils.get_file_format)

diff --git a/src/encoded/item_utils/file_format.py b/src/encoded/item_utils/file_format.py
@@ -13,5 +13,9 @@ def is_chain_file(properties: Dict[str, Any]) -> bool:
     return get_standard_file_extension(properties) in ["chain.gz","chain"]
 
 
+def is_fasta_file(properties: Dict[str, Any]) -> bool:
+    return get_standard_file_extension(properties) in ["fa","fasta"]
+
+
 def is_tsv_file(properties: Dict[str, Any]) -> bool:
     return get_standard_file_extension(properties) == "tsv"