smaht-dac · sarahgonicholson · Dec 20, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,11 +7,16 @@ smaht-portal
 Change Log
 ----------
 
-`PR 266: Node v20 Upgrade <https://github.com/smaht-dac/smaht-portal/pull/266>`_
+0.105.1
+=======
+`PR 277 DSA Release <https://github.com/smaht-dac/smaht-portal/pull/277>`_
+* In `commands/release-file.py` and `commands/create-annotated-filenames.py`:
+  * Assay and sequencer codes value set to XX for DSA fasta files and chain files
+  * For Supplementary Files, use `title` instead of `code` for software part of annotated filename
 
 0.105.0
-=====
-
+=======
+`PR 266: Node v20 Upgrade <https://github.com/smaht-dac/smaht-portal/pull/266>`_
 * Node 18 to 20 upgrade including GitHub actions
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "encoded"
-version = "0.105.0"
+version = "0.105.1"
 description = "SMaHT Data Analysis Portal"
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"

diff --git a/src/encoded/commands/create_annotated_filenames.py b/src/encoded/commands/create_annotated_filenames.py
@@ -32,7 +32,7 @@
 FILENAME_SEPARATOR = "-"
 ANALYSIS_INFO_SEPARATOR = "_"
 CHAIN_FILE_INFO_SEPARATOR = "To"
-
+DSA_INFO_VALUE = "dsa"
 
 DEFAULT_PROJECT_ID = constants.PRODUCTION_PREFIX
 DEFAULT_ABSENT_FIELD = "X"
@@ -419,7 +419,9 @@ def get_annotated_filename(
         associated_items.donors, associated_items.sample_sources
     )
     sequencing_and_assay_codes = get_sequencing_and_assay_codes(
-        associated_items.sequencers, associated_items.assays
+        associated_items.file, 
+        associated_items.sequencers, 
+        associated_items.assays
     )
     sequencing_center_code = get_sequencing_center_code(
         associated_items.sequencing_center
@@ -750,14 +752,20 @@ def get_sex_abbreviation(sex: str) -> str:
 
 
 def get_sequencing_and_assay_codes(
+    file: List[Dict[str, Any]],
     sequencers: List[Dict[str], Any],
     assays: List[Dict[str], Any],
 ) -> FilenamePart:
-    """Get sequencing and assay codes for file."""
+    """Get sequencing and assay codes for file.
+
+    Returns XX for Reference Genome and Reference Conversion files.
+    """
     sequencing_codes = get_sequencing_codes(sequencers)
     assay_codes = get_assay_codes(assays)
     if len(sequencing_codes) == 1 and len(assay_codes) == 1:
         return get_filename_part(value=f"{sequencing_codes[0]}{assay_codes[0]}")
+    elif supp_file_utils.is_reference_conversion(file) or supp_file_utils.is_reference_genome(file):
+        return get_filename_part(value="XX")
     errors = []
     if not sequencing_codes:
         errors.append("No sequencing code found")
@@ -807,7 +815,7 @@ def get_analysis(
     Some error handling here for missing data by file type, but not
     exhaustive and allowing for some flexibility in what is expected.
     """
-    software_and_versions = get_software_and_versions(software)
+    software_and_versions = get_software_and_versions(file, software)
     reference_genome_code = item_utils.get_code(reference_genome)
     errors = get_analysis_errors(file, reference_genome_code)
     if errors:
@@ -816,7 +824,9 @@ def get_analysis(
         software_and_versions, reference_genome_code
     )
     if file_format_utils.is_chain_file(file_extension):
-        value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}"
+        value = ANALYSIS_INFO_SEPARATOR.join([value,get_chain_file_value(file)]) if value else get_chain_file_value(file)
+    if file_format_utils.is_fasta_file(file_extension):
+        value = f"{value}{ANALYSIS_INFO_SEPARATOR}{DSA_INFO_VALUE}"
     if not value:
         if file_utils.is_unaligned_reads(file):  # Think this is the only case (?)
             return get_filename_part(value=DEFAULT_ABSENT_FIELD)
@@ -853,18 +863,20 @@ def get_analysis_value(
     return ANALYSIS_INFO_SEPARATOR.join(to_write)
 
 
-def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
+def get_software_and_versions(file: Dict[str, Any], software: List[Dict[str, Any]]) -> str:
     """Get software and accompanying versions for file.
 
-    Currently only looking for software items with codes, as these are
-    expected to be the software used for naming.
+    Currently looking for software items with codes, as these are expected to be the software used for naming, with the exception of SupplementaryFile items, where lower case title is used.
     """
-    software_with_codes = get_software_with_codes(software)
+    if supp_file_utils.is_supplementary_file(file):
+        software_with_codes = get_software_with_title(software)
+    else:
+        software_with_codes = get_software_with_codes(software)
     if not software_with_codes:
         return ""
     software_with_codes_and_versions = get_software_with_versions(software_with_codes)
     if len(software_with_codes) == len(software_with_codes_and_versions):
-        return get_software_and_versions_string(software_with_codes_and_versions)
+        return get_software_and_versions_string(file, software_with_codes_and_versions)
     missing_versions = get_software_codes_missing_versions(software_with_codes)
     logger.warning(f"Missing versions for software items: {missing_versions}.")
     return ""
@@ -877,23 +889,43 @@ def get_software_with_codes(
     return [item for item in software_items if item_utils.get_code(item)]
 
 
+def get_software_with_title(
+    software_items: List[Dict[str, Any]]
+) -> List[Dict[str, Any]]:
+    """Get software items with title."""
+    return [item for item in software_items if item_utils.get_title(item)]
+
+
 def get_software_with_versions(
     software_items: List[Dict[str, Any]]
 ) -> List[Dict[str, Any]]:
     """Get software items with versions."""
     return [item for item in software_items if item_utils.get_version(item)]
 
 
-def get_software_and_versions_string(software_items: List[Dict[str, Any]]) -> str:
+def get_software_and_versions_string(
+        file: Dict[str, Any],
+        software_items: List[Dict[str, Any]]
+    ) -> str:
     """Get string representation of software and versions."""
-    sorted_software_items = sorted(software_items, key=item_utils.get_code)
-    return ANALYSIS_INFO_SEPARATOR.join(
-        [
-            f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
-            f"{item_utils.get_version(item)}"
-            for item in sorted_software_items
-        ]
-    )
+    if supp_file_utils.is_supplementary_file(file):
+        sorted_software_items = sorted(software_items, key=item_utils.get_title)
+        return ANALYSIS_INFO_SEPARATOR.join(
+            [
+                f"{item_utils.get_title(item).lower()}{ANALYSIS_INFO_SEPARATOR}"
+                f"{item_utils.get_version(item)}"
+                for item in sorted_software_items
+            ]
+        )
+    else:
+        sorted_software_items = sorted(software_items, key=item_utils.get_code)
+        return ANALYSIS_INFO_SEPARATOR.join(
+            [
+                f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
+                f"{item_utils.get_version(item)}"
+                for item in sorted_software_items
+            ]
+        )
 
 
 def get_software_codes_missing_versions(

diff --git a/src/encoded/commands/release_file.py b/src/encoded/commands/release_file.py
@@ -24,6 +24,7 @@
     sample_source as sample_source_utils,
     submitted_file as submitted_file_utils,
     tissue as tissue_utils,
+    supplementary_file as supp_file_utils
 )
 from encoded.item_utils.constants import (
     file as file_constants,
@@ -198,12 +199,15 @@ def get_file_sets_from_file(self) -> List[dict]:
         )
         mwfrs = ff_utils.search_metadata(search_filter, key=self.key)
         if len(mwfrs) != 1:
-            self.print_error_and_exit(
-                (
-                    f"Expected exactly one associated MetaWorkflowRun, got"
-                    f" {len(mwfrs)}: {search_filter}"
+            if not supp_file_utils.is_reference_genome(self.file) and not supp_file_utils.is_reference_conversion(self.file):
+                self.print_error_and_exit(
+                    (
+                        f"Expected exactly one associated MetaWorkflowRun, got"
+                        f" {len(mwfrs)}: {search_filter}"
+                    )
                 )
-            )
+            else:
+                return []
         mwfr = mwfrs[0]
         file_sets = meta_workflow_run_utils.get_file_sets(mwfr)
         # Might need to be more general in the future
@@ -354,17 +358,16 @@ def add_file_patchdict(self, dataset: str) -> None:
             item_utils.get_accession(file_set) for file_set in self.file_sets
         ]
         annotated_filename_info = self.get_annotated_filename_info()
-
         # Add file to file set and set status to released
         patch_body = {
             item_constants.UUID: item_utils.get_uuid(self.file),
             item_constants.STATUS: item_constants.STATUS_RELEASED,
-            file_constants.FILE_SETS: file_set_accessions,
             file_constants.DATASET: dataset,
             file_constants.ACCESS_STATUS: access_status,
             file_constants.ANNOTATED_FILENAME: annotated_filename_info.filename,
         }
-
+        if not supp_file_utils.is_reference_conversion(self.file) and not supp_file_utils.is_reference_genome(self.file):
+            patch_body[file_constants.FILE_SET] = file_set_accessions
         # Take the extra files from the annotated filename object if available.
         # They will have the correct filenames
         if annotated_filename_info.patch_dict:
@@ -454,6 +457,12 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_REFERENCE_GENOME: (
+                    file_constants.ACCESS_STATUS_OPEN
+                ),
+                file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: (
+                    file_constants.ACCESS_STATUS_OPEN
+                )
             },
             IPSC: {
                 file_constants.DATA_CATEGORY_SEQUENCING_READS: (
@@ -465,6 +474,12 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_PROTECTED
                 ),
+                file_constants.DATA_CATEGORY_REFERENCE_GENOME: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                ),
+                file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                )
             },
             self.TISSUE: {
                 file_constants.DATA_CATEGORY_SEQUENCING_READS: (
@@ -476,6 +491,12 @@ def get_access_status(self, dataset: str) -> str:
                 file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
                     file_constants.ACCESS_STATUS_OPEN
                 ),
+                file_constants.DATA_CATEGORY_REFERENCE_GENOME: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                ),
+                file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: (
+                    file_constants.ACCESS_STATUS_PROTECTED
+                )
             },
         }
         if dataset in [

diff --git a/src/encoded/item_utils/constants/file.py b/src/encoded/item_utils/constants/file.py
@@ -6,6 +6,8 @@
 DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
 DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
 DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
+DATA_CATEGORY_REFERENCE_GENOME = "Reference Genome"
+DATA_CATEGORY_REFERENCE_CONVERSION = "Reference Conversion"
 DATASET = "dataset"
 EXTRA_FILES = "extra_files"
 FILE_SETS = "file_sets"

diff --git a/src/encoded/item_utils/file_format.py b/src/encoded/item_utils/file_format.py
@@ -10,4 +10,9 @@ def get_other_allowed_extensions(properties: Dict[str, Any]) -> str:
 
 
 def is_chain_file(properties: Dict[str, Any]) -> bool:
-    return get_standard_file_extension(properties) == "chain.gz"
+    return get_standard_file_extension(properties) in ["chain.gz","chain"]
+
+
+def is_fasta_file(properties: Dict[str, Any]) -> bool:
+    return get_standard_file_extension(properties) in ["fa","fasta"]
+
diff --git a/src/encoded/item_utils/supplementary_file.py b/src/encoded/item_utils/supplementary_file.py
@@ -21,6 +21,16 @@ def is_chain_file(properties: Dict[str, Any],request_handler: RequestHandler):
     return file_utils.get_file_extension(properties,request_handler) == "chain.gz"
 
 
+def is_reference_genome(properties: Dict[str, Any]):
+    """Check if data category is Reference Genome"""
+    return "Reference Genome" in file_utils.get_data_category(properties)
+
+
+def is_reference_conversion(properties: Dict[str, Any]):
+    """Check if data category is Reference Conversion"""
+    return "Reference Conversion" in file_utils.get_data_category(properties)
+
+
 def get_donor_specific_assembly(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
     """Get donor-specific assembly from properties."""
     return properties.get("donor_specific_assembly", "")
@@ -64,16 +74,3 @@ def get_derived_from_file_sets(
             file_utils.get_file_sets,
         )
     return properties.get("file_sets", [])  
-
-
-def get_dsa_software(
-        properties: Dict[str, Any], request_handler: Optional[RequestHandler] = None 
-) -> List[Union[str, Dict[str, Any]]]:
-    """Get software from donor-specific assembly associated with file."""
-    if request_handler:
-        return get_property_value_from_identifier(
-            request_handler,
-            get_donor_specific_assembly(properties),
-            dsa_utils.get_software,
-        )
-    return []