Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SN DSA Release #277

Merged
merged 30 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
608dea2
release file for fa
sarahgonicholson Oct 16, 2024
eea955e
fix chainf iles
sarahgonicholson Oct 16, 2024
84d2567
update changelog
sarahgonicholson Oct 16, 2024
0027520
Merge up-to-date with main
sarahgonicholson Oct 16, 2024
4188794
Remove debugger
sarahgonicholson Oct 16, 2024
b33acc7
Fix tests
sarahgonicholson Oct 16, 2024
038b37b
fix changelog
sarahgonicholson Oct 16, 2024
a4a551d
Fix reference genome access status
sarahgonicholson Oct 17, 2024
a509010
Add test for multiple assays and sequencers
sarahgonicholson Oct 17, 2024
f03446b
Fix chain format
sarahgonicholson Oct 17, 2024
3afa0fa
refactor data category check
sarahgonicholson Oct 17, 2024
7113233
use software title for supplementary files
sarahgonicholson Oct 17, 2024
acbc80e
fix software test
sarahgonicholson Oct 17, 2024
bbe7344
update software description
sarahgonicholson Oct 17, 2024
b4394ba
use supp file utils
sarahgonicholson Oct 17, 2024
73fb220
Don't patch fileset if reference file
sarahgonicholson Oct 17, 2024
4de4f84
fix file release test
sarahgonicholson Oct 18, 2024
c06087d
Address comments
sarahgonicholson Oct 18, 2024
18ff9d5
Merge up-to-date with main
sarahgonicholson Oct 18, 2024
5497661
merge up-to-date with main
sarahgonicholson Nov 15, 2024
603900e
update dsa annotated_filenames and tests
sarahgonicholson Nov 15, 2024
ca981b3
Merge up-to-date with main
sarahgonicholson Dec 2, 2024
2ded982
Merge up-to-date with main
sarahgonicholson Dec 3, 2024
d215fa7
Merge up-to-date with main
sarahgonicholson Dec 12, 2024
7202cf9
Fix chain file test
sarahgonicholson Dec 13, 2024
55fcbe2
Update changelog
sarahgonicholson Dec 13, 2024
2170b5b
Update data categories
sarahgonicholson Dec 13, 2024
f122f62
Update data_type and data_category for DSA
sarahgonicholson Dec 17, 2024
6ce7da7
Merge up-to-date with main
sarahgonicholson Dec 17, 2024
d9edfa4
Change dsa to DSA
sarahgonicholson Dec 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@ smaht-portal
Change Log
----------

`PR 266: Node v20 Upgrade <https://github.com/smaht-dac/smaht-portal/pull/266>`_
0.105.1
=======
`PR 277 DSA Release <https://github.com/smaht-dac/smaht-portal/pull/277>`_
* In `commands/release-file.py` and `commands/create-annotated-filenames.py`:
* Assay and sequencer codes value set to XX for DSA fasta files and chain files
* For Supplementary Files, use `title` instead of `code` for software part of annotated filename

0.105.0
=====

=======
`PR 266: Node v20 Upgrade <https://github.com/smaht-dac/smaht-portal/pull/266>`_
* Node 18 to 20 upgrade including GitHub actions


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "encoded"
version = "0.105.0"
version = "0.105.1"
description = "SMaHT Data Analysis Portal"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
70 changes: 51 additions & 19 deletions src/encoded/commands/create_annotated_filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
FILENAME_SEPARATOR = "-"
ANALYSIS_INFO_SEPARATOR = "_"
CHAIN_FILE_INFO_SEPARATOR = "To"

DSA_INFO_VALUE = "dsa"

DEFAULT_PROJECT_ID = constants.PRODUCTION_PREFIX
DEFAULT_ABSENT_FIELD = "X"
Expand Down Expand Up @@ -419,7 +419,9 @@ def get_annotated_filename(
associated_items.donors, associated_items.sample_sources
)
sequencing_and_assay_codes = get_sequencing_and_assay_codes(
associated_items.sequencers, associated_items.assays
associated_items.file,
associated_items.sequencers,
associated_items.assays
)
sequencing_center_code = get_sequencing_center_code(
associated_items.sequencing_center
Expand Down Expand Up @@ -750,14 +752,20 @@ def get_sex_abbreviation(sex: str) -> str:


def get_sequencing_and_assay_codes(
file: List[Dict[str, Any]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct? Is it a list?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixing!

sequencers: List[Dict[str], Any],
assays: List[Dict[str], Any],
) -> FilenamePart:
"""Get sequencing and assay codes for file."""
"""Get sequencing and assay codes for file.

Returns XX for Reference Genome and Reference Conversion files.
"""
sequencing_codes = get_sequencing_codes(sequencers)
assay_codes = get_assay_codes(assays)
if len(sequencing_codes) == 1 and len(assay_codes) == 1:
return get_filename_part(value=f"{sequencing_codes[0]}{assay_codes[0]}")
elif supp_file_utils.is_reference_conversion(file) or supp_file_utils.is_reference_genome(file):
return get_filename_part(value="XX")
errors = []
if not sequencing_codes:
errors.append("No sequencing code found")
Expand Down Expand Up @@ -807,7 +815,7 @@ def get_analysis(
Some error handling here for missing data by file type, but not
exhaustive and allowing for some flexibility in what is expected.
"""
software_and_versions = get_software_and_versions(software)
software_and_versions = get_software_and_versions(file, software)
reference_genome_code = item_utils.get_code(reference_genome)
errors = get_analysis_errors(file, reference_genome_code)
if errors:
Expand All @@ -816,7 +824,9 @@ def get_analysis(
software_and_versions, reference_genome_code
)
if file_format_utils.is_chain_file(file_extension):
value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}"
value = ANALYSIS_INFO_SEPARATOR.join([value,get_chain_file_value(file)]) if value else get_chain_file_value(file)
if file_format_utils.is_fasta_file(file_extension):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not quite sure I understand. You are only testing for fasta file here. Do you always want to appenddsa in that case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great point. I'll add a specification for only DSA fasta files

value = f"{value}{ANALYSIS_INFO_SEPARATOR}{DSA_INFO_VALUE}"
if not value:
if file_utils.is_unaligned_reads(file): # Think this is the only case (?)
return get_filename_part(value=DEFAULT_ABSENT_FIELD)
Expand Down Expand Up @@ -853,18 +863,20 @@ def get_analysis_value(
return ANALYSIS_INFO_SEPARATOR.join(to_write)


def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
def get_software_and_versions(file: Dict[str, Any], software: List[Dict[str, Any]]) -> str:
"""Get software and accompanying versions for file.

Currently only looking for software items with codes, as these are
expected to be the software used for naming.
Currently looking for software items with codes, as these are expected to be the software used for naming, with the exception of SupplementaryFile items, where lower case title is used.
"""
software_with_codes = get_software_with_codes(software)
if supp_file_utils.is_supplementary_file(file):
software_with_codes = get_software_with_title(software)
else:
software_with_codes = get_software_with_codes(software)
if not software_with_codes:
return ""
software_with_codes_and_versions = get_software_with_versions(software_with_codes)
if len(software_with_codes) == len(software_with_codes_and_versions):
return get_software_and_versions_string(software_with_codes_and_versions)
return get_software_and_versions_string(file, software_with_codes_and_versions)
missing_versions = get_software_codes_missing_versions(software_with_codes)
logger.warning(f"Missing versions for software items: {missing_versions}.")
return ""
Expand All @@ -877,23 +889,43 @@ def get_software_with_codes(
return [item for item in software_items if item_utils.get_code(item)]


def get_software_with_title(
software_items: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Get software items with title."""
return [item for item in software_items if item_utils.get_title(item)]


def get_software_with_versions(
software_items: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Get software items with versions."""
return [item for item in software_items if item_utils.get_version(item)]


def get_software_and_versions_string(software_items: List[Dict[str, Any]]) -> str:
def get_software_and_versions_string(
file: Dict[str, Any],
software_items: List[Dict[str, Any]]
) -> str:
"""Get string representation of software and versions."""
sorted_software_items = sorted(software_items, key=item_utils.get_code)
return ANALYSIS_INFO_SEPARATOR.join(
[
f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
f"{item_utils.get_version(item)}"
for item in sorted_software_items
]
)
if supp_file_utils.is_supplementary_file(file):
sorted_software_items = sorted(software_items, key=item_utils.get_title)
return ANALYSIS_INFO_SEPARATOR.join(
[
f"{item_utils.get_title(item).lower()}{ANALYSIS_INFO_SEPARATOR}"
f"{item_utils.get_version(item)}"
for item in sorted_software_items
]
)
else:
sorted_software_items = sorted(software_items, key=item_utils.get_code)
return ANALYSIS_INFO_SEPARATOR.join(
[
f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}"
f"{item_utils.get_version(item)}"
for item in sorted_software_items
]
)


def get_software_codes_missing_versions(
Expand Down
37 changes: 29 additions & 8 deletions src/encoded/commands/release_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
sample_source as sample_source_utils,
submitted_file as submitted_file_utils,
tissue as tissue_utils,
supplementary_file as supp_file_utils
)
from encoded.item_utils.constants import (
file as file_constants,
Expand Down Expand Up @@ -198,12 +199,15 @@ def get_file_sets_from_file(self) -> List[dict]:
)
mwfrs = ff_utils.search_metadata(search_filter, key=self.key)
if len(mwfrs) != 1:
self.print_error_and_exit(
(
f"Expected exactly one associated MetaWorkflowRun, got"
f" {len(mwfrs)}: {search_filter}"
if not supp_file_utils.is_reference_genome(self.file) and not supp_file_utils.is_reference_conversion(self.file):
self.print_error_and_exit(
(
f"Expected exactly one associated MetaWorkflowRun, got"
f" {len(mwfrs)}: {search_filter}"
)
)
)
else:
return []
mwfr = mwfrs[0]
file_sets = meta_workflow_run_utils.get_file_sets(mwfr)
# Might need to be more general in the future
Expand Down Expand Up @@ -354,17 +358,16 @@ def add_file_patchdict(self, dataset: str) -> None:
item_utils.get_accession(file_set) for file_set in self.file_sets
]
annotated_filename_info = self.get_annotated_filename_info()

# Add file to file set and set status to released
patch_body = {
item_constants.UUID: item_utils.get_uuid(self.file),
item_constants.STATUS: item_constants.STATUS_RELEASED,
file_constants.FILE_SETS: file_set_accessions,
file_constants.DATASET: dataset,
file_constants.ACCESS_STATUS: access_status,
file_constants.ANNOTATED_FILENAME: annotated_filename_info.filename,
}

if not supp_file_utils.is_reference_conversion(self.file) and not supp_file_utils.is_reference_genome(self.file):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are making this check already in get_file_sets_from_file. Should we just test here if file_set_accessions is empty?

patch_body[file_constants.FILE_SET] = file_set_accessions
# Take the extra files from the annotated filename object if available.
# They will have the correct filenames
if annotated_filename_info.patch_dict:
Expand Down Expand Up @@ -454,6 +457,12 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_REFERENCE_GENOME: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: (
file_constants.ACCESS_STATUS_OPEN
)
},
IPSC: {
file_constants.DATA_CATEGORY_SEQUENCING_READS: (
Expand All @@ -465,6 +474,12 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_REFERENCE_GENOME: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: (
file_constants.ACCESS_STATUS_PROTECTED
)
},
self.TISSUE: {
file_constants.DATA_CATEGORY_SEQUENCING_READS: (
Expand All @@ -476,6 +491,12 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_REFERENCE_GENOME: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: (
file_constants.ACCESS_STATUS_PROTECTED
)
},
}
if dataset in [
Expand Down
2 changes: 2 additions & 0 deletions src/encoded/item_utils/constants/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
DATA_CATEGORY_REFERENCE_GENOME = "Reference Genome"
DATA_CATEGORY_REFERENCE_CONVERSION = "Reference Conversion"
DATASET = "dataset"
EXTRA_FILES = "extra_files"
FILE_SETS = "file_sets"
Expand Down
7 changes: 6 additions & 1 deletion src/encoded/item_utils/file_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,9 @@ def get_other_allowed_extensions(properties: Dict[str, Any]) -> str:


def is_chain_file(properties: Dict[str, Any]) -> bool:
return get_standard_file_extension(properties) == "chain.gz"
return get_standard_file_extension(properties) in ["chain.gz","chain"]


def is_fasta_file(properties: Dict[str, Any]) -> bool:
return get_standard_file_extension(properties) in ["fa","fasta"]

23 changes: 10 additions & 13 deletions src/encoded/item_utils/supplementary_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@ def is_chain_file(properties: Dict[str, Any],request_handler: RequestHandler):
return file_utils.get_file_extension(properties,request_handler) == "chain.gz"


def is_reference_genome(properties: Dict[str, Any]):
"""Check if data category is Reference Genome"""
return "Reference Genome" in file_utils.get_data_category(properties)


def is_reference_conversion(properties: Dict[str, Any]):
"""Check if data category is Reference Conversion"""
return "Reference Conversion" in file_utils.get_data_category(properties)


def get_donor_specific_assembly(properties: Dict[str, Any]) -> Union[str, Dict[str, Any]]:
"""Get donor-specific assembly from properties."""
return properties.get("donor_specific_assembly", "")
Expand Down Expand Up @@ -64,16 +74,3 @@ def get_derived_from_file_sets(
file_utils.get_file_sets,
)
return properties.get("file_sets", [])


def get_dsa_software(
properties: Dict[str, Any], request_handler: Optional[RequestHandler] = None
) -> List[Union[str, Dict[str, Any]]]:
"""Get software from donor-specific assembly associated with file."""
if request_handler:
return get_property_value_from_identifier(
request_handler,
get_donor_specific_assembly(properties),
dsa_utils.get_software,
)
return []
Loading
Loading