-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SN DSA Release #277
SN DSA Release #277
Changes from 16 commits
608dea2
eea955e
84d2567
0027520
4188794
b33acc7
038b37b
a4a551d
a509010
f03446b
3afa0fa
7113233
acbc80e
bbe7344
b4394ba
73fb220
4de4f84
c06087d
18ff9d5
5497661
603900e
ca981b3
2ded982
d215fa7
7202cf9
55fcbe2
2170b5b
f122f62
6ce7da7
d9edfa4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "encoded" | ||
version = "0.105.0" | ||
version = "0.105.1" | ||
description = "SMaHT Data Analysis Portal" | ||
authors = ["4DN-DCIC Team <[email protected]>"] | ||
license = "MIT" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,7 +32,7 @@ | |
FILENAME_SEPARATOR = "-" | ||
ANALYSIS_INFO_SEPARATOR = "_" | ||
CHAIN_FILE_INFO_SEPARATOR = "To" | ||
|
||
DSA_INFO_VALUE = "dsa" | ||
|
||
DEFAULT_PROJECT_ID = constants.PRODUCTION_PREFIX | ||
DEFAULT_ABSENT_FIELD = "X" | ||
|
@@ -419,7 +419,9 @@ def get_annotated_filename( | |
associated_items.donors, associated_items.sample_sources | ||
) | ||
sequencing_and_assay_codes = get_sequencing_and_assay_codes( | ||
associated_items.sequencers, associated_items.assays | ||
associated_items.file, | ||
associated_items.sequencers, | ||
associated_items.assays | ||
) | ||
sequencing_center_code = get_sequencing_center_code( | ||
associated_items.sequencing_center | ||
|
@@ -750,14 +752,20 @@ def get_sex_abbreviation(sex: str) -> str: | |
|
||
|
||
def get_sequencing_and_assay_codes( | ||
file: List[Dict[str, Any]], | ||
sequencers: List[Dict[str], Any], | ||
assays: List[Dict[str], Any], | ||
) -> FilenamePart: | ||
"""Get sequencing and assay codes for file.""" | ||
"""Get sequencing and assay codes for file. | ||
|
||
Returns XX for Reference Genome and Reference Conversion files. | ||
""" | ||
sequencing_codes = get_sequencing_codes(sequencers) | ||
assay_codes = get_assay_codes(assays) | ||
if len(sequencing_codes) == 1 and len(assay_codes) == 1: | ||
return get_filename_part(value=f"{sequencing_codes[0]}{assay_codes[0]}") | ||
elif supp_file_utils.is_reference_conversion(file) or supp_file_utils.is_reference_genome(file): | ||
return get_filename_part(value="XX") | ||
errors = [] | ||
if not sequencing_codes: | ||
errors.append("No sequencing code found") | ||
|
@@ -807,7 +815,7 @@ def get_analysis( | |
Some error handling here for missing data by file type, but not | ||
exhaustive and allowing for some flexibility in what is expected. | ||
""" | ||
software_and_versions = get_software_and_versions(software) | ||
software_and_versions = get_software_and_versions(file, software) | ||
reference_genome_code = item_utils.get_code(reference_genome) | ||
errors = get_analysis_errors(file, reference_genome_code) | ||
if errors: | ||
|
@@ -816,7 +824,9 @@ def get_analysis( | |
software_and_versions, reference_genome_code | ||
) | ||
if file_format_utils.is_chain_file(file_extension): | ||
value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}" | ||
value = ANALYSIS_INFO_SEPARATOR.join([value,get_chain_file_value(file)]) if value else get_chain_file_value(file) | ||
if file_format_utils.is_fasta_file(file_extension): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite sure I understand. You are only testing for fasta file here. Do you always want to append There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great point. I'll add a specification for only DSA fasta files |
||
value = f"{value}{ANALYSIS_INFO_SEPARATOR}{DSA_INFO_VALUE}" | ||
if not value: | ||
if file_utils.is_unaligned_reads(file): # Think this is the only case (?) | ||
return get_filename_part(value=DEFAULT_ABSENT_FIELD) | ||
|
@@ -853,18 +863,20 @@ def get_analysis_value( | |
return ANALYSIS_INFO_SEPARATOR.join(to_write) | ||
|
||
|
||
def get_software_and_versions(software: List[Dict[str, Any]]) -> str: | ||
def get_software_and_versions(file: Dict[str, Any], software: List[Dict[str, Any]]) -> str: | ||
"""Get software and accompanying versions for file. | ||
|
||
Currently only looking for software items with codes, as these are | ||
expected to be the software used for naming. | ||
Currently looking for software items with codes, as these are expected to be the software used for naming, with the exception of SupplementaryFile items, where lower case title is used. | ||
""" | ||
software_with_codes = get_software_with_codes(software) | ||
if supp_file_utils.is_supplementary_file(file): | ||
software_with_codes = get_software_with_title(software) | ||
else: | ||
software_with_codes = get_software_with_codes(software) | ||
if not software_with_codes: | ||
return "" | ||
software_with_codes_and_versions = get_software_with_versions(software_with_codes) | ||
if len(software_with_codes) == len(software_with_codes_and_versions): | ||
return get_software_and_versions_string(software_with_codes_and_versions) | ||
return get_software_and_versions_string(file, software_with_codes_and_versions) | ||
missing_versions = get_software_codes_missing_versions(software_with_codes) | ||
logger.warning(f"Missing versions for software items: {missing_versions}.") | ||
return "" | ||
|
@@ -877,23 +889,43 @@ def get_software_with_codes( | |
return [item for item in software_items if item_utils.get_code(item)] | ||
|
||
|
||
def get_software_with_title( | ||
software_items: List[Dict[str, Any]] | ||
) -> List[Dict[str, Any]]: | ||
"""Get software items with title.""" | ||
return [item for item in software_items if item_utils.get_title(item)] | ||
|
||
|
||
def get_software_with_versions( | ||
software_items: List[Dict[str, Any]] | ||
) -> List[Dict[str, Any]]: | ||
"""Get software items with versions.""" | ||
return [item for item in software_items if item_utils.get_version(item)] | ||
|
||
|
||
def get_software_and_versions_string(software_items: List[Dict[str, Any]]) -> str: | ||
def get_software_and_versions_string( | ||
file: Dict[str, Any], | ||
software_items: List[Dict[str, Any]] | ||
) -> str: | ||
"""Get string representation of software and versions.""" | ||
sorted_software_items = sorted(software_items, key=item_utils.get_code) | ||
return ANALYSIS_INFO_SEPARATOR.join( | ||
[ | ||
f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}" | ||
f"{item_utils.get_version(item)}" | ||
for item in sorted_software_items | ||
] | ||
) | ||
if supp_file_utils.is_supplementary_file(file): | ||
sorted_software_items = sorted(software_items, key=item_utils.get_title) | ||
return ANALYSIS_INFO_SEPARATOR.join( | ||
[ | ||
f"{item_utils.get_title(item).lower()}{ANALYSIS_INFO_SEPARATOR}" | ||
f"{item_utils.get_version(item)}" | ||
for item in sorted_software_items | ||
] | ||
) | ||
else: | ||
sorted_software_items = sorted(software_items, key=item_utils.get_code) | ||
return ANALYSIS_INFO_SEPARATOR.join( | ||
[ | ||
f"{item_utils.get_code(item)}{ANALYSIS_INFO_SEPARATOR}" | ||
f"{item_utils.get_version(item)}" | ||
for item in sorted_software_items | ||
] | ||
) | ||
|
||
|
||
def get_software_codes_missing_versions( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
sample_source as sample_source_utils, | ||
submitted_file as submitted_file_utils, | ||
tissue as tissue_utils, | ||
supplementary_file as supp_file_utils | ||
) | ||
from encoded.item_utils.constants import ( | ||
file as file_constants, | ||
|
@@ -198,12 +199,15 @@ def get_file_sets_from_file(self) -> List[dict]: | |
) | ||
mwfrs = ff_utils.search_metadata(search_filter, key=self.key) | ||
if len(mwfrs) != 1: | ||
self.print_error_and_exit( | ||
( | ||
f"Expected exactly one associated MetaWorkflowRun, got" | ||
f" {len(mwfrs)}: {search_filter}" | ||
if not supp_file_utils.is_reference_genome(self.file) and not supp_file_utils.is_reference_conversion(self.file): | ||
self.print_error_and_exit( | ||
( | ||
f"Expected exactly one associated MetaWorkflowRun, got" | ||
f" {len(mwfrs)}: {search_filter}" | ||
) | ||
) | ||
) | ||
else: | ||
return [] | ||
mwfr = mwfrs[0] | ||
file_sets = meta_workflow_run_utils.get_file_sets(mwfr) | ||
# Might need to be more general in the future | ||
|
@@ -354,17 +358,16 @@ def add_file_patchdict(self, dataset: str) -> None: | |
item_utils.get_accession(file_set) for file_set in self.file_sets | ||
] | ||
annotated_filename_info = self.get_annotated_filename_info() | ||
|
||
# Add file to file set and set status to released | ||
patch_body = { | ||
item_constants.UUID: item_utils.get_uuid(self.file), | ||
item_constants.STATUS: item_constants.STATUS_RELEASED, | ||
file_constants.FILE_SETS: file_set_accessions, | ||
file_constants.DATASET: dataset, | ||
file_constants.ACCESS_STATUS: access_status, | ||
file_constants.ANNOTATED_FILENAME: annotated_filename_info.filename, | ||
} | ||
|
||
if not supp_file_utils.is_reference_conversion(self.file) and not supp_file_utils.is_reference_genome(self.file): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are making this check already in |
||
patch_body[file_constants.FILE_SET] = file_set_accessions | ||
# Take the extra files from the annotated filename object if available. | ||
# They will have the correct filenames | ||
if annotated_filename_info.patch_dict: | ||
|
@@ -454,6 +457,12 @@ def get_access_status(self, dataset: str) -> str: | |
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( | ||
file_constants.ACCESS_STATUS_OPEN | ||
), | ||
file_constants.DATA_CATEGORY_REFERENCE_GENOME: ( | ||
file_constants.ACCESS_STATUS_OPEN | ||
), | ||
file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: ( | ||
file_constants.ACCESS_STATUS_OPEN | ||
) | ||
}, | ||
IPSC: { | ||
file_constants.DATA_CATEGORY_SEQUENCING_READS: ( | ||
|
@@ -465,6 +474,12 @@ def get_access_status(self, dataset: str) -> str: | |
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( | ||
file_constants.ACCESS_STATUS_PROTECTED | ||
), | ||
file_constants.DATA_CATEGORY_REFERENCE_GENOME: ( | ||
file_constants.ACCESS_STATUS_PROTECTED | ||
), | ||
file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: ( | ||
file_constants.ACCESS_STATUS_PROTECTED | ||
) | ||
}, | ||
self.TISSUE: { | ||
file_constants.DATA_CATEGORY_SEQUENCING_READS: ( | ||
|
@@ -476,6 +491,12 @@ def get_access_status(self, dataset: str) -> str: | |
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: ( | ||
file_constants.ACCESS_STATUS_OPEN | ||
), | ||
file_constants.DATA_CATEGORY_REFERENCE_GENOME: ( | ||
file_constants.ACCESS_STATUS_PROTECTED | ||
), | ||
file_constants.DATA_CATEGORY_REFERENCE_CONVERSION: ( | ||
file_constants.ACCESS_STATUS_PROTECTED | ||
) | ||
}, | ||
} | ||
if dataset in [ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this correct? Is it a list?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixing!