Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SN DSA Release #277

Merged
merged 30 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
608dea2
release file for fa
sarahgonicholson Oct 16, 2024
eea955e
fix chainf iles
sarahgonicholson Oct 16, 2024
84d2567
update changelog
sarahgonicholson Oct 16, 2024
0027520
Merge up-to-date with main
sarahgonicholson Oct 16, 2024
4188794
Remove debugger
sarahgonicholson Oct 16, 2024
b33acc7
Fix tests
sarahgonicholson Oct 16, 2024
038b37b
fix changelog
sarahgonicholson Oct 16, 2024
a4a551d
Fix reference genome access status
sarahgonicholson Oct 17, 2024
a509010
Add test for multiple assays and sequencers
sarahgonicholson Oct 17, 2024
f03446b
Fix chain format
sarahgonicholson Oct 17, 2024
3afa0fa
refactor data category check
sarahgonicholson Oct 17, 2024
7113233
use software title for supplementary files
sarahgonicholson Oct 17, 2024
acbc80e
fix software test
sarahgonicholson Oct 17, 2024
bbe7344
update software description
sarahgonicholson Oct 17, 2024
b4394ba
use supp file utils
sarahgonicholson Oct 17, 2024
73fb220
Don't patch fileset if reference file
sarahgonicholson Oct 17, 2024
4de4f84
fix file release test
sarahgonicholson Oct 18, 2024
c06087d
Address comments
sarahgonicholson Oct 18, 2024
18ff9d5
Merge up-to-date with main
sarahgonicholson Oct 18, 2024
5497661
merge up-to-date with main
sarahgonicholson Nov 15, 2024
603900e
update dsa annotated_filenames and tests
sarahgonicholson Nov 15, 2024
ca981b3
Merge up-to-date with main
sarahgonicholson Dec 2, 2024
2ded982
Merge up-to-date with main
sarahgonicholson Dec 3, 2024
d215fa7
Merge up-to-date with main
sarahgonicholson Dec 12, 2024
7202cf9
Fix chain file test
sarahgonicholson Dec 13, 2024
55fcbe2
Update changelog
sarahgonicholson Dec 13, 2024
2170b5b
Update data categories
sarahgonicholson Dec 13, 2024
f122f62
Update data_type and data_category for DSA
sarahgonicholson Dec 17, 2024
6ce7da7
Merge up-to-date with main
sarahgonicholson Dec 17, 2024
d9edfa4
Change dsa to DSA
sarahgonicholson Dec 20, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ smaht-portal
Change Log
----------

0.122.0
=======
`PR 277 DSA Release <https://github.com/smaht-dac/smaht-portal/pull/277>`_
* In `commands/release-file.py` and `commands/create-annotated-filenames.py`:
* Assay and sequencer codes value set to XX for DSA fasta files and chain files
* For Supplementary Files, use `haplotype`, `target_assembly`, and `source_assembly` properties to create annotated filenames for chain and fasta files

0.121.0
=======
`PR 300 SN Remove basecalling <https://github.com/smaht-dac/smaht-portal/pull/300>`
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "encoded"
version = "0.121.0"
version = "0.122.0"
description = "SMaHT Data Analysis Portal"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
106 changes: 90 additions & 16 deletions src/encoded/commands/create_annotated_filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
supplementary_file as supp_file_utils,
tissue as tissue_utils,
tissue_sample as tissue_sample_utils,
donor_specific_assembly as dsa_utils,
reference_genome as rg_utils
)
from encoded.item_utils.constants import file as file_constants
from encoded.item_utils.utils import RequestHandler
Expand All @@ -32,6 +34,7 @@
FILENAME_SEPARATOR = "-"
ANALYSIS_INFO_SEPARATOR = "_"
CHAIN_FILE_INFO_SEPARATOR = "To"
DSA_INFO_VALUE = "DSA"

RNA_DATA_CATEGORY = "RNA Quantification"
GENE_DATA_TYPE = "Gene Expression"
Expand Down Expand Up @@ -98,6 +101,8 @@ class AssociatedItems:
tissue_samples: List[Dict[str, Any]]
tissues: List[Dict[str, Any]]
donors: List[Dict[str, Any]]
target_assembly: Dict[str, Any]
source_assembly: Dict[str, Any]


def get_associated_items(
Expand All @@ -116,6 +121,8 @@ def get_associated_items(
reference_genome = get_reference_genome(file, request_handler)
gene_annotations = get_gene_annotations(file, request_handler)
donor_specific_assembly = get_donor_specific_assembly(file, request_handler)
target_assembly = get_target_assembly(file, request_handler)
source_assembly = get_source_assembly(file, request_handler)
if donor_specific_assembly:
file_sets=get_derived_from_file_sets(file, request_handler)
else:
Expand Down Expand Up @@ -146,6 +153,8 @@ def get_associated_items(
tissues=tissues,
cell_lines=cell_lines,
donors=donors,
target_assembly=target_assembly,
source_assembly=source_assembly
)


Expand Down Expand Up @@ -215,6 +224,20 @@ def get_reference_genome(
return get_item(file_utils.get_reference_genome(file), request_handler)


def get_target_assembly(
file: Dict[str, Any], request_handler: RequestHandler
) -> Union[None, Dict[str, Any]]:
"""Get target assembly for file."""
return get_item(supp_file_utils.get_target_assembly(file), request_handler)


def get_source_assembly(
file: Dict[str, Any], request_handler: RequestHandler
) -> Union[None, Dict[str, Any]]:
"""Get source assembly for file."""
return get_item(supp_file_utils.get_source_assembly(file), request_handler)


def get_gene_annotations(
file: Dict[str, Any], request_handler: RequestHandler
) -> Dict[str, Any]:
Expand Down Expand Up @@ -432,7 +455,9 @@ def get_annotated_filename(
associated_items.donors, associated_items.sample_sources
)
sequencing_and_assay_codes = get_sequencing_and_assay_codes(
associated_items.sequencers, associated_items.assays
associated_items.file,
associated_items.sequencers,
associated_items.assays
)
sequencing_center_code = get_sequencing_center_code(
associated_items.sequencing_center
Expand All @@ -444,7 +469,10 @@ def get_annotated_filename(
associated_items.software,
associated_items.reference_genome,
associated_items.gene_annotations,
associated_items.file_format
associated_items.file_format,
associated_items.target_assembly,
associated_items.source_assembly,
associated_items.donor_specific_assembly,
)
errors = collect_errors(
project_id,
Expand Down Expand Up @@ -767,14 +795,20 @@ def get_sex_abbreviation(sex: str) -> str:


def get_sequencing_and_assay_codes(
file: Dict[str, Any],
sequencers: List[Dict[str], Any],
assays: List[Dict[str], Any],
) -> FilenamePart:
"""Get sequencing and assay codes for file."""
"""Get sequencing and assay codes for file.

Returns XX for Genome Assembly and Reference Conversion files.
"""
sequencing_codes = get_sequencing_codes(sequencers)
assay_codes = get_assay_codes(assays)
if len(sequencing_codes) == 1 and len(assay_codes) == 1:
return get_filename_part(value=f"{sequencing_codes[0]}{assay_codes[0]}")
elif supp_file_utils.is_genome_assembly(file) or supp_file_utils.is_reference_conversion(file):
return get_filename_part(value="XX")
errors = []
if not sequencing_codes:
errors.append("No sequencing code found")
Expand Down Expand Up @@ -819,6 +853,9 @@ def get_analysis(
reference_genome: Dict[str, Any],
gene_annotations: Dict[str, Any],
file_extension: Dict[str, Any],
target_assembly: Dict[str, Any],
source_assembly: Dict[str, Any],
donor_specific_assembly: Dict[str, Any],
) -> FilenamePart:
"""Get analysis info for file.

Expand All @@ -829,19 +866,23 @@ def get_analysis(
reference_genome_code = item_utils.get_code(reference_genome)
gene_annotation_code = get_annotations_and_versions(gene_annotations)
transcript_info_code = get_rna_seq_tsv_value(file, file_extension)
haplotype_code = get_haplotype_value(file, file_extension, donor_specific_assembly)
chain_code = get_chain_file_value(file, target_assembly, source_assembly, file_extension)
value = get_analysis_value(
software_and_versions,
reference_genome_code,
gene_annotation_code,
transcript_info_code
transcript_info_code,
chain_code,
haplotype_code
)
if file_format_utils.is_chain_file(file_extension):
value = f"{value}{ANALYSIS_INFO_SEPARATOR}{get_chain_file_value(file)}"
errors = get_analysis_errors(
file,
reference_genome_code,
gene_annotation_code,
transcript_info_code,
chain_code,
haplotype_code,
file_extension,
)
if errors:
Expand All @@ -858,6 +899,8 @@ def get_analysis_errors(
reference_genome_code: str,
gene_annotation_code: str,
transcript_info_code: str,
chain_code: str,
haplotype_code: str,
file_extension: Dict[str, Any]
) -> List[str]:
"""Get analysis errors for file by file type."""
Expand All @@ -876,19 +919,24 @@ def get_analysis_errors(
errors.append("No gene annotation code found")
elif file_format_utils.is_tsv_file(file_extension) and not transcript_info_code:
errors.append("No gene or isoform code found")
if file_format_utils.is_chain_file(file_extension):
if not chain_code:
errors.append("No target or source assembly found for chain conversion ")
return errors


def get_analysis_value(
software_and_versions: str,
reference_genome_code: str,
gene_annotation_code: str,
transcript_info_code: str
transcript_info_code: str,
chain_code: str,
haplotype_code: str,
) -> str:
"""Get analysis value for filename."""
to_write = [
string
for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code]
for string in [software_and_versions, reference_genome_code, gene_annotation_code, transcript_info_code, chain_code, haplotype_code]
if string
]
return ANALYSIS_INFO_SEPARATOR.join(to_write)
Expand Down Expand Up @@ -951,8 +999,7 @@ def get_annotation_codes_missing_versions(
def get_software_and_versions(software: List[Dict[str, Any]]) -> str:
"""Get software and accompanying versions for file.

Currently only looking for software items with codes, as these are
expected to be the software used for naming.
Currently looking for software items with codes, as these are expected to be the software used for naming.
"""
software_with_codes = get_software_with_codes(software)
if not software_with_codes:
Expand All @@ -979,7 +1026,9 @@ def get_software_with_versions(
return [item for item in software_items if item_utils.get_version(item)]


def get_software_and_versions_string(software_items: List[Dict[str, Any]]) -> str:
def get_software_and_versions_string(
software_items: List[Dict[str, Any]]
) -> str:
"""Get string representation of software and versions."""
sorted_software_items = sorted(software_items, key=item_utils.get_code)
return ANALYSIS_INFO_SEPARATOR.join(
Expand All @@ -1002,11 +1051,36 @@ def get_software_codes_missing_versions(
]


def get_chain_file_value(file: Dict[str, Any]) -> str:
"""Get reference conversion direction for chain files."""
target_assembly=supp_file_utils.get_target_assembly(file)
source_assembly=supp_file_utils.get_source_assembly(file)
return CHAIN_FILE_INFO_SEPARATOR.join([source_assembly,target_assembly])
def get_chain_file_value(
file: Dict[str, Any],
target_assembly: Dict[str, Any],
source_assembly: Dict[str, Any],
file_extension: Dict[str, Any]
) -> str:
"""Get genome conversion direction for chain files."""
if file_format_utils.is_chain_file(file_extension):
target_value = ""
source_value = ""
if target_assembly and source_assembly:
target_value = DSA_INFO_VALUE if dsa_utils.is_donor_specific_assembly(target_assembly) else item_utils.get_code(target_assembly)
source_value = DSA_INFO_VALUE if dsa_utils.is_donor_specific_assembly(source_assembly) else item_utils.get_code(source_assembly)
if target_value and source_value:
return CHAIN_FILE_INFO_SEPARATOR.join([source_value,target_value])
return ""


def get_haplotype_value(
file: Dict[str, Any],
file_extension: Dict[str, Any],
donor_specific_assembly: Dict[str, Any]
):
"""Get haplotype value for fasta file."""
if file_format_utils.is_fasta_file(file_extension):
if (haplotype := supp_file_utils.get_haplotype(file)):
return haplotype
elif donor_specific_assembly:
return DSA_INFO_VALUE
return ""


def get_rna_seq_tsv_value(file: Dict[str, Any], file_extension: Dict[str, Any]) -> str:
Expand Down
38 changes: 30 additions & 8 deletions src/encoded/commands/release_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
sample_source as sample_source_utils,
submitted_file as submitted_file_utils,
tissue as tissue_utils,
supplementary_file as supp_file_utils
)
from encoded.item_utils.constants import (
file as file_constants,
Expand Down Expand Up @@ -198,12 +199,15 @@ def get_file_sets_from_file(self) -> List[dict]:
)
mwfrs = ff_utils.search_metadata(search_filter, key=self.key)
if len(mwfrs) != 1:
self.print_error_and_exit(
(
f"Expected exactly one associated MetaWorkflowRun, got"
f" {len(mwfrs)}: {search_filter}"
if not supp_file_utils.is_genome_assembly(self.file) and not supp_file_utils.is_reference_conversion(self.file):
self.print_error_and_exit(
(
f"Expected exactly one associated MetaWorkflowRun, got"
f" {len(mwfrs)}: {search_filter}"
)
)
)
else:
return []
mwfr = mwfrs[0]
file_sets = meta_workflow_run_utils.get_file_sets(mwfr)
# Might need to be more general in the future
Expand Down Expand Up @@ -238,6 +242,7 @@ def prepare(
self.add_release_items_to_patchdict(
self.quality_metrics_zips, "Compressed QC metrics file"
)

self.add_release_items_to_patchdict(self.file_sets, "FileSet")
self.add_release_items_to_patchdict(self.sequencings, "Sequencing")
self.add_release_items_to_patchdict(self.libraries, "Library")
Expand Down Expand Up @@ -354,17 +359,16 @@ def add_file_patchdict(self, dataset: str) -> None:
item_utils.get_accession(file_set) for file_set in self.file_sets
]
annotated_filename_info = self.get_annotated_filename_info()

# Add file to file set and set status to released
patch_body = {
item_constants.UUID: item_utils.get_uuid(self.file),
item_constants.STATUS: item_constants.STATUS_RELEASED,
file_constants.FILE_SETS: file_set_accessions,
file_constants.DATASET: dataset,
file_constants.ACCESS_STATUS: access_status,
file_constants.ANNOTATED_FILENAME: annotated_filename_info.filename,
}

if file_set_accessions:
patch_body[file_constants.FILE_SETS] = file_set_accessions
# Take the extra files from the annotated filename object if available.
# They will have the correct filenames
if annotated_filename_info.patch_dict:
Expand Down Expand Up @@ -454,6 +458,12 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_GENOME_CONVERSION: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
Expand All @@ -468,6 +478,12 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_GENOME_CONVERSION: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
Expand All @@ -482,6 +498,12 @@ def get_access_status(self, dataset: str) -> str:
file_constants.DATA_CATEGORY_SOMATIC_VARIANT_CALLS: (
file_constants.ACCESS_STATUS_OPEN
),
file_constants.DATA_CATEGORY_GENOME_ASSEMBLY: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_GENOME_CONVERSION: (
file_constants.ACCESS_STATUS_PROTECTED
),
file_constants.DATA_CATEGORY_RNA_QUANTIFICATION: (
file_constants.ACCESS_STATUS_OPEN
)
Expand Down
2 changes: 2 additions & 0 deletions src/encoded/item_utils/constants/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
DATA_CATEGORY_GERMLINE_VARIANT_CALLS = "Germline Variant Calls"
DATA_CATEGORY_SEQUENCING_READS = "Sequencing Reads"
DATA_CATEGORY_SOMATIC_VARIANT_CALLS = "Somatic Variant Calls"
DATA_CATEGORY_GENOME_CONVERSION = "Reference Conversion"
DATA_CATEGORY_GENOME_ASSEMBLY = "Genome Assembly"
DATA_CATEGORY_RNA_QUANTIFICATION = "RNA Quantification"
DATASET = "dataset"
EXTRA_FILES = "extra_files"
Expand Down
8 changes: 7 additions & 1 deletion src/encoded/item_utils/donor_specific_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@

from .utils import RequestHandler, get_property_value_from_identifier, get_property_values_from_identifiers
from . import (
file as file_utils
file as file_utils,
item as item_utils
)

def is_donor_specific_assembly(properties: Dict[str, Any]) -> bool:
"""Check if item is a donor specific assembly."""
return item_utils.get_type(properties) == "DonorSpecificAssembly"


def get_file_format_id(request_handler: RequestHandler,identifier: str):
"""Return identifier of file_format for file."""
return get_property_value_from_identifier(request_handler,identifier,file_utils.get_file_format)
Expand Down
4 changes: 4 additions & 0 deletions src/encoded/item_utils/file_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,9 @@ def is_chain_file(properties: Dict[str, Any]) -> bool:
return get_standard_file_extension(properties) in ["chain.gz","chain"]


def is_fasta_file(properties: Dict[str, Any]) -> bool:
return get_standard_file_extension(properties) in ["fa","fasta"]


def is_tsv_file(properties: Dict[str, Any]) -> bool:
return get_standard_file_extension(properties) == "tsv"
Loading
Loading