Skip to content

Commit

Permalink
Release v12.3.0 (#391)
Browse files Browse the repository at this point in the history
* Make consortium releases = public releases, add dashboard plots/tables (#385)

* Remove null oncotree codes in consortium release

* Filter out flagged mutations

* Add to dashboard plot

* Correct queries

* Subset

* Update description

* Add figure

* Fix

* Fix

* Add to dashboard

* Fix text

* Add variant counts

* Add index

* Add non somatic mutation check

* Add headers

* Add SNVs that were annotatoed as DNP

* Fix

* Store as file

* Fix

* Fix

* Fix text

* Add spacing

* update

* Add

* Only upload subset

* Add in files to release

* Don't filter out blacklist variants

* Use mutation file

* Correct

* Edit

* Fix

* Fix

* Update genie/database_to_staging.py

* Fix columns

* Add

* Remove comments

* Push

* Add 'Not released' value (#389)

* Add Not released value

* Use stringIO, fix tests

* Calculate missing counts (#388)

* Calculate missing counts

* Fix

* Update genie/database_to_staging.py

* Add function to calculate missing variant counts

* No need for blindtext

* Fix

* Shuffle sections

* Fix

* Fix missing variant counts

* Fix section

* Fix

* Fix headers
  • Loading branch information
thomasyu888 authored Feb 10, 2021
1 parent 7536c4f commit 7be3db0
Show file tree
Hide file tree
Showing 6 changed files with 390 additions and 108 deletions.
2 changes: 1 addition & 1 deletion genie/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "12.2.0"
__version__ = "12.3.0"
84 changes: 70 additions & 14 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,12 @@ def configure_maf(mafdf, remove_variants, flagged_variants):
# common_variants = mafdf['FILTER'].astype(str).str.contains(
# "common_variant", na=False
# )
# Germline Filter
gnomad_cols = ["gnomAD_AFR_AF", 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF',
'gnomAD_EAS_AF', 'gnomAD_FIN_AF', 'gnomAD_NFE_AF',
'gnomAD_OTH_AF', 'gnomAD_SAS_AF']
new_common_variants = mafdf.loc[
# location of germline variants
common_variants_idx = mafdf.loc[
:, gnomad_cols
].max(axis=1, skipna=True) > 0.0005

Expand All @@ -255,22 +257,77 @@ def configure_maf(mafdf, remove_variants, flagged_variants):
# Genome Nexus successfully annotated (vcf2maf does not have this column)
if mafdf.get("Annotation_Status") is None:
mafdf['Annotation_Status'] = "SUCCESS"
# Make sure to only get variants that were successfully annotated
success = mafdf['Annotation_Status'] == "SUCCESS"

mafdf = mafdf.loc[(~new_common_variants &
mafdf = mafdf.loc[(~common_variants_idx &
~to_remove_variants & success),]

fillnas = ['t_depth', 't_ref_count', 't_alt_count',
'n_depth', 'n_ref_count', 'n_alt_count']
for col in fillnas:
mafdf[col][mafdf[col].astype(str) == "."] = ""
n_depth_ind = mafdf['n_depth'].astype(str).isin(["NA", "0.0", "0"])
mafdf['Match_Norm_Seq_Allele2'][n_depth_ind] = ''
mafdf['Match_Norm_Seq_Allele1'][n_depth_ind] = ''
# May not need to do this because these columns are always
# returned as numerical values now
# fillnas = ['t_depth', 't_ref_count', 't_alt_count',
# 'n_depth', 'n_ref_count', 'n_alt_count']
# for col in fillnas:
# mafdf[col][mafdf[col].astype(str) == "."] = float('nan')
n_depth_ind = mafdf['n_depth'].astype(str).isin(["NA", "0.0", "0", 'nan'])
mafdf.loc[n_depth_ind, 'Match_Norm_Seq_Allele2'] = ''
mafdf.loc[n_depth_ind, 'Match_Norm_Seq_Allele1'] = ''
# Calculate missing t_depth, t_ref_count, t_alt_count
t_counts = calculate_missing_variant_counts(
depth=mafdf['t_depth'], alt_count=mafdf['t_alt_count'],
ref_count=mafdf['t_ref_count']
)
mafdf['t_depth'] = t_counts['depth']
mafdf['t_ref_count'] = t_counts['ref_count']
mafdf['t_alt_count'] = t_counts['alt_count']
# Calculate missing n_depth, n_ref_count, n_alt_count
n_counts = calculate_missing_variant_counts(
depth=mafdf['n_depth'], alt_count=mafdf['n_alt_count'],
ref_count=mafdf['n_ref_count']
)
mafdf['n_depth'] = n_counts['depth']
mafdf['n_ref_count'] = n_counts['ref_count']
mafdf['n_alt_count'] = n_counts['alt_count']

return mafdf


def calculate_missing_variant_counts(depth: pd.Series, alt_count: pd.Series,
ref_count: pd.Series) -> dict:
"""Calculate missing counts. t_depth = t_alt_count + t_ref_count
Args:
depth: Allele Depth
alt_count: Allele alt counts
ref_count: Allele ref counts
Returns:
filled in depth, alt_count and ref_count values
"""
# Avoid SettingWithCopyWarning
depth = depth.copy()
alt_count = alt_count.copy()
ref_count = ref_count.copy()
# t_depth = t_ref_count + t_alt_count
null_depth = depth.isnull()
# The notation null_depth_ref means all the reference values for which
# depth is NA
null_depth_ref = ref_count[null_depth]
null_depth_alt = alt_count[null_depth]
depth.loc[null_depth] = null_depth_ref + null_depth_alt
# t_ref_count = t_depth - t_alt_count
null_ref = ref_count.isnull()
null_ref_depth = depth[null_ref]
null_ref_alt = alt_count[null_ref]
ref_count[null_ref] = null_ref_depth - null_ref_alt
# t_alt_count = t_depth - t_ref_count
null_alt = alt_count.isnull()
null_alt_depth = depth[null_alt]
null_alt_ref = ref_count[null_alt]
alt_count[null_alt] = null_alt_depth - null_alt_ref
return {'depth': depth, 'ref_count': ref_count, 'alt_count': alt_count}


def runMAFinBED(syn,
center_mappingdf,
test=False,
Expand Down Expand Up @@ -675,7 +732,6 @@ def store_maf_files(syn,
maf_ent = syn.get(centerMafSynIdsDf.id[0])
headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0)
column_order = headerdf.columns

for _, mafSynId in enumerate(centerMafSynIdsDf.id):
maf_ent = syn.get(mafSynId)
logger.info(maf_ent.path)
Expand Down Expand Up @@ -900,9 +956,9 @@ def store_clinical_files(syn,
if code.upper() in oncotree_dict.keys() else float('nan')
for code in clinicaldf['ONCOTREE_CODE']]

# All cancer types that are null should have null oncotree codes
clinicaldf['ONCOTREE_CODE'][
clinicaldf['CANCER_TYPE'].isnull()] = float('nan')
# All cancer types that are null contain deprecated oncotree codes
# And should be removed
clinicaldf = clinicaldf[~clinicaldf['CANCER_TYPE'].isnull()]
# Suggest using AGE_AT_SEQ_REPORT_DAYS instead so that the
# descriptions can match
clinicaldf['AGE_AT_SEQ_REPORT_DAYS'] = clinicaldf['AGE_AT_SEQ_REPORT']
Expand Down
Loading

0 comments on commit 7be3db0

Please sign in to comment.