Release v12.3.0 (#391)

* Make consortium releases = public releases, add dashboard plots/tables (#385) * Remove null oncotree codes in consortium release * Filter out flagged mutations * Add to dashboard plot * Correct queries * Subset * Update description * Add figure * Fix * Fix * Add to dashboard * Fix text * Add variant counts * Add index * Add non somatic mutation check * Add headers * Add SNVs that were annotatoed as DNP * Fix * Store as file * Fix * Fix * Fix text * Add spacing * update * Add * Only upload subset * Add in files to release * Don't filter out blacklist variants * Use mutation file * Correct * Edit * Fix * Fix * Update genie/database_to_staging.py * Fix columns * Add * Remove comments * Push * Add 'Not released' value (#389) * Add Not released value * Use stringIO, fix tests * Calculate missing counts (#388) * Calculate missing counts * Fix * Update genie/database_to_staging.py * Add function to calculate missing variant counts * No need for blindtext * Fix * Shuffle sections * Fix * Fix missing variant counts * Fix section * Fix * Fix headers
Sage-Bionetworks · Feb 10, 2021 · 7be3db0 · 7be3db0
1 parent 7536c4f
commit 7be3db0
Show file tree

Hide file tree

Showing 6 changed files with 390 additions and 108 deletions.
diff --git a/genie/__version__.py b/genie/__version__.py
@@ -1 +1 @@
-__version__ = "12.2.0"
+__version__ = "12.3.0"
diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -243,10 +243,12 @@ def configure_maf(mafdf, remove_variants, flagged_variants):
     # common_variants = mafdf['FILTER'].astype(str).str.contains(
     #     "common_variant", na=False
     # )
+    # Germline Filter
     gnomad_cols = ["gnomAD_AFR_AF", 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF',
                    'gnomAD_EAS_AF', 'gnomAD_FIN_AF', 'gnomAD_NFE_AF',
                    'gnomAD_OTH_AF', 'gnomAD_SAS_AF']
-    new_common_variants = mafdf.loc[
+    # location of germline variants
+    common_variants_idx = mafdf.loc[
         :, gnomad_cols
     ].max(axis=1, skipna=True) > 0.0005
 
@@ -255,22 +257,77 @@ def configure_maf(mafdf, remove_variants, flagged_variants):
     # Genome Nexus successfully annotated (vcf2maf does not have this column)
     if mafdf.get("Annotation_Status") is None:
         mafdf['Annotation_Status'] = "SUCCESS"
+    # Make sure to only get variants that were successfully annotated
     success = mafdf['Annotation_Status'] == "SUCCESS"
 
-    mafdf = mafdf.loc[(~new_common_variants &
+    mafdf = mafdf.loc[(~common_variants_idx &
                        ~to_remove_variants & success),]
-
-    fillnas = ['t_depth', 't_ref_count', 't_alt_count',
-               'n_depth', 'n_ref_count', 'n_alt_count']
-    for col in fillnas:
-        mafdf[col][mafdf[col].astype(str) == "."] = ""
-    n_depth_ind = mafdf['n_depth'].astype(str).isin(["NA", "0.0", "0"])
-    mafdf['Match_Norm_Seq_Allele2'][n_depth_ind] = ''
-    mafdf['Match_Norm_Seq_Allele1'][n_depth_ind] = ''
+    # May not need to do this because these columns are always
+    # returned as numerical values now
+    # fillnas = ['t_depth', 't_ref_count', 't_alt_count',
+    #            'n_depth', 'n_ref_count', 'n_alt_count']
+    # for col in fillnas:
+    #     mafdf[col][mafdf[col].astype(str) == "."] = float('nan')
+    n_depth_ind = mafdf['n_depth'].astype(str).isin(["NA", "0.0", "0", 'nan'])
+    mafdf.loc[n_depth_ind, 'Match_Norm_Seq_Allele2'] = ''
+    mafdf.loc[n_depth_ind, 'Match_Norm_Seq_Allele1'] = ''
+    # Calculate missing t_depth, t_ref_count, t_alt_count
+    t_counts = calculate_missing_variant_counts(
+        depth=mafdf['t_depth'], alt_count=mafdf['t_alt_count'],
+        ref_count=mafdf['t_ref_count']
+    )
+    mafdf['t_depth'] = t_counts['depth']
+    mafdf['t_ref_count'] = t_counts['ref_count']
+    mafdf['t_alt_count'] = t_counts['alt_count']
+    # Calculate missing n_depth, n_ref_count, n_alt_count
+    n_counts = calculate_missing_variant_counts(
+        depth=mafdf['n_depth'], alt_count=mafdf['n_alt_count'],
+        ref_count=mafdf['n_ref_count']
+    )
+    mafdf['n_depth'] = n_counts['depth']
+    mafdf['n_ref_count'] = n_counts['ref_count']
+    mafdf['n_alt_count'] = n_counts['alt_count']
 
     return mafdf
 
 
+def calculate_missing_variant_counts(depth: pd.Series, alt_count: pd.Series,
+                                     ref_count: pd.Series) -> dict:
+    """Calculate missing counts. t_depth = t_alt_count + t_ref_count
+
+    Args:
+        depth: Allele Depth
+        alt_count: Allele alt counts
+        ref_count: Allele ref counts
+
+    Returns:
+        filled in depth, alt_count and ref_count values
+
+    """
+    # Avoid SettingWithCopyWarning
+    depth = depth.copy()
+    alt_count = alt_count.copy()
+    ref_count = ref_count.copy()
+    # t_depth = t_ref_count + t_alt_count
+    null_depth = depth.isnull()
+    # The notation null_depth_ref means all the reference values for which
+    # depth is NA
+    null_depth_ref = ref_count[null_depth]
+    null_depth_alt = alt_count[null_depth]
+    depth.loc[null_depth] = null_depth_ref + null_depth_alt
+    # t_ref_count = t_depth - t_alt_count
+    null_ref = ref_count.isnull()
+    null_ref_depth = depth[null_ref]
+    null_ref_alt = alt_count[null_ref]
+    ref_count[null_ref] = null_ref_depth - null_ref_alt
+    # t_alt_count = t_depth - t_ref_count
+    null_alt = alt_count.isnull()
+    null_alt_depth = depth[null_alt]
+    null_alt_ref = ref_count[null_alt]
+    alt_count[null_alt] = null_alt_depth - null_alt_ref
+    return {'depth': depth, 'ref_count': ref_count, 'alt_count': alt_count}
+
+
 def runMAFinBED(syn,
                 center_mappingdf,
                 test=False,
@@ -675,7 +732,6 @@ def store_maf_files(syn,
     maf_ent = syn.get(centerMafSynIdsDf.id[0])
     headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0)
     column_order = headerdf.columns
-
     for _, mafSynId in enumerate(centerMafSynIdsDf.id):
         maf_ent = syn.get(mafSynId)
         logger.info(maf_ent.path)
@@ -900,9 +956,9 @@ def store_clinical_files(syn,
         if code.upper() in oncotree_dict.keys() else float('nan')
         for code in clinicaldf['ONCOTREE_CODE']]
 
-    # All cancer types that are null should have null oncotree codes
-    clinicaldf['ONCOTREE_CODE'][
-        clinicaldf['CANCER_TYPE'].isnull()] = float('nan')
+    # All cancer types that are null contain deprecated oncotree codes
+    # And should be removed
+    clinicaldf = clinicaldf[~clinicaldf['CANCER_TYPE'].isnull()]
     # Suggest using AGE_AT_SEQ_REPORT_DAYS instead so that the
     # descriptions can match
     clinicaldf['AGE_AT_SEQ_REPORT_DAYS'] = clinicaldf['AGE_AT_SEQ_REPORT']