diff --git a/genie/__version__.py b/genie/__version__.py index 59c3246f..e4400ae6 100644 --- a/genie/__version__.py +++ b/genie/__version__.py @@ -1 +1 @@ -__version__ = "12.2.0" +__version__ = "12.3.0" diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 7056280e..d7259901 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -243,10 +243,12 @@ def configure_maf(mafdf, remove_variants, flagged_variants): # common_variants = mafdf['FILTER'].astype(str).str.contains( # "common_variant", na=False # ) + # Germline Filter gnomad_cols = ["gnomAD_AFR_AF", 'gnomAD_AMR_AF', 'gnomAD_ASJ_AF', 'gnomAD_EAS_AF', 'gnomAD_FIN_AF', 'gnomAD_NFE_AF', 'gnomAD_OTH_AF', 'gnomAD_SAS_AF'] - new_common_variants = mafdf.loc[ + # location of germline variants + common_variants_idx = mafdf.loc[ :, gnomad_cols ].max(axis=1, skipna=True) > 0.0005 @@ -255,22 +257,77 @@ def configure_maf(mafdf, remove_variants, flagged_variants): # Genome Nexus successfully annotated (vcf2maf does not have this column) if mafdf.get("Annotation_Status") is None: mafdf['Annotation_Status'] = "SUCCESS" + # Make sure to only get variants that were successfully annotated success = mafdf['Annotation_Status'] == "SUCCESS" - mafdf = mafdf.loc[(~new_common_variants & + mafdf = mafdf.loc[(~common_variants_idx & ~to_remove_variants & success),] - - fillnas = ['t_depth', 't_ref_count', 't_alt_count', - 'n_depth', 'n_ref_count', 'n_alt_count'] - for col in fillnas: - mafdf[col][mafdf[col].astype(str) == "."] = "" - n_depth_ind = mafdf['n_depth'].astype(str).isin(["NA", "0.0", "0"]) - mafdf['Match_Norm_Seq_Allele2'][n_depth_ind] = '' - mafdf['Match_Norm_Seq_Allele1'][n_depth_ind] = '' + # May not need to do this because these columns are always + # returned as numerical values now + # fillnas = ['t_depth', 't_ref_count', 't_alt_count', + # 'n_depth', 'n_ref_count', 'n_alt_count'] + # for col in fillnas: + # mafdf[col][mafdf[col].astype(str) == "."] = float('nan') + n_depth_ind = mafdf['n_depth'].astype(str).isin(["NA", "0.0", "0", 'nan']) + mafdf.loc[n_depth_ind, 'Match_Norm_Seq_Allele2'] = '' + mafdf.loc[n_depth_ind, 'Match_Norm_Seq_Allele1'] = '' + # Calculate missing t_depth, t_ref_count, t_alt_count + t_counts = calculate_missing_variant_counts( + depth=mafdf['t_depth'], alt_count=mafdf['t_alt_count'], + ref_count=mafdf['t_ref_count'] + ) + mafdf['t_depth'] = t_counts['depth'] + mafdf['t_ref_count'] = t_counts['ref_count'] + mafdf['t_alt_count'] = t_counts['alt_count'] + # Calculate missing n_depth, n_ref_count, n_alt_count + n_counts = calculate_missing_variant_counts( + depth=mafdf['n_depth'], alt_count=mafdf['n_alt_count'], + ref_count=mafdf['n_ref_count'] + ) + mafdf['n_depth'] = n_counts['depth'] + mafdf['n_ref_count'] = n_counts['ref_count'] + mafdf['n_alt_count'] = n_counts['alt_count'] return mafdf +def calculate_missing_variant_counts(depth: pd.Series, alt_count: pd.Series, + ref_count: pd.Series) -> dict: + """Calculate missing counts. t_depth = t_alt_count + t_ref_count + + Args: + depth: Allele Depth + alt_count: Allele alt counts + ref_count: Allele ref counts + + Returns: + filled in depth, alt_count and ref_count values + + """ + # Avoid SettingWithCopyWarning + depth = depth.copy() + alt_count = alt_count.copy() + ref_count = ref_count.copy() + # t_depth = t_ref_count + t_alt_count + null_depth = depth.isnull() + # The notation null_depth_ref means all the reference values for which + # depth is NA + null_depth_ref = ref_count[null_depth] + null_depth_alt = alt_count[null_depth] + depth.loc[null_depth] = null_depth_ref + null_depth_alt + # t_ref_count = t_depth - t_alt_count + null_ref = ref_count.isnull() + null_ref_depth = depth[null_ref] + null_ref_alt = alt_count[null_ref] + ref_count[null_ref] = null_ref_depth - null_ref_alt + # t_alt_count = t_depth - t_ref_count + null_alt = alt_count.isnull() + null_alt_depth = depth[null_alt] + null_alt_ref = ref_count[null_alt] + alt_count[null_alt] = null_alt_depth - null_alt_ref + return {'depth': depth, 'ref_count': ref_count, 'alt_count': alt_count} + + def runMAFinBED(syn, center_mappingdf, test=False, @@ -675,7 +732,6 @@ def store_maf_files(syn, maf_ent = syn.get(centerMafSynIdsDf.id[0]) headerdf = pd.read_csv(maf_ent.path, sep="\t", comment="#", nrows=0) column_order = headerdf.columns - for _, mafSynId in enumerate(centerMafSynIdsDf.id): maf_ent = syn.get(mafSynId) logger.info(maf_ent.path) @@ -900,9 +956,9 @@ def store_clinical_files(syn, if code.upper() in oncotree_dict.keys() else float('nan') for code in clinicaldf['ONCOTREE_CODE']] - # All cancer types that are null should have null oncotree codes - clinicaldf['ONCOTREE_CODE'][ - clinicaldf['CANCER_TYPE'].isnull()] = float('nan') + # All cancer types that are null contain deprecated oncotree codes + # And should be removed + clinicaldf = clinicaldf[~clinicaldf['CANCER_TYPE'].isnull()] # Suggest using AGE_AT_SEQ_REPORT_DAYS instead so that the # descriptions can match clinicaldf['AGE_AT_SEQ_REPORT_DAYS'] = clinicaldf['AGE_AT_SEQ_REPORT'] diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index 4cbf934f..12a15608 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1,9 +1,8 @@ """Clinical file format validation and processing""" import datetime -import os +from io import StringIO import logging -import subprocess -import yaml +import os import pandas as pd import synapseclient @@ -373,8 +372,8 @@ def _validate(self, clinicaldf, oncotree_link): Returns: Error message """ - total_error = "" - warning = "" + total_error = StringIO() + warning = StringIO() clinicaldf.columns = [col.upper() for col in clinicaldf.columns] clinicaldf = clinicaldf.fillna("") @@ -405,16 +404,18 @@ def _validate(self, clinicaldf, oncotree_link): process_functions.checkColExist(clinicaldf, sampleId) if not haveSampleColumn: - total_error += \ + total_error.write( "Sample Clinical File: Must have SAMPLE_ID column.\n" + ) else: if sum(clinicaldf[sampleId].duplicated()) > 0: - total_error += ( + total_error.write( "Sample Clinical File: No duplicated SAMPLE_ID " "allowed.\nIf there are no duplicated " "SAMPLE_IDs, and both sample and patient files are " "uploaded, then please check to make sure no duplicated " - "PATIENT_IDs exist in the patient clinical file.\n") + "PATIENT_IDs exist in the patient clinical file.\n" + ) # CHECK: PATIENT_ID patientId = "PATIENT_ID" # #CHECK: PATIENT_ID IN SAMPLE FILE @@ -422,8 +423,9 @@ def _validate(self, clinicaldf, oncotree_link): process_functions.checkColExist(clinicaldf, patientId) if not havePatientColumn: - total_error += \ + total_error.write( "Patient Clinical File: Must have PATIENT_ID column.\n" + ) # CHECK: within the sample file that the sample ids match # the patient ids @@ -435,27 +437,30 @@ def _validate(self, clinicaldf, oncotree_link): for sample, patient in zip(clinicaldf[sampleId], clinicaldf[patientId])]): - total_error += ( + total_error.write( "Sample Clinical File: PATIENT_ID's much be contained in " - "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n") + "the SAMPLE_ID's (ex. SAGE-1 <-> SAGE-1-2)\n" + ) # #CHECK: All samples must have associated patient data # (GENIE requires patient data) if not all(clinicaldf[patientId] != ""): - total_error += ( + total_error.write( "Patient Clinical File: All samples must have associated " "patient information and no null patient ids allowed. " "These samples are missing patient data: {}\n".format( ", ".join(clinicaldf[sampleId][ - clinicaldf[patientId] == ""]))) + clinicaldf[patientId] == ""])) + ) # CHECK: All patients should have associated sample data if not all(clinicaldf[sampleId] != ""): # ## MAKE WARNING FOR NOW### - warning += ( + warning.write( "Sample Clinical File: All patients must have associated " "sample information. These patients are missing sample " "data: {}\n".format( ", ".join(clinicaldf[patientId][ - clinicaldf[sampleId] == ""]))) + clinicaldf[sampleId] == ""])) + ) # CHECK: AGE_AT_SEQ_REPORT age = "AGE_AT_SEQ_REPORT" @@ -473,21 +478,24 @@ def _validate(self, clinicaldf, oncotree_link): if not all([process_functions.checkInt(i) for i in age_seq_report_df[age]]): - total_error += ( + total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. It must be an integer, 'Unknown', " - "'>32485', '<6570'.\n") + "'>32485', '<6570'.\n" + ) else: age_seq_report_df[age] = age_seq_report_df[age].astype(int) median_age = age_seq_report_df[age].median() if median_age < 100: - total_error += ( + total_error.write( "Sample Clinical File: Please double check your " "AGE_AT_SEQ_REPORT. You may be reporting this value " - "in YEARS, please report in DAYS.\n") + "in YEARS, please report in DAYS.\n" + ) else: - total_error += \ + total_error.write( "Sample Clinical File: Must have AGE_AT_SEQ_REPORT column.\n" + ) # CHECK: ONCOTREE_CODE haveColumn = \ @@ -505,13 +513,14 @@ def _validate(self, clinicaldf, oncotree_link): if not all(oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])): unmapped_oncotrees = oncotree_codes[ ~oncotree_codes.isin(oncotree_mapping['ONCOTREE_CODE'])] - total_error += ( + total_error.write( "Sample Clinical File: Please double check that all your " "ONCOTREE CODES exist in the mapping. You have {} samples " "that don't map. These are the codes that " "don't map: {}\n".format( len(unmapped_oncotrees), - ",".join(set(unmapped_oncotrees)))) + ",".join(set(unmapped_oncotrees))) + ) # Should add the SEX mismatch into the dashboard file if process_functions.checkColExist(clinicaldf, "SEX") and \ 'oncotree_mapping_dict' in locals() and \ @@ -546,25 +555,27 @@ def _validate(self, clinicaldf, oncotree_link): wrongCodeSamples.append(sample) if len(wrongCodeSamples) > 0: - warning += ( + warning.write( "Sample Clinical File: Some SAMPLE_IDs have " "conflicting SEX and ONCOTREE_CODES: {}\n".format( - ",".join(wrongCodeSamples))) + ",".join(wrongCodeSamples)) + ) else: - total_error += \ + total_error.write( "Sample Clinical File: Must have ONCOTREE_CODE column.\n" + ) warn, error = process_functions.check_col_and_values( clinicaldf, "SAMPLE_TYPE", sampletype_mapping['CODE'].tolist(), "Sample Clinical File", required=True) - total_error += error + total_error.write(error) # CHECK: SEQ_ASSAY_ID haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_ASSAY_ID") if haveColumn: if not all([i != "" for i in clinicaldf['SEQ_ASSAY_ID']]): - total_error += ( + total_error.write( "Sample Clinical File: Please double check your " "SEQ_ASSAY_ID columns, there are empty rows.\n" ) @@ -580,14 +591,16 @@ def _validate(self, clinicaldf, oncotree_link): if not seqassay.upper().startswith(self.center): invalid_seqassay.append(seqassay) if invalid_seqassay: - total_error += ( + total_error.write( "Sample Clinical File: Please make sure your " "SEQ_ASSAY_IDs start with your center " "abbreviation: {}.\n".format( - ", ".join(invalid_seqassay))) + ", ".join(invalid_seqassay)) + ) else: - total_error += \ + total_error.write( "Sample Clinical File: Must have SEQ_ASSAY_ID column.\n" + ) haveColumn = process_functions.checkColExist(clinicaldf, "SEQ_DATE") seq_date_error = ( @@ -608,27 +621,30 @@ def _validate(self, clinicaldf, oncotree_link): seqdate = clinicaldf['SEQ_DATE'][ clinicaldf['SEQ_DATE'] != 'Release'] if sum(clinicaldf['SEQ_DATE'] == '') > 0: - total_error += ( + total_error.write( "Sample Clinical File: Samples without SEQ_DATEs will " - "NOT be released.\n") + "NOT be released.\n" + ) try: if not seqdate.empty: seqdate.apply( lambda date: datetime.datetime.strptime(date, '%b-%Y')) if not seqdate.str.startswith( ("Jan", "Apr", "Jul", "Oct")).all(): - total_error += seq_date_error + total_error.write(seq_date_error) except ValueError: - total_error += seq_date_error + total_error.write(seq_date_error) else: - total_error += "Sample Clinical File: Must have SEQ_DATE column.\n" + total_error.write( + "Sample Clinical File: Must have SEQ_DATE column.\n" + ) # CHECK: BIRTH_YEAR error = _check_year(clinicaldf=clinicaldf, year_col="BIRTH_YEAR", filename="Patient Clinical File", allowed_string_values=['Unknown', '>89', '<18']) - total_error += error + total_error.write(error) # CHECK: YEAR DEATH error = _check_year(clinicaldf=clinicaldf, @@ -636,31 +652,36 @@ def _validate(self, clinicaldf, oncotree_link): filename="Patient Clinical File", allowed_string_values=['Unknown', 'Not Collected', 'Not Applicable', + "Not Released", '>89', '<18']) - total_error += error + total_error.write(error) # CHECK: YEAR CONTACT error = _check_year(clinicaldf=clinicaldf, year_col="YEAR_CONTACT", filename="Patient Clinical File", allowed_string_values=['Unknown', 'Not Collected', + "Not Released", '>89', '<18']) - total_error += error + total_error.write(error) # CHECK: INT CONTACT haveColumn = process_functions.checkColExist(clinicaldf, "INT_CONTACT") if haveColumn: if not all([process_functions.checkInt(i) for i in clinicaldf.INT_CONTACT if i not in - ['>32485', '<6570', 'Unknown', 'Not Collected']]): + ['>32485', '<6570', 'Unknown', 'Not Collected', + "Not Released"]]): - total_error += ( + total_error.write( "Patient Clinical File: Please double check your " "INT_CONTACT column, it must be an integer, '>32485', " - "'<6570', 'Unknown' or 'Not Collected'.\n") + "'<6570', 'Unknown', 'Not Released' or 'Not Collected'.\n" + ) else: - total_error += \ + total_error.write( "Patient Clinical File: Must have INT_CONTACT column.\n" + ) # INT DOD haveColumn = process_functions.checkColExist(clinicaldf, "INT_DOD") @@ -668,15 +689,18 @@ def _validate(self, clinicaldf, oncotree_link): if not all([process_functions.checkInt(i) for i in clinicaldf.INT_DOD if i not in ['>32485', '<6570', 'Unknown', - 'Not Collected', 'Not Applicable']]): + 'Not Collected', 'Not Applicable', "Not Released"]]): - total_error += ( + total_error.write( "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', " - "'Unknown', 'Not Collected' or 'Not Applicable'.\n") + "'Unknown', 'Not Collected', 'Not Released' or " + "'Not Applicable'.\n" + ) else: - total_error += \ + total_error.write( "Patient Clinical File: Must have INT_DOD column.\n" + ) haveColumn = process_functions.checkColExist(clinicaldf, "DEAD") if haveColumn: @@ -684,56 +708,58 @@ def _validate(self, clinicaldf, oncotree_link): if not all([ str(i).upper() in ['TRUE', 'FALSE'] for i in clinicaldf.DEAD if i not in - ['Unknown', 'Not Collected']]): - total_error += ( + ['Unknown', 'Not Collected', "Not Released"]]): + total_error.write( "Patient Clinical File: Please double check your " - "DEAD column, it must be True, False, 'Unknown' or " - "'Not Collected'.\n") + "DEAD column, it must be True, False, 'Unknown', " + "'Not Released' or 'Not Collected'.\n" + ) else: - total_error += \ + total_error.write( "Patient Clinical File: Must have DEAD column.\n" + ) # CHECK: PRIMARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "PRIMARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File" ) - warning += warn - total_error += error + warning.write(warn) + total_error.write(error) # CHECK: SECONDARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "SECONDARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File" ) - warning += warn - total_error += error + warning.write(warn) + total_error.write(error) # CHECK: TERTIARY_RACE warn, error = process_functions.check_col_and_values( clinicaldf, "TERTIARY_RACE", race_mapping['CODE'].tolist(), "Patient Clinical File" ) - warning += warn - total_error += error + warning.write(warn) + total_error.write(error) # CHECK: SEX warn, error = process_functions.check_col_and_values( clinicaldf, "SEX", sex_mapping['CODE'].tolist(), "Patient Clinical File", required=True ) - warning += warn - total_error += error + warning.write(warn) + total_error.write(error) # CHECK: ETHNICITY warn, error = process_functions.check_col_and_values( clinicaldf, "ETHNICITY", ethnicity_mapping['CODE'].tolist(), "Patient Clinical File" ) - warning += warn - total_error += error + warning.write(warn) + total_error.write(error) - return total_error, warning + return total_error.getvalue(), warning.getvalue() def _get_dataframe(self, filePathList): clinicaldf = pd.read_csv(filePathList[0], sep="\t", comment="#") diff --git a/templates/dashboardTemplate.Rmd b/templates/dashboardTemplate.Rmd index 4259ef4e..e3e70981 100644 --- a/templates/dashboardTemplate.Rmd +++ b/templates/dashboardTemplate.Rmd @@ -263,8 +263,10 @@ if (is.null(this_bed)) { this_assays = as.character(unique(this_samples$SEQ_ASSAY_ID)) this_mut <- getFileDf("data_mutations_extended.txt", releaseFiles) assay_infodf = getFileDf("assay_information.txt", releaseFiles) -black_list_variants <- synTableQuery("select * from syn18459663", - includeRowIdAndRowVersion = F) +black_list_variants <- synTableQuery( + "select * from syn18459663 where filter_variant is true", + includeRowIdAndRowVersion = F +) black_list_variantsdf = black_list_variants$asDataFrame() # this_cna <- getFileDf("data_CNA.txt", releaseFiles) #this_fus <- getFileDf("data_fusions.txt", releaseFiles) @@ -403,21 +405,25 @@ For patient retractions submitted between these months, the records will be remo --- -## Blacklisted Variants -This is a count of how many blacklisted variants a center has. For instance, it is likely that the BRAF p.V600M mutations are likely false positives. If your center shows up in this table, please investigate your variants. If you have suggestions for variants that should be part of this list, please inform Sage Bionetworks. +## Flagged Mutations +This is a count of how many flagged mutations a center has. Most of these variants are potential +artifacts flagged by manual review of cBioPortal. Please inform Sage Bionetworks about: + +* Suggestions for variants that should be part of this list +* Any variant shouldn't be part of this list ```{r blacklist} blacklist_variants = paste(black_list_variantsdf$Hugo_Symbol, black_list_variantsdf$HGVSp_Short) -subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, - c("Hugo_Symbol","Center","HGVSp_Short")] +subset_mut = this_mut[this_mut$Hugo_Symbol %in% black_list_variantsdf$Hugo_Symbol, ] subset_mut$blacklist = paste(subset_mut$Hugo_Symbol, subset_mut$HGVSp_Short) subset_mut = subset_mut[subset_mut$blacklist %in% blacklist_variants,] -kable(table(subset_mut$Center, subset_mut$blacklist), +kable(table(subset_mut$blacklist, subset_mut$Center), caption = "Blacklist variant count") ``` + --- ## Distribution of Clinical Attributes @@ -426,7 +432,7 @@ Please use the below distributions to access the validity of your clinical data. ### Race Some european sites like GRCC, CRUK, and NKI do not collect race information from their patients. If your institution does not either, you should see empty barplots. -```{r} +```{r race} #Center X Race par(mar = c(10,3,3,1)) plotCenterXRace(this_patient) @@ -435,7 +441,7 @@ plotCenterXRace(this_patient) ### Ethnicity Some european sites like GRCC, CRUK, and NKI do not collect ethnicity information from their patients. If your institution does not either, you should see empty barplots. -```{r} +```{r ethnicity} #Center X Ethnicity par(mar = c(10,3,3,1)) plotCenterXEthnicity(this_patient) @@ -444,7 +450,7 @@ plotCenterXEthnicity(this_patient) ### Sex CRUK submits samples from breast cancer patients, so their samples are all female. -```{r} +```{r sex} #Center X Sex par(mar = c(10,3,3,1)) plotCenterXSex(this_patient) @@ -454,14 +460,14 @@ plotCenterXSex(this_patient) CHOP submits mainly pediatric cases, so their distribution is different from the rest. -```{r} +```{r age} #Center X Age par(mar = c(10,3,3,1)) plotCenterXAge(this_samples) ``` -```{r, include=F, echo=F} +```{r seqyear, include=F, echo=F} check_seq_year = "SEQ_YEAR" %in% colnames(this_samples) ``` @@ -474,6 +480,109 @@ check_seq_year = "SEQ_YEAR" %in% colnames(this_samples) +### Cancer Type (Null) +This will show NULL or UNKNOWN cancer types. + +```{r cancertype} +unknown_cancer_type = this_samples$CANCER_TYPE %in% c("UNKNOWN", '') +null_cancer_type = is.na(this_samples$CANCER_TYPE) + +na_cancer_type_subset = this_samples[unknown_cancer_type | null_cancer_type, ] +kable(table(na_cancer_type_subset$CANCER_TYPE, na_cancer_type_subset$CENTER)) +``` + + + +`r if(!is.null(this_samples$YEAR_CONTACT)) { "### Vital Status ('Missing' values)\nThese are the centers with 'missing' vital status values: 'Unknown', 'Not Collected' for:\n\n"}` + +`r if(!is.null(this_samples$YEAR_CONTACT)) { "* YEAR_CONTACT\n"}` + + +```{r yearcontact} +if(!is.null(this_samples$YEAR_CONTACT)) { + missing_vs = this_patient$YEAR_CONTACT %in% c("Unknown", 'Not Collected') + + missing_subset = this_patient[missing_vs, ] + kable(table(missing_subset$YEAR_CONTACT, missing_subset$CENTER)) +} +``` + +`r if(!is.null(this_samples$YEAR_CONTACT)) { "* INT_CONTACT\n"}` + +```{r intcontact} +if(!is.null(this_samples$YEAR_CONTACT)) { + + missing_vs = this_patient$INT_CONTACT %in% c("Unknown", 'Not Collected') + + missing_subset = this_patient[missing_vs, ] + kable(table(missing_subset$INT_CONTACT, missing_subset$CENTER)) +} +``` + +`r if(!is.null(this_samples$YEAR_CONTACT)) { "* YEAR_DEATH\n"}` + +```{r yeardeath} +if(!is.null(this_samples$YEAR_CONTACT)) { + + missing_vs = this_patient$YEAR_DEATH %in% c("Unknown", 'Not Collected') + + missing_subset = this_patient[missing_vs, ] + kable(table(missing_subset$YEAR_DEATH, missing_subset$CENTER)) +} +``` + +`r if(!is.null(this_samples$YEAR_CONTACT)) { "* INT_DOD\n"}` + +```{r intdod} +if(!is.null(this_samples$YEAR_CONTACT)) { + + missing_vs = this_patient$INT_DOD %in% c("Unknown", 'Not Collected') + + missing_subset = this_patient[missing_vs, ] + kable(table(missing_subset$INT_DOD, missing_subset$CENTER)) +} +``` + +`r if(!is.null(this_samples$YEAR_CONTACT)) { "* DEAD\n"}` + +```{r dead} +if(!is.null(this_samples$YEAR_CONTACT)) { + + missing_vs = this_patient$DEAD %in% c("Unknown", 'Not Collected') + + missing_subset = this_patient[missing_vs, ] + kable(table(missing_subset$DEAD, missing_subset$CENTER)) +} +``` + +--- + +## Missing Variant Counts +These are the sites with missing `t_depth`, `t_alt_count` and `t_ref_count` values. +```{r variantcounts, echo=F} +missing_t_depth_idx = is.na(this_mut$t_depth) +missing_t_ref_count_idx = is.na(this_mut$t_ref_count) +missing_t_alt_count_idx = is.na(this_mut$t_alt_count) + +missing_t_depth = table(this_mut$Center[missing_t_depth_idx]) +missing_t_ref_count = table(this_mut$Center[missing_t_ref_count_idx]) +missing_t_alt_count = table(this_mut$Center[missing_t_alt_count_idx]) +centers = unique(c(names(missing_t_depth), + names(missing_t_ref_count), + names(missing_t_alt_count))) +missing_counts = matrix(data=NA, nrow=length(centers), ncol=3, + dimnames = list(centers, + c("t_depth", "t_alt_count", + "t_ref_count"))) + + +missing_counts[names(missing_t_depth), 't_depth'] = missing_t_depth +missing_counts[names(missing_t_ref_count), 't_ref_count'] = missing_t_ref_count +missing_counts[names(missing_t_alt_count), 't_alt_count'] = missing_t_alt_count +missing_counts[is.na(missing_counts)] = 0 +kable(missing_counts) +``` + --- ## Top 5 most frequently mutated genes per pipeline @@ -501,13 +610,13 @@ filter_maf <- function(mafdf) { # Remove 5'Flank unless it is TERT mafdf[!flank_classification | tert_mutations, ] } -this_mut <- filter_maf(this_mut) +filtered_mafdf <- filter_maf(this_mut) ``` ```{r top_5_mutated, echo=FALSE} -mergeddf = merge.data.frame(this_mut[,c("Tumor_Sample_Barcode", "Hugo_Symbol")], +mergeddf = merge.data.frame(filtered_mafdf[,c("Tumor_Sample_Barcode", "Hugo_Symbol")], this_samples[,c("SAMPLE_ID", "SEQ_ASSAY_ID")], by.x = "Tumor_Sample_Barcode", by.y = "SAMPLE_ID") @@ -682,3 +791,89 @@ if (length(largePanels) > 1) { plotPanelOverlap(this_bed, largePanels) } ``` + +--- + +## Possible Non-center Related Data Issues + +This section includes QC issues that are mostly related to the Sage Bionetworks pipeline, Genome Nexus or _maybe_ center related issues.. +If empty, nothing is flagged. + +```{r, include=F, echo=F} +variant <- paste0(this_mut$Tumor_Sample_Barcode, "_", + this_mut$Chromosome, ":", + this_mut$Start_Position, "_", + this_mut$Reference_Allele, "/", + this_mut$Tumor_Seq_Allele2) +duplicated_idx = duplicated(variant) +duplicated_variants = sum(duplicated_idx) > 0 +``` + +`r if(duplicated_variants) {"---\n\n### Duplicated Variants\nThese duplicated variants are caused by the annotation pipeline as no duplicated variants are allowed in the maf or vcfs."}` + +```{r duplicated, echo=F} +if (duplicated_variants) { + kable(this_mut[duplicated_idx, c("Chromosome", "Start_Position", + "Reference_Allele", + "Tumor_Seq_Allele2", + "Tumor_Sample_Barcode")], + row.names=F) +} +``` + +```{r nonsomatic, echo=F, include=F} +non_somatic <- this_mut[this_mut$Reference_Allele==this_mut$Tumor_Seq_Allele2 & + this_mut$Reference_Allele==this_mut$Tumor_Seq_Allele1,] +if (nrow(non_somatic) > 0) { + write.csv(non_somatic[, c("Hugo_Symbol", "Chromosome", "Reference_Allele", + "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", + "Tumor_Sample_Barcode")], + "non_somatic.csv", + row.names=F, quote=F) + non_somatic_ent = synStore(File("non_somatic.csv", parent=release_synid)) + unlink("non_somatic.csv") +} + +``` + +`r if(nrow(non_somatic) > 0) {paste0("---\n\n### Non-somatic Mutations\nThese mutations should be removed from the data.: ", "https://www.synapse.org/#!Synapse:", non_somatic_ent$properties$id)}` + + +```{r invalid_variant_type, echo=F, include=F} +is_dnp <- nchar(this_mut$Reference_Allele) == nchar(this_mut$Tumor_Seq_Allele2) & + nchar(this_mut$Reference_Allele)==2 & this_mut$Variant_Type == "DNP" +not_dnp <- this_mut[ + substr(this_mut$Reference_Allele[is_dnp], 1, 1) == substr(this_mut$Tumor_Seq_Allele2[is_dnp], 1, 1) | + substr(this_mut$Reference_Allele[is_dnp], 1, 2) == substr(this_mut$Tumor_Seq_Allele2[is_dnp], 1, 2), +] +if (nrow(not_dnp) > 0) { + write.csv(not_dnp[,c("Hugo_Symbol", "Chromosome", "Reference_Allele", + "Tumor_Seq_Allele2", "Tumor_Sample_Barcode", + "Variant_Type")], + "snv_as_dnp.csv", + row.names=F, quote=F) + dnp_ent = synStore(File("snv_as_dnp.csv", parent=release_synid)) + unlink("snv_as_dnp.csv") +} + +is_onp <- nchar(this_mut$Reference_Allele) == nchar(this_mut$Tumor_Seq_Allele2) & + nchar(this_mut$Reference_Allele) == 3 & this_mut$Variant_Type == "ONP" +not_onp <- this_mut[ + substr(this_mut$Reference_Allele[is_onp], 1, 1) == substr(this_mut$Tumor_Seq_Allele2[is_onp], 1, 1) | + substr(this_mut$Reference_Allele[is_onp], 1, 2) == substr(this_mut$Tumor_Seq_Allele2[is_onp], 1, 2), +] + +if (nrow(not_onp) > 0) { + write.csv(not_onp[, c("Hugo_Symbol", "Chromosome", "Reference_Allele", + "Tumor_Seq_Allele2", "Tumor_Sample_Barcode", + "Variant_Type")], + "snv_as_onp.csv", row.names=F, quote=F) + onp_ent = synStore(File("snv_as_onp.csv", parent=release_synid)) + unlink("snv_as_onp.csv") +} + +``` + +`r if(nrow(not_dnp) > 0) {paste0("---\n\n### SNV labelled as DNP\nThese are variants that have variant type DNP that should be SNV: ", "https://www.synapse.org/#!Synapse:", dnp_ent$properties$id)}` + +`r if(nrow(not_onp) > 0) {paste0("---\n\n### SNV labelled as ONP\nThese are variants that have variant type ONP that should be SNV: ", "https://www.synapse.org/#!Synapse:", onp_ent$properties$id)}` diff --git a/templates/data_guide_template.Rnw b/templates/data_guide_template.Rnw index 145682e1..ec3ebb6c 100644 --- a/templates/data_guide_template.Rnw +++ b/templates/data_guide_template.Rnw @@ -4,9 +4,8 @@ \usepackage{longtable} \usepackage{tocloft} \usepackage{amssymb} -\usepackage{blindtext} - \cftpagenumbersoff{section} - \cftpagenumbersoff{subsection} +\cftpagenumbersoff{section} +\cftpagenumbersoff{subsection} \usepackage{color} %May be necessary if you want to color links \usepackage{hyperref} \usepackage{float} diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 0223d379..3af80b1e 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -406,18 +406,20 @@ def test_nonull__validate(): "'Unknown', '>89', '<18'.\n" "Patient Clinical File: Please double check your YEAR_DEATH " "column, it must be an integer in YYYY format <= {year} or " - "'Unknown', 'Not Collected', 'Not Applicable', '>89', '<18'.\n" + "'Unknown', 'Not Collected', 'Not Applicable', " + "'Not Released', '>89', '<18'.\n" "Patient Clinical File: Please double check your YEAR_CONTACT " "column, it must be an integer in YYYY format <= {year} or " - "'Unknown', 'Not Collected', '>89', '<18'.\n" + "'Unknown', 'Not Collected', 'Not Released', '>89', '<18'.\n" "Patient Clinical File: Please double check your INT_CONTACT " - "column, it must be an integer, '>32485', '<6570', 'Unknown' " - "or 'Not Collected'.\n" + "column, it must be an integer, '>32485', '<6570', 'Unknown', " + "'Not Released' or 'Not Collected'.\n" "Patient Clinical File: Please double check your INT_DOD " "column, it must be an integer, '>32485', '<6570', 'Unknown', " - "'Not Collected' or 'Not Applicable'.\n" + "'Not Collected', 'Not Released' or 'Not Applicable'.\n" "Patient Clinical File: Please double check your DEAD column, " - "it must be True, False, 'Unknown' or 'Not Collected'.\n" + "it must be True, False, 'Unknown', " + "'Not Released' or 'Not Collected'.\n" "Patient Clinical File: Please double check your PRIMARY_RACE " "column. This column must only be these values: 1, 2, 3, 4, 99\n" "Patient Clinical File: Please double check your SECONDARY_RACE " @@ -427,7 +429,8 @@ def test_nonull__validate(): "Patient Clinical File: Please double check your SEX column. " "This column must only be these values: 1, 2, 99\n" "Patient Clinical File: Please double check your ETHNICITY " - "column. This column must only be these values: 1, 2, 3, 4, 99\n") + "column. This column must only be these values: 1, 2, 3, 4, 99\n" + ) assert error == expected_errors.format( year=datetime.datetime.utcnow().year ) @@ -537,18 +540,20 @@ def test_errors__validate(): "'Unknown', '>89', '<18'.\n" "Patient Clinical File: Please double check your YEAR_DEATH " "column, it must be an integer in YYYY format <= {year} or " - "'Unknown', 'Not Collected', 'Not Applicable', '>89', '<18'.\n" + "'Unknown', 'Not Collected', 'Not Applicable', " + "'Not Released', '>89', '<18'.\n" "Patient Clinical File: Please double check your YEAR_CONTACT " "column, it must be an integer in YYYY format <= {year} or " - "'Unknown', 'Not Collected', '>89', '<18'.\n" + "'Unknown', 'Not Collected', 'Not Released', '>89', '<18'.\n" "Patient Clinical File: Please double check your INT_CONTACT " - "column, it must be an integer, '>32485', '<6570', 'Unknown' or " - "'Not Collected'.\n" + "column, it must be an integer, '>32485', '<6570', 'Unknown', " + "'Not Released' or 'Not Collected'.\n" "Patient Clinical File: Please double check your INT_DOD column, " "it must be an integer, '>32485', '<6570', 'Unknown', " - "'Not Collected' or 'Not Applicable'.\n" + "'Not Collected', 'Not Released' or 'Not Applicable'.\n" "Patient Clinical File: Please double check your DEAD column, " - "it must be True, False, 'Unknown' or 'Not Collected'.\n" + "it must be True, False, 'Unknown', " + "'Not Released' or 'Not Collected'.\n" "Patient Clinical File: Please double check your PRIMARY_RACE " "column. This column must only be these values: 1, 2, 3, 4, 99\n" "Patient Clinical File: Please double check your SECONDARY_RACE " @@ -558,7 +563,8 @@ def test_errors__validate(): "Patient Clinical File: Please double check your SEX column. " "This column must only be these values: 1, 2, 99\n" "Patient Clinical File: Please double check your ETHNICITY " - "column. This column must only be these values: 1, 2, 3, 4, 99\n") + "column. This column must only be these values: 1, 2, 3, 4, 99\n" + ) expectedWarnings = ( "Sample Clinical File: All patients must have associated sample " "information. These patients are missing sample data: ID6\n"