diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..1fd11da Binary files /dev/null and b/.DS_Store differ diff --git a/docker/.DS_Store b/docker/.DS_Store new file mode 100644 index 0000000..6c0f9e6 Binary files /dev/null and b/docker/.DS_Store differ diff --git a/docker/bcftools/Dockerfile b/docker/bcftools/Dockerfile new file mode 100644 index 0000000..678311d --- /dev/null +++ b/docker/bcftools/Dockerfile @@ -0,0 +1,34 @@ +# The Germline Genomics of Cancer (G2C) +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# Simple Dockerfile for vanallenlab/bcftools + +#FROM ubuntu:22.04 +#FROM ubuntu:focal-20220531 +FROM continuumio/miniconda3 +MAINTAINER "Noah Fields " + + + + +# Install bcftools dependencies +RUN apt-get update && \ + apt-get install -y gcc make zlib1g-dev libbz2-dev liblzma-dev \ + libcurl4-openssl-dev libssl-dev libncurses5-dev && \ + apt-get clean + +# Install bcftools +#ARG BCFTOOLS_VERISON=1.19 +RUN wget https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \ + tar -xvf bcftools-1.19.tar.bz2 && \ + cd bcftools-1.19 && \ + ./configure --prefix=/ && \ + make && \ + make install + +RUN rm -r bcftools-1.19 +RUN rm bcftools-1.19.tar.bz2 + +# Launch bash at runtime +CMD ["/bin/bash"] diff --git a/docker/charr/.DS_Store b/docker/charr/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docker/charr/.DS_Store differ diff --git a/docker/charr/Dockerfile b/docker/charr/Dockerfile new file mode 100644 index 0000000..cbc62fe --- /dev/null +++ b/docker/charr/Dockerfile @@ -0,0 +1,58 @@ +# The Germline Genomics of Cancer (G2C) +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# Simple Dockerfile for vanallenlab/g2c_pipeline:nf_graf_plink_alpha + +#FROM ubuntu:22.04 +#FROM ubuntu:focal-20220531 +FROM continuumio/miniconda3 +MAINTAINER "Noah Fields " + +# Create a conda environment and activate it +#RUN conda create --name myenv python=3.11 && \ +# conda activate myenv + + +# Install make, zlib, file, and git +RUN apt-get clean && \ + apt-get -qqy update && \ + apt-get -qqy install --fix-missing \ + apt-utils \ + default-libmysqlclient-dev \ + curl \ + vim \ + wget + +#RUN apt install -y python3 + + +# Clone NCBI Grafpop +RUN wget https://github.com/HTGenomeAnalysisUnit/SCE-VCF/releases/download/v0.1.2/sceVCF + +RUN chmod +x sceVCF +RUN mv sceVCF /bin/sceVCF + +RUN wget -O /bin/echtvar https://github.com/brentp/echtvar/releases/download/v0.1.9/echtvar \ + && chmod +x /bin/echtvar + +# Install bcftools dependencies +RUN apt-get update && \ + apt-get install -y gcc make zlib1g-dev libbz2-dev liblzma-dev \ + libcurl4-openssl-dev libssl-dev libncurses5-dev && \ + apt-get clean + +# Install bcftools +#ARG BCFTOOLS_VERISON=1.19 +RUN wget https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \ + tar -xvf bcftools-1.19.tar.bz2 && \ + cd bcftools-1.19 && \ + ./configure --prefix=/ && \ + make && \ + make install + +RUN rm -r bcftools-1.19 +RUN rm bcftools-1.19.tar.bz2 + +# Launch bash at runtime +CMD ["/bin/bash"] diff --git a/docker/g2c-vep/.DS_Store b/docker/g2c-vep/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docker/g2c-vep/.DS_Store differ diff --git a/docker/g2c_pipeline/Dockerfile b/docker/g2c_pipeline/Dockerfile index aee66cb..654152f 100644 --- a/docker/g2c_pipeline/Dockerfile +++ b/docker/g2c_pipeline/Dockerfile @@ -9,9 +9,7 @@ # It is based on the primary sv-pipeline image from GATK-SV, which is available here: # us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline -# And can be reviewed in more detail here: -# https://github.com/broadinstitute/gatk-sv/blob/main/dockerfiles/sv-pipeline/Dockerfile -ARG BASE_IMAGE=sv-pipeline:latest +ARG BASE_IMAGE=us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3 FROM $BASE_IMAGE as base # Aliases for convenience @@ -27,5 +25,9 @@ RUN cd /opt && \ cd pancan_germline_wgs && \ git checkout $REPO_HASH +# Install missing R libraries as necessary +RUN Rscript -e 'install.packages("argparse", repos="https://cran.rstudio.com")' + + # Launch bash as entrypoint CMD ["/bin/bash"] diff --git a/docker/g2c_ufc/Dockerfile b/docker/g2c_ufc/Dockerfile new file mode 100644 index 0000000..956fecc --- /dev/null +++ b/docker/g2c_ufc/Dockerfile @@ -0,0 +1,6 @@ +FROM condaforge/mambaforge:22.11.1-4 AS build +ENV CONDA_SUBDIR=linux-64 +RUN conda config --env --set subdir linux-64 && \ + mamba update --all && \ + mamba install pandas seaborn matplotlib +RUN pip install qqman \ No newline at end of file diff --git a/docker/g2c_ufc/STEP_1_Make_Batches.plot_qc.py b/docker/g2c_ufc/STEP_1_Make_Batches.plot_qc.py new file mode 100644 index 0000000..d314443 --- /dev/null +++ b/docker/g2c_ufc/STEP_1_Make_Batches.plot_qc.py @@ -0,0 +1,81 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import argparse + +def plot_qc_metrics(pre_filtered_df, post_filtered_df): + # Check if all columns of interest are present in both DataFrames + columns_of_interest = ['n_grafpop_snps', 'hq_het_rate', 'charr', 'inconsistent_ab_het_rate', + 'median_coverage', 'chrX_ploidy', 'chrY_ploidy'] + + missing_columns = [] + for col in columns_of_interest: + if col not in pre_filtered_df.columns: + missing_columns.append((args.pre_filtered, col)) + if col not in post_filtered_df.columns: + missing_columns.append((args.post_filtered, col)) + + if missing_columns: + for filename, column in missing_columns: + print(f"Error: {filename} doesn't contain the '{column}' column.") + return + + # Filter 'Post-filtered' DataFrame for rows where global_qc_pass is TRUE + print(post_filtered_df.head()) + post_filtered_df = post_filtered_df[post_filtered_df['global_qc_pass'] == True] + print(post_filtered_df[post_filtered_df['global_qc_pass'] == True].head()) + + # Define columns of interest + for metric in columns_of_interest: + # Create a violin plot for pre-filtered and post-filtered datasets + plt.figure(figsize=(10, 6)) + + # Violin plot + plt.subplot(1, 2, 1) # Subplot for violin plot + sns.violinplot(data=[pre_filtered_df[metric], post_filtered_df[metric]], palette="Set3") + plt.title(f'{metric} - Quality Control') + plt.xlabel('Dataset') + plt.ylabel(metric) + plt.xticks([0, 1], ['Pre-filtered', 'Post-filtered']) + + # Box plot + plt.subplot(1, 2, 2) # Subplot for box plot + sns.boxplot(data=[pre_filtered_df[metric], post_filtered_df[metric]], palette="Set3") + plt.title(f'{metric} - Quality Control') + plt.xlabel('Dataset') + plt.ylabel(metric) + plt.xticks([0, 1], ['Pre-filtered', 'Post-filtered']) + + # Adjust layout and spacing + plt.tight_layout() + + # Save the plot + plt.savefig(f'{metric}_qc.png') + plt.close() + + +def parse_arguments(): + # Create argument parser + parser = argparse.ArgumentParser(description="Plot quality control metrics") + # Add arguments + parser.add_argument("--pre-filtered", required=True, help="Path to pre-filtered TSV file") + parser.add_argument("--post-filtered", required=True, help="Path to post-filtered TSV file") + # Parse arguments + args = parser.parse_args() + + return args + +if __name__ == "__main__": + # Parse command-line arguments + args = parse_arguments() + + # Read pre-filtered and post-filtered TSV files into DataFrames + try: + pre_filtered_df = pd.read_csv(args.pre_filtered, sep='\t') + post_filtered_df = pd.read_csv(args.post_filtered, sep='\t') + except FileNotFoundError as e: + print(f"Error: {e.filename} not found.") + exit(1) + + # Call plot function with DataFrame arguments + plot_qc_metrics(pre_filtered_df, post_filtered_df) \ No newline at end of file diff --git a/docker/g2c_ufc/qqplot.py b/docker/g2c_ufc/qqplot.py new file mode 100644 index 0000000..cef6c59 --- /dev/null +++ b/docker/g2c_ufc/qqplot.py @@ -0,0 +1,42 @@ +from qqman import qqman +import pandas as pd +import matplotlib.pyplot as plt + +if __name__ == "__main__": + df_pancancer_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.pancancer.txt",sep='\t') + df_pancancer_assoc = df_pancancer_assoc[(df_pancancer_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_pancancer_assoc['max_MAF'] == 0.01)] + df_breast_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.breast.txt",sep='\t') + df_breast_assoc = df_breast_assoc[(df_breast_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_breast_assoc['max_MAF'] == 0.01)] + df_colorectal_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.colorectal.txt",sep='\t') + df_colorectal_assoc = df_colorectal_assoc[(df_colorectal_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_colorectal_assoc['max_MAF'] == 0.01)] + + df_lung_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.lung.txt",sep='\t') + df_lung_assoc = df_lung_assoc[(df_lung_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_lung_assoc['max_MAF'] == 0.01)] + df_melanoma_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.melanoma.txt",sep='\t') + df_melanoma_assoc = df_melanoma_assoc[(df_melanoma_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_melanoma_assoc['max_MAF'] == 0.01)] + df_prostate_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.prostate.txt",sep='\t') + df_prostate_assoc = df_prostate_assoc[(df_prostate_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_prostate_assoc['max_MAF'] == 0.01)] + + pancancer_p_vals = list(df_pancancer_assoc['Pvalue']) + breast_p_vals = list(df_breast_assoc['Pvalue']) + colorectal_p_vals = list(df_colorectal_assoc['Pvalue']) + lung_p_vals = list(df_lung_assoc['Pvalue']) + melanoma_p_vals = list(df_melanoma_assoc['Pvalue']) + prostate_p_vals = list(df_prostate_assoc['Pvalue']) + + figure, axes = plt.subplots(nrows=2, ncols=3, figsize = (8,6)) + + # qqman.qqplot("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.breast.txt", ax=axes[0,0],title="From file") + qqman.qqplot(pancancer_p_vals, ax=axes[0,0],title="Pancancer Cancer") + qqman.qqplot(breast_p_vals, ax=axes[0,1],title="Breast Cancer") + qqman.qqplot(colorectal_p_vals, ax=axes[0,2],title="Colorectal Cancer") + qqman.qqplot(lung_p_vals, ax=axes[1,0],title="Lung Cancer") + qqman.qqplot(melanoma_p_vals, ax=axes[1,1],title="Melanoma Cancer") + qqman.qqplot(prostate_p_vals, ax=axes[1,2],title="Prostate Cancer") + # qqman.qqplot(df_assoc.P, ax=axes[1,0],title="From Series") + + figure.tight_layout() + plt.show() + #plt.savefig("./SubQQplot.png",format="png") + #plt.clf() + #plt.close() \ No newline at end of file diff --git a/docker/g2c_ufc/qqplot_broad_retreat.py b/docker/g2c_ufc/qqplot_broad_retreat.py new file mode 100644 index 0000000..3afa25f --- /dev/null +++ b/docker/g2c_ufc/qqplot_broad_retreat.py @@ -0,0 +1,44 @@ +from qqman import qqman +import pandas as pd +import matplotlib.pyplot as plt + +if __name__ == "__main__": + df_pancancer_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.pancancer.txt", sep='\t') + df_pancancer_assoc1 = df_pancancer_assoc[ + (df_pancancer_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & + (df_pancancer_assoc['max_MAF'] == 0.01) + ] + df_pancancer_assoc2 = df_pancancer_assoc[ + (df_pancancer_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH;MODERATE') & + (df_pancancer_assoc['max_MAF'] == 0.01) + ] + + pancancer_p_vals = list(df_pancancer_assoc['Pvalue']) + + figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 15)) + + # First plot: High Impact Variants + qqman.qqplot(df_pancancer_assoc1.Pvalue, ax=axes[0]) + axes[0].set_title("PLP and High Impact Variants", fontsize=44, pad=30) + axes[0].tick_params(axis='both', labelsize=40) # Adjust tick size + axes[0].set_xlabel("Expected -log10(p)", fontsize=40, weight='bold') + axes[0].set_ylabel("Observed -log10(p)", fontsize=40, weight='bold') + + # Second plot: High/Moderate Impact Variants + qqman.qqplot(df_pancancer_assoc2.Pvalue, ax=axes[1]) + axes[1].set_title("PLP and High/Moderate Impact Variants", fontsize=44, pad=30) + axes[1].tick_params(axis='both', labelsize=40) # Adjust tick size + axes[1].set_xlabel("Expected -log10(p)", fontsize=40, weight='bold') + axes[1].set_ylabel("Observed -log10(p)", fontsize=40, weight='bold') + + # Add overarching title + figure.suptitle("Preliminary Gene-Based Rare Variant Association Testing on Chr22", fontsize=50, weight='bold') # Adjust y to bring title down + + # Add footnote with more space (lower the y position) + figure.text(0.5, 0.01, "*PLP = Pathogenic and/or Likely Pathogenic", ha='center', fontsize=30, style='italic') + + # Adjust spacing between plots + plt.subplots_adjust(wspace=0.8, top=0.83, bottom=0.2) # Adjust horizontal space between subplots + + # Save the figure + plt.savefig("/Users/noah/Desktop/UFC/plots/QQplot_BroadRetreat_2024.png", format="png", dpi=800) diff --git a/docker/graf/.DS_Store b/docker/graf/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docker/graf/.DS_Store differ diff --git a/docker/graf/Dockerfile b/docker/graf/Dockerfile new file mode 100644 index 0000000..5cc4a90 --- /dev/null +++ b/docker/graf/Dockerfile @@ -0,0 +1,58 @@ +# The Germline Genomics of Cancer (G2C) +# Contact: Ryan Collins +# Distributed under the terms of the GNU GPL v2.0 + +# Simple Dockerfile for vanallenlab/g2c_pipeline:nf_graf_plink_alpha + +#FROM ubuntu:22.04 +#FROM ubuntu:focal-20220531 +FROM continuumio/miniconda3 +MAINTAINER "Noah Fields " + +# Create a conda environment and activate it +#RUN conda create --name myenv python=3.11 && \ +# conda activate myenv + + +# Install make, zlib, file, and git +RUN apt-get clean && \ + apt-get -qqy update && \ + apt-get -qqy install --fix-missing \ + apt-utils \ + default-libmysqlclient-dev \ + curl \ + vim \ + wget + +#RUN apt install -y python3 + +# Install Plink on Mac +#RUN wget \ +# https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231018.zip && \ +# unzip plink_linux_x86_64_20231018.zip + +# Print the current working directory +#RUN echo "Current working directory: $(pwd)" > log.txt + +# Clone NCBI Grafpop +RUN wget -O GrafPop1.0.tar.gz \ + https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetZip.cgi?zip_name=GrafPop1.0.tar.gz && \ + tar -xzvf GrafPop1.0.tar.gz + +# Move the grafpop executable +RUN mv grafpop /bin + +# Copy the AncInferSNPs.txt to the home directory +RUN cp data/AncInferSNPs.txt . + +# Remove all traces of hg19 +RUN rm data/TG_10_1000_g37* + +# Remove the tar.gz file for lighter Docker +RUN rm GrafPop1.0.tar.gz + +COPY infer_ancestry.py /opt/ +RUN chmod a+x /opt/infer_ancestry.py + +# Launch bash at runtime +CMD ["/bin/bash"] diff --git a/docker/graf/infer_ancestry.py b/docker/graf/infer_ancestry.py new file mode 100644 index 0000000..5b1ae1e --- /dev/null +++ b/docker/graf/infer_ancestry.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +import sys +import csv + +def read_sample_data(filename): + # Dictionary to store the data + sample_data = {} + + # Read the text file and populate the dictionary + with open(filename, 'r') as txt_file: + lines = txt_file.readlines() + + # Skip lines until the header line + header_index = lines.index('Sample\t#SNPs\tGD1 (x)\tGD2 (y)\tGD3 (z)\tGD4\tE(%)\tF(%)\tA(%)\n') + + for line in lines[header_index + 1:]: + # Split the line into columns + columns = line.strip().split('\t') + + # Extract relevant data + sample = columns[0] + gd1 = float(columns[2]) + gd2 = float(columns[3]) + gd3 = float(columns[4]) + gd4 = float(columns[5]) + e_percent = float(columns[6]) + f_percent = float(columns[7]) + a_percent = float(columns[8]) + + # Store the values in the dictionary + sample_data[sample] = { + 'GD1': gd1, + 'GD2': gd2, + 'GD3': gd3, + 'GD4': gd4, + 'E(%)': e_percent, + 'F(%)': f_percent, + 'A(%)': a_percent + } + + return sample_data + +def classify_population(sample_data): + for sample, data in sample_data.items(): + e_percent = data['E(%)'] + f_percent = data['F(%)'] + a_percent = data['A(%)'] + gd1 = data['GD1'] + + if e_percent >= 90: + print(f"European\n1") + elif f_percent >= 95: + print(f"African\n2") + elif a_percent >= 95: + print(f"East-Asian\n3") + elif 40 <= f_percent < 95 and a_percent < 14: + print(f"African-American\n4") + elif f_percent < 50 and e_percent < 90 and a_percent < 14 and gd1 < 1.48: + print(f"Latin-American-1\n5") + elif a_percent >= 14 and f_percent >= 14: + print(f"Other\n9") + elif gd1 > 30 * data['GD4']**2 + 1.58 and f_percent < 14: + print(f"Asian-Pacific-Islander\n7") + elif gd1 + data['GD4'] < 1.525 and f_percent < 14: + print(f"Latin-American-2\n6") + elif data['GD4'] > 5 * (gd1 - 1.524)**2 + 0.0575: + print(f"South-Asian\n8") + else: + print(f"Unknown\n10") + +if __name__ == "__main__": + # Check if the filename is provided as a command-line argument + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + filename = sys.argv[1] + data = read_sample_data(filename) + + # Classify the population based on the provided conditions + classify_population(data) diff --git a/docker/gsc_pytools/.DS_Store b/docker/gsc_pytools/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docker/gsc_pytools/.DS_Store differ diff --git a/docker/gsc_pytools/Dockerfile b/docker/gsc_pytools/Dockerfile new file mode 100644 index 0000000..5dd7187 --- /dev/null +++ b/docker/gsc_pytools/Dockerfile @@ -0,0 +1,25 @@ +# The Germline Somatic Convergence (G2C) +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 +# Germline_Somatic Docker +# Use an Ubuntu base image +FROM ubuntu:latest +MAINTAINER "Noah Fields " + +# Install Python 3.9 and pip +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y python3.9 python3-pip python3.9-distutils && \ + apt-get clean + +# Install Python packages using pip +RUN python3.9 -m pip install numpy pandas firthlogist scipy seaborn statsmodels scikit-learn + +# Copy your utility script to the image +COPY gsc_util.py /opt/ +COPY *.py /opt/ + +# Set the default command to run at runtime +CMD ["/bin/bash"] diff --git a/docker/gsc_pytools/gsc_graph.py b/docker/gsc_pytools/gsc_graph.py new file mode 100644 index 0000000..561ecee --- /dev/null +++ b/docker/gsc_pytools/gsc_graph.py @@ -0,0 +1,349 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score +from statsmodels.stats.multitest import fdrcorrection + +def plot_germline_frequencies(data, + hmf_col='germline_plp_frequency_HMF', + profile_col='germline_plp_frequency_PROFILE', + save_path="germline_snp_frequencies.png"): + + # Define the cancer types of interest + cancer_types = ['Breast', 'Prostate', 'Colorectal', 'Lung', 'Kidney', 'Pancancer'] + + # Drop rows where either the HMF or PROFILE frequency is missing (NaN) + data_clean = data.dropna(subset=[hmf_col, profile_col]) + data = data_clean + + # Filtering to keep only the specified columns + data = data[['germline_risk_allele', 'cancer_type', hmf_col, profile_col]] + + # Removing duplicates + data = data.drop_duplicates() + + # Create a figure with subplots + fig, axes = plt.subplots(2, 3, figsize=(18, 10)) + axes = axes.flatten() # Flatten the array for easy indexing + + # Iterate through each cancer type and create a subplot + for idx, cancer_type in enumerate(cancer_types): + + # Filter the DataFrame for the current cancer type + filtered_data = data[data['cancer_type'] == cancer_type] + print(f"{cancer_type}: has {len(data)} rows") + + + # Extract HMF and PROFILE frequencies + x = filtered_data[hmf_col].values.reshape(-1, 1) + y = filtered_data[profile_col].values + + # Fit a linear regression model + model = LinearRegression() + model.fit(x, y) + y_pred = model.predict(x) + r_squared = r2_score(y, y_pred) + + # Plot the data + axes[idx].scatter(x, y, alpha=0.5) + axes[idx].plot(x, y_pred, color='blue', label='Best Fit', linestyle='--') + + # Add y=x line + axes[idx].plot(x, x, color='black', linestyle='--', label='y = x') + + # Set titles and labels + axes[idx].set_title(f'{cancer_type} (R² = {r_squared:.2f})') + axes[idx].set_xlabel('Germline Allele Frequency HMF') + axes[idx].set_ylabel('Germline Allele Frequency PROFILE') + axes[idx].legend() + axes[idx].set_xlim(0, max(x.max(), y.max())) + axes[idx].set_ylim(0, max(x.max(), y.max())) + + plt.tight_layout() + plt.savefig(save_path) + plt.close() + +def plot_somatic_mutation_frequencies(df, + x_col="somatic_plp_frequency_HMF", + y_col="somatic_plp_frequency_PROFILE", + save_path="somatic_mutation_frequencies.png"): + # Specify the cancer types to include + cancer_types = ["Breast", "Prostate", "Colorectal", "Lung", "Kidney", "Pancancer"] + + # Drop rows where either the HMF or PROFILE frequency is missing (NaN) + df_clean = df.dropna(subset=[x_col, y_col]) + df = df_clean + + # Filtering to keep only the specified columns + df = df[['somatic_gene', 'cancer_type', x_col, y_col]] + + # Removing duplicates + df = df.drop_duplicates() + + # Set up the figure and axes + fig, axes = plt.subplots(2, 3, figsize=(18, 12)) + axes = axes.flatten() + + for i, cancer_type in enumerate(cancer_types): + # Filter data for the current cancer type + subset = df[df['cancer_type'] == cancer_type] + + # Prepare data for regression + X = subset[x_col].values.reshape(-1, 1) + y = subset[y_col].values + + # Fit linear regression + model = LinearRegression() + model.fit(X, y) + + # Get predictions and R² value + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + + # Plot the data + axes[i].scatter(X, y, alpha=0.6, label=cancer_type) + axes[i].plot(X, y_pred, linestyle='--', color='red', label='Best Fit') + axes[i].plot([X.min(), X.max()], [X.min(), X.max()], 'k--', label='y=x') # dashed line for y=x + axes[i].set_title(f"{cancer_type}: R² = {r2:.2f}") + axes[i].set_xlabel(x_col) + axes[i].set_ylabel(y_col) + axes[i].legend() + + # Adjust layout and save the figure + plt.tight_layout() + plt.savefig(save_path) + plt.close() + +def plot_volcano(df, + p_col="p_val_combined", + or_col="OR_combined", + criteria_col="criteria", + cancer_type_col="cancer_type", + germline_event="germline_risk_allele", + figure_title="Volcano Plot of Germline Somatic Convergences", + save_path="insert_path_here.png"): + + # Drop rows where either p-value or OR is NaN + df_clean = df.dropna(subset=[p_col, or_col]) + df = df_clean + + # Calculate -log10(p-value) and log2(OR) + df['log10_p'] = -np.log10(df[p_col]) + df['log2_OR'] = np.log2(df[or_col]) + + # Define shapes and colors + shapes = {'known_ppi': 'o', 'same_gene': 's', 'protein_complex': 'h', + 'ligand_receptor': '^', 'known_ppi;ligand_receptor': '*', + 'known_ppi;protein_complex': 'd'} + colors = {'Breast': 'red', 'Prostate': 'blue', 'Colorectal': 'green', + 'Lung': 'purple', 'Kidney': 'orange', 'Pancancer': 'black'} + + # Create figure and axis + fig, ax = plt.subplots(figsize=(10, 8)) + + # Plot points + for criteria_val, shape in shapes.items(): + for cancer_type, color in colors.items(): + subset = df[(df[criteria_col] == criteria_val) & + (df[cancer_type_col] == cancer_type) & + (df['log2_OR'] > -20)] + ax.scatter(subset['log2_OR'], subset['log10_p'], + marker=shape, color=color, alpha=0.7) + + # Add significance thresholds + bonferroni_threshold = -np.log10(0.05 / len(df)) + nominal_threshold = -np.log10(0.05) + rejected, pvals_corrected = fdrcorrection(df[p_col], alpha=0.05) + if rejected.any(): + fdr_threshold = -np.log10(pvals_corrected[rejected].max()) + ax.axhline(y=fdr_threshold, color='green', linestyle='--', label=f'FDR Threshold') + ax.axhline(y=bonferroni_threshold, color='red', linestyle='--', label=f'Bonferroni') + ax.axhline(y=nominal_threshold, color='blue', linestyle='--', label='Nominal') + + # Annotate points above Bonferroni + significant_points = df[df['log10_p'] > nominal_threshold] + for _, row in significant_points.iterrows(): + # Get the value from germline_event, fallback to 'germline_gene' if germline_event is NA or NaN + germline_value = row[germline_event] if pd.notna(row[germline_event]) else row['germline_gene'] + label_text = f"({germline_value}, {row['somatic_gene']})" + ax.text(row['log2_OR'], row['log10_p'], label_text, fontsize=8, ha='right') + + # # Add legends + # color_handles = [plt.Line2D([0], [0], marker='o', color='w', + # markerfacecolor=color, label=cancer) + # for cancer, color in colors.items()] + # color_legend = ax.legend(handles=color_handles, title="Cancer Type", + # bbox_to_anchor=(1.05, 1), loc='upper left') + # ax.add_artist(color_legend) + + # shape_handles = [plt.Line2D([0], [0], marker=shape, color='k', label=criteria) + # for criteria, shape in shapes.items()] + # shape_legend = ax.legend(handles=shape_handles, title="Criteria", + # bbox_to_anchor=(1.05, 0.75), loc='upper left') + # ax.add_artist(shape_legend) + + # Add vertical dashed line at x=0 + ax.axvline(x=0, color='gray', linestyle='--', alpha=0.7, label='x=0') + + # # # Dashed lines legend + # # line_handles = [ + # # plt.Line2D([0], [0], color='red', linestyle='--', label='Bonferroni'), + # # plt.Line2D([0], [0], color='blue', linestyle='--', label='Nominal'), + # # plt.Line2D([0], [0], color='green', linestyle='--', label='Benjamini-Hochberg'), + # # plt.Line2D([0], [0], color='gray', linestyle='--', label='x=0') + # # ] + + # # # Use a separate legend for lines + # # line_legend = ax.legend(handles=line_handles, title="Significance Lines", + # # bbox_to_anchor=(1.05, 0.5), loc='upper left') + # # ax.add_artist(line_legend) + + # Add point legends (color and shape combined) + color_handles = [ + plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, label=cancer) + for cancer, color in colors.items() + ] + shape_handles = [ + plt.Line2D([0], [0], marker=shape, color='k', label=criteria) + for criteria, shape in shapes.items() + ] + + # Add line legend for thresholds + line_handles = [ + plt.Line2D([0], [0], color='red', linestyle='--', label='Bonferroni'), + plt.Line2D([0], [0], color='blue', linestyle='--', label='Nominal'), + plt.Line2D([0], [0], color='green', linestyle='--', label='FDR Threshold'), + plt.Line2D([0], [0], color='gray', linestyle='--', label='x=0'), + ] + + # Create a combined legend outside the plot + fig.legend(handles=color_handles + shape_handles + line_handles, + title="Legend", loc='center right', bbox_to_anchor=(1.3, 0.5)) + + # Adjust layout to give room for the legend + plt.subplots_adjust(right=0.75) + + # Labels, title, and save + ax.set_xlabel('log2(OR)') + ax.set_ylabel('-log10(p)') + ax.set_title(figure_title) + plt.tight_layout() + fig.savefig(save_path) + plt.close(fig) + + + +def plot_p_values(data, + hmf_col='p_val_HMF', + profile_col='p_val_PROFILE', + save_path="p_val_comparison.png"): + """ + Function to plot -log10 transformed p-values for HMF and PROFILE across different cancer types. + """ + # Define the cancer types of interest + cancer_types = ['Breast', 'Prostate', 'Colorectal', 'Lung', 'Kidney', 'Pancancer'] + + # Drop rows where either the HMF or PROFILE p-value is missing (NaN) + data_clean = data.dropna(subset=[hmf_col, profile_col]) + data_clean = data_clean[(data_clean[hmf_col] > 0) & (data_clean[profile_col] > 0)] + data = data_clean + + # Transform p-values to -log10(p) + data['log_p_val_HMF'] = -np.log10(data[hmf_col]) + data['log_p_val_PROFILE'] = -np.log10(data[profile_col]) + + # Drop rows with non-finite values after transformation + data = data_clean[np.isfinite(data['log_p_val_HMF']) & np.isfinite(data['log_p_val_PROFILE'])] + + # Create a figure with subplots + fig, axes = plt.subplots(2, 3, figsize=(18, 10)) + axes = axes.flatten() # Flatten the array for easy indexing + + # Iterate through each cancer type and create a subplot + for idx, cancer_type in enumerate(cancer_types): + + # Filter the DataFrame for the current cancer type + filtered_data = data[data['cancer_type'] == cancer_type] + + # Extract transformed HMF and PROFILE p-values + x = filtered_data['log_p_val_HMF'].values.reshape(-1, 1) + y = filtered_data['log_p_val_PROFILE'].values + + # Fit a linear regression model + model = LinearRegression() + model.fit(x, y) + y_pred = model.predict(x) + r_squared = r2_score(y, y_pred) + + # Plot the data + axes[idx].scatter(x, y, alpha=0.5) + axes[idx].plot(x, y_pred, color='blue', label='Best Fit', linestyle='--') + + # Add y=x line + axes[idx].plot(x, x, color='black', linestyle='--', label='y = x') + + # Set titles and labels + axes[idx].set_title(f'{cancer_type} (R² = {r_squared:.2f})') + axes[idx].set_xlabel('-log10(p_val_HMF)') + axes[idx].set_ylabel('-log10(p_val_PROFILE)') + axes[idx].legend() + axes[idx].set_xlim(0, max(x.max(), y.max())) + axes[idx].set_ylim(0, max(x.max(), y.max())) + + plt.tight_layout() + plt.savefig(save_path) + plt.close() + +def plot_gwas_subplots(df, save_path='gwas_subplots.png'): + df = df[(df['relevant_cancer'] == 1) | (df['relevant_cancer'] == 2)] + # Convert p-values to -log10 scale for plotting + df['-log10_p_val_HMF'] = -np.log10(df['p_val_HMF']) + df['log2OR'] = np.log2(df['OR_HMF']) + # Extract chromosome position as integer from `germline_risk_allele` + df['chrom_position'] = df['germline_risk_allele'].apply(lambda x: int(x.split(':')[1].split('-')[0])) + + # Group by cancer_type, germline_gene, and somatic_gene + grouped = df.groupby(['cancer_type', 'germline_gene', 'somatic_gene']) + + # Calculate nominal and Bonferroni thresholds + nominal_threshold = -np.log10(0.05) + + # Set up the plot grid + num_plots = len(grouped) + fig, axes = plt.subplots(nrows=(num_plots // 3) + 1, ncols=3, figsize=(15, 5 * (num_plots // 3 + 1))) + axes = axes.flatten() + + for i, (name, group) in enumerate(grouped): + cancer_type, germline_gene, somatic_gene = name + print(cancer_type) + # Calculate Bonferroni threshold for the current group + bonferroni_threshold = -np.log10(0.05 / len(group)) + + # Plot on the i-th subplot + ax = axes[i] + ax.scatter(group['chrom_position'], group['-log10_p_val_HMF'], color='b', s=10, alpha=0.6) + #ax.scatter(group['chrom_position'], group['log2OR'], color='b', s=10, alpha=0.6) + ax.axhline(nominal_threshold, color='green', linestyle='--', label='Nominal p=0.05') + ax.axhline(bonferroni_threshold, color='red', linestyle='-', label='Bonferroni Corrected') + + # Set plot titles and labels + title = "" + if cancer_type == "Pancancer": + title = f"{germline_gene} Noncoding and {somatic_gene} Coding Convergence: Pancancer" + else: + title = f"{germline_gene} Noncoding and {somatic_gene} Coding Convergence: {cancer_type}" + ax.set_title(title, fontsize=10) + ax.set_xlabel("Chromosome Position") + ax.set_ylabel("-log10(p)") + ax.legend() + + # Remove any extra subplots in the grid + for j in range(i + 1, len(axes)): + fig.delaxes(axes[j]) + + # Adjust layout and save the figure + plt.tight_layout() + plt.savefig(save_path, dpi=300) + plt.show() + print(f"Figure saved to {save_path}") \ No newline at end of file diff --git a/docker/gsc_pytools/gsc_util.py b/docker/gsc_pytools/gsc_util.py new file mode 100644 index 0000000..7f777db --- /dev/null +++ b/docker/gsc_pytools/gsc_util.py @@ -0,0 +1,567 @@ +import pandas as pd +import statsmodels.api as sm +from scipy.stats import fisher_exact +import numpy as np +from firthlogist import FirthLogisticRegression +import scipy.stats as stats + + +# Assuming FirthLogisticRegression is already imported or defined elsewhere +# You may need to implement this if it's not available. + +def logistic_regression_with_fallback(df, cancer_type, germline_event, somatic_gene, covariates=['male', 'pca_1', 'pca_2', 'pca_3', 'pca_4']): + print(f"Cancer Type: {cancer_type}\n Germline Event: {germline_event} \n Somatic Gene: {somatic_gene}") + + # Filter based on cancer_type if it's not Pancancer + if cancer_type != "Pancancer": + df = df[df['cancer_type'] == cancer_type] + + # Check if the germline_event and somatic_gene are in the dataframe columns + if germline_event not in df.columns or somatic_gene not in df.columns: + print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame") + return + + # If either column has no positive cases, skip + if df[germline_event].sum() == 0 or df[somatic_gene].sum() == 0: + return + + # Drop rows with NaN values in the columns of interest + df = df.dropna(subset=[germline_event, somatic_gene] + covariates) + + + + + # Try regular logistic regression + try: + # Prepare the predictor (X) and response (y) variables + X = df[[germline_event] + covariates] + y = df[somatic_gene] + + # Normalize somatic_gene values: treat values > 1 as 1 + y = y.apply(lambda x: 1 if x > 1 else x) + + # Add a constant to the predictors for logistic regression + X = sm.add_constant(X) + + model = sm.Logit(y, X) + result = model.fit(disp=False) + + # If the model converged, extract p-value, odds ratio, and confidence intervals + if result.mle_retvals['converged']: + print("Regular logistic regression converged successfully!") + + # Extract p-value and odds ratio for the germline_gene predictor + p_value = result.pvalues[germline_event] # Assuming germline_event is the first predictor after the constant + odds_ratio = np.exp(result.params[germline_event]) + + # Calculate the 95% confidence interval for the odds ratio + coef = result.params[germline_event] + std_err = result.bse[germline_event] + + # The confidence interval for the coefficient + conf_int_coef = [coef - 1.96 * std_err, coef + 1.96 * std_err] + + # Convert the confidence interval from log odds to odds ratio + conf_int_or = np.exp(conf_int_coef) + + # Return the results: odds ratio, p-value, and confidence intervals + return odds_ratio, p_value, conf_int_or[0], conf_int_or[1] + + else: + raise ValueError("Regular logistic regression failed to converge.") + + # Fallback to Firth logistic regression if regular logistic regression fails + except Exception as e: + print(f"Regular logistic regression failed: {e}") + print("Falling back to Firth logistic regression...") + + try: + # Prepare the predictor (X) and response (y) variables + X = df[[germline_event] + covariates] + y = df[somatic_gene] + + # Normalize somatic_gene values: treat values > 1 as 1 + y = y.apply(lambda x: 1 if x > 1 else x) + + # Fit Firth logistic regression (Assuming FirthLogisticRegression is defined) + model = FirthLogisticRegression() + model.fit(X, y) + + # Extract p-value and odds ratio for the germline_gene predictor + p_value = model.pvals_[0] # Assuming the predictor is the second column after the constant + odds_ratio = np.exp(model.coef_[0]) + + # Calculate confidence intervals + coef = model.coef_[0] + std_err = model.bse_[0] + conf_int_coef = [coef - 1.96 * std_err, coef + 1.96 * std_err] + conf_int_or = np.exp(conf_int_coef) + + return odds_ratio, p_value, conf_int_or[0], conf_int_or[1] + + except Exception as e: + print(f"Firth logistic regression failed: {e}") + return + + +def firth_logistic_regression(df, cancer_type, germline_event, somatic_gene,covariates=['male','pca_1','pca_2','pca_3','pca_4']): + print(f"Cancer Type: {cancer_type}\n Germline Event: {germline_event} \n Somatic Gene: {somatic_gene}") + if cancer_type != "Pancancer": + df = df[df['cancer_type'] == cancer_type] + + if germline_event not in df.columns or somatic_gene not in df.columns: + print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame") + return + if df[germline_event].sum() == 0 or df[somatic_gene].sum() == 0: + return + + df= df.dropna(subset=[germline_event,somatic_gene] + covariates) + + try: + # Prepare the predictor (X) and response (y) variables + X = df[[germline_event] + covariates] + y = df[somatic_gene] + + # Normalize somatic_gene values: treat values > 1 as 1 + y = y.apply(lambda x: 1 if x > 1 else x) + + # Fit Logistic Regression Model + model = FirthLogisticRegression() + model.fit(X,y) + + # Extract p-value and odds ratio for the germline_gene predictor + p_value = model.pvals_[0] + odds_ratio = np.exp(model.coef_[0]) + + # Calculate the 95% confidence interval for the odds ratio + coef = model.coef_[0] + std_err = model.bse_[0] + + # The confidence interval for the coefficient + conf_int_coef = [coef - 1.96 * std_err, coef + 1.96 * std_err] + + # Convert the confidence interval from log odds to odds ratio + conf_int_or = np.exp(conf_int_coef) + + # Append results to the results array, including confidence intervals + return odds_ratio, p_value, conf_int_or[0], conf_int_or[1] + + except Exception as e: + print(f"Error processing combination {germline_event} - {somatic_gene}: {e}") + + +def find_filtered_allele_frequency(df, cancer_type, germline_event, somatic_gene,covariates=['male','pca_1','pca_2','pca_3','pca_4']): + """ + Calculate the allele frequency for a specified column in a DataFrame. + + Args: + df (pd.DataFrame): The input DataFrame. + column_name (str): The column name for which to calculate allele frequency. + + Returns: + float: The calculated allele frequency. + """ + if cancer_type != "Pancancer": + df = df[df['cancer_type'] == cancer_type] + + if germline_event not in df.columns or somatic_gene not in df.columns: + print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame") + return + + df= df.dropna(subset=[germline_event,somatic_gene] + covariates) + + # Get the column values, excluding NaNs + column_values = df[germline_event].dropna() + + # Calculate the allele frequency + allele_frequency = column_values.sum() / (len(column_values) * 2) + + return allele_frequency,column_values.sum(),(len(column_values) * 2) + +# Get a allele frequency for the cancer_type at large +def find_allele_frequency(df, cancer_type, germline_event): + # Filter to Cancer type of interest + if cancer_type != "Pancancer": + df = df[df['cancer_type'] == cancer_type] + + # Make sure the snp is present in the df + if germline_event not in df.columns: + print(f"Combination {germline_event} not found in DataFrame") + return + + df= df.dropna(subset=[germline_event]) + + # Get the column values, excluding NaNs + column_values = df[germline_event].dropna() + + # Calculate the allele frequency + allele_frequency = column_values.sum() / (len(column_values) * 2) + + return allele_frequency,column_values.sum(),(len(column_values) * 2) + +# Get a mutation frequency for specific logistic regression (verify that all variables are there first) +def find_filtered_mutation_frequency(df, cancer_type, germline_event, somatic_gene, covariates=['male','pca_1','pca_2','pca_3','pca_4']): + # Filter to Cancer Type of Interest + if cancer_type != "Pancancer": + df = df[df['cancer_type'] == cancer_type] + + # Verify that we have the data we want + if germline_event not in df.columns or somatic_gene not in df.columns: + print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame") + return + + df= df.dropna(subset=[germline_event,somatic_gene] + covariates) + + # Get the column values, excluding NaNs + column_values = df[somatic_gene].dropna() + + # Transform the column values: non-zero values become 1, zero values stay 0 + column_values = column_values.apply(lambda x: 1 if x != 0 else 0) + + # Calculate the allele frequency + mutation_frequency = column_values.sum() / len(column_values) + + return mutation_frequency,column_values.sum(),len(column_values) + +# Get a allele frequency for the cancer_type at large +def find_mutation_frequency(df, cancer_type, somatic_gene): + # Filter to Cancer Type of Interest + if cancer_type != "Pancancer": + df = df[df['cancer_type'] == cancer_type] + + # Verify that we have the data we want + if somatic_gene not in df.columns: + print(f"Combination {somatic_gene} not found in DataFrame") + return + + df= df.dropna(subset=[somatic_gene]) + + # Get the column values, excluding NaNs + column_values = df[somatic_gene].dropna() + + # Transform the column values: non-zero values become 1, zero values stay 0 + column_values = column_values.apply(lambda x: 1 if x != 0 else 0) + + # Calculate the allele frequency + mutation_frequency = column_values.sum() / len(column_values) + + return mutation_frequency,column_values.sum(),len(column_values) + +def merge_and_analyze_noncoding_coding(tsv1_path, tsv2_path, output_path='merged_with_combined_OR_and_p_values.tsv'): + """ + This function merges two TSV files, performs inverse variance analysis on odds ratios (ORs), + and combines p-values using Stouffer's Z-score method. + + Args: + tsv1_path (str): File path to the HMF TSV file. + tsv2_path (str): File path to the PROFILE TSV file. + output_path (str): Path to save the output TSV file (default: 'merged_with_combined_OR_and_p_values.tsv'). + + Returns: + merged_df (DataFrame): The resulting DataFrame with combined OR and p-values. + """ + + # Step 1: Read the data into pandas DataFrames + df1 = pd.read_csv(tsv1_path, sep='\t') + df2 = pd.read_csv(tsv2_path, sep='\t') + + # Step 1: Replace 'Renal' with 'Kidney' in the 'cancer_type' column in both DataFrames + df1['cancer_type'] = df1['cancer_type'].replace('Renal', 'Kidney') + df2['cancer_type'] = df2['cancer_type'].replace('Renal', 'Kidney') + + # Step 2: Merge the DataFrames on the relevant columns, including 'criteria' + merged_df = pd.merge(df1, df2, how='outer', + on=['criteria', 'cancer_type', 'germline_risk_allele', 'germline_gene', 'germline_context', 'somatic_gene', 'somatic_context','relevant_cancer'], + suffixes=('_HMF', '_PROFILE')) + + # Step 3: Calculate variance for ORs based on confidence intervals (CI) + merged_df['variance_HMF'] = ((np.log(merged_df['ci_OR_high_HMF']) - np.log(merged_df['ci_OR_low_HMF'])) / (2 * 1.96)) ** 2 + merged_df['variance_PROFILE'] = ((np.log(merged_df['ci_OR_high_PROFILE']) - np.log(merged_df['ci_OR_low_PROFILE'])) / (2 * 1.96)) ** 2 + + # Step 4: Perform inverse variance weighting for OR + merged_df['log_OR_HMF'] = np.log(merged_df['OR_HMF']) + merged_df['log_OR_PROFILE'] = np.log(merged_df['OR_PROFILE']) + + # Calculate combined log OR using inverse variance weighting + merged_df['log_OR_combined'] = ( + (merged_df['log_OR_HMF'] / merged_df['variance_HMF'] + merged_df['log_OR_PROFILE'] / merged_df['variance_PROFILE']) / + (1 / merged_df['variance_HMF'] + 1 / merged_df['variance_PROFILE']) + ) + + # Convert combined log OR back to OR + merged_df['OR_final'] = np.exp(merged_df['log_OR_combined']) + + # Step 5: Calculate the variance of the combined OR using inverse variance + merged_df['variance_OR_combined'] = 1 / (1 / merged_df['variance_HMF'] + 1 / merged_df['variance_PROFILE']) + + # Step 6: Calculate Z-scores as OR_combined / variance_OR_combined + merged_df['z_combined'] = np.log(merged_df['OR_final']) / np.sqrt(merged_df['variance_OR_combined']) + + # Step 7: Calculate p-value from Z-score + merged_df['p_val_final'] = 2 * stats.norm.sf(np.abs(merged_df['z_combined'])) + + # Step 8: Save the merged DataFrame with combined OR and p-values + merged_df.to_csv(output_path, sep='\t', index=False) + + # Return the merged DataFrame + return merged_df + +def filter_by_pvalues(input_tsv, output_tsv = "filtered.tsv"): + """ + This function filters the rows where (p_val_HMF or p_val_PROFILE < 0.05) + and p_val_combined < 0.05, and writes the filtered DataFrame to a new file. + + Args: + input_tsv (str): Path to the input TSV file. + output_tsv (str): Path to save the filtered output TSV file. + + Returns: + filtered_df (DataFrame): The filtered DataFrame. + """ + + # Read the merged data + df = pd.read_csv(input_tsv, sep='\t') + + # Filter based on the conditions + filtered_df = df[ + ((df['p_val_HMF'] < 0.05) | (df['p_val_PROFILE'] < 0.05)) & + (df['p_val_combined'] < 0.05) + ] + + # Save the filtered DataFrame to a new TSV file + filtered_df.to_csv(output_tsv, sep='\t', index=False) + + # Return the filtered DataFrame + return filtered_df + +def analyze_data(convergence_table_path,genotype_table_path,germline_context,somatic_context,cohort,output_file): + # Step 1: Read in Dataframe containing convergences and Dataframe containing patient level data + convergences_df = pd.read_csv(convergence_table_path,sep='\t') + convergences_df = convergences_df[(convergences_df['germline_context'] == germline_context) & + (convergences_df['somatic_context'] == somatic_context)] + patient_df = pd.read_csv(genotype_table_path, sep='\t') + + # Initialize an empty list to store results + results = [] + + # Step 2: Initialize covariates + covariates = None + if cohort == "HMF": + covariates=['male','pca_1','pca_2','pca_3','age','TMB','tumorPurity','primary'] + elif cohort == "PROFILE": + covariates=['male','pca_1','pca_2','pca_3','pca_4','age','TMB','tumorPurity','primary','late_stage'] + + # Alternate slate of covariates for hormone sensitive cancers and pancancer + covariates_hsc = [cov for cov in covariates if cov != 'male'] + covariates_pancancer = covariates + ['Breast_Diagnosis','Colorectal_Diagnosis','Kidney_Diagnosis','Lung_Diagnosis','Prostate_Diagnosis'] + + # Step 3: Prepare Suffixes + germline_suffix = None + if germline_context == "coding": + germline_suffix = "-g" + else: + germline_suffix = "" + + somatic_suffix = None + if somatic_context == "coding": + somatic_suffix = "-s" + else: + somatic_suffix = "-ncs" + + # Step 4: Loop through each row of the DataFrame and apply logistic regression with firth fallback + for index, row in convergences_df.iterrows(): + germline_event = None + + # Extract necessary columns + if germline_context == "coding": + gwas_cancer_type = row[0] + germline_gene = row[1] + germline_context = row[2] + somatic_gene = row[3] + somatic_context = row[4] + criteria = row[5] + germline_event = germline_gene + else: + gwas_cancer_type = row[0] + germline_snp = row[1] + germline_gene = row[2] + germline_context = row[3] + somatic_gene = row[4] + somatic_context = row[5] + criteria = row[6] + germline_event = germline_snp + + for cancer_type in ["Breast","Colorectal","Prostate","Lung","Kidney","Pancancer"]: + relevant_cancer = 0 + if gwas_cancer_type.lower() == cancer_type.lower(): + relevant_cancer = 1 + elif cancer_type == "Pancancer": + relevant_cancer = 2 + + # Apply the functions to get metrics + if cancer_type == "Breast" or cancer_type == "Prostate": + regression_output = logistic_regression_with_fallback(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix,covariates=covariates_hsc) + filtered_germline_output = find_filtered_allele_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_hsc) + filtered_somatic_output = find_filtered_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_hsc) + elif cancer_type == "Pancancer": + regression_output = logistic_regression_with_fallback(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix,covariates=covariates_pancancer) + filtered_germline_output = find_filtered_allele_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_pancancer) + filtered_somatic_output = find_filtered_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_pancancer) + else: + regression_output = logistic_regression_with_fallback(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix,covariates=covariates) + filtered_germline_output = find_filtered_allele_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates) + filtered_somatic_output = find_filtered_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates) + + germline_output = find_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix) + somatic_output = find_mutation_frequency(patient_df, cancer_type, somatic_gene + somatic_suffix) + + # Assign values using ternary operators for cleaner code + OR, p_val, ci_OR_low, ci_OR_high = regression_output if regression_output else (pd.NA, pd.NA, pd.NA, pd.NA) + + filtered_germline_plp_frequency, filtered_germline_plp_count, filtered_germline_sample_size = ( + filtered_germline_output if filtered_germline_output else (pd.NA, pd.NA, pd.NA) + ) + + germline_plp_frequency, germline_plp_count, germline_sample_size = ( + germline_output if germline_output else (pd.NA, pd.NA, pd.NA) + ) + + somatic_plp_frequency, somatic_plp_count, somatic_sample_size = ( + somatic_output if somatic_output else (pd.NA, pd.NA, pd.NA) + ) + + filtered_somatic_plp_frequency, filtered_somatic_plp_count, filtered_somatic_sample_size = ( + filtered_somatic_output if filtered_somatic_output else (pd.NA, pd.NA, pd.NA) + ) + + if germline_context == "coding": + # Append the results to the list + results.append([ + cancer_type, germline_gene, germline_context, + somatic_gene, somatic_context, criteria, + germline_plp_frequency, germline_plp_count, germline_sample_size, + filtered_germline_plp_frequency, filtered_germline_plp_count, filtered_germline_sample_size, + somatic_plp_frequency, somatic_plp_count, somatic_sample_size, + filtered_somatic_plp_frequency, filtered_somatic_plp_count, filtered_somatic_sample_size, + OR, p_val, ci_OR_low, ci_OR_high, relevant_cancer + ]) + else: + # Append the results to the list + results.append([ + cancer_type, germline_snp, germline_gene, germline_context, + somatic_gene, somatic_context, criteria, + germline_plp_frequency, germline_plp_count, germline_sample_size, + filtered_germline_plp_frequency, filtered_germline_plp_count, filtered_germline_sample_size, + somatic_plp_frequency, somatic_plp_count, somatic_sample_size, + filtered_somatic_plp_frequency, filtered_somatic_plp_count, filtered_somatic_sample_size, + OR, p_val, ci_OR_low, ci_OR_high, relevant_cancer + ]) + + if germline_context == "noncoding" and somatic_context == "coding": + cohort_suffix = cohort + else: + cohort_suffix = "final" + + if germline_context == "coding": + # Convert the results to a DataFrame + output_df = pd.DataFrame(results, columns=[ + 'cancer_type', 'germline_gene', 'germline_context', + 'somatic_gene', 'somatic_context', 'criteria', + f'germline_plp_frequency_{cohort}', f'germline_plp_count_{cohort}', f'germline_sample_size_{cohort}', + f'filtered_germline_plp_frequency_{cohort}', f'filtered_germline_plp_count_{cohort}', f'filtered_germline_sample_size_{cohort}', + f'somatic_plp_frequency_{cohort}', f'somatic_plp_count_{cohort}', f'somatic_sample_size_{cohort}', + f'filtered_somatic_plp_frequency_{cohort}', f'filtered_somatic_plp_count_{cohort}', f'filtered_somatic_sample_size_{cohort}', + f'OR_{cohort_suffix}', f'p_val_{cohort_suffix}', f'ci_OR_low_{cohort_suffix}', f'ci_OR_high_{cohort_suffix}', 'relevant_cancer' + ]) + else: + # Convert the results to a DataFrame + output_df = pd.DataFrame(results, columns=[ + 'cancer_type', 'germline_risk_allele','germline_gene', 'germline_context', + 'somatic_gene', 'somatic_context', 'criteria', + f'germline_plp_frequency_{cohort}', f'germline_plp_count_{cohort}', f'germline_sample_size_{cohort}', + f'filtered_germline_plp_frequency_{cohort}', f'filtered_germline_plp_count_{cohort}', f'filtered_germline_sample_size_{cohort}', + f'somatic_plp_frequency_{cohort}', f'somatic_plp_count_{cohort}', f'somatic_sample_size_{cohort}', + f'filtered_somatic_plp_frequency_{cohort}', f'filtered_somatic_plp_count_{cohort}', f'filtered_somatic_sample_size_{cohort}', + f'OR_{cohort_suffix}', f'p_val_{cohort_suffix}', f'ci_OR_low_{cohort_suffix}', f'ci_OR_high_{cohort_suffix}', 'relevant_cancer' + ]) + + # Eliminate duplicate rows based on all columns + output_df = output_df.drop_duplicates() + + # Write the DataFrame to a TSV file + output_df.to_csv(output_file, sep='\t', index=False) + +def process_and_merge_profile_tables(variant_table_pathway, gene_table_pathway): + """ + Process variant and gene tables by filtering columns based on suffixes, merging them, + and adjusting '-s' columns based on '-g' column values. + + Parameters: + - variant_table: DataFrame containing variant data. + - gene_table: DataFrame containing gene data. + + Returns: + - DataFrame: The processed and merged table. + """ + + variant_table = pd.read_csv(variant_table_pathway,sep='\t') + gene_table = pd.read_csv(gene_table_pathway,sep='\t') + + # Step 1: Process column names + # Remove '-s' and '-g' suffixes for comparison + variant_columns = [col for col in variant_table.columns if col != 'patient_id' and (col.endswith('-s') or col.endswith('-g'))] + gene_columns = [col for col in gene_table.columns if col != 'patient_id' and (col.endswith('-s') or col.endswith('-g'))] + + variant_base_columns = [col.rsplit('-', 1)[0] for col in variant_columns] + gene_base_columns = [col.rsplit('-', 1)[0] for col in gene_columns] + + # Step 2: Find common base columns + common_columns = list(set(variant_base_columns).intersection(set(gene_base_columns))) + + # Step 3: Create filtered_gene_columns and filter gene_table + filtered_gene_columns = [col + '-g' for col in common_columns] + ['patient_id'] + filtered_gene_table = gene_table[filtered_gene_columns] + + # Step 4: Merge filtered_gene_table with variant_table + merged_table = variant_table.merge(filtered_gene_table, on='patient_id', how='inner') + + # Step 5: Adjust '-s' columns based on '-g' columns + conversion_stats = [] # To store stats for each column + for col in common_columns: + s_col = col + '-s' + g_col = col + '-g' + if s_col in merged_table.columns and g_col in merged_table.columns: + # Get initial count of values > 0 in the s_col + initial_s_col_count = (merged_table[s_col] > 0).sum() + + # Perform the conversion + merged_table.loc[(merged_table[g_col] == 1) & (merged_table[s_col] > 0), s_col] = 0 + + # Get the count of values converted to 0 + converted_count = initial_s_col_count - (merged_table[s_col] > 0).sum() + + # Calculate percentage of converted values + if initial_s_col_count > 0: + conversion_percentage = (converted_count / initial_s_col_count) * 100 + else: + conversion_percentage = 0.0 + + # Append stats to the list + conversion_stats.append({ + "s_col": s_col, + "g_col": g_col, + "converted_count": converted_count, + "initial_s_col_count": initial_s_col_count, + "conversion_percentage": conversion_percentage + }) + + # Print stats for each column + for stats in conversion_stats: + print(f"Column: {stats['s_col']} | " + f"Converted: {stats['converted_count']} out of {stats['initial_s_col_count']} " + f"({stats['conversion_percentage']:.2f}%)") + + + merged_table.to_csv("profile_table.tsv",sep='\t',index=False) + return + diff --git a/docker/hail_rf/Dockerfile b/docker/hail_rf/Dockerfile new file mode 100644 index 0000000..9c3b7cf --- /dev/null +++ b/docker/hail_rf/Dockerfile @@ -0,0 +1,15 @@ +# The Germline Genomics of Cancer (G2C) +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# Simple Dockerfile for vanallenlab/hail_rf + + +FROM hailgenetics/hail:0.2.127.post1-py3.11 +MAINTAINER "Noah Fields " + + +COPY apply_rf_model.py /opt/ + +# Launch bash at runtime +CMD ["/bin/bash"] diff --git a/docker/hail_rf/apply_rf_model.py b/docker/hail_rf/apply_rf_model.py new file mode 100644 index 0000000..8f41cf3 --- /dev/null +++ b/docker/hail_rf/apply_rf_model.py @@ -0,0 +1,640 @@ +# noqa: D100 +#Adopted From Broad Institute: https://broadinstitute.github.io/gnomad_methods/_modules/index.html + +import json +import os +import logging +import pprint +from pprint import pformat +from typing import Dict, List, Optional, Tuple + +import hail as hl +import pandas as pd +import pyspark.sql +from pyspark.ml import Pipeline +from pyspark.ml.classification import RandomForestClassifier +from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler +from pyspark.ml.functions import vector_to_array +from pyspark.sql import SparkSession +from pyspark.sql.functions import col # pylint: disable=no-name-in-module + +#from gnomad.utils.file_utils import file_exists + +logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +def run_rf_test( + mt: hl.MatrixTable, output: str = "/tmp" +) -> Tuple[pyspark.ml.PipelineModel, hl.Table]: + """ + Run a dummy test RF on a given MT. + + 1. Creates row annotations and labels to run model on + 2. Trains a RF pipeline model (including median imputation of missing values in created annotations) + 3. Saves the RF pipeline model + 4. Applies the model to the MT and prints features importance + + :param mt: Input MT + :param output: Output files prefix to save the RF model + :return: RF model and MatrixTable after applying RF model + """ + mt = mt.annotate_rows( + feature1=hl.rand_bool(0.1), + feature2=hl.rand_norm(0.0, 1.0), + feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)), + ) + + mt = mt.annotate_rows( + label=hl.if_else(mt["feature1"] & (mt["feature2"] > 0), "TP", "FP") + ) + ht = mt.rows() + + def f3stats(ht): + return ht.aggregate( + hl.struct( + n=hl.agg.count_where(hl.is_defined(ht["feature3"])), + med=hl.median(hl.agg.collect(ht["feature3"])), + ) + ) + + f3_before_imputation = f3stats(ht) + logger.info("Feature3 defined values before imputation: %d", f3_before_imputation.n) + logger.info("Feature3 median: %f", f3_before_imputation.med) + + features_to_impute = ["feature3"] + quantiles = get_columns_quantiles(ht, features_to_impute, [0.5]) + quantiles = {k: v[0] for k, v in quantiles.items()} + + logger.info("Features median:\n%s", [f"{k}: {v}\n" for k, v in quantiles.items()]) + ht = ht.annotate(**{f: hl.or_else(ht[f], quantiles[f]) for f in features_to_impute}) + ht = ht.annotate_globals(medians=quantiles) + + f3_after_imputation = f3stats(ht) + logger.info("Feature3 defined values after imputation: %d", f3_after_imputation.n) + logger.info("Feature3 median: %f", f3_after_imputation.med) + + ht = ht.select("label", "feature1", "feature2", "feature3") + + label = "label" + features = ["feature1", "feature2", "feature3"] + + rf_model = train_rf(ht, features, label) + save_model(rf_model, out_path=output + "/rf.model", overwrite=True) + rf_model = load_model(output + "/rf.model") + + return rf_model, apply_rf_model(ht, rf_model, features, label) + + + +def check_ht_fields_for_spark(ht: hl.Table, fields: List[str]) -> None: + """ + Check specified fields of a hail table for Spark DataFrame conversion (type and name). + + :param ht: input Table + :param fields: Fields to test + :return: None + """ + allowed_types = [ + hl.tfloat, + hl.tfloat32, + hl.tfloat64, + hl.tint, + hl.tint32, + hl.tint64, + hl.tstr, + hl.tbool, + ] + + bad_field_names = [c for c in fields if "." in c] + + bad_types = [ + c[0] + for c in ht.key_by().select(*fields).row.items() + if c[1].dtype not in allowed_types + ] + + if bad_field_names or bad_types: + raise ValueError( + "Only basic type fields can be converted from Hail to Spark. In addition," + " `.` are not allowed in field names in Spark.\n" + f"Offending fields (non basic type): {bad_types}" + f"Offending fields (bad field name): {', '.join(bad_field_names)}\n" + ) + + return + + + +def get_columns_quantiles( + ht: hl.Table, fields: List[str], quantiles: List[float], relative_error: int = 0.001 +) -> Dict[str, List[float]]: + """ + Compute approximate quantiles of specified numeric fields from non-missing values. Non-numeric fields are ignored. + + This function returns a Dict of column name -> list of quantiles in the same order specified. + If a column only has NAs, None is returned. + + :param ht: input HT + :param fields: list of features to impute. If none given, all numerical features with missing data are imputed + :param quantiles: list of quantiles to return (e.g. [0.5] would return the median) + :param relative_error: The relative error on the quantile approximation + :return: Dict of column -> quantiles + """ + check_ht_fields_for_spark(ht, fields) + + df = ht.key_by().select(*fields).to_spark() + + res = {} + for f in fields: + logger.info("Computing median for column: %s", f) + col_no_na = df.select(f).dropna() + if col_no_na.first() is not None: + res[f] = col_no_na.approxQuantile(str(f), quantiles, relative_error) + else: + res[f] = None + + return res + + + +def median_impute_features( + ht: hl.Table, strata: Optional[Dict[str, hl.expr.Expression]] = None +) -> hl.Table: + """ + Numerical features in the Table are median-imputed by Hail's `approx_median`. + + If a `strata` dict is given, imputation is done based on the median of of each stratification. + + The annotations that are added to the Table are + - feature_imputed - A row annotation indicating if each numerical feature was imputed or not. + - features_median - A global annotation containing the median of the numerical features. If `strata` is given, + this struct will also be broken down by the given strata. + - variants_by_strata - An additional global annotation with the variant counts by strata that will only be + added if imputing by a given `strata`. + + :param ht: Table containing all samples and features for median imputation. + :param strata: Whether to impute features median by specific strata (default False). + :return: Feature Table imputed using approximate median values. + """ + logger.info("Computing feature medians for imputation of missing numeric values") + numerical_features = [ + k for k, v in ht.row.dtype.items() if v == hl.tint or v == hl.tfloat + ] + + median_agg_expr = hl.struct( + **{feature: hl.agg.approx_median(ht[feature]) for feature in numerical_features} + ) + + if strata: + ht = ht.annotate_globals( + feature_medians=ht.aggregate( + hl.agg.group_by(hl.tuple([ht[x] for x in strata]), median_agg_expr), + _localize=False, + ), + variants_by_strata=ht.aggregate( + hl.agg.counter(hl.tuple([ht[x] for x in strata])), _localize=False + ), + ) + feature_median_expr = ht.feature_medians[hl.tuple([ht[x] for x in strata])] + logger.info( + "Variant count by strata:\n%s", + "\n".join( + [ + "{}: {}".format(k, v) + for k, v in hl.eval(ht.variants_by_strata).items() + ] + ), + ) + + else: + ht = ht.annotate_globals( + feature_medians=ht.aggregate(median_agg_expr, _localize=False) + ) + feature_median_expr = ht.feature_medians + + ht = ht.annotate( + **{f: hl.or_else(ht[f], feature_median_expr[f]) for f in numerical_features}, + feature_imputed=hl.struct( + **{f: hl.is_missing(ht[f]) for f in numerical_features} + ), + ) + + return ht + + + +def ht_to_rf_df( + ht: hl.Table, features: List[str], label: str = None, index: str = None +) -> pyspark.sql.DataFrame: + """ + Create a Spark dataframe ready for RF from a HT. + + Rows with any missing features are dropped. + Missing labels are replaced with 'NA' + + .. note:: + + Only basic types are supported! + + :param ht: Input HT + :param features: Features that will be used for RF + :param label: Optional label column that will be predicted by RF + :param index: Optional index column to keep (E.g. for joining results back at later stage) + :return: Spark Dataframe + """ + cols_to_keep = features[:] + + if label: + cols_to_keep.append(label) + if index: + cols_to_keep.append(index) + + df = ht.key_by().select(*cols_to_keep).to_spark() + df = df.dropna(subset=features) + + if label: + df = df.fillna("NA", subset=label) + + return df + + + +def get_features_importance( + rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3 +) -> Dict[str, float]: + """ + Extract the features importance from a Pipeline model containing a RandomForestClassifier stage. + + :param rf_pipeline: Input pipeline + :param rf_index: index of the RandomForestClassifier stage + :param assembler_index: index of the VectorAssembler stage + :return: feature importance for each feature in the RF model + """ + feature_names = [ + x[: -len("_indexed")] if x.endswith("_indexed") else x + for x in rf_pipeline.stages[assembler_index].getInputCols() + ] + + return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances)) + + + +def get_labels(rf_pipeline: pyspark.ml.PipelineModel) -> List[str]: + """ + Return the labels from the StringIndexer stage at index 0 from an RF pipeline model. + + :param rf_pipeline: Input pipeline + :return: labels + """ + return rf_pipeline.stages[0].labels + + + +def test_model( + ht: hl.Table, + rf_model: pyspark.ml.PipelineModel, + features: List[str], + label: str, + prediction_col_name: str = "rf_prediction", +) -> List[hl.tstruct]: + """ + A wrapper to test a model on a set of examples with known labels. + + 1) Runs the model on the data + 2) Prints confusion matrix and accuracy + 3) Returns confusion matrix as a list of struct + + :param ht: Input table + :param rf_model: RF Model + :param features: Columns containing features that were used in the model + :param label: Column containing label to be predicted + :param prediction_col_name: Where to store the prediction + :return: A list containing structs with {label, prediction, n} + """ + ht = apply_rf_model( + ht.filter(hl.is_defined(ht[label])), + rf_model, + features, + label, + prediction_col_name=prediction_col_name, + ) + + test_results = ( + ht.group_by(ht[prediction_col_name], ht[label]) + .aggregate(n=hl.agg.count()) + .collect() + ) + + # Print results + df = pd.DataFrame(test_results) + df = df.pivot(index=label, columns=prediction_col_name, values="n") + logger.info("Testing results:\n%s", pprint.pformat(df)) + logger.info( + "Accuracy: %f", + sum([x.n for x in test_results if x[label] == x[prediction_col_name]]) + / sum([x.n for x in test_results]), + ) + + return test_results + + + +def apply_rf_model( + ht: hl.Table, + rf_model: pyspark.ml.PipelineModel, + features: List[str], + label: str = None, + probability_col_name: str = "rf_probability", + prediction_col_name: str = "rf_prediction", +) -> hl.Table: + """ + Apply a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions. + + :param ht: Input HT + :param rf_model: Random Forest pipeline model + :param features: List of feature columns in the pipeline. !Should match the model list of features! + :param label: Optional column containing labels. !Should match the model labels! + :param probability_col_name: Name of the column that will store the RF probabilities + :param prediction_col_name: Name of the column that will store the RF predictions + :return: Table with RF columns + """ + logger.info("Applying RF model.") + + check_fields = features[:] + if label: + check_fields.append(label) + check_ht_fields_for_spark(ht, check_fields) + + index_name = "rf_idx" + while index_name in ht.row: + index_name += "_tmp" + ht = ht.add_index(name=index_name) + + ht_keys = ht.key + ht = ht.key_by(index_name) + + df = ht_to_rf_df(ht, features, label, index_name) + + rf_df = rf_model.transform(df) + prob_cols = get_labels(rf_model) + + rf_df = rf_df.withColumn("probability", vector_to_array("probability")).select( + [index_name, "predictedLabel"] + + [col("probability")[i] for i in range(len(prob_cols))] + ) + new_colnames = [index_name, "predictedLabel"] + prob_cols + rf_df = rf_df.toDF(*new_colnames) + + # Note: SparkSession is needed to write DF to disk before converting to HT; + # the resulting HT sometimes has missing and/or duplicate rows without + # the intermediate write. + spark = SparkSession.builder.getOrCreate() + rf_df.write.mode("overwrite").save("rf_probs.parquet") + rf_df = spark.read.parquet("rf_probs.parquet") + + rf_ht = hl.Table.from_spark(rf_df) + + rf_ht = rf_ht.key_by(index_name) + ht = ht.annotate( + **{ + probability_col_name: {c: rf_ht[ht[index_name]][c] for c in prob_cols}, + prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"], + } + ) + + ht = ht.key_by(*ht_keys) + ht = ht.drop(index_name) + + return ht + + + +def save_model( + rf_pipeline: pyspark.ml.PipelineModel, out_path: str, overwrite: bool = False +) -> None: + """ + Save a Random Forest pipeline model. + + :param rf_pipeline: Pipeline to save + :param out_path: Output path + :param overwrite: If set, will overwrite existing file(s) at output location + :return: Nothing + """ + logger.info("Saving model to %s", out_path) + if overwrite: + rf_pipeline.write().overwrite().save(out_path) + else: + rf_pipeline.save(out_path) + + + +def load_model(input_path: str) -> pyspark.ml.PipelineModel: + """ + Load a Random Forest pipeline model. + + :param input_path: Location of model to load + :return: Random Forest pipeline model + """ + logger.info("Loading model from %s", input_path) + return pyspark.ml.PipelineModel.load(input_path) + + + +def train_rf( + ht: hl.Table, + features: List[str], + label: str, + num_trees: int = 500, + max_depth: int = 5, +) -> pyspark.ml.PipelineModel: + """ + Train a Random Forest (RF) pipeline model. + + :param ht: Input HT + :param features: List of columns to be used as features + :param label: Column containing the label to predict + :param num_trees: Number of trees to use + :param max_depth: Maximum tree depth + :return: Random Forest pipeline model + """ + logger.info( + "Training RF model using:\nfeatures: %s\nlabels: %s\nnum_trees:" + " %d\nmax_depth: %d", + ",".join(features), + label, + num_trees, + max_depth, + ) + + check_ht_fields_for_spark(ht, features + [label]) + + df = ht_to_rf_df(ht, features, label) + + label_indexer = ( + StringIndexer(inputCol=label, outputCol=label + "_indexed") + .setHandleInvalid("keep") + .fit(df) + ) + labels = label_indexer.labels + logger.info("Found labels: %s", labels) + + string_features = [x[0] for x in df.dtypes if x[0] != label and x[1] == "string"] + if string_features: + logger.info("Indexing string features: %s", ",".join(string_features)) + string_features_indexers = [ + StringIndexer(inputCol=x, outputCol=x + "_indexed") + .setHandleInvalid("keep") + .fit(df) + for x in string_features + ] + + assembler = VectorAssembler( + inputCols=[ + x[0] + "_indexed" if x[1] == "string" else x[0] + for x in df.dtypes + if x[0] != label + ], + outputCol="features", + ) + + rf = RandomForestClassifier( + labelCol=label + "_indexed", + featuresCol="features", + maxDepth=max_depth, + numTrees=num_trees, + ) + + label_converter = IndexToString( + inputCol="prediction", outputCol="predictedLabel", labels=labels + ) + + pipeline = Pipeline( + stages=[label_indexer] + + string_features_indexers + + [assembler, rf, label_converter] + ) + + # Train model + logger.info("Training RF model") + rf_model = pipeline.fit(df) + + feature_importance = get_features_importance(rf_model) + + logger.info( + "RF features importance:\n%s", + "\n".join([f"{f}: {i}" for f, i in feature_importance.items()]), + ) + + return rf_model + + + +def get_rf_runs(rf_json_fp: str) -> Dict: + """ + Load RF run data from JSON file. + + :param rf_json_fp: File path to rf json file. + :return: Dictionary containing the content of the JSON file, or an empty dictionary if the file wasn't found. + """ + if file_exists(rf_json_fp): + with hl.hadoop_open(rf_json_fp) as f: + return json.load(f) + else: + logger.warning( + "File %s could not be found. Returning empty RF run hash dict.", rf_json_fp + ) + return {} + + + +def get_run_data( + input_args: Dict[str, bool], + test_intervals: List[str], + features_importance: Dict[str, float], + test_results: List[hl.tstruct],) -> Dict: + """ + Create a Dict containing information about the RF input arguments and feature importance. + + :param Dict of bool keyed by str input_args: Dictionary of model input arguments + :param List of str test_intervals: Intervals withheld from training to be used in testing + :param Dict of float keyed by str features_importance: Feature importance returned by the RF + :param List of struct test_results: Accuracy results from applying RF model to the test intervals + :return: Dict of RF information + """ + run_data = { + "input_args": input_args, + "features_importance": features_importance, + "test_intervals": test_intervals, + } + + if test_results is not None: + tps = 0 + total = 0 + for row in test_results: + values = list(row.values()) + # Note: values[0] is the TP/FP label and values[1] is the prediction + if values[0] == values[1]: + tps += values[2] + total += values[2] + run_data["test_results"] = [dict(x) for x in test_results] + run_data["test_accuracy"] = tps / total + + return run_data + + + +def pretty_print_runs( + runs: Dict, label_col: str = "rf_label", prediction_col_name: str = "rf_prediction") -> None: + """ + Print the information for the RF runs loaded from the json file storing the RF run hashes -> info. + + :param runs: Dictionary containing JSON input loaded from RF run file + :param label_col: Name of the RF label column + :param prediction_col_name: Name of the RF prediction column + :return: Nothing -- only prints information + """ + for run_hash, run_data in runs.items(): + print(f"\n=== {run_hash} ===") + testing_results = ( + run_data.pop("test_results") if "test_results" in run_data else None + ) + # print(testing_results) + print(json.dumps(run_data, sort_keys=True, indent=4, separators=(",", ": "))) + if testing_results is not None: + # Print results + res_pd = pd.DataFrame(testing_results) + res_pd = res_pd.pivot( + index=label_col, columns=prediction_col_name, values="n" + ) + logger.info("Testing results:\n%s", pformat(res_pd)) + + +def file_exists(fname: str) -> bool: + """ + Check whether a file exists. + + Supports either local or Google cloud (gs://) paths. + If the file is a Hail file (.ht, .mt, .bm, .parquet, .he, and .vds extensions), it + checks that _SUCCESS is present. + + :param fname: File name. + :return: Whether the file exists. + """ + fext = os.path.splitext(fname)[1] + if fext in {".ht", ".mt", ".bm", ".parquet", ".he"}: + paths = [f"{fname}/_SUCCESS"] + elif fext == ".vds": + paths = [f"{fname}/reference_data/_SUCCESS", f"{fname}/variant_data/_SUCCESS"] + else: + paths = [fname] + + if fname.startswith("gs://"): + exists_func = hl.hadoop_exists + else: + exists_func = os.path.isfile + + exists = all([exists_func(p) for p in paths]) + + return exist \ No newline at end of file diff --git a/docker/pydata_stack/.DS_Store b/docker/pydata_stack/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/docker/pydata_stack/.DS_Store differ diff --git a/docker/pydata_stack/Dockerfile b/docker/pydata_stack/Dockerfile new file mode 100644 index 0000000..a680921 --- /dev/null +++ b/docker/pydata_stack/Dockerfile @@ -0,0 +1,18 @@ +# The Germline Genomics of Cancer (G2C) +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# Simple Dockerfile for python3 w/ pandas + +FROM ubuntu:latest +MAINTAINER "Noah Fields " + + +# Install python packages dependencies +RUN apt-get update && \ + apt-get install -y python3-numpy python3-pandas \ + python3-scipy python3-seaborn && \ + apt-get clean + +# Launch bash at runtime +CMD ["/bin/bash"] diff --git a/figures/.DS_Store b/figures/.DS_Store new file mode 100644 index 0000000..f57dff7 Binary files /dev/null and b/figures/.DS_Store differ diff --git a/figures/charr_figures/HQ_HET_RATE.png b/figures/charr_figures/HQ_HET_RATE.png new file mode 100644 index 0000000..d2aed28 Binary files /dev/null and b/figures/charr_figures/HQ_HET_RATE.png differ diff --git a/figures/charr_figures/INCONSISTENT_AB_HET_RATE.png b/figures/charr_figures/INCONSISTENT_AB_HET_RATE.png new file mode 100644 index 0000000..c262953 Binary files /dev/null and b/figures/charr_figures/INCONSISTENT_AB_HET_RATE.png differ diff --git a/figures/charr_figures/charr.png b/figures/charr_figures/charr.png new file mode 100644 index 0000000..30e2c67 Binary files /dev/null and b/figures/charr_figures/charr.png differ diff --git a/figures/g2c_figures/Everyone_demographics.png b/figures/g2c_figures/Everyone_demographics.png new file mode 100644 index 0000000..745c361 Binary files /dev/null and b/figures/g2c_figures/Everyone_demographics.png differ diff --git a/figures/g2c_figures/biome_demographics.png b/figures/g2c_figures/biome_demographics.png new file mode 100644 index 0000000..72aeb19 Binary files /dev/null and b/figures/g2c_figures/biome_demographics.png differ diff --git a/figures/g2c_figures/cptac_demographics.png b/figures/g2c_figures/cptac_demographics.png new file mode 100644 index 0000000..450fc73 Binary files /dev/null and b/figures/g2c_figures/cptac_demographics.png differ diff --git a/figures/g2c_figures/gtex_demographics.png b/figures/g2c_figures/gtex_demographics.png new file mode 100644 index 0000000..fa04ab2 Binary files /dev/null and b/figures/g2c_figures/gtex_demographics.png differ diff --git a/figures/g2c_figures/hgsvc_demographics.png b/figures/g2c_figures/hgsvc_demographics.png new file mode 100644 index 0000000..771da1d Binary files /dev/null and b/figures/g2c_figures/hgsvc_demographics.png differ diff --git a/figures/g2c_figures/hmf_demographics.png b/figures/g2c_figures/hmf_demographics.png new file mode 100644 index 0000000..83feea1 Binary files /dev/null and b/figures/g2c_figures/hmf_demographics.png differ diff --git a/figures/g2c_figures/icgc_demographics.png b/figures/g2c_figures/icgc_demographics.png new file mode 100644 index 0000000..a3dba38 Binary files /dev/null and b/figures/g2c_figures/icgc_demographics.png differ diff --git a/figures/g2c_figures/lcins_demographics.png b/figures/g2c_figures/lcins_demographics.png new file mode 100644 index 0000000..1cff8b1 Binary files /dev/null and b/figures/g2c_figures/lcins_demographics.png differ diff --git a/figures/g2c_figures/mesa_demographics.png b/figures/g2c_figures/mesa_demographics.png new file mode 100644 index 0000000..098e3be Binary files /dev/null and b/figures/g2c_figures/mesa_demographics.png differ diff --git a/figures/g2c_figures/proactive-core_demographics.png b/figures/g2c_figures/proactive-core_demographics.png new file mode 100644 index 0000000..c75ead4 Binary files /dev/null and b/figures/g2c_figures/proactive-core_demographics.png differ diff --git a/figures/g2c_figures/proactive-other_demographics.png b/figures/g2c_figures/proactive-other_demographics.png new file mode 100644 index 0000000..c7bc613 Binary files /dev/null and b/figures/g2c_figures/proactive-other_demographics.png differ diff --git a/figures/g2c_figures/ufc_demographics.png b/figures/g2c_figures/ufc_demographics.png new file mode 100644 index 0000000..8f4b8aa Binary files /dev/null and b/figures/g2c_figures/ufc_demographics.png differ diff --git a/scripts/.DS_Store b/scripts/.DS_Store new file mode 100644 index 0000000..facc943 Binary files /dev/null and b/scripts/.DS_Store differ diff --git a/scripts/aou_ufc_helpers/.DS_Store b/scripts/aou_ufc_helpers/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/scripts/aou_ufc_helpers/.DS_Store differ diff --git a/scripts/aou_ufc_helpers/VALab_germline_somatic.tsv b/scripts/aou_ufc_helpers/VALab_germline_somatic.tsv new file mode 100644 index 0000000..7227b8f --- /dev/null +++ b/scripts/aou_ufc_helpers/VALab_germline_somatic.tsv @@ -0,0 +1,265 @@ +#cancer germline_gene germline_context somatic_gene somatic_context criteria +breast AKAP9 coding HRAS coding known_ppi +breast AKAP9 coding IKZF3 coding known_ppi +breast ARRDC3 noncoding SMARCD1 coding known_ppi +breast BARD1 coding BARD1 coding same_gene +breast BRCA1 coding BRCA1 coding same_gene +breast BRCA1 coding ESR1 coding known_ppi +breast BRCA1 coding JAK2 coding known_ppi +breast BRCA1 coding UBR5 coding known_ppi +breast BRCA2 coding BRCA2 coding same_gene +breast C2CD6 noncoding IKZF3 coding known_ppi +breast CBX8 noncoding BAP1 coding known_ppi;protein_complex +breast CBX8 noncoding H3C2 coding known_ppi +breast CCNE1 noncoding CCNE1 coding same_gene +breast CCNE1 noncoding FBXW7 coding known_ppi +breast CHEK2 noncoding RB1 coding known_ppi +breast COL1A2 noncoding SMARCD1 coding known_ppi +breast COL8A1 noncoding FBLN2 coding known_ppi +breast CUX1 noncoding CUX1 coding same_gene +breast ERBB4 noncoding ABL2 coding known_ppi +breast ERBB4 noncoding ERBB2 coding known_ppi +breast ERBB4 noncoding ERBB3 coding known_ppi +breast ERBB4 noncoding ERBB4 coding same_gene +breast ESRRG noncoding ATM coding known_ppi +breast FGF10 noncoding FGFR1 coding ligand_receptor +breast FGF10 noncoding FGFR2 coding ligand_receptor +breast GRHL2 noncoding ESR1 coding known_ppi +breast GRHL2 noncoding GATA3 coding known_ppi +breast GRHL2 noncoding TRPS1 noncoding known_ppi +breast H2BC6 noncoding BAP1 coding known_ppi +breast H2BC6 noncoding RAD51B noncoding known_ppi +breast H4C13 noncoding CREBBP coding known_ppi +breast H4C13 noncoding H3C2 coding known_ppi;protein_complex +breast H4C13 noncoding RAD51B noncoding known_ppi +breast ITPR1 noncoding AKT1 coding known_ppi +breast KLF4 noncoding KLF4 coding same_gene +breast KLHL38 noncoding IKZF3 coding known_ppi +breast MAGI3 noncoding PTEN coding known_ppi +breast MAPT noncoding AKT1 coding known_ppi +breast NOTCH2 noncoding NOTCH2 coding same_gene +breast OTUD7B noncoding CASP8 coding known_ppi +breast PALB2 coding KEAP1 coding known_ppi +breast PDE4D noncoding UBR5 coding known_ppi +breast PHLDA3 coding NTRK1 coding known_ppi +breast PIDD1 noncoding KEAP1 coding known_ppi +breast PKLR noncoding TP53 coding known_ppi +breast PLCL2 noncoding HRAS coding known_ppi +breast RB1 coding MDM4 coding known_ppi +breast RB1 coding RB1 coding same_gene +breast RBBP8 noncoding BARD1 coding protein_complex +breast RBBP8 noncoding BRCA1 coding known_ppi;protein_complex +breast RBBP8 noncoding BRCA2 coding known_ppi +breast RBBP8 noncoding MAP2K4 coding known_ppi +breast RBBP8 noncoding RB1 coding known_ppi +breast RNF115 noncoding MDM4 coding known_ppi +breast RTKN2 noncoding CLTC coding known_ppi +breast TACC2 noncoding GNAS coding known_ppi +breast TBX3 noncoding TBX3 coding same_gene +breast TGFBR2 noncoding CDH1 coding known_ppi +breast TP53 coding CREBBP coding known_ppi +breast TP53 coding EP300 coding known_ppi +breast TP53 coding MDM4 coding known_ppi;protein_complex +breast TP53 coding TP53 coding same_gene +breast TP53 coding UBR5 coding known_ppi +breast XBP1 noncoding XBP1 noncoding same_gene +breast XPNPEP3 noncoding UBR5 coding known_ppi +breast XRCC6 noncoding NOTCH1 coding known_ppi +colorectal APC coding APC coding same_gene +colorectal APC coding AXIN1 coding protein_complex +colorectal APC coding CTNNB1 coding known_ppi +colorectal BMP2 noncoding ACVR2A coding ligand_receptor +colorectal BMP2 noncoding BMPR1A coding known_ppi;ligand_receptor +colorectal BMP4 noncoding ACVR2A coding ligand_receptor +colorectal BMP4 noncoding BMPR1A coding known_ppi;ligand_receptor +colorectal BMP5 noncoding ACVR2A coding ligand_receptor +colorectal BMP5 noncoding BMPR1A coding ligand_receptor +colorectal BMP7 noncoding ACVR2A coding ligand_receptor +colorectal BMP7 noncoding BMPR1A coding ligand_receptor +colorectal COL4A2 noncoding ITGAV coding ligand_receptor +colorectal EIF3H noncoding ARHGEF10 coding known_ppi +colorectal EIF3H noncoding CTNND1 coding known_ppi +colorectal EIF3H noncoding EIF3E coding known_ppi;protein_complex +colorectal EIF3H noncoding TP53 coding known_ppi +colorectal EIF3H noncoding UBR5 coding known_ppi +colorectal FHL3 noncoding RAD21 coding known_ppi +colorectal GNA12 noncoding PPP2R1A coding known_ppi +colorectal HLA-G noncoding B2M coding known_ppi +colorectal HLA-G noncoding HLA-A coding known_ppi +colorectal KLF5 noncoding SMAD2 coding known_ppi +colorectal KLF5 noncoding SMAD4 coding known_ppi +colorectal LAMA5 noncoding ITGAV coding ligand_receptor +colorectal LAMC1 noncoding ITGAV coding ligand_receptor +colorectal LPAR1 noncoding KRAS coding known_ppi +colorectal LRP1 noncoding GRIN2A coding known_ppi +colorectal MLH1 coding MLH1 coding same_gene +colorectal MLH1 noncoding MLH1 coding same_gene +colorectal MSH2 coding MSH2 coding same_gene +colorectal MSH6 coding MSH6 coding same_gene +colorectal MYNN coding CDH1 coding known_ppi +colorectal MYNN coding CHD4 coding known_ppi +colorectal PKM noncoding GNAS coding known_ppi +colorectal PKM noncoding PCBP1 coding known_ppi +colorectal POLD1 coding POLD1 coding same_gene +colorectal POLD3 noncoding POLD1 coding known_ppi;protein_complex +colorectal RFC3 noncoding RAD17 coding protein_complex +colorectal RPL3 noncoding SMAD2 coding known_ppi +colorectal RPL3 noncoding SMAD4 coding known_ppi +colorectal RUNX2 noncoding SOX9 coding known_ppi +colorectal SH2B3 coding EGFR coding known_ppi +colorectal SH2B3 coding ERBB2 coding known_ppi +colorectal SH2B3 coding ERBB3 coding known_ppi +colorectal SMAD3 noncoding PARP4 coding known_ppi +colorectal SMAD3 noncoding PIK3R1 coding known_ppi +colorectal SMAD3 noncoding SMAD2 coding known_ppi;protein_complex +colorectal SMAD3 noncoding SMAD3 coding same_gene +colorectal SMAD3 noncoding SMAD4 coding known_ppi;protein_complex +colorectal SMAD3 noncoding TGIF1 coding known_ppi +colorectal SMAD9 noncoding ARID1B coding known_ppi +colorectal SMAD9 noncoding BAZ1A coding known_ppi +colorectal SMAD9 noncoding EIF3E coding known_ppi +colorectal SMAD9 noncoding MGAT1 noncoding known_ppi +colorectal SMAD9 noncoding SMAD2 coding known_ppi +colorectal SMAD9 noncoding SMAD4 coding known_ppi +colorectal SMC1B noncoding RAD21 coding protein_complex +colorectal TBX3 noncoding TBX3 coding same_gene +colorectal TCF7L2 noncoding CTNNB1 coding known_ppi +colorectal TCF7L2 noncoding TCF7L2 coding same_gene +colorectal TERT noncoding AKT1 coding known_ppi +colorectal TERT noncoding CTNNB1 coding known_ppi +colorectal TERT noncoding SMARCA4 coding known_ppi +colorectal TGFBR2 coding CDH1 coding known_ppi +colorectal TGFBR2 coding ETS2 noncoding known_ppi +colorectal TGFBR2 coding ITGAV coding known_ppi +colorectal TGFBR2 coding TGFBR2 coding same_gene +colorectal TNS3 noncoding ERBB2 coding known_ppi +colorectal TNS3 noncoding ERBB3 coding known_ppi +colorectal VTI1A noncoding VTI1A coding same_gene +colorectal ZNRF3 noncoding ZNRF3 coding same_gene +lung APOM noncoding KLF6 noncoding known_ppi +lung ATM coding ATM coding same_gene +lung BAZ1A noncoding BAZ1A coding same_gene +lung CHEK2 coding PER3 coding known_ppi +lung COPS2 noncoding CREBBP coding known_ppi +lung COPS2 noncoding CUL3 coding known_ppi +lung EGFR coding EGFR coding same_gene +lung EGFR coding ERBB2 coding known_ppi +lung EGFR coding ESR1 coding known_ppi +lung EGFR coding JAK2 coding known_ppi +lung EGFR coding PIK3R1 coding known_ppi +lung EGFR coding RASA1 coding known_ppi +lung FGF7 noncoding FGFR2 coding ligand_receptor +lung FGF7 noncoding FGFR2 noncoding ligand_receptor +lung H4C8 noncoding CREBBP coding known_ppi +lung HLA-C noncoding B2M coding known_ppi +lung HLA-C noncoding B2M noncoding known_ppi +lung HLA-C noncoding HLA-A coding known_ppi +lung HLA-C noncoding RNF213 coding known_ppi +lung HLA-DQA1 noncoding CD74 coding known_ppi +lung HLA-DRA noncoding CD74 coding known_ppi +lung HLA-G noncoding B2M coding known_ppi +lung HLA-G noncoding B2M noncoding known_ppi +lung HLA-G noncoding HLA-A coding known_ppi +lung MORF4L1 noncoding TRRAP coding protein_complex +lung NR1H4 noncoding ESR1 coding known_ppi +lung NRG1 noncoding ERBB2 coding known_ppi;ligand_receptor +lung NRG1 noncoding ERBB4 coding ligand_receptor +lung NRG1 noncoding NRG1 coding same_gene +lung PDS5B noncoding BRCA2 coding known_ppi +lung PDS5B noncoding RAD21 coding known_ppi +lung PPP1R18 coding TSC1 coding known_ppi +lung RB1 coding RB1 coding same_gene +lung SLK noncoding KEAP1 coding known_ppi +lung TNXB coding SDC4 coding ligand_receptor +lung VARS2 noncoding TP53 coding known_ppi +prostate ACVR2A noncoding ACVR2A coding same_gene +prostate ACVR2A noncoding BMP6 coding ligand_receptor +prostate ADAMTSL4 noncoding AKT1 coding known_ppi +prostate ADAMTSL4 noncoding RUNX1T1 coding known_ppi +prostate ATM coding ATM coding same_gene +prostate BMP7 noncoding ACVR2A coding ligand_receptor +prostate CASP8 noncoding CUL3 coding known_ppi +prostate CASP8 noncoding RAF1 coding known_ppi +prostate CDKN1B coding CDKN1B coding same_gene +prostate CDKN1B coding RHOA coding known_ppi +prostate CHEK2 coding RB1 coding known_ppi +prostate DPM1 noncoding ZBTB16 coding known_ppi +prostate ESR2 noncoding CASZ1 coding known_ppi +prostate FERMT2 noncoding CTNNB1 coding known_ppi +prostate FOXP1 noncoding FOXP1 coding same_gene +prostate GDF5 noncoding ACVR2A coding ligand_receptor +prostate GDF7 noncoding ACVR2A coding ligand_receptor +prostate GRB10 noncoding NRAS coding known_ppi +prostate HAUS6 noncoding GNAS coding known_ppi +prostate HIBADH noncoding TFRC coding known_ppi +prostate HLA-C noncoding RNF213 coding known_ppi +prostate IGF1R noncoding CTNNB1 coding known_ppi +prostate INHBB coding ACVR2A coding ligand_receptor +prostate INHBB noncoding ACVR2A coding ligand_receptor +prostate KLK2 noncoding KLK2 coding same_gene +prostate KLK3 noncoding KLK3 noncoding same_gene +prostate KNL1 noncoding EPS15 coding known_ppi +prostate LMNA noncoding KLF6 coding known_ppi +prostate LMNA noncoding RBFOX1 coding known_ppi +prostate MAP2K1 noncoding RAF1 coding known_ppi +prostate MAP3K1 noncoding BRAF coding known_ppi +prostate MDM4 noncoding RB1 coding known_ppi +prostate MYEOV noncoding MYEOV coding same_gene +prostate MYO6 noncoding CYLD coding known_ppi +prostate MYO16 noncoding PIK3R1 coding known_ppi +prostate NEDD8 noncoding CUL3 coding known_ppi +prostate NEDD8 noncoding CYLD coding known_ppi +prostate NRIP1 noncoding NRIP1 noncoding same_gene +prostate NUF2 noncoding DDX5 coding known_ppi +prostate NUF2 noncoding EPS15 coding known_ppi +prostate OTX1 noncoding FBLN2 coding known_ppi +prostate PPP1R7 noncoding HRAS coding known_ppi +prostate PPP2R2A noncoding LRP1B coding known_ppi +prostate PTEN coding PTEN coding same_gene +prostate RASSF3 noncoding TFRC coding known_ppi +prostate RNF43 noncoding RNF43 coding same_gene +prostate SKIL noncoding SMAD4 coding known_ppi +prostate SLC22A2 noncoding ETV5 coding known_ppi +prostate SMAD2 noncoding SMAD4 coding known_ppi;protein_complex +prostate SPEN noncoding SPEN coding same_gene +prostate SPOP coding CUL3 coding known_ppi +prostate SPOP coding SPOP coding same_gene +prostate STXBP1 noncoding NCOR1 coding known_ppi +prostate TCF7L2 noncoding CTNNB1 coding known_ppi +prostate TERT noncoding AKT1 coding known_ppi +prostate TERT noncoding CTNNB1 coding known_ppi +prostate TMPRSS2 coding TMPRSS2 coding same_gene +prostate TMPRSS2 coding TMPRSS2 noncoding same_gene +prostate TMPRSS2 noncoding TMPRSS2 coding same_gene +prostate TMPRSS2 noncoding TMPRSS2 noncoding same_gene +prostate TNFSF10 noncoding CUL3 coding known_ppi +prostate TNS3 noncoding ERBB3 coding known_ppi +prostate TOR1A noncoding CTNNB1 coding known_ppi +prostate TOR1A noncoding PIK3R1 coding known_ppi +prostate TOR1A noncoding PRDM1 coding known_ppi +prostate TP53 noncoding BRCA2 coding known_ppi +prostate TP53 noncoding TP53 coding same_gene +prostate TP53 noncoding XPO1 coding known_ppi +prostate UBE3A noncoding IDH1 coding known_ppi +prostate VEGFA noncoding U2AF1 coding known_ppi +prostate WBP11 noncoding DDX5 coding known_ppi +prostate WBP11 noncoding RUNX1T1 coding known_ppi +prostate ZBTB42 coding ZBTB16 coding known_ppi +prostate ZFHX3 noncoding ZFHX3 coding same_gene +renal ATM coding ATM coding same_gene +renal BAP1 coding ASXL2 coding known_ppi +renal BAP1 coding BAP1 coding same_gene +renal BAP1 coding NF1 coding known_ppi +renal DPF3 noncoding ARID1A coding protein_complex +renal DPF3 noncoding ARID1B coding protein_complex +renal DPF3 noncoding SMARCA4 coding protein_complex +renal FAF1 noncoding NUP214 coding known_ppi +renal MET coding MET coding same_gene +renal SCARB1 noncoding PLCG1 coding known_ppi +renal TSC1 coding TSC1 coding same_gene +renal TSC2 coding TSC2 coding same_gene +renal VHL coding CDKN2A coding known_ppi +renal VHL coding HIF1A coding known_ppi +renal VHL coding NF2 coding known_ppi +renal VHL coding PCBP1 coding known_ppi +renal VHL coding VHL coding same_gene diff --git a/scripts/aou_ufc_helpers/Write_JointGenotyping_Part2_Json.py b/scripts/aou_ufc_helpers/Write_JointGenotyping_Part2_Json.py new file mode 100644 index 0000000..90f5f03 --- /dev/null +++ b/scripts/aou_ufc_helpers/Write_JointGenotyping_Part2_Json.py @@ -0,0 +1,86 @@ +import os +import subprocess +import json + +sample_name_map="gs://fc-secure-d531c052-7b41-4dea-9e1d-22e648f6e228/ufc_jg/jg_part1/dfci-ufc.sample_map" + +# Construct the base path for the Google Cloud Storage bucket +bucket_path = os.getenv('WORKSPACE_BUCKET') + +# Function to list paths using gsutil and construct arrays +def process_directory(dir_path,suffix): + command = f"gsutil ls {bucket_path}/GnarlyJointGenotypingPart1-Output/{callset_name}/{dir_path}/*{suffix}" + paths = subprocess.check_output(command, shell=True).decode('utf-8').strip().split('\n') + return paths + +# List all directories under ${bucket_path}/GnarlyJointGenotypingPart1-Output/ +command = f"gsutil ls -d {bucket_path}/GnarlyJointGenotypingPart1-Output/*/" +callset_names = subprocess.check_output(command, shell=True).decode('utf-8').strip().split('\n') +callset_names = [name.split('/')[-2] for name in callset_names] + +# Instantiate Arrays +output_sites_only_vcf = [] +output_variant_filtered_vcf = [] +output_sites_only_vcf_idx = [] +output_variant_filtered_vcf_idx = [] + +# Loop through each callset_name +for callset_name in callset_names: + # Process each directory and populate respective arrays + sites_only_vcf = process_directory("sites_only_vcf",".vcf.gz") + variant_filtered_vcf = process_directory("variant_filtered_vcf",".vcf.gz") + + # Process each directory and populate respective arrays + sites_only_vcf_idx = process_directory("sites_only_vcf",".tbi") + variant_filtered_vcf_idx = process_directory("variant_filtered_vcf",".tbi") + + # Append paths to respective output lists + output_sites_only_vcf.append(sites_only_vcf) + output_variant_filtered_vcf.append(variant_filtered_vcf) + output_sites_only_vcf_idx.append(sites_only_vcf_idx) + output_variant_filtered_vcf_idx.append(variant_filtered_vcf_idx) +# JSON file path +output_json_file = "GnarlyJointGenotypingPart2.json" + +# Construct JSON content +json_content = { + "GnarlyJointGenotypingPart2.ApplyRecalibration.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.CollectMetricsOnFullVcf.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.CollectMetricsSharded.gatk-publ_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.FinalGatherVcf.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.GatherVariantCallingMetrics.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.IndelsVariantRecalibrator.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.SNPGatherTranches.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.SNPsVariantRecalibratorCreateModel.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.SNPsVariantRecalibratorScattered.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.SitesOnlyGatherVcf.gatk_docker": "broadinstitute/gatk:4.4.0.0", + "GnarlyJointGenotypingPart2.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", + "GnarlyJointGenotypingPart2.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", + "GnarlyJointGenotypingPart2.callset_name": "dfci-ufc", + "GnarlyJointGenotypingPart2.dbsnp_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "GnarlyJointGenotypingPart2.dbsnp_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "GnarlyJointGenotypingPart2.eval_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", + "GnarlyJointGenotypingPart2.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", + "GnarlyJointGenotypingPart2.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", + "GnarlyJointGenotypingPart2.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "GnarlyJointGenotypingPart2.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "GnarlyJointGenotypingPart2.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", + "GnarlyJointGenotypingPart2.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", + "GnarlyJointGenotypingPart2.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "GnarlyJointGenotypingPart2.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", + "GnarlyJointGenotypingPart2.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "GnarlyJointGenotypingPart2.sites_only_vcfs_index_nested": output_sites_only_vcf_idx, + "GnarlyJointGenotypingPart2.sites_only_vcfs_nested": output_sites_only_vcf, + "GnarlyJointGenotypingPart2.variant_filtered_vcfs_index_nested": output_variant_filtered_vcf_idx, + "GnarlyJointGenotypingPart2.variant_filtered_vcfs_nested": output_variant_filtered_vcf, + "GnarlyJointGenotypingPart2.sample_name_map": sample_name_map +} + +# Write JSON content to file without prettifying +with open(output_json_file, 'w') as f: + json.dump(json_content, f, separators=(',', ':')) + + # Add a newline character at the end of the file + f.write('\n') + +print(f"Output written to {output_json_file}") \ No newline at end of file diff --git a/scripts/aou_ufc_helpers/creating_gene_regions_of_interest_hg38 copy.sh b/scripts/aou_ufc_helpers/creating_gene_regions_of_interest_hg38 copy.sh new file mode 100644 index 0000000..c69ab51 --- /dev/null +++ b/scripts/aou_ufc_helpers/creating_gene_regions_of_interest_hg38 copy.sh @@ -0,0 +1,41 @@ +# Example code for creating gene regions of interest and subsetting AoU VCF + +# Given the following input files: +# 1. list (flat .txt file) of gene symbols of relevance: riaz_genes.list +# 2. list (flat .txt file) of sample IDs to retain: keep_samples.list +# 3. VCF of all variants from AoU: aou.vcf.gz (must also have .tbi index present locally) + +# The below can all be run on your local machine from the terminal + +# Download MANE GTF: +wget https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.3/MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz + +# Subset MANE to genes of interest +zcat MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz \ +| fgrep -wf riaz_genes.list \ +| bgzip -c \ +> MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf.gz + +# Extract coordinates from subsetted GTF, pad by ±2kb to be conservative, and use BEDTools to merge these coordinates into unique regions +zcat MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf.gz \ +| awk -v FS="\t" -v OFS="\t" -v buffer=2000 '{ print $1, $4-buffer, $5+buffer }' \ +| sort -Vk1,1 -k2,2n -k3,3n \ +| bedtools merge -i - \ +| bgzip -c \ +> riaz_genes.coordinates.bed.gz + +# Copy gene coordinates to your AoU workspace bucket +# You can find this by running "echo $WORKSPACE_BUCKET" within the AoU workbench terminal +#gsutil cp riaz_genes.coordinates.bed.gz gs://my_workspace_bucket/ + +# Subset variants to minimal set of interest for our purposes +# Note that this must be run in the AoU workspace terminal, not on your local machine +bcftools view \ + -O z -o aou.subsetted.vcf.gz \ + --regions-file riaz_genes.coordinates.bed.gz \ + --samples-file keep_samples.list \ + --min-ac 1 \ + aou.vcf.gz + +# Index output VCF for future queries +tabix -p vcf -f aou.subsetted.vcf.gz \ No newline at end of file diff --git a/scripts/aou_ufc_helpers/germline_CPGs_7_31_23.txt b/scripts/aou_ufc_helpers/germline_CPGs_7_31_23.txt new file mode 100644 index 0000000..f633707 --- /dev/null +++ b/scripts/aou_ufc_helpers/germline_CPGs_7_31_23.txt @@ -0,0 +1,149 @@ +value +ABCB11 +ACD +AIP +ALK +APC +ATM +ATR +AXIN2 +BAP1 +BARD1 +BLM +BMPR1A +BRAF +BRCA1 +BRCA2 +BRIP1 +BUB1B +CBL +CDC73 +CDH1 +CDK4 +CDKN1B +CDKN1C +CDKN2A +CDKN2B +CEBPA +CHEK2 +CYLD +DDB2 +DDX41 +DICER1 +DIS3L2 +DKC1 +EGFR +EPCAM +ERCC1 +ERCC2 +ERCC3 +ERCC4 +ERCC5 +ETV6 +EXT1 +EXT2 +FAH +FANCA +FANCC +FANCD2 +FANCE +FANCF +FANCG +FANCI +FANCL +FANCM +FH +FLCN +GATA2 +GPC3 +HFE +HMBS +HNF1A +HRAS +KIT +KRAS +LZTR1 +MAP2K1 +MAP2K2 +MAX +MEN1 +MET +MITF +MLH1 +MPL +MSH2 +MSH6 +MTAP +MUTYH +NBN +NF1 +NF2 +NHP2 +NRAS +NTHL1 +PALB2 +PDGFRA +PHOX2B +PMS2 +POLD1 +POLE +POLH +POT1 +PRF1 +PRKAR1A +PTCH1 +PTCH2 +PTEN +PTPN11 +RAD51C +RAD51D +RAF1 +RB1 +RECQL +RECQL4 +RET +RFWD3 +RHBDF2 +RTEL1 +RUNX1 +SBDS +SDHA +SDHAF2 +SDHB +SDHC +SDHD +SETBP1 +SH2D1A +SLC25A13 +SLX4 +SMAD4 +SMARCA4 +SMARCB1 +SMARCE1 +SOS1 +SPRTN +SRP72 +STAT3 +STK11 +SUFU +TERT +TGFBR1 +TINF2 +TMEM127 +TP53 +TRIM37 +TSC1 +TSC2 +TSHR +UROD +VHL +WRN +WT1 +XPA +XPC +XRCC3 +GEN1 +RAD51 +RAD54L +RNF43 +UBE2T diff --git a/scripts/aou_ufc_helpers/matching.py b/scripts/aou_ufc_helpers/matching.py new file mode 100644 index 0000000..884d6a3 --- /dev/null +++ b/scripts/aou_ufc_helpers/matching.py @@ -0,0 +1,110 @@ +import pandas as pd + +# Define file paths +control_file = 'superset_control_demographics.tsv' +case_file = 'case_demographics.tsv' +output_file = 'control_demographics_interim.tsv' + +# Read case and control files +control_df = pd.read_csv(control_file, sep='\t', index_col=0) +case_df = pd.read_csv(case_file, sep='\t', index_col=0) + +# Sort control and case data +sorted_control = control_df.sort_values(by=['race', 'ethnicity', 'year_of_birth', 'sex_at_birth']) +sorted_case = case_df.sort_values(by=['race', 'ethnicity', 'year_of_birth', 'sex_at_birth']) + +# Initialize matched controls dictionary +matched_controls = {} +print(sorted_control.shape,sorted_case.shape) +# Exact Matching +for idx, case_row in sorted_case.iterrows(): + match = sorted_control[ + (sorted_control['race'] == case_row['race']) & + (sorted_control['ethnicity'] == case_row['ethnicity']) & + (sorted_control['year_of_birth'] == case_row['year_of_birth']) & + (sorted_control['sex_at_birth'] == case_row['sex_at_birth']) + ].head(1) + if not match.empty: control_idx = match.index[0] matched_controls[idx] = control_idx #Case is key and control is value sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df + sorted_case = sorted_case.drop(idx) +#This is all four variables of interest. +for bandwidth in range(1,20): + for idx, case_row in sorted_case.iterrows(): + #print(f"Bandwidth: {bandwidth}") + match = sorted_control[ + (sorted_control['race'] == case_row['race']) & + (sorted_control['ethnicity'] == case_row['ethnicity']) & + (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth)) & + (sorted_control['sex_at_birth'] == case_row['sex_at_birth']) + ].head(1) + if not match.empty: + control_idx = match.index[0] + matched_controls[idx] = control_idx #Case is key and control is value sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df sorted_case = sorted_case.drop(idx) +#This is just for race, age, and sex. We exclude ethnicity because that is often tied to race. +for bandwidth in range(0,20): + for idx, case_row in sorted_case.iterrows(): + #print(f"Bandwidth: {bandwidth}") + match = sorted_control[ + (sorted_control['race'] == case_row['race']) & + (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth)) & + (sorted_control['sex_at_birth'] == case_row['sex_at_birth']) + ].head(1) + if not match.empty: + control_idx = match.index[0] + matched_controls[idx] = control_idx #Case is key and control is value + sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df + sorted_case = sorted_case.drop(idx) + +#This is just for race and age. We are putting race ahead of sex for matching. +for bandwidth in range(0,20): + for idx, case_row in sorted_case.iterrows(): + match = sorted_control[ + (sorted_control['race'] == case_row['race']) & + (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth)) + ].head(1) + if not match.empty: + control_idx = match.index[0] + matched_controls[idx] = control_idx #Case is key and control is value + sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df + sorted_case = sorted_case.drop(idx) + +#This is just for sex and age once race and ethnicity have been used up +for bandwidth in range(0,20): + for idx, case_row in sorted_case.iterrows(): + match = sorted_control[ + (sorted_control['sex_at_birth'] == case_row['sex_at_birth']) & + (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth)) + ].head(1) + if not match.empty: + control_idx = match.index[0] + matched_controls[idx] = control_idx #Case is key and control is value + sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df + sorted_case = sorted_case.drop(idx) + +#We are only looking for year of birth as a last resort +#This is useful for when all of the other fields are missing +for bandwidth in range(0,20): + for idx, case_row in sorted_case.iterrows(): + match = sorted_control[(sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] +bandwidth)) + ].head(1) + if not match.empty: + control_idx = match.index[0] + matched_controls[idx] = control_idx #Case is key and control is value + sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df + sorted_case = sorted_case.drop(idx) + + + + + +sorted_control.to_csv('unmatched.tsv',sep='\t') +sorted_case.to_csv('unmatchables.tsv',sep='\t') +print(len(sorted_case)) + +# Convert dictionary to DataFrame +df = pd.DataFrame(list(matched_controls.items()), columns=['person_id_case', 'person_id_control']) + +# Save DataFrame to TSV file +output_file = 'matched_controls.tsv' +df.to_csv(output_file, sep='\t', index=False) + +print("Exact matching done. Interim results saved to:", output_file) diff --git a/scripts/g2c_helpers/.DS_Store b/scripts/g2c_helpers/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/scripts/g2c_helpers/.DS_Store differ diff --git a/scripts/g2c_helpers/g2c_charr.py b/scripts/g2c_helpers/g2c_charr.py new file mode 100644 index 0000000..a81e105 --- /dev/null +++ b/scripts/g2c_helpers/g2c_charr.py @@ -0,0 +1,164 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +import pandas as pd +import gcsfs +from tqdm import tqdm +import matplotlib.pyplot as plt +import numpy as np +import random + + +cohorts = ['aou','biome','mesa','ufc','gtex','cptac','hgsvc','icgc','hmf','lcins','proactive-core','proactive-other'] +ancestries = ['European', 'Latin-American-1','Latin-American-2','South-Asian','Asian-Pacific-Islander','East-Asian','African', 'African-American','Unknown','Other'] +bar_colors = ['blue', 'red', 'pink', 'purple', 'darkgreen', 'lightgreen', 'yellow', 'orange', 'turquoise', 'brown'] + + +# Dictionary to keep track of statistics +charr_arr = [] +charr_reblocked_arr = [] +charr_dict = dict() +variable_of_interest = 'CHARR' #'INCONSISTENT_AB_HET_RATE' # + +def grab_charr_files(): + def list_charr_files(base_path, cohort): + # Create a GCS file system instance + gcs_filesystem = gcsfs.GCSFileSystem(project='terra-446ad1f7') # Replace with your actual project ID + + # Construct the path pattern for the specified cohort + cohort_path = f"{base_path}/{cohort}/charr/*" + + # List all files matching the pattern + charr_files = gcs_filesystem.glob(cohort_path) + + return charr_files + + # Specify the base path in the "gs://dfci-g2c-inputs" file system and the cohort + base_path = 'gs://dfci-g2c-inputs' + + charr_files = [] + for cohort in cohorts: + # Get a list of file pathways that have 'demographics' in them for the specified cohort + charr_files.extend([f"gs://{file}" for file in list_charr_files(base_path, cohort)]) + + return charr_files + +def grab_charr_reblocked_files(): + path = '/Users/noah/Desktop/ssi.tsv' + df = pd.read_csv(path,sep='\t') + charr_reblocked_files = list(df['contamination_reblocked']) + + return charr_reblocked_files + +# Function to process each row +def process_file(charr_file): + df = pd.read_csv(charr_file,sep='\t') + return float(df[variable_of_interest].iloc[0]) + +def process_cohorts(): + charr_files = grab_charr_files() + cohort_dict = dict() + cohort_dict['mesa'] = [file for file in charr_files if 'mesa' in file] + cohort_dict['ceph'] = [file for file in charr_files if 'ceph' in file] + cohort_dict['gtex'] = [file for file in charr_files if 'gtex' in file] + cohort_dict['icgc'] = [file for file in charr_files if 'icgc' in file] + cohort_dict['lcins'] = [file for file in charr_files if 'lcins' in file] + cohort_dict['hmf'] = [file for file in charr_files if 'hmf' in file] + cohort_dict['ufc'] = [file for file in charr_files if 'ufc' in file] + cohort_dict['cptac'] = [file for file in charr_files if 'cptac' in file] + cohort_dict['hgsvc'] = [file for file in charr_files if 'hgsvc' in file] + cohort_dict['biome'] = [file for file in charr_files if 'biome' in file] + cohort_dict['proactive-core'] = [file for file in charr_files if 'proactive-core' in file] + cohort_dict['proactive-other'] = [file for file in charr_files if 'proactive-other' in file] + + + + for key in cohort_dict: + for file in cohort_dict[key]: + if key not in charr_dict: + charr_dict[key]=[] + charr_dict[key].append(process_file(file)) + +def make_figure(): + def box_plot(data,position, label): + plt.boxplot(data,positions=[position], labels=[label]) + + + box_plot(charr_dict['mesa'],1,"MESA") + box_plot(charr_dict['ceph'],2,"CEPH") + box_plot(charr_dict['gtex'],3,"GTEX") + box_plot(charr_dict['icgc'],4,"ICGC") + box_plot(charr_dict['lcins'],5,"LCINS") + box_plot(charr_dict['hmf'],6,"HMF") + box_plot(charr_dict['ufc'],7,"UFC") + box_plot(charr_dict['cptac'],8,"CPTAC") + box_plot(charr_dict['hgsvc'],9,"HGSVC") + box_plot(charr_dict['biome'],10,"BIOME") + box_plot(charr_dict['proactive-core'],11,"P-CORE") + box_plot(charr_dict['proactive-other'],12,"P-OTHER") + + # Show the plot + # Add labels and title + plt.xlabel('Cohort') + plt.ylabel('CHARR') + plt.title(f'Box Plot of {variable_of_interest} values across Cohorts') + + # Add dotted lines and labels + if variable_of_interest == "CHARR": + plt.axhline(y=0.02, linestyle='--', color='gray', label='Subtle Contamination') + plt.axhline(y=0.03, linestyle='--', color='red', label='Suggestive of Contamination') + plt.legend() + if variable_of_interest == "INCONSISTENT_AB_HET_RATE": + plt.axhline(y=0.1, linestyle='--', color='gray', label='Warning') + plt.axhline(y=0.15, linestyle='--', color='red', label='Fail') + plt.legend() + # Create a wider figure + #plt.figure(figsize=(10, 6)) # Adjust width as needed + plt.savefig(f"../../figures/charr_figures/{variable_of_interest}.png") + +def compare_reblocked_raw(): + # Function to process each row + def process_file_compare(charr_file): + df = pd.read_csv(charr_file,sep='\t') + return float(df['CHARR'].iloc[0]) + def list_charr_files(base_path): + # Create a GCS file system instance + gcs_filesystem = gcsfs.GCSFileSystem(project='terra-446ad1f7') # Replace with your actual project ID + + # Construct the path pattern for the specified cohort + original_path = f"{base_path}/charr_original/*" + reblocked_path = f"{base_path}/charr/*" + + # List all files matching the pattern + original_files = sorted(gcs_filesystem.glob(original_path)) + reblocked_files = [file.replace('charr_original','charr') for file in original_files] + + return original_files,reblocked_files + + # Specify the base path in the "gs://dfci-g2c-inputs" file system and the cohort + base_path = 'gs://dfci-g2c-inputs/mesa' + original_files,reblocked_files = list_charr_files(base_path) + + coordinates = [] + for i in range(len(original_files)): + coordinates.append((process_file_compare("gs://" + original_files[i]),process_file_compare("gs://" + reblocked_files[i]))) + + # Extract the first and second integers from each tuple into separate arrays + x = np.array([t[0] for t in coordinates]) + y = np.array([t[1] for t in coordinates]) + + correlation_coefficient = np.corrcoef(x, y)[0,1] + r_squared = correlation_coefficient ** 2 + print(r_squared) + + + +if __name__ == '__main__': + #compare_reblocked_raw() + process_cohorts() + make_figure() + # main() + + diff --git a/scripts/g2c_helpers/g2c_demographics.py b/scripts/g2c_helpers/g2c_demographics.py new file mode 100644 index 0000000..3f41bae --- /dev/null +++ b/scripts/g2c_helpers/g2c_demographics.py @@ -0,0 +1,184 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +import pandas as pd +import gcsfs +from tqdm import tqdm +import matplotlib.pyplot as plt +import numpy as np +import random + + +cohorts = ['aou','biome','mesa','ufc','gtex','cptac','hgsvc','icgc','hmf','lcins','proactive-core','proactive-other'] +ancestries = ['European', 'Latin-American-1','Latin-American-2','South-Asian','Asian-Pacific-Islander','East-Asian','African', 'African-American','Unknown','Other'] +bar_colors = ['blue', 'red', 'pink', 'purple', 'darkgreen', 'lightgreen', 'yellow', 'orange', 'turquoise', 'brown'] + + +# Dictionary to keep track of statistics +cohort_stats = {} +cohort_sex_stats = {} +overall_stats = {} +overall_sex_stats = {} +cohort_count = {} +total_count = 0 + +def grab_demographics_files(): + def list_demographics_files(base_path, cohort): + # Create a GCS file system instance + gcs_filesystem = gcsfs.GCSFileSystem(project='terra-446ad1f7') # Replace with your actual project ID + + # Construct the path pattern for the specified cohort + cohort_path = f"{base_path}/{cohort}/demographics/*" + + # List all files matching the pattern + demographics_files = gcs_filesystem.glob(cohort_path) + + return demographics_files + + # Specify the base path in the "gs://dfci-g2c-inputs" file system and the cohort + base_path = 'gs://dfci-g2c-inputs' + + demographics_files = [] + for cohort in cohorts: + # Get a list of file pathways that have 'demographics' in them for the specified cohort + demographics_files.extend([f"gs://{file}" for file in list_demographics_files(base_path, cohort)]) + + return demographics_files + +# Function to process each row +def process_file(demo_file): + global total_count + temp_df = pd.read_csv(demo_file,sep='\t') + # Extract required columns + temp_df = temp_df[['ancestry', 'chrX_count', 'chrY_count']] + cohort = demo_file.split('/')[-3] + cohort_count[cohort] = cohort_count.get(cohort, 0) + 1 + total_count += 1 + # Update cohort-specific statistics + if cohort not in cohort_stats: + cohort_stats[cohort] = {} + cohort_sex_stats[cohort] = {} + + for _, temp_row in temp_df.iterrows(): + ancestry = temp_row['ancestry'] + chrX_count = int(temp_row['chrX_count']) + chrY_count = int(temp_row['chrY_count']) + sex = None + if chrX_count == 1 and chrY_count == 1: + sex = "Male" + elif chrX_count == 2 and chrY_count == 0: + sex = "Female" + else: + sex = "Aneuploidy" + + # Update cohort-specific statistics + if ancestry not in cohort_stats[cohort]: + cohort_stats[cohort][ancestry] = 0 + if sex not in cohort_sex_stats[cohort]: + cohort_sex_stats[cohort][sex] = 0 + + if ancestry not in overall_stats: + overall_stats[ancestry] = 0 + if sex not in overall_sex_stats: + overall_sex_stats[sex] = 0 + + + cohort_stats[cohort][ancestry] += 1 + cohort_sex_stats[cohort][sex] += 1 + overall_stats[ancestry] += 1 + overall_sex_stats[sex] += 1 + + + + +# Loop through the rows of the DataFrame +for demo_file in tqdm(grab_demographics_files()): + process_file(demo_file) + + +with open('g2c_demographics.txt', 'w') as file: + # Write cohort-specific statistics + file.write("=" * 30 + "\n") + for cohort, stats in cohort_stats.items(): + file.write(f"Cohort: {cohort}\n") + + for ancestry in cohort_stats[cohort]: + file.write(f"{ancestry}: {cohort_stats[cohort][ancestry]}\n") + file.write("- " * 30 + "\n") + for sex in cohort_sex_stats[cohort]: + file.write(f"{sex}: {cohort_sex_stats[cohort][sex]}\n") + file.write(f"Total: {cohort_count[cohort]}\n") + file.write("=" * 30 + "\n") + + # Write overall statistics + file.write("Overall Statistics\n") + for ancestry in overall_stats: + file.write(f"{ancestry}: {overall_stats[ancestry]}\n") + file.write("- " * 30 + "\n") + + # Write overall sex statistics + for sex in overall_sex_stats: + file.write(f"{sex}: {overall_sex_stats[sex]}\n") + + file.write(f"Total: {total_count}\n") + + +def create_pie_chart(cohort,cohort_stats): + values = list(cohort_stats[cohort].values()) + keys = list(cohort_stats[cohort].keys()) + + #labels = [f"{key}: {value}" for key, value in zip(keys, values)] + + # Calculate the total count for percentage calculation + total_count = sum(values) + + plt.pie(values,labels=keys,colors = bar_colors, autopct=lambda p: f'{p:.1f}%\n') + plt.title(cohort, fontdict={'fontsize': 16, 'fontweight': 'bold'}) + plt.savefig(f"../../figures/g2c_figures/{cohort}_demographics.png") + plt.clf() + +def create_total_pie_chart(overall_stats): + values = list(overall_stats.values()) + keys = list(overall_stats.keys()) + + + # Calculate the total count for percentage calculation + total_count = sum(values) + + plt.pie(values,labels=keys,colors = bar_colors, autopct=lambda p: f'{p:.1f}%\n') + plt.title("Everyone", fontdict={'fontsize': 16, 'fontweight': 'bold'}) + plt.savefig(f"../../figures/g2c_figures/Everyone_demographics.png") + plt.clf() + + +# # Create subplots +# fig, ax = plt.subplots(figsize=(14, 8)) + +# Create bar graphs for each cohort and total +for cohort in cohort_stats: + #print(cohort) + if cohort not in cohort_stats.keys(): + continue + #create_bar_graph(ax, cohort, cohort_stats, bar_width=0.2) + create_pie_chart(cohort,cohort_stats) +create_total_pie_chart(overall_stats) + +# # Add labels and title +# ax.set_xlabel('Ancestry Groups') +# ax.set_ylabel('Percentage') +# ax.set_title('Ancestry Distribution Across Cohorts') +# ax.set_xticks(np.arange(len(ancestries)) + 0.2) +# ax.set_xticklabels(ancestries) +# ax.legend() + +# # Save the plot to 'g2c_demographics.png' +# plt.savefig('g2c_demographics.png') + +# # Show the plot +# plt.show() + + + + diff --git a/scripts/g2c_helpers/g2c_demographics.txt b/scripts/g2c_helpers/g2c_demographics.txt new file mode 100644 index 0000000..0021491 --- /dev/null +++ b/scripts/g2c_helpers/g2c_demographics.txt @@ -0,0 +1,187 @@ +============================== +Cohort: biome +East-Asian: 107 +Latin-American-1: 539 +European: 1875 +African-American: 146 +Latin-American-2: 41 +Other: 58 +Asian-Pacific-Islander: 22 +South-Asian: 38 +Unknown: 13 +African: 3 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 1331 +Female: 1501 +Aneuploidy: 10 +Total: 2842 +============================== +Cohort: mesa +European: 1583 +Latin-American-1: 209 +Latin-American-2: 299 +East-Asian: 111 +African-American: 1004 +Asian-Pacific-Islander: 5 +Other: 44 +Unknown: 4 +African: 2 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 1519 +Female: 1738 +Aneuploidy: 4 +Total: 3261 +============================== +Cohort: ufc +European: 42 +Latin-American-1: 5 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 30 +Female: 17 +Total: 47 +============================== +Cohort: gtex +African-American: 113 +European: 733 +East-Asian: 7 +Latin-American-1: 13 +Latin-American-2: 13 +African: 1 +South-Asian: 3 +Other: 6 +Asian-Pacific-Islander: 3 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Female: 302 +Male: 583 +Aneuploidy: 7 +Total: 892 +============================== +Cohort: cptac +East-Asian: 202 +European: 851 +Latin-American-2: 32 +South-Asian: 7 +Other: 2 +Unknown: 13 +Asian-Pacific-Islander: 7 +Latin-American-1: 12 +African-American: 19 +African: 1 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 602 +Female: 533 +Aneuploidy: 11 +Total: 1146 +============================== +Cohort: hgsvc +European: 133 +Unknown: 7 +Latin-American-2: 46 +East-Asian: 60 +Latin-American-1: 11 +Other: 3 +African-American: 93 +African: 32 +South-Asian: 68 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 226 +Female: 224 +Aneuploidy: 3 +Total: 453 +============================== +Cohort: icgc +Latin-American-1: 17 +European: 1066 +African-American: 14 +South-Asian: 25 +Latin-American-2: 7 +Asian-Pacific-Islander: 5 +East-Asian: 221 +Unknown: 12 +Other: 1 +African: 6 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Female: 446 +Male: 908 +Aneuploidy: 20 +Total: 1374 +============================== +Cohort: hmf +European: 4114 +Unknown: 86 +African-American: 36 +Latin-American-1: 91 +Asian-Pacific-Islander: 65 +East-Asian: 33 +Other: 22 +Latin-American-2: 14 +South-Asian: 19 +African: 3 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 2122 +Female: 2321 +Aneuploidy: 40 +Total: 4483 +============================== +Cohort: lcins +European: 213 +Latin-American-1: 9 +African-American: 4 +Latin-American-2: 1 +East-Asian: 3 +Asian-Pacific-Islander: 2 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Female: 175 +Male: 56 +Aneuploidy: 1 +Total: 232 +============================== +Cohort: proactive-core +Latin-American-1: 78 +European: 1004 +Asian-Pacific-Islander: 8 +Other: 6 +African-American: 35 +East-Asian: 28 +Latin-American-2: 18 +South-Asian: 14 +Unknown: 6 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 488 +Female: 708 +Aneuploidy: 1 +Total: 1197 +============================== +Cohort: proactive-other +Latin-American-1: 32 +European: 252 +East-Asian: 29 +African: 1 +African-American: 20 +Latin-American-2: 17 +South-Asian: 5 +Asian-Pacific-Islander: 5 +Unknown: 3 +Other: 4 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Female: 227 +Male: 140 +Aneuploidy: 1 +Total: 368 +============================== +Overall Statistics +East-Asian: 801 +Latin-American-1: 1016 +European: 11866 +African-American: 1484 +Latin-American-2: 488 +Other: 146 +Asian-Pacific-Islander: 122 +South-Asian: 179 +Unknown: 144 +African: 49 +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Male: 8005 +Female: 8192 +Aneuploidy: 98 +Total: 16295 diff --git a/scripts/g2c_helpers/json_converter.py b/scripts/g2c_helpers/json_converter.py new file mode 100644 index 0000000..53f57a3 --- /dev/null +++ b/scripts/g2c_helpers/json_converter.py @@ -0,0 +1,96 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# The purpose of this script is to convert between json +# code from terra, put it into human readable form (that can be edited if necessary), +# and then convert to a json file that is ideal for AllofUs workbench. +# If you are starting from the original terra json, use the flag '-t', +# otherwise run w/ no flags. +import json +import argparse +import os +import re + +# The goal of this function is to turn terra json +# files into a human readable json w/ the suffix .hread.json +# that can be easily edited. +def terra_to_readable_json(input_file, output_file): + line = open(input_file).readline()[1:-1] + line = line.replace('\":\"','\" : \"') + line = re.sub(r'\${(.*?)}', r'\1', line) + line = line.replace('\"true\"','true') + line = line.replace('\"false\"','false') + line = line.replace('\"[','[').replace(']\"',']') + + arr = re.split(r',(?![^\[]*\])', line) + arr = [s for s in arr if '\"\"' not in s] + arr = [s for s in arr if '[]' not in s] + print(len(arr)) + + with open(output_file, 'w') as f: + f.write('{\n') + for val in arr: + + val = re.sub(r'"(\d+(\.\d+)?)",?', r'\1', val) #turns strings into floats or ints if applicable + val = val.replace('\\',"") + if val != arr[-1]: + f.write(f"\t{val},\n") + else: + f.write(f"\t{val}\n") + + f.write('}') + + print(f"Output written to: {output_file}") + +# This function takes in an input file and +# writes to the directory of the output_file. +# This takes a human readable json file (normal json) +# and converts to a json file used by AllofUs workbench. +def readable_to_aou_json(input_file, output_file): + try: + # Open the input file for reading + with open(input_file, 'r') as input_file_handle: + # Read the content from the input file + content = input_file_handle.read() + + # Remove '\n' and '\t' characters from the content + content_without_newline_tabs = content.replace('\n', '').replace('\t', '') + + # Open the output file for writing + with open(output_file, 'w') as output_file_handle: + # Write the modified content to the output file + output_file_handle.write(content_without_newline_tabs) + + print(f"Conversion successful. Output written to {output_file}") + + except Exception as e: + print(f"Error: {e}") + +# Example usage: +# readable_to_aou_json("input.txt", "output.txt") + + + + + +def main(): + parser = argparse.ArgumentParser(description='Convert JSON file keys to separate lines.') + parser.add_argument('input_file', help='Input JSON file') + parser.add_argument('-t', '--terra', action='store_true', help='If present, run both functions') + + args = parser.parse_args() + + if args.terra: + terra_to_readable_json(args.input_file, args.input_file.replace('.json','.hread.json')) + readable_to_aou_json(args.input_file.replace('.json','.hread.json'), args.input_file.replace('.json','.out.json')) + else: + if '.hread.json' in args.input_file: + readable_to_aou_json(args.input_file,args.input_file.replace('.hread.json','.out.json')) + else: + readable_to_aou_json(args.input_file,args.input_file.replace('.json','.out.json')) + +if __name__ == '__main__': + main() + diff --git a/scripts/g2c_helpers/process_cohort.sh b/scripts/g2c_helpers/process_cohort.sh new file mode 100644 index 0000000..6a1156b --- /dev/null +++ b/scripts/g2c_helpers/process_cohort.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +while getopts "c:" flag; do + case $flag in + c) cohort=$OPTARG ;; + *) echo "Usage: $0 -c "; exit 1 ;; + esac +done + +if [ -z "$cohort" ]; then + echo "Cohort name is required. Use -c flag." + exit 1 +fi + +# Download the sample list file and store it as an array +samples=() +while IFS= read -r sample; do + samples+=("$sample") +done < <(gsutil cat "gs://dfci-g2c-inputs/sample-lists/${cohort}.samples.list") +output_file="${cohort}_consolidated_data.tsv" +echo -e "Sample\tCohort\tgrafpop_SNPs\tgrafpop_GD1\tgrafpop_GD2\tgrafpop_GD3\tpct_EUR\tpct_AFR\tpct_ASN\tchrX_count\tchrY_count\tancestry\tHQ_HOM\tHQ_HOM_RATE\tHQ_HET\tHQ_HET_RATE\tCHARR\tMEAN_REF_AB_HOM_ALT\tHETEROZYGOSITY_RATE\tINCONSISTENT_AB_HET_RATE\trd_median\trd_mean\tmanta_DEL_count\tmanta_DUP_count\tmanta_INS_count\tmanta_INV_count\tmanta_BND_count\tmelt_INS_count\twham_DEL_count\twham_DUP_count" > $output_file +# Process each sample +for sample in "${samples[@]}"; do + # Get the string output from the specified file + charr_output=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/charr/${sample}.txt" | cut -f2-9 | tail -n 1) + + demographics_output=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/demographics/${sample}.txt" | cut -f2-5,7-9,13-15 | sed -n '2p') + rd_median=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/${sample}.raw-counts.tsv" | grep 'rd_q50' | cut -f2) + rd_mean=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/${sample}.raw-counts.tsv" | grep 'rd_mean' | cut -f2) + + manta_DEL_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_DEL_count' | cut -f2) + manta_DUP_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_DUP_count' | cut -f2) + manta_INS_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_INS_count' | cut -f2) + manta_INV_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_INV_count' | cut -f2) + manta_BND_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_BND_count' | cut -f2) + + melt_INS_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/melt_${sample}.vcf.tsv" | grep 'vcf_INS_count' | cut -f2) + wham_DEL_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/wham_${sample}.vcf.tsv" | grep 'vcf_DEL_count' |cut -f2) + wham_DUP_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/wham_${sample}.vcf.tsv" | grep 'vcf_DUP_count' |cut -f2) + # Append the string output to the sample name and print to test_output.txt + echo -e "${sample}\t${cohort}\t${demographics_output}\t${charr_output}\t${rd_median}\t${rd_mean}\t${manta_DEL_count}\t${manta_DUP_count}\t${manta_INS_count}\t${manta_INV_count}\t${manta_BND_count}\t${melt_INS_count}\t${wham_DEL_count}\t${wham_DUP_count}" >> $output_file +done + +echo "Processing complete. Results saved in ${output_file}." + diff --git a/scripts/gsc_helpers/.DS_Store b/scripts/gsc_helpers/.DS_Store new file mode 100644 index 0000000..87d18f3 Binary files /dev/null and b/scripts/gsc_helpers/.DS_Store differ diff --git a/scripts/gsc_helpers/Standardize_Hmf_Germline_Vcfs.wdl b/scripts/gsc_helpers/Standardize_Hmf_Germline_Vcfs.wdl new file mode 100644 index 0000000..85d5e5d --- /dev/null +++ b/scripts/gsc_helpers/Standardize_Hmf_Germline_Vcfs.wdl @@ -0,0 +1,67 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# This code will standardize a HMF vcf. + + +version 1.0 + +task standardize{ + input { + File unstandardized_vcf + String id + } + Int default_disk_size = ceil((2 * size(unstandardized_vcf, "GB")) + 10) + + command <<< + # Step 1: Remove annotations (INFO fields) + bcftools view -h ~{unstandardized_vcf} | tail -n 1 | cut -f10 > sample_name.txt + bcftools annotate -x FORMAT/AD,INFO ~{unstandardized_vcf} -o temp.vcf.gz + + # Step 2: Keep only samples listed in sample_name.txt + bcftools view -S sample_name.txt temp.vcf.gz -o filtered_samples.vcf.gz + + # Step 3: Normalize the VCF file + bcftools norm -m - filtered_samples.vcf.gz -o normalized.vcf.gz + + # Step 4: Filter variants where 'AC > 0' and the filter passes QC + bcftools filter -i 'AC > 0 & FILTER == "PASS"' normalized.vcf.gz -o allele_count.vcf.gz + + # Step 5: + bcftools +fill-tags allele_count.vcf.gz -o ~{id}.vcf.gz -- -t FORMAT/VAF + + # Step 6: Index the vcf + bcftools index -t -o ~{id}.vcf.gz.tbi ~{id}.vcf.gz + >>> + + runtime { + docker: "vanallenlab/bcftools" + disks: "local-disk " + default_disk_size + " HDD" + } + output{ + File vcf = "~{id}.vcf.gz" + File vcf_idx = "~{id}.vcf.gz.tbi" + } +} + + + + +# Define workflow +workflow standardize_hmf { + input{ + File unstandardized_vcf + String id + } + call standardize { + input: + id = id, + unstandardized_vcf = unstandardized_vcf + } + output{ + File standardized_vcf = standardize.vcf + File standardized_vcf_idx = standardize.vcf_idx + } +} \ No newline at end of file diff --git a/scripts/gsc_helpers/annotate_noncoding_snps.VALab_germline_somatic_table.py b/scripts/gsc_helpers/annotate_noncoding_snps.VALab_germline_somatic_table.py new file mode 100644 index 0000000..42d5208 --- /dev/null +++ b/scripts/gsc_helpers/annotate_noncoding_snps.VALab_germline_somatic_table.py @@ -0,0 +1,71 @@ +# Read nc_c.map and create a mapping of germline genes to list of (chr:pos, risk_allele) tuples +gene_map = {} +with open("nc_c.map", "r") as map_file: + for line in map_file: + parts = line.strip().split("\t") + if len(parts) == 3: + chrom_pos, risk_allele, gene = parts + if gene in gene_map: + gene_map[gene].append((chrom_pos, risk_allele)) + else: + gene_map[gene] = [(chrom_pos, risk_allele)] + +# Read VALab_germline_somatic.tsv, filter rows, and create output +output = [] +with open("VALab_germline_somatic.tsv", "r") as input_file: + # Skip header + next(input_file) + for line in input_file: + parts = line.strip().split("\t") + if len(parts) >= 6 and parts[2] == "noncoding" and parts[4] == "coding": + cancer_type, germline_gene, _, somatic_gene, _, criteria = parts + if germline_gene in gene_map: + chrom_pos_risk_alleles = gene_map[germline_gene] + for chrom_pos, risk_allele in chrom_pos_risk_alleles: + output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tnoncoding\t{somatic_gene}\tcoding\t{criteria}") + + +# Read VALab_germline_somatic.tsv, filter rows +with open("VALab_germline_somatic.tsv", "r") as input_file: + # Skip header + next(input_file) + for line in input_file: + parts = line.strip().split("\t") + if len(parts) >= 6 and parts[2] == "noncoding" and parts[4] == "noncoding": + cancer_type, germline_gene, _, somatic_gene, _, criteria = parts + if germline_gene in gene_map: + chrom_pos_risk_alleles = gene_map[germline_gene] + for chrom_pos, risk_allele in chrom_pos_risk_alleles: + output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tnoncoding\t{somatic_gene}\tnoncoding\t{criteria}") + +# Read VALab_germline_somatic.tsv, filter rows +with open("VALab_germline_somatic.tsv", "r") as input_file: + # Skip header + next(input_file) + for line in input_file: + parts = line.strip().split("\t") + if len(parts) >= 6 and parts[2] == "coding" and parts[4] == "coding": + cancer_type, germline_gene, _, somatic_gene, _, criteria = parts + if germline_gene in gene_map: + chrom_pos_risk_alleles = gene_map[germline_gene] + for chrom_pos, risk_allele in chrom_pos_risk_alleles: + output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tcoding\t{somatic_gene}\tcoding\t{criteria}") + +# Read VALab_germline_somatic.tsv, filter rows +with open("VALab_germline_somatic.tsv", "r") as input_file: + # Skip header + next(input_file) + for line in input_file: + parts = line.strip().split("\t") + if len(parts) >= 6 and parts[2] == "coding" and parts[4] == "noncoding": + cancer_type, germline_gene, _, somatic_gene, _, criteria = parts + if germline_gene in gene_map: + chrom_pos_risk_alleles = gene_map[germline_gene] + for chrom_pos, risk_allele in chrom_pos_risk_alleles: + output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tcoding\t{somatic_gene}\tnoncoding\t{criteria}") + +# Write output to a new file +with open("VALab_germline_somatic.noncoding.tsv", "w") as output_file: + output_file.write("cancer\tchr:pos-risk_allele\tgermline_gene\tgermline_context\tsomatic_gene\tsomatic_context\tcriteria\n") + for line in output: + output_file.write(line + "\n") diff --git a/scripts/gsc_helpers/annotate_table.sh b/scripts/gsc_helpers/annotate_table.sh new file mode 100644 index 0000000..75ed51c --- /dev/null +++ b/scripts/gsc_helpers/annotate_table.sh @@ -0,0 +1,72 @@ +#!/bin/bash +COSMIC_TABLE="/Users/noah/Desktop/DFCI_Data/gsi/data/COSMIC.CGC.10_26_23.tsv" +CELLCHAT_TABLE="/Users/noah/Desktop/DFCI_Data/gsi/data/cellchat_db_formatted.csv" +VALab_germline_somatic_table="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.tsv" +output_file="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.annotated.tsv" + +# Function to check ONC status from COSMIC.CGC.10_26_23.tsv +get_onc_status() { + local gene="$1" + local oncogene="$(grep -m 1 "^$gene" "$COSMIC_TABLE" | cut -d $'\t' -f 15)" + + if grep -q "oncogene" <<< "$oncogene" && grep -q "TSG" <<< "$oncogene"; then + echo "ONC-TSG" + elif grep -q "oncogene" <<< "$oncogene"; then + echo "ONC" + elif grep -q "TSG" <<< "$oncogene"; then + echo "TSG" + else + echo "UNK" + fi +} + +# Function to check Signaling Scenario from cellchat_db_formatted.csv +get_signaling_scenario() { + local gene="$1" + + local signaling_scenario="" + + if grep -q -F "$gene" <(cut -d ',' -f4 "$CELLCHAT_TABLE");then + signaling_scenario="Ligand" + elif grep -q -F "$gene" <(cut -d ',' -f5 "$CELLCHAT_TABLE");then + signaling_scenario="Receptor" + else + signaling_scenario="UNK" + fi + + echo "$signaling_scenario" +} + +get_mutation_type() { + local gene="$1" + + local mutation_type="$(grep -m 1 "^$gene" "$COSMIC_TABLE" | cut -d $'\t' -f16)" + + if test -z "$mutation_type";then + echo "UNK" + else + echo "$mutation_type" + fi +} + +echo -e "cancer\tgermline_gene\tgermline_context\tgermline_onc_status\tgermline_mutations\tgermline_signaling\tsomatic_gene\tsomatic_context\tsomatic_onc_status\tsomatic_mutations\tsomatic_signaling\tcriteria" > "$output_file" +# Main script +while IFS=$'\t' read -r cancer germline_gene germline_context somatic_gene somatic_context criteria; do + # Get ONC status for germline gene + germline_onc_status=$(get_onc_status "$germline_gene") + + # Get ONC status for somatic gene + somatic_onc_status=$(get_onc_status "$somatic_gene") + + # Get Signaling Scenario + germline_signaling_scenario=$(get_signaling_scenario "$germline_gene") + somatic_signaling_scenario=$(get_signaling_scenario "$somatic_gene") + + #Get Mutation Type + germline_mutation_type=$(get_mutation_type "$germline_gene") + somatic_mutation_type=$(get_mutation_type "$somatic_gene") + + # Print the updated line + echo -e "$cancer\t$germline_gene\t$germline_context\t$germline_onc_status\t$germline_mutation_type\t$germline_signaling_scenario\t$somatic_gene\t$somatic_context\t$somatic_onc_status\t$germline_mutation_type\t$somatic_signaling_scenario\t$criteria" >> "$output_file" +done < "$VALab_germline_somatic_table" | tail -n +2 + diff --git a/scripts/gsc_helpers/check_sorted.sh b/scripts/gsc_helpers/check_sorted.sh new file mode 100644 index 0000000..7c5f1de --- /dev/null +++ b/scripts/gsc_helpers/check_sorted.sh @@ -0,0 +1,31 @@ +#!/bin/bash + + +#Check if a vcf file is sorted + +# Define function to check if variants are properly sorted +check_sorted() { + local vcf_file="$1" + + # Extract chromosome and position from VCF file + bcftools query -f '%CHROM\t%POS\n' "${vcf_file}" > chrom_pos.txt + + # Check if variants are properly sorted + if sort -c chrom_pos.txt; then + echo "VCF file is properly sorted." + else + echo "VCF file is not properly sorted." + fi + + # Clean up temporary file + #rm chrom_pos.txt +} + +# Main script +if [[ $# -eq 0 ]]; then + echo "Usage: $0 " + exit 1 +fi + +vcf_file="$1" +check_sorted "${vcf_file}" \ No newline at end of file diff --git a/scripts/gsc_helpers/coding_genes.flipped.coordinates.bed.gz b/scripts/gsc_helpers/coding_genes.flipped.coordinates.bed.gz new file mode 100644 index 0000000..e69de29 diff --git a/scripts/gsc_helpers/coding_genes.list b/scripts/gsc_helpers/coding_genes.list new file mode 100644 index 0000000..7f0e8e5 --- /dev/null +++ b/scripts/gsc_helpers/coding_genes.list @@ -0,0 +1,139 @@ +ABL2 +ACVR2A +AKAP9 +AKT1 +APC +ARHGEF10 +ARID1A +ARID1B +ASXL2 +ATM +AXIN1 +B2M +BAP1 +BARD1 +BAZ1A +BMP6 +BMPR1A +BRAF +BRCA1 +BRCA2 +CASP8 +CASZ1 +CCNE1 +CD74 +CDH1 +CDKN1B +CDKN2A +CHD4 +CHEK2 +CLTC +CREBBP +CTNNB1 +CTNND1 +CUL3 +CUX1 +CYLD +DDX5 +EGFR +EIF3E +EP300 +EPS15 +ERBB2 +ERBB3 +ERBB4 +ESR1 +ETV5 +FBLN2 +FBXW7 +FGFR1 +FGFR2 +FOXP1 +GATA3 +GNAS +GRIN2A +H3C2 +HIF1A +HLA-A +HRAS +IDH1 +IKZF3 +INHBB +ITGAV +JAK2 +KEAP1 +KLF4 +KLF6 +KLK2 +KRAS +LRP1B +MAP2K4 +MDM4 +MET +MLH1 +MSH2 +MSH6 +MYEOV +MYNN +NCOR1 +NF1 +NF2 +NOTCH1 +NOTCH2 +NRAS +NRG1 +NTRK1 +NUP214 +PALB2 +PARP4 +PCBP1 +PER3 +PHLDA3 +PIK3R1 +PLCG1 +POLD1 +PPP1R18 +PPP2R1A +PRDM1 +PTEN +RAD17 +RAD21 +RAF1 +RASA1 +RB1 +RBFOX1 +RHOA +RNF213 +RNF43 +RUNX1T1 +SDC4 +SH2B3 +SMAD2 +SMAD3 +SMAD4 +SMARCA4 +SMARCD1 +SOX9 +SPEN +SPOP +TBX3 +TCF7L2 +TFRC +TGFBR2 +TGIF1 +TMPRSS2 +TNXB +TP53 +TRRAP +TSC1 +TSC2 +U2AF1 +UBR5 +VHL +VTI1A +XPO1 +ZBTB16 +ZBTB42 +ZFHX3 +ZNRF3 +germline_gene diff --git a/scripts/gsc_helpers/creating_gene_regions_of_interest_hg19.sh b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg19.sh new file mode 100644 index 0000000..74775c4 --- /dev/null +++ b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg19.sh @@ -0,0 +1,27 @@ +# Example code for creating gene regions of interest and subsetting AoU VCF + + +# The below can all be run on your local machine from the terminal + +# Download MANE GTF: +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/GRCh37_mapping/gencode.v45lift37.annotation.gtf.gz +gsc_table="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.flipped.tsv" +#cut -f2,3 "$gsc_table" | grep -v 'noncoding' | cut -f1 | sort | uniq > coding_genes.list +cut -f2 "$gsc_table" | sort | uniq > coding_genes.list + +# Subset MANE to genes of interest +zcat gencode.v45lift37.annotation.gtf.gz \ +| fgrep -wf coding_genes.list \ +| bgzip -c \ +> gencode.v45lift37.annotation.subsetted.gtf.gz + +# Extract coordinates from subsetted GTF, pad by ±2kb to be conservative, and use BEDTools to merge these coordinates into unique regions +zcat gencode.v45lift37.annotation.subsetted.gtf.gz \ +| awk -v FS="\t" -v OFS="\t" -v buffer=2000 '{ print $1, $4-buffer, $5+buffer }' \ +| sort -Vk1,1 -k2,2n -k3,3n \ +| bedtools merge -i - \ +| bgzip -c \ +> coding_genes.flipped.coordinates.bed.gz + +rm coding_genes.list + diff --git a/scripts/gsc_helpers/creating_gene_regions_of_interest_hg38.sh b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg38.sh new file mode 100644 index 0000000..2d03e9c --- /dev/null +++ b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg38.sh @@ -0,0 +1,40 @@ +# Example code for creating gene regions of interest and subsetting AoU VCF + +# Given the following input files: +# 1. list (flat .txt file) of gene symbols of relevance: riaz_genes.list +# 2. list (flat .txt file) of sample IDs to retain: keep_samples.list +# 3. VCF of all variants from AoU: aou.vcf.gz (must also have .tbi index present locally) + +# The below can all be run on your local machine from the terminal + +# Download MANE GTF: +#wget https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.3/MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz + +input_table="VALab_germline_somatic.tsv" +prefix="gsc" +cut -f2 $input_table | tail -n +2 | sort | uniq > "${prefix}_genes.list" + +# Subset MANE to genes of interest +zcat MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz \ +| fgrep -wf "${prefix}_genes.list" \ +> MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf + +# Extract coordinates from subsetted GTF, pad by ±2kb to be conservative, and use BEDTools to merge these coordinates into unique regions +cat MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf \ +| awk -v FS="\t" -v OFS="\t" -v buffer=2000 '{ print $1, $4-buffer, $5+buffer }' \ +| sort -Vk1,1 -k2,2n -k3,3n \ +| bedtools merge -i - \ +> "${prefix}_genes.coordinates.bed" + + +# Subset variants to minimal set of interest for our purposes +# Note that this must be run in the AoU workspace terminal, not on your local machine +# bcftools view \ +# -O z -o aou.subsetted.vcf.gz \ +# --regions-file riaz_genes.coordinates.bed.gz \ +# --samples-file keep_samples.list \ +# --min-ac 1 \ +# aou.vcf.gz + +# Index output VCF for future queries +# tabix -p vcf -f aou.subsetted.vcf.gz \ No newline at end of file diff --git a/scripts/gsc_helpers/get_felix_somatic_snps.sh b/scripts/gsc_helpers/get_felix_somatic_snps.sh new file mode 100644 index 0000000..94fccd1 --- /dev/null +++ b/scripts/gsc_helpers/get_felix_somatic_snps.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Download the zip file +#wget https://www.science.org/doi/suppl/10.1126/science.abg5601/suppl_file/science.abg5601_tables_s2_to_s23.zip + +# Unzip the file +#unzip science.abg5601_tables_s2_to_s23.zip +#rm combined_data.tsv combined_data.tmp + +#get somatic noncoding portions +gsc_table="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.tsv" +cut -f1,4,5 "$gsc_table" | grep 'noncoding' | cut -f2 > somatic_noncoding.list + +# Define the Python script to extract sheet names and process each sheet +read -r -d '' PYTHON_SCRIPT <<'EOF' +import pandas as pd + +# Load the Excel file +excel_file = pd.ExcelFile('science.abg5601_tables_s2_to_s23/science.abg5601_tables_s2_to_s23.xlsx') + +# Loop through each sheet in the Excel file +for sheet_name in excel_file.sheet_names: + cancer_type = sheet_name.split()[-1] + cancer_type_set = set(['Breast','Colorectal','Prostate','Lung','Kidney']) + if cancer_type not in cancer_type_set: + continue + # Read the data from the sheet into a DataFrame + df = pd.read_excel(excel_file, sheet_name=sheet_name) + # Export the data to a TSV file + df.to_csv(f'{sheet_name.split()[-1]}.tsv', sep='\t', index=False) + #print(sheet_name.split()[-1]) +EOF + +# Execute the Python script +python3 -c "$PYTHON_SCRIPT" + +# Combine all TSV files into one +cat *.tsv | grep -E 'Breast|Colorectal|Prostate|Kidney|Lung' \ +| grep -v 'Table' \ +| awk -F'\t' '$3 != "coding"' \ +| cut -f1,2,7 | awk -F'\t' -v OFS='\t' '{ split($3, chr, "-"); print chr[1],chr[2], $1, $2 }' \ +| awk -F'\t' -v OFS='\t' '{ split($1, a, ":"); print a[1],a[2], $2, $3, $4 }' \ +| grep -Fwf somatic_noncoding.list | sort -n > combined_data.tmp + +#rm somatic_noncoding.list + +# Remove individual TSV files +rm Breast.tsv Prostate.tsv Lung.tsv Kidney.tsv Colorectal.tsv + +mv combined_data.tmp gsc_noncoding_snps_hg38.bed + diff --git a/scripts/gsc_helpers/noncoding_coding/combine_snps.sh b/scripts/gsc_helpers/noncoding_coding/combine_snps.sh new file mode 100644 index 0000000..9f159b9 --- /dev/null +++ b/scripts/gsc_helpers/noncoding_coding/combine_snps.sh @@ -0,0 +1,37 @@ +ref_table="../data/VALab_germline_somatic.tsv" + + +#Breast Section +grep 'breast' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > breast_noncoding_germline.tmp.list +cut -f3,10,15,18 breast_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf breast_noncoding_germline.tmp.list | awk '{print $0 "\tbreast"}' > breast_noncoding_snps.tmp.tsv + +#Colorectal Section +grep 'colorectal' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > colorectal_noncoding_germline.tmp.list +cut -f2,9,14,17 colorectal_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf colorectal_noncoding_germline.tmp.list | awk '{print $0 "\tcolorectal"}' > colorectal_noncoding_snps.tmp.tsv + +#Lung Section +grep 'lung' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > lung_noncoding_germline.tmp.list +cut -f2,9,14,17 lung_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf lung_noncoding_germline.tmp.list | awk '{print $0 "\tlung"}' > lung_noncoding_snps.tmp.tsv + +#Kidney Section +grep 'renal' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > kidney_noncoding_germline.tmp.list +cut -f2,9,14,17 renal_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf kidney_noncoding_germline.tmp.list | awk '{print $0 "\trenal"}' > kidney_noncoding_snps.tmp.tsv + +#Prostate Section +grep 'prostate' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > prostate_noncoding_germline.tmp.list +cut -f2,9,14,17 prostate_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf prostate_noncoding_germline.tmp.list | awk '{print $0 "\tprostate"}' > prostate_noncoding_snps.tmp.tsv + +#New Prostate Section +cut -f5,11,26,27 new_pca_only_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf prostate_noncoding_germline.tmp.list | awk '{print $2 "\t" $1 "\t" $3 "\t" $4 "\tprostate"}' > new_prostate_noncoding_snps.tmp.tsv + +#combine all files +cat breast_noncoding_snps.tmp.tsv colorectal_noncoding_snps.tmp.tsv lung_noncoding_snps.tmp.tsv kidney_noncoding_snps.tmp.tsv prostate_noncoding_snps.tmp.tsv new_prostate_noncoding_snps.tmp.tsv | grep -v ',' | awk 'BEGIN {FS="\t"; OFS="\t"} {split($1, a, "-"); print $3 "\t" $2 "\t" a[2] "\t" $5}' | awk 'BEGIN {FS="\t"; OFS="\t"} {split($1, a, ":"); print "chr"a[1] "\t" a[2]-1 "\t" a[2] "\t" $2 "\t" $3 "\t" $4 "\t" $5}' | sort | uniq > pancan_gwas_catalog_sig_filtered.tmp.tsv + +grep 'HLA-' pancan_gwas_catalog_sig_filtered.tmp.tsv > HLA.tmp.tsv +grep -v '-' pancan_gwas_catalog_sig_filtered.tmp.tsv > pancan_gwas_catalog_sig_filtered.tmp2.tsv +cat HLA.tmp.tsv >> pancan_gwas_catalog_sig_filtered.tmp2.tsv + +sort -V pancan_gwas_catalog_sig_filtered.tmp2.tsv | uniq > pancan_gwas_catalog_sig_filtered.tsv + +#remove all tmp files +rm *tmp* diff --git a/scripts/gsc_helpers/noncoding_coding/create_tables.sh b/scripts/gsc_helpers/noncoding_coding/create_tables.sh new file mode 100644 index 0000000..30e64b0 --- /dev/null +++ b/scripts/gsc_helpers/noncoding_coding/create_tables.sh @@ -0,0 +1,74 @@ +# This file is meant to organize for logistic regression for noncoding-coding convergences +germline_somatic_file=$1 +non_coding_chr=$2 +non_coding_pos=$3 +somatic_gene=$4 +final_output_file="${non_coding_chr}.${non_coding_pos}.${somatic_gene}.tsv" +# Here is where we compile lists +grep "$somatic_gene" "$germline_somatic_file" | grep "$non_coding_chr $non_coding_pos" | grep 'HET' | cut -f1 | sort | uniq> apc_patients_het.list + +grep "$somatic_gene" "$germline_somatic_file" | grep "$non_coding_chr $non_coding_pos" | grep 'HOM' | cut -f1 | sort | uniq> apc_patients_risk_hom.list +cut -f1 "$germline_somatic_file" | sort | uniq > patients.list + +grep "$somatic_gene" "$germline_somatic_file" | cut -f1 | sort | uniq > apc_patients.list +grep -vFwf apc_patients_het.list "$germline_somatic_file" | grep -vFwf apc_patients_risk_hom.list | cut -f1 | sort | uniq > apc_patients_safe_hom.list +# Define input files +patients_file="patients.list" +apc_patients_file="apc_patients_risk_hom.list" +output_file="risk_hom_status.list" +tmp_file=$(mktemp) +# Loop through patients.list and check if each patient ID is in apc_patients_file + +# Create an array of patient IDs from apc_patients_file +while read -r patient_id; do + if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then + echo -e "1" >> "$output_file" # Patient is in apc_patients_file, write "1" + else + echo -e "0" >> "$output_file" # Patient is not in apc_patients_file, write "0" + fi +done < "$patients_file" + +# Define input files +apc_patients_file="apc_patients_het.list" +output_file="het_status.list" + +# Create an array of patient IDs from apc_patients_file +while read -r patient_id; do + if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then + echo -e "1" >> "$output_file" # Patient is in apc_patients_file, write "1" + else + echo -e "0" >> "$output_file" # Patient is not in apc_patients_file, write "0" + fi +done < "$patients_file" + + +# Define input files +apc_patients_file="apc_patients_safe_hom.list" +output_file="safe_hom_status.list" + +# Create an array of patient IDs from apc_patients_file +while read -r patient_id; do + if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then + echo -e "1" >> "$output_file" # Patient is in apc_patients_file, write "1" + else + echo -e "0" >> "$output_file" # Patient is not in apc_patients_file, write "0" + fi +done < "$patients_file" + +# Define input files +apc_patients_file="apc_patients.list" +output_file="driver.list" + +# Create an array of patient IDs from apc_patients_file +while read -r patient_id; do + if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then + echo -e "1" >> "$output_file" # Patient is in apc_patients_file, write "1" + else + echo -e "0" >> "$output_file" # Patient is not in apc_patients_file, write "0" + fi +done < "$patients_file" + +echo "patients risk_hom het safe_hom driver_status" > $final_output_file +paste patients.list risk_hom_status.list het_status.list safe_hom_status.list driver.list >> $final_output_file +rm patients.list risk_hom_status.list het_status.list safe_hom_status.list driver.list +rm apc_patients.list apc_patients_safe_hom.list apc_patients_het.list apc_patients_risk_hom.list diff --git a/scripts/gsc_helpers/noncoding_coding/logistic_regression.py b/scripts/gsc_helpers/noncoding_coding/logistic_regression.py new file mode 100644 index 0000000..e3c3e29 --- /dev/null +++ b/scripts/gsc_helpers/noncoding_coding/logistic_regression.py @@ -0,0 +1,44 @@ +import sys +import pandas as pd +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + +def main(filename): + # Read the TSV file into a pandas DataFrame + df = pd.read_csv(filename, sep='\t') + + # Define the features (X) and target (y) + X = df[['risk_hom', 'het', 'safe_hom']] + y = df['driver_status'] + + # Create the logistic regression model + model = LogisticRegression() + + # Fit the model to the data + model.fit(X, y) + + # Get the coefficients of the model + coefficients = model.coef_[0] + + # Calculate the odds ratios (OR) + odds_ratios = np.exp(coefficients) + + # Create a DataFrame to display results + results = pd.DataFrame({ + 'Feature': X.columns, + 'Coefficient': coefficients, + 'Odds Ratio': odds_ratios + }) + +if __name__ == "__main__": + # Check if the filename argument is provided + if len(sys.argv) < 2: + print("Usage: python logistic_regression.py ") + sys.exit(1) + + # Get the filename from command line argument + filename = sys.argv[1] + + # Call the main function with the filename argument + main(filename) diff --git a/scripts/gsc_helpers/vcf_to_tsv.py b/scripts/gsc_helpers/vcf_to_tsv.py new file mode 100644 index 0000000..5a8b0f4 --- /dev/null +++ b/scripts/gsc_helpers/vcf_to_tsv.py @@ -0,0 +1,91 @@ +import pandas as pd +import numpy as np + +# Function to parse VCF and create DataFrame +def vcf_to_tsv(vcf_file,risk_snp_dict): + # Initialize lists and dictionaries + data = {} + sample_ids = [] + variants = [] + + # Open VCF file and read line by line + with open(vcf_file, 'r') as f: + for line in f: + line = line.strip() + + # Skip header lines starting with ## + if line.startswith('##'): + continue + + # Process header line starting with #CHROM + elif line.startswith('#CHROM'): + fields = line.split('\t') + sample_ids = fields[9:] # Extract sample IDs from header + continue + + # Process variant data lines + fields = line.split('\t') + chrom = fields[0] + pos = fields[1] + ref = fields[3] + alt = fields[4] + allele = risk_snp_dict[f"{chrom}:{pos}"] + key = f"{chrom}:{pos}-{allele}" + variants.append(key) + + # Process genotype information for each sample + for i, sample_id in enumerate(sample_ids): + genotype_info = fields[i + 9] # Start index for sample genotype info + if './.' in genotype_info: + value = np.nan + else: + alleles = genotype_info.split(':')[0].split('/') + if allele == ref: + value = 2 - (int(alleles[0]) + int(alleles[1])) + elif allele == alt: + value = int(alleles[0]) + int(alleles[1]) + else: + value = 0 + + if '.' in alleles: + value = np.nan + #else: + + #count_allele = alleles.count(str(allele_type)) + #value = count_allele + #value = sum(int(allele) for allele in alleles) + + if sample_id not in data: + data[sample_id] = {} + + data[sample_id][key] = value + + # Create DataFrame + df = pd.DataFrame.from_dict(data, orient='index') + df.index.name = 'sample_id' + df.columns = variants + + return df + +def tsv_to_dict(tsv_file): + data_dict = {} + with open(tsv_file, 'r') as f: + for line in f: + if line.startswith('#'): # Skip header lines if any + continue + fields = line.strip().split('\t') + key = f"{fields[0]}:{fields[2]}" + value = fields[3] + data_dict[key] = value + return data_dict + +# Example usage +vcf_file = 'PROFILE.noncoding.vcf' +bed_file = 'noncoding_vars.hg19.bed' +output_tsv = 'output.tsv' + +risk_snp_dict = tsv_to_dict(bed_file) +df = vcf_to_tsv(vcf_file,risk_snp_dict) +#risk_snp_dict = tsv_to_dict(bed_file) +df.to_csv(output_tsv, sep='\t', na_rep='NaN') + diff --git a/scripts/ufc_helpers/.DS_Store b/scripts/ufc_helpers/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/scripts/ufc_helpers/.DS_Store differ diff --git a/scripts/ufc_helpers/.Rhistory b/scripts/ufc_helpers/.Rhistory new file mode 100644 index 0000000..95edd18 --- /dev/null +++ b/scripts/ufc_helpers/.Rhistory @@ -0,0 +1,512 @@ +proband = read.delim("/Users/noah/Desktop/proband.counts.tsv") +proband = read.delim("/Users/noah/Desktop/0450-B-1.counts.tsv") +df <- read.table("/Users/noah/Desktop/0450-B-1.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6"), fill = TRUE) +df2 <- read.table("/Users/noah/Desktop/proband.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6"), fill = TRUE) +df2 <- read.table("/Users/noah/Desktop/proband.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE) +View(df) +View(df2) +df <- read.table("/Users/noah/Desktop/0450-B-1.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE) +View(df) +View(df) +pwd +estimate_chromosomes() +library(dplyr) +estimate_chromosomes_processing <- function(file_path) { +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +# Return the merged data frame with normalized read coverage +return(merged_df) +} +estimate_chromosomes <- function(df) { +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +} +# Extract the file path from the command-line arguments +file_path <- commandArgs(trailingOnly = TRUE)[1] +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +estimate_chromosomes_processing("/Users/noah/Desktop/proband.counts.tsv") +result_df = estimate_chromosomes_processing("/Users/noah/Desktop/proband.counts.tsv") +result = estimate_chromosomes(result_df) +result_df = estimate_chromosomes_processing("/Users/noah/Desktop/0450-B-1.counts.tsv") +result = estimate_chromosomes(result_df) +df <- read.table("/Users/noah/Desktop/proband.short.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE) +View(df) +df <- df[, 1:4] +header_row <- grep("CONTIG", df$Chromosome) +df2 <- df[(header_row + 1):nrow(df), ] +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +library(dplyr) +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +View(merged_df) +View(df) +rm(list=ls()) +cat("hello") +library(dplyr, quietly = TRUE) +estimate_chromosomes_processing <- function(file_path) { +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +# Return the merged data frame with normalized read coverage +return(merged_df) +} +estimate_chromosomes <- function(df) { +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +cat("hi") +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +} +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/proband.short.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +result <- estimate_chromosomes(result_df) +cat(result) +cat("Hi") +library(dplyr, quietly = TRUE) +estimate_chromosomes_processing <- function(file_path) { +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +# Return the merged data frame with normalized read coverage +return(merged_df) +} +estimate_chromosomes <- function(df) { +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +} +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +result <- estimate_chromosomes(result_df) +cat(result) +cat("Hi") +library(dplyr, quietly = TRUE) +estimate_chromosomes_processing <- function(file_path) { +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +# Return the merged data frame with normalized read coverage +return(merged_df) +} +estimate_chromosomes <- function(df) { +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +cat(chrX_cov,chrY_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +} +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +result <- estimate_chromosomes(result_df) +cat(result) +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +View(result_df) +rm(list=ls()) +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv" +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +View(merged_df) +rm(list=ls()) +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/0450-B-1.counts.tsv" +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +View(result_df) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +View(merged_df) +df = merged_df +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +cat(chrX_cov,chrY_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +rm(list = ls()) +# Extract the file path from the command-line arguments +#file_path <- commandArgs(trailingOnly = TRUE)[1] +file_path = "/Users/noah/Desktop/proband.counts.tsv" +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +df = merged_df +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +cat(chrX_cov,chrY_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +library(dplyr, quietly = TRUE) +estimate_chromosomes_processing <- function(file_path) { +# Create an R data frame with chromosome numbers and lengths +chromosome_data <- data.frame( +Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", +"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", +"chr21", "chr22", "chrX", "chrY"), +Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, +138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, +81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415) +) +# Read the TSV file into a data frame +df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) +# Keep only the first four columns +df <- df[, 1:4] +# Find the row index where the specified header pattern is located +header_row <- grep("CONTIG", df$Chromosome) +# Omit rows before the header row (if header is found) +df2 <- df[(header_row + 1):nrow(df), ] +# Group by V1 and calculate the sum of V4 for each group +result_df <- df2 %>% +group_by(Chromosome) %>% +summarize(Sum_V4 = sum(as.numeric(V4))) +# Merge the two data frames based on the "Chromosome" column +merged_df <- merge(result_df, chromosome_data, by = "Chromosome") +# Calculate normalized read coverage +merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length +# Return the merged data frame with normalized read coverage +return(merged_df) +} +estimate_chromosomes <- function(df) { +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +cat(chrX_cov,chrY_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +} +file_path <- "/Users/noah/Desktop/0450-B-1.short.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +View(result_df) +result <- estimate_chromosomes(result_df) +cat(result) +file_path <- "/Users/noah/Desktop/proband.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +result <- estimate_chromosomes(result_df) +cat(result) +file_path <- "/Users/noah/Desktop/HG00096.counts.tsv" +# Example usage +result_df <- estimate_chromosomes_processing(file_path) +result <- estimate_chromosomes(result_df) +View(result_df) +df <- result_df +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +cat(chrX_cov,chrY_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +result <- estimate_chromosomes(result_df) +estimate_chromosomes <- function(df) { +# Extract 'norm_read_cov' values for autosomes and sex chromosomes +autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] +chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] +chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] +# Calculate mean of autosomal counts +autosomal_mean <- mean(autosomal_cov) +#cat(chrX_cov,chrY_cov) +# Define values to compare proximity +zero_value <- 0 +one_copy_value <- autosomal_mean / 2 +two_copies_value <- autosomal_mean +three_copies_value <- 1.5 * autosomal_mean +# Find the closest value for chrX_cov +closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) +# Find the closest value for chrY_cov +closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) +# Return the estimates in the specified format +return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) +} +result <- estimate_chromosomes(result_df) +cat(result) diff --git a/scripts/ufc_helpers/organize_results.py b/scripts/ufc_helpers/organize_results.py new file mode 100755 index 0000000..e50808b --- /dev/null +++ b/scripts/ufc_helpers/organize_results.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + + +import argparse + +def process_data(input_file, ancestry, ancestry_ID, chrX_count, chrY_count): + # Read the file line by line + with open(input_file, 'r') as file: + lines = file.readlines() + + # Extract coordinates from the comments + coordinates = [line.strip().split()[1:] for line in lines[3:6]] + coordinate_dict = {item[0].rstrip(':'): tuple(map(float, item[1:])) for item in coordinates} + + + # Extract data from the main part of the file + data_line = lines[8].strip().split('\t') + + + # Prepare output data + output_data = { + 'Sample': data_line[0].strip(), + 'SNPs': int(data_line[1].strip()), + 'GD1(x)': float(data_line[2].strip()), + 'GD2(y)': float(data_line[3].strip()), + 'GD3(z)': float(data_line[4].strip()), + 'GD4': float(data_line[5].strip()), + 'E(%)': float(data_line[6].strip()), + 'F(%)': float(data_line[7].strip()), + 'A(%)': float(data_line[8].strip()), + 'F(xyz)': coordinate_dict['F'], + 'A(xyz)': coordinate_dict['A'], + 'E(xyz)': coordinate_dict['E'], + 'chrX_count': chrX_count, + 'chrY_count': chrY_count, + 'ancestry': ancestry, + 'ancestry_ID': ancestry_ID + } + + # Convert output data to CSV format + output_csv = ( + f"Sample\tSNPs\tGD1(x)\tGD2(y)\tGD3(z)\tGD4\tE(%)\tF(%)\tA(%)\tF(xyz)\tA(xyz)\tE(xyz)\tchrX_count\tchrY_count\tancestry\tancestry_ID\n" + f"{output_data['Sample']}\t{output_data['SNPs']}\t{output_data['GD1(x)']}\t{output_data['GD2(y)']}\t" + f"{output_data['GD3(z)']}\t{output_data['GD4']}\t{output_data['E(%)']}\t{output_data['F(%)']}\t{output_data['A(%)']}\t" + f"{output_data['F(xyz)']}\t{output_data['A(xyz)']}\t{output_data['E(xyz)']}\t{output_data['chrX_count']}\t" + f"{output_data['chrY_count']}\t{output_data['ancestry']}\t{output_data['ancestry_ID']}\n" + ) + + print(output_csv) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Process input data and create output TSV.') + parser.add_argument('--file', type=str, help='Input file path', required=True) + parser.add_argument('--ancestry', type=str, help='Column name for Ancestry', required=True) + parser.add_argument('--ancestryID', type=int, help='Column name for Ancestry ID', required=True) + parser.add_argument('--chrX_count', type=int, help='Column name for chrX count', required=True) + parser.add_argument('--chrY_count', type=int, help='Column name for chrY count', required=True) + + args = parser.parse_args() + + process_data(args.file, args.ancestry, args.ancestryID, args.chrX_count, args.chrY_count) diff --git a/scripts/ufc_helpers/sex_ancestry_accuracy_calc.py b/scripts/ufc_helpers/sex_ancestry_accuracy_calc.py new file mode 100644 index 0000000..25fa387 --- /dev/null +++ b/scripts/ufc_helpers/sex_ancestry_accuracy_calc.py @@ -0,0 +1,47 @@ +import pandas as pd + +def process_data(df): + # Task 1: Calculate the accuracy prediction for chrX_count, chrY_count, and SEX + condition1 = (df['chrX_count'] == 1) & (df['chrY_count'] == 1) & (df['Sex'] == 'MALE') + condition2 = (df['chrX_count'] == 2) & (df['chrY_count'] == 0) & (df['Sex'] == 'FEMALE') + + # Omit rows not meeting the conditions + filtered_df = df[condition1 | condition2] + + correct_rows = filtered_df.shape[0] + incorrect_rows = df.shape[0] - correct_rows + + accuracy = correct_rows / (correct_rows + incorrect_rows) + + print(f"\nAccuracy Prediction for chrX_count, chrY_count, and SEX: {accuracy:.2%}") + print(f"Instances omitted: {incorrect_rows}") + + # Task 2: Calculate accuracy for all Population and Ancestry pairings + df['Ancestry'] = df['Population'].astype(str) + '_' + df['ancestry'].astype(str) + pairings_count = df.groupby(['ancestry', 'Population']).size().reset_index(name='Count') + + print("\nCount of Ancestry Population pairings:") + print(pairings_count) + + return accuracy + +def main(): + # Replace 'your_data.tsv' with the actual file path + file_path = '/Users/noah/Desktop/test_sample.tsv' + + # Read the TSV file into a DataFrame + df = pd.read_csv(file_path, sep='\t') + + # Task for each individual cohort + cohorts = df['Cohort'].unique() + for cohort in cohorts: + print(f"\nProcessing cohort: {cohort}") + cohort_df = df[df['Cohort'] == cohort] + process_data(cohort_df) + + # Task for the combined cohort + print("\nProcessing combined cohort:") + process_data(df) + +if __name__ == "__main__": + main() diff --git a/sex_infer.R b/scripts/ufc_helpers/sex_infer.R similarity index 63% rename from sex_infer.R rename to scripts/ufc_helpers/sex_infer.R index 70ed09f..16d1564 100644 --- a/sex_infer.R +++ b/scripts/ufc_helpers/sex_infer.R @@ -1,4 +1,4 @@ -library(dplyr) +library(dplyr, quietly = TRUE) estimate_chromosomes_processing <- function(file_path) { # Create an R data frame with chromosome numbers and lengths @@ -12,10 +12,10 @@ estimate_chromosomes_processing <- function(file_path) { ) # Read the TSV file into a data frame - df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6"), fill = TRUE) + df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE) - # Remove the 5th and 6th columns - df <- df[, -c(5, 6)] + # Keep only the first four columns + df <- df[, 1:4] # Find the row index where the specified header pattern is located header_row <- grep("CONTIG", df$Chromosome) @@ -43,50 +43,28 @@ estimate_chromosomes <- function(df) { autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")] chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"] chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"] - - # Calculate mean and standard deviation of autosomal counts + + # Calculate mean of autosomal counts autosomal_mean <- mean(autosomal_cov) - autosomal_sd <- sd(autosomal_cov) - - # Set thresholds based on mean and standard deviation - threshold_one_copy <- autosomal_mean - 3 * autosomal_sd - threshold_two_copies <- autosomal_mean - 2 * autosomal_sd - threshold_three_copies <- autosomal_mean + 2 * autosomal_sd - - # Initialize estimates - estimated_chrX <- 0 - estimated_chrY <- 0 - - # Iterate through each threshold and add 1 copy for every threshold it passes - if (max(chrX_cov) > threshold_one_copy) { - estimated_chrX <- estimated_chrX + 1 - } + #cat(chrX_cov,chrY_cov) + # Define values to compare proximity + zero_value <- 0 + one_copy_value <- autosomal_mean / 2 + two_copies_value <- autosomal_mean + three_copies_value <- 1.5 * autosomal_mean - if (max(chrX_cov) > threshold_two_copies) { - estimated_chrX <- estimated_chrX + 1 - } + # Find the closest value for chrX_cov + closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov)) - if (max(chrX_cov) > threshold_three_copies) { - estimated_chrX <- estimated_chrX + 1 - } - - if (max(chrY_cov) > threshold_one_copy) { - estimated_chrY <- estimated_chrY + 1 - } - - if (max(chrY_cov) > threshold_two_copies) { - estimated_chrY <- estimated_chrY + 1 - } - - if (max(chrY_cov) > threshold_three_copies) { - estimated_chrY <- estimated_chrY + 1 - } + # Find the closest value for chrY_cov + closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov)) # Return the estimates in the specified format - return(paste(estimated_chrX, estimated_chrY, sep = "-")) + return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-")) } + # Extract the file path from the command-line arguments file_path <- commandArgs(trailingOnly = TRUE)[1] @@ -94,3 +72,4 @@ file_path <- commandArgs(trailingOnly = TRUE)[1] result_df <- estimate_chromosomes_processing(file_path) result <- estimate_chromosomes(result_df) cat(result) + diff --git a/wdl/.DS_Store b/wdl/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/wdl/.DS_Store differ diff --git a/wdl/Coding_Convergence.gsc.wdl b/wdl/Coding_Convergence.gsc.wdl new file mode 100644 index 0000000..4c96987 --- /dev/null +++ b/wdl/Coding_Convergence.gsc.wdl @@ -0,0 +1,215 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +version 1.0 + +# Analysis of germline variants and somatic drivers in coding regions +# This workflow identifies pathogenic and high-impact variants in germline data, as well as somatic driver mutations in coding regions. + +# Task to extract coding regions from a somatic table +task Extract_Coding_Regions { + input { + File germline_somatic_table + String cancer_type + } + command <<< + #Only interested in coding variants + cut -f1,2,3 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_coding.list + cut -f1,7,8 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > somatic_coding.list + >>> + runtime { + docker: "ubuntu:latest" + } + output { + File germline_coding_list = "germline_coding.list" + File somatic_coding_list = "somatic_coding.list" + } +} + +# Task to extract pathogenic and high-impact germline variants +task Extract_Germline_Variants { + input { + File germline_merged_vep_vcf + File germline_somatic_table + File germline_genes + String patient_id + } + + command <<< + # Extract variants present only in the germline data + bcftools view -s ~{patient_id} ~{germline_merged_vep_vcf} -o sample.vcf + bcftools view -i 'AC>0 & GT="alt"' sample.vcf -o germline_only.vcf + + # Split VEP annotations and filter for relevant information + bcftools +split-vep -f '%CHROM|%POS|%CSQ/SYMBOL|%CSQ/IMPACT|%CSQ/gnomAD_AF_nfe|%CSQ/gnomAD_AF_popmax|%CSQ/gnomAD_controls_AF_popmax|%CSQ/ClinVar_external_CLNSIG' germline_only.vcf | cut -d'|' -f1,2,5,6,39-43 | awk '{gsub(/\|/, "\t");print}' | cut -d',' -f1 | awk -F'\t' '($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $9 < 0.02 || $9 == "." || $9 == "") &&($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $7 < 0.02 || $7 == "." || $7 == "") && ($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $8 < 0.02 || $8 == "." || $8 == "")' | grep -Ev 'Benign|Likely_benign|LOW' | grep -Fwf ~{germline_genes} > query.tsv + + # Extract potential deleterious germline genes + cut -f4 query.tsv | sort | uniq > potential_deleterious_germline_genes.list + touch c1.list + touch c2.list + touch c3.list + touch pathogenic_germline_genes.list + + # Condition 1: Pathogenic or Likely pathogenic by ClinVar + grep -E 'Pathogenic|Likely_pathogenic' query.tsv | cut -f4 | sort | uniq > c1.list + + # Condition 2: Tumor suppressor gene (TSG) or Mutation-Type includes D,N,F + while IFS= read -r gene; do + TSG=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | grep 'TSG' | awk 'END {print NR}') + MT=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | cut -f5 | grep -E 'D|N|F' | awk 'END {print NR}') + if ((TSG + MT > 0)); then + echo "$gene" >> c2.list + fi + done < potential_deleterious_germline_genes.list + + # Condition 3: Oncogene (ONC) or Mutation-Type includes Mis + while IFS= read -r gene; do + ONC=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | grep 'ONC' | awk 'END {print NR}') + MIS=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | cut -f5 | grep 'Mis' | awk 'END {print NR}') + if ((ONC + MIS > 0)); then + echo "$gene" >> c3.list + fi + done < potential_deleterious_germline_genes.list + + # Determine pathogenic germline genes based on conditions + while IFS= read -r gene; do + # If condition 1 is met + if [ $(grep "$gene" c1.list | awk 'END {print NR}') -gt 0 ];then + echo "$gene" >> pathogenic_germline_genes.list + # If condition 2 but not 3 is met + elif [ $(grep "$gene" c2.list | awk 'END {print NR}') -gt 0 ] && [ $(grep "$gene" c3.list | awk 'END {print NR}') -eq 0 ]; then + if [ $(grep "$gene" query.tsv | grep 'HIGH' | awk 'END {print NR}') -gt 0 ]; then + echo "$gene" >> pathogenic_germline_genes.list + fi + # If condition 3 but not 2 is met + elif [ $(grep "$gene" c2.list | awk 'END {print NR}') -eq 0 ] && [ $(grep "$gene" c3.list | awk 'END {print NR}') -gt 0 ]; then + if [ $(grep "$gene" query.tsv | grep 'MODERATE' | awk 'END {print NR}') -gt 0 ]; then + echo "$gene" >> pathogenic_germline_genes.list + fi + # If condition 2 and 3 is met or none of the conditions are met + else + if [ $(grep "$gene" query.tsv | grep -E 'HIGH|MODERATE' | awk 'END {print NR}') -gt 0 ]; then + echo "$gene" >> pathogenic_germline_genes.list + fi + fi + done < potential_deleterious_germline_genes.list + + sort pathogenic_germline_genes.list | uniq > ~{patient_id}.txt + + + >>> + + runtime { + docker: "vanallenlab/bcftools" + } + output{ + File out = "~{patient_id}.txt" + } +} + +# Extract somatic driver mutations based on somatic genes +task Extract_Somatic_Drivers { + input { + File somatic_tsv + File somatic_genes + String patient_id + } + command <<< + # Extract somatic driver mutations based on somatic genes + cat ~{somatic_tsv} | cut -f3 | grep -Fwf ~{somatic_genes} | sort | uniq >> ~{patient_id}.somatic_drivers.txt + >>> + + runtime { + docker: "ubuntu:latest" + } + output{ + File out = "~{patient_id}.somatic_drivers.txt" + } + +} + +task Output_Mutations { + input { + File germline_variants + File somatic_drivers + String patient_id + File germline_genes + File somatic_genes + } + + command <<< + # Initialize output string with patient ID + output="~{patient_id}" + + # Check each germline gene for pathogenic variants + while IFS=$'\t' read -r germline_gene; do + if [ $(grep -F "$germline_gene" ~{germline_variants} | awk 'END {print NR}' ) -gt 0 ]; then + output="${output} 1" + else + output="${output} 0" + fi + done < ~{germline_genes} + + # Check each somatic gene for driver mutations + while IFS=$'\t' read -r somatic_gene; do + if [ $(grep -F "$somatic_gene" ~{somatic_drivers} | awk 'END {print NR}' ) -gt 0 ]; then + output="${output} 1" + else + output="${output} 0" + fi + done < ~{somatic_genes} + + # Write output string to file + echo "$output" > ~{patient_id}_info.txt + >>> + + runtime { + docker: "ubuntu:latest" + } + output{ + File out = "~{patient_id}_info.txt" + } +} + + +# Define workflow +workflow convergence { + input{ + File germline_merged_vep_vcf # Merged VCF file annotated with VEP + File germline_somatic_table # Germline-Somatic table file containing convergence pairs + File somatic_tsv # Somatic TSV file containing mutation information + String patient_id # Unique identifier for the patient + String cancer_type # Type of cancer being analyzed + } + call Extract_Coding_Regions { + input: + germline_somatic_table = germline_somatic_table, + cancer_type = cancer_type + } + call Extract_Germline_Variants { + input: + germline_merged_vep_vcf = germline_merged_vep_vcf, + germline_genes = Extract_Coding_Regions.germline_coding_list, + germline_somatic_table = germline_somatic_table, + patient_id = patient_id + } + call Extract_Somatic_Drivers { + input: + somatic_genes = Extract_Coding_Regions.somatic_coding_list, + somatic_tsv = somatic_tsv, + patient_id = patient_id + } + call Output_Mutations { + input: + patient_id = patient_id, + somatic_drivers = Extract_Somatic_Drivers.out, + germline_variants = Extract_Germline_Variants.out, + germline_genes = Extract_Coding_Regions.germline_coding_list, + somatic_genes = Extract_Coding_Regions.somatic_coding_list + } + output{ + File convergence = Output_Mutations.out + } +} \ No newline at end of file diff --git a/wdl/Consolidate_Cohort_Data.wdl b/wdl/Consolidate_Cohort_Data.wdl new file mode 100644 index 0000000..cf82b22 --- /dev/null +++ b/wdl/Consolidate_Cohort_Data.wdl @@ -0,0 +1,173 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# The purpose of this code is to consolidate data for each G2C cohort. + +version 1.0 + +task get_individuals { + input{ + String cohort + } + + command <<< + gsutil cat "gs://dfci-g2c-inputs/sample-lists/~{cohort}.samples.list" | awk 'NF > 0' > samples.list + >>> + + runtime { + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + output{ + Array[String] out = read_lines("samples.list") + } +} + +task unzip_ploidy_tsv { + input{ + String ploidy_location + } + + command <<< + gsutil -m cp ~{ploidy_location} ploidy.tsv.gz + gunzip ploidy.tsv.gz + >>> + + runtime { + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + output{ + File out = "ploidy.tsv" + } +} + + +task get_sample_data{ + input{ + String sample + String cohort + File ploidy_tsv + } + command <<< + charr_output=$(gsutil cat gs://dfci-g2c-inputs/~{cohort}/charr/~{sample}.txt | cut -f2-9 | tail -n 1) + if [ -z "$charr_output" ]; then + charr_output="NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA" + fi + demographics_output=$(gsutil cat gs://dfci-g2c-inputs/~{cohort}/demographics/~{sample}.txt | cut -f2-5,7-9,13-15 | sed -n '2p') + if [ -z "$demographics_output" ]; then + demographics_output="NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA" + fi + melt_INS_count=$(gsutil cat gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/melt_~{sample}.vcf.tsv | grep 'vcf_INS_count' | cut -f2) + if [ -z "$melt_INS_count" ] || [ "$melt_INS_count" -eq 0 ]; then + melt_INS_count="NA" + fi + + gsutil -m cp gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/manta_~{sample}.vcf.tsv manta_~{cohort}.tmp + gsutil -m cp gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/wham_~{sample}.vcf.tsv wham_~{cohort}.tmp + gsutil -m cp gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample}.raw-counts.tsv metrics_~{cohort}.tmp + + rd_median=$(grep 'rd_q50' metrics_~{cohort}.tmp | cut -f2) + if [ -z "$rd_median" ] || [ "$rd_median" -eq 0 ]; then + rd_median="NA" + fi + + rd_mean=$(grep 'rd_mean' metrics_~{cohort}.tmp | cut -f2) + if [ -z "$rd_mean" ] || [ "$rd_mean" -eq 0 ]; then + rd_mean="NA" + fi + + manta_DEL_count=$(grep 'vcf_DEL_count' manta_~{cohort}.tmp | cut -f2) + if [ -z "$manta_DEL_count" ] || [ "$manta_DEL_count" -eq 0 ]; then + manta_DEL_count="NA" + fi + + manta_DUP_count=$(grep 'vcf_DUP_count' manta_~{cohort}.tmp | cut -f2) + if [ -z "$manta_DUP_count" ] || [ "$manta_DUP_count" -eq 0 ]; then + manta_DUP_count="NA" + fi + + manta_INS_count=$(grep 'vcf_INS_count' manta_~{cohort}.tmp | cut -f2) + if [ -z "$manta_INS_count" ] || [ "$manta_INS_count" -eq 0 ]; then + manta_INS_count="NA" + fi + + manta_INV_count=$(grep 'vcf_INV_count' manta_~{cohort}.tmp | cut -f2) + if [ -z "$manta_INV_count" ] || [ "$manta_INV_count" -eq 0 ]; then + manta_INV_count="NA" + fi + + manta_BND_count=$(grep 'vcf_BND_count' manta_~{cohort}.tmp | cut -f2) + if [ -z "$manta_BND_count" ] || [ "$manta_BND_count" -eq 0 ]; then + manta_BND_count="NA" + fi + + wham_DEL_count=$(grep 'vcf_DEL_count' wham_~{cohort}.tmp | cut -f2) + if [ -z "$wham_DEL_count" ] || [ "$wham_DEL_count" -eq 0 ]; then + wham_DEL_count="NA" + fi + + wham_DUP_count=$(grep 'vcf_DUP_count' wham_~{cohort}.tmp | cut -f2) + if [ -z "$wham_DUP_count" ] || [ "$wham_DUP_count" -eq 0 ]; then + wham_DUP_count="NA" + fi + + ploidy_estimate=$(awk -F'\t' '$1 == "~{cohort}" {print}' ~{ploidy_tsv} | awk -v sample="~{sample}" '{ if ($2==sample) print }' | cut -f3-26,28-30) + if [ -z "$ploidy_estimate" ]; then + ploidy_estimate="NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA" + fi + echo -e "~{sample}\t~{cohort}\t${demographics_output}\t${charr_output}\t${rd_median}\t${rd_mean}\t${manta_DEL_count}\t${manta_DUP_count}\t${manta_INS_count}\t${manta_INV_count}\t${manta_BND_count}\t${melt_INS_count}\t${wham_DEL_count}\t${wham_DUP_count}\t${ploidy_estimate}" > ~{sample}.txt + >>> + runtime { + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + output{ + String out = read_string("~{sample}.txt") + } +} + +task write_output { + input { + String cohort + Array[String] data + } + command <<< + echo -e "Sample\tCohort\tgrafpop_SNPs\tgrafpop_GD1\tgrafpop_GD2\tgrafpop_GD3\tpct_EUR\tpct_AFR\tpct_ASN\tchrX_count\tchrY_count\tancestry\tHQ_HOM\tHQ_HOM_RATE\tHQ_HET\tHQ_HET_RATE\tCHARR\tMEAN_REF_AB_HOM_ALT\tHETEROZYGOSITY_RATE\tINCONSISTENT_AB_HET_RATE\trd_median\trd_mean\tmanta_DEL_count\tmanta_DUP_count\tmanta_INS_count\tmanta_INV_count\tmanta_BND_count\tmelt_INS_count\twham_DEL_count\twham_DUP_count\tchr1_CopyNumber\tchr2_CopyNumber\tchr3_CopyNumber\tchr4_CopyNumber\tchr5_CopyNumber\tchr6_CopyNumber\tchr7_CopyNumber\tchr8_CopyNumber\tchr9_CopyNumber\tchr10_CopyNumber\tchr11_CopyNumber\tchr12_CopyNumber\tchr13_CopyNumber\tchr14_CopyNumber\tchr15_CopyNumber\tchr16_CopyNumber\tchr17_CopyNumber\tchr18_CopyNumber\tchr19_CopyNumber\tchr20_CopyNumber\tchr21_CopyNumber\tchr22_CopyNumber\tchrX_CopyNumber\tchrY_CopyNumber\tmedian_coverage\twgd_score\tnondiploid_bins" > ~{cohort}.consolidated_data.tsv + cat ~{write_lines(data)} >> ~{cohort}.consolidated_data.tsv + gsutil -m cp ~{cohort}.consolidated_data.tsv gs://fc-secure-826914ff-6f0b-48bd-b6b1-1002dbffd5a3/ + >>> + + runtime { + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + output{ + } +} + +workflow consolidate { + input { + String cohort + String ploidy_location + } + call get_individuals{ + input: + cohort = cohort + } + call unzip_ploidy_tsv{ + input: + ploidy_location = ploidy_location + } + scatter(i in get_individuals.out){ + call get_sample_data{ + input: + sample = i, + cohort = cohort, + ploidy_tsv = unzip_ploidy_tsv.out + } + } + call write_output { + input: + cohort = cohort, + data = get_sample_data.out + } +} diff --git a/wdl/G2C_Auto_File_Transfer.wdl b/wdl/G2C_Auto_File_Transfer.wdl new file mode 100644 index 0000000..6960d21 --- /dev/null +++ b/wdl/G2C_Auto_File_Transfer.wdl @@ -0,0 +1,224 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +#The purpose of this code is to transfer files from Terra to Google Cloud Storage. + + +version 1.0 + + +task gatk_hc_reblocked{ + input { + #Go into /cohort/gatk-hc/reblocked + File reblocked_gvcf + File reblocked_gvcf_idx + String sample_id + String cohort + } + Int disk_gb = ceil(1.5 * size(reblocked_gvcf, "GB")) + 10 + String reblocked_gvcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-hc/reblocked/~{sample_id}.reblocked.g.vcf.gz" + String reblocked_gvcf_idx_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-hc/reblocked/~{sample_id}.reblocked.g.vcf.gz.tbi" + command <<< + gsutil -m cp ~{reblocked_gvcf} ~{reblocked_gvcf_pathway} + gsutil -m cp ~{reblocked_gvcf_idx} ~{reblocked_gvcf_idx_pathway} + >>> + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + disks: "local-disk " + disk_gb + " HDD" + } + +} + +task gatk_sv_coverage{ + input { + #Go into /cohort/gatk-sv/coverage + File coverage_counts + String sample_id + String cohort + } + String coverage_counts_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/coverage/~{sample_id}.counts.tsv.gz" + + command <<< + gsutil -m cp ~{coverage_counts} ~{coverage_counts_pathway} + >>> + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } +} + +task gatk_sv_metrics{ + input { + #Go into /cohort/gatk-sv/metrics + Array[File] sample_metric_files + String sample_id + String cohort + } + String pe_file_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample_id}.pe-file.tsv" + String raw_counts_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample_id}.raw-counts.tsv" + String sr_file_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample_id}.sr-file.tsv" + String manta_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/manta_~{sample_id}.vcf.tsv" + String melt_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/melt_~{sample_id}.vcf.tsv" + String wham_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/wham_~{sample_id}.vcf.tsv" + + command <<< + gsutil -m cp ~{sep=" " sample_metric_files} "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/" + + >>> + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } +} + +task gatk_sv_pesr{ + input { + #Go into /cohort/gatk-sv/pesr/ + File pesr_disc + File pesr_disc_index + File pesr_sd + File pesr_sd_index + File pesr_split + File pesr_split_index + String sample_id + String cohort + } + String pesr_disc_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.pe.txt.gz" + String pesr_disc_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.pe.txt.gz.tbi" + + String pesr_sd_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sd.txt.gz" + String pesr_sd_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sd.txt.gz.tbi" + + String pesr_split_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sr.txt.gz" + String pesr_split_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sr.txt.gz.tbi" + + command <<< + gsutil -m cp ~{pesr_disc} ~{pesr_disc_pathway} + gsutil -m cp ~{pesr_disc_index} ~{pesr_disc_index_pathway} + + gsutil -m cp ~{pesr_sd} ~{pesr_sd_pathway} + gsutil -m cp ~{pesr_sd_index} ~{pesr_sd_index_pathway} + + gsutil -m cp ~{pesr_split} ~{pesr_split_pathway} + gsutil -m cp ~{pesr_split_index} ~{pesr_split_index_pathway} + >>> + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } + +} +task manta{ + input { + #Go into /cohort/manta + File manta_vcf + File manta_vcf_tbi + String sample_id + String cohort + } + Int disk_gb = ceil(1.5 * size(manta_vcf, "GB")) + 10 + String manta_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/manta/~{sample_id}.manta.vcf.gz" + String manta_vcf_tbi_pathway = "gs://dfci-g2c-inputs/~{cohort}/manta/~{sample_id}.manta.vcf.gz.tbi" + + command <<< + gsutil -m cp ~{manta_vcf} ~{manta_vcf_pathway} + gsutil -m cp ~{manta_vcf_tbi} ~{manta_vcf_tbi_pathway} + >>> + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + disks: "local-disk " + disk_gb + " HDD" + } + +} + +task melt{ + input { + #Go into /cohort/melt + File melt_vcf + File melt_vcf_tbi + String sample_id + String cohort + + } + Int disk_gb = ceil(1.5 * size(melt_vcf, "GB")) + 10 + String melt_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/melt/~{sample_id}.melt.vcf.gz" + String melt_vcf_tbi_pathway = "gs://dfci-g2c-inputs/~{cohort}/melt/~{sample_id}.melt.vcf.gz.tbi" + + command <<< + gsutil -m cp ~{melt_vcf} ~{melt_vcf_pathway} + gsutil -m cp ~{melt_vcf_tbi} ~{melt_vcf_tbi_pathway} + >>> + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + disks: "local-disk " + disk_gb + " HDD" + } +} + +task wham{ + input { + #Go into /cohort/wham + File wham_vcf + File wham_index + String sample_id + String cohort + } + Int disk_gb = ceil(1.5 * size(wham_vcf, "GB")) + 10 + #Should these be Strings or File variables + String wham_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/wham/~{sample_id}.wham.vcf.gz" + String wham_vcf_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/wham/~{sample_id}.wham.vcf.gz.tbi" + + command <<< + gsutil -m cp ~{wham_vcf} ~{wham_vcf_pathway} + gsutil -m cp ~{wham_index} ~{wham_vcf_index_pathway} + >>> + + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + disks: "local-disk " + disk_gb + " HDD" + } +} + + +# Define workflow +workflow store_in_gcs { + + input { + String cohort + String sample_id + } + + call gatk_hc_reblocked{ + input: + sample_id = sample_id, + cohort = cohort + } + call gatk_sv_coverage{ + input: + sample_id = sample_id, + cohort = cohort + } + call gatk_sv_metrics{ + input: + sample_id = sample_id, + cohort = cohort + } + call gatk_sv_pesr{ + input: + sample_id = sample_id, + cohort = cohort + } + call manta{ + input: + sample_id = sample_id, + cohort = cohort + } + call melt{ + input: + sample_id = sample_id, + cohort = cohort + } + call wham{ + input: + sample_id = sample_id, + cohort = cohort + } +} \ No newline at end of file diff --git a/wdl/GSC.wdl b/wdl/GSC.wdl new file mode 100644 index 0000000..0deb2f0 --- /dev/null +++ b/wdl/GSC.wdl @@ -0,0 +1,244 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# This code will gather germline and somatic convergence. + + +version 1.0 + +import "https://raw.githubusercontent.com/vanallenlab/pancan_germline_wgs/infer_sex_ancestry/wdl/Coding_Convergence.wdl" as GSC_CC + +# The workflow gathers germline-somatic convergence for all combinations. +workflow Convergence { + input{ + Array[File] germline_noncoding_vcfs + Array[File] somatic_noncoding_vcfs + Array[File] somatic_noncoding_vcf_idxs + Array[File] somatic_tsv_arr + Array[String] patient_ids + File germline_merged_vep_vcf + File germline_somatic_table + File noncoding_germline_somatic_table + File somatic_noncoding_hotspots_bed + String cancer_type + } + + scatter(patient_index in range(length(patient_ids))){ + call GSC_CC.convergence { + input: + id = patient_ids[patient_index], + cancer_type = cancer_type, + somatic_tsv = somatic_tsv_arr[patient_index], + germline_merged_vep_vcf = germline_merged_vep_vcf, + germline_somatic_table = germline_somatic_table + } + + call Extract_Noncoding_Germline_SNPs { + input: + patient_id = patient_ids[patient_index], + cancer_type = cancer_type, + germline_noncoding_vcf = germline_noncoding_vcfs[patient_index], + noncoding_germline_somatic_table = noncoding_germline_somatic_table + } + + call Extract_Somatic_Noncoding_Mutations { + input: + somatic_noncoding_vcf = somatic_noncoding_vcfs[patient_index], + somatic_noncoding_vcf_idx = somatic_noncoding_vcf_idxs[patient_index], + somatic_noncoding_hotspots_bed = somatic_noncoding_hotspots_bed, + patient_id = patient_ids[patient_index], + cancer_type = cancer_type + } + + call Concatnate_Strings { + input: + coding_str = convergence.convergence, + germline_noncoding_str = Extract_Noncoding_Germline_SNPs.out, + somatic_noncoding_str = Extract_Somatic_Noncoding_Mutations.out + } + } + + call Write_tsv { + input: + cancer_type = cancer_type, + patients_info = Concatnate_Strings.out, + germline_somatic_table = germline_somatic_table, + noncoding_germline_somatic_table = noncoding_germline_somatic_table, + somatic_noncoding_hotspots_bed = somatic_noncoding_hotspots_bed + } + + output{ + File out = Write_tsv.out + } +} + + +# Task to Extract Noncoding Germline SNPs +task Extract_Noncoding_Germline_SNPs { + input { + File germline_noncoding_vcf # Germline VCF file + File noncoding_germline_somatic_table # Table defining noncoding germline somatic regions + String cancer_type # Type of cancer being analyzed + String patient_id # Unique identifier for the sample + } + + command <<< + # Create an empty file to store extracted SNPs + touch ~{patient_id}.vars.txt + + # Extract SNP IDs from the noncoding germline somatic table for the specified cancer type + cut -f1,2 < ~{noncoding_germline_somatic_table} | tail -n +2 | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_noncoding.list + + # Initialize an empty string to store SNP information + out="" + + # Extract CHROM, POS, REF, ALT, and GT fields from the VCF file + bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t[%GT]\n' ~{germline_noncoding_vcf} | awk 'BEGIN {FS="\t"; OFS="\t"} { + split($5, a, "/") + if (a[1] == "1" && a[2] == "1") { + print $1":"$2"-"$4, "HOM" + } else if (a[1] == "0" && a[2] == "1") { + print $1":"$2"-"$3, "HET" + print $1":"$2"-"$4, "HET" + } else if (a[1] == "1" && a[2] == "0") { + print $1":"$2"-"$3, "HET" + print $1":"$2"-"$4, "HET" + } else if (a[1] == "0" && a[2] == "0") { + print $1":"$2"-"$3, "HOM" + } + }' > query.txt + + # Filter the SNP table to get CHR, POS, RISK_ALLELE, then search through query.txt for it + grep -Fwf germline_noncoding.list query.txt > ~{patient_id}.vars.txt + + # Loop through each SNP in the SNP list + while IFS= read -r snp; do + # Check if SNP is homozygous for risk allele + if [ $(grep "$snp HOM" ~{patient_id}.vars.txt | awk 'END {print NR}') -gt 0 ]; then + out="${out} 2" # Append '2' to the output string + # Check if SNP is heterozygous + elif [ $(grep "$snp HET" ~{patient_id}.vars.txt | awk 'END {print NR}') -gt 0 ]; then + out="${out} 1" # Append '1' to the output string + else + out="${out} 0" # Append '0' to the output string + fi + done < germline_noncoding.list + + # Write the output string to a file + echo -e "$out" > ~{patient_id}.txt + >>> + + output { + File out = "~{patient_id}.txt" + } + runtime { + docker: "vanallenlab/bcftools" + } +} + +# Task to Extract Somatic Mutations in Noncoding Regions +task Extract_Somatic_Noncoding_Mutations { + input { + File somatic_noncoding_vcf # Somatic VCF file + File somatic_noncoding_vcf_idx # Index file for the somatic VCF + File somatic_noncoding_hotspots_bed # BED file defining noncoding regions + String patient_id # Unique identifier for the patient (tumor(s) for each patient are previously put into one VCF) + String cancer_type # Type of cancer the patient has + + } + command <<< + # Filter noncoding regions BED file to include only regions relevant to the specified cancer type + grep -i '~{cancer_type}' ~{somatic_noncoding_hotspots_bed} | sort > somatic_noncoding_hotspots.bed + + # Initialize an empty string to store somatic mutation counts + somatic_noncoding_str="" + + # Iterate through each noncoding regions + while IFS=$'\t' read -r CHROM START END TISSUE GENE; do + + # Count somatic mutations within the region using bcftools + somatic_mutation_count=$(bcftools view -H -r $CHROM:$START-$END ~{somatic_noncoding_vcf} | wc -l) + + # Append the mutation count to the somatic mutations string + somatic_noncoding_str="$somatic_noncoding_str $somatic_mutation_count" + done < somatic_noncoding_hotspots.bed + + # Write somatic mutation counts to a text file + echo "$somatic_noncoding_str" > noncoding_somatic_mutation_counts.txt + >>> + + output { + File out = "noncoding_somatic_mutation_counts.txt" + } + runtime { + docker: "vanallenlab/bcftools" + } +} + +# Task to Extract Somatic Mutations in Noncoding Regions +task Write_tsv{ + input{ + Array[File] patients_info + File germline_somatic_table + File noncoding_germline_somatic_table + File somatic_noncoding_hotspots_bed + String cancer_type + } + + command <<< + #Only interested in coding variants + cut -f1,2,3 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_coding.list + cut -f1,7,8 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > somatic_coding.list + cut -f1,2 < ~{noncoding_germline_somatic_table} | tail -n +2 | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_noncoding.list + grep -i '~{cancer_type}' ~{somatic_noncoding_hotspots_bed} | sort | cut -f5 > noncoding_somatic_regions.list + + header="patient_id" + while IFS= read -r gene; do + header="${header} $gene-g" + done < germline_coding.list + while IFS= read -r gene; do + header="${header} $gene-s" + done < somatic_coding.list + while IFS= read -r snp; do + header="${header} $snp" + done < germline_noncoding.list + while IFS= read -r gene_region; do + header="${header} $gene_region-ncs" + done < noncoding_somatic_regions.list + + echo "$header" > ~{cancer_type}.tsv + cat ~{write_lines(patients_info)} >> patient_files.list + while IFS= read -r file;do + cat "$file" >> ~{cancer_type}.tsv + done < patient_files.list + >>> + runtime { + docker: "ubuntu:latest" + } + output { + File out = "~{cancer_type}.tsv" + } +} + +task Concatnate_Strings { + input{ + File coding_str + File germline_noncoding_str + File somatic_noncoding_str + } + command <<< + str1=$(cat ~{coding_str}) + str2=$(cat ~{germline_noncoding_str}) + str3=$(cat ~{somatic_noncoding_str}) + echo "$str1$str2$str3" > out.txt + >>> + runtime{ + docker: "ubuntu:latest" + } + output { + File out = "out.txt" + } +} + diff --git a/wdl/Infer_Sex_and_Ancestry.92.wdl b/wdl/Infer_Sex_and_Ancestry.92.wdl new file mode 100644 index 0000000..e357b54 --- /dev/null +++ b/wdl/Infer_Sex_and_Ancestry.92.wdl @@ -0,0 +1,127 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +#The purpose of this code is to infer sex and ancestry +#from variant call data. + + +version 1.0 + +task get_chr_count { + input { + File counts_coverage_file + } + + command <<< + Rscript /opt/pancan_germline_wgs/scripts/ufc_helpers/sex_infer.R ~{counts_coverage_file} > log.txt + + head -n1 log.txt | cut -f1 -d'-' > chrX.txt + head -n1 log.txt | cut -f2 -d'-' > chrY.txt + >>> + + runtime { + docker: "vanallenlab/g2c_pipeline:nf_sex_ancestry_alpha" + memory: "8 GB" # Adjust the amount based on your requirements + } + output { + Int chrX = read_int("chrX.txt") + Int chrY = read_int("chrY.txt") + File output_log = "log.txt" + } +} + +task organize { + input { + File raw_ancestry_file + String sample_ID + Int ancestryID + String ancestry + Int chrX_count + Int chrY_count + } + + command <<< + /opt/pancan_germline_wgs/scripts/ufc_helpers/organize_results.py --file ~{raw_ancestry_file} --ancestry ~{ancestry} --ancestryID ~{ancestryID} --chrX_count ~{chrX_count} --chrY_count ~{chrY_count} > ~{sample_ID}_demo.txt + >>> + + runtime { + docker: "vanallenlab/g2c_pipeline:nf_sex_ancestry_alpha" + memory: "8 GB" # Adjust the amount based on your requirements + } + output { + File output_file = "~{sample_ID}_demo.txt" + } +} + +task infer_ancestry{ + input { + File gvcf_file + #Int memory_var = 60 + #Int? bootDiskSizeGb_var + String sample_ID + } + Int memory_var = ceil(25 * size(gvcf_file, "GB")) + Int bootDiskSizeGb_var = ceil(6 * size(gvcf_file, "GB")+10) + + command <<< + cp /AncInferSNPs.txt . + grafpop ~{gvcf_file} ~{sample_ID}_ancestry_raw.txt + /opt/infer_ancestry.py ~{sample_ID}_ancestry_raw.txt > tmp.txt #outputs to ancestryID.txt and ancestry.txt + + # Extract the first line and save it to 'ancestry.txt' + head -n 1 tmp.txt > ancestry.txt + + # Extract the second line and save it to 'ancestryID.txt' + tail -n 1 tmp.txt > ancestryID.txt + + >>> + runtime{ + docker: "vanallenlab/g2c_pipeline:nf_graf_plink_alpha" + memory: "~{memory_var} GB" # Adjust the amount based on your requirements + bootDiskSizeGb: bootDiskSizeGb_var # Adjust the amount based on your requirements + } + output{ + Int Ancestry_ID = read_int("ancestryID.txt") + String Ancestry = read_string("ancestry.txt") + File grafpop_output_file = "~{sample_ID}_ancestry_raw.txt" + + + } +} + +# Define workflow +workflow infer_sex_ancestry { + + input { + File counts_coverage_file + File gvcf_file + String sample_ID + + } + + call get_chr_count{ + input: + counts_coverage_file = counts_coverage_file, + } + call infer_ancestry{ + input: + gvcf_file = gvcf_file, + sample_ID = sample_ID + } + call organize{ + input: + raw_ancestry_file = infer_ancestry.grafpop_output_file, + sample_ID = sample_ID, + ancestryID = infer_ancestry.Ancestry_ID, + ancestry = infer_ancestry.Ancestry, + chrX_count = get_chr_count.chrX, + chrY_count = get_chr_count.chrY, + } + + output{ + File demographics_file = organize.output_file + } +} + diff --git a/wdl/Vep.wdl b/wdl/Vep.wdl index cc67dcb..6278d20 100644 --- a/wdl/Vep.wdl +++ b/wdl/Vep.wdl @@ -27,7 +27,7 @@ workflow Vep { Array[File?] other_vep_files # All other files needed for VEP. These will be localized in full to each VM and moved to execution directory. Array[String] vep_options = [""] - String vep_assembly = "GRCh38" + String vep_assembly = "GRCh37" Int vep_version = 110 Boolean shard_vcfs = true diff --git a/wdl/Vep_hg19.wdl b/wdl/Vep_hg19.wdl new file mode 100644 index 0000000..a0a2277 --- /dev/null +++ b/wdl/Vep_hg19.wdl @@ -0,0 +1,149 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Ryan L. Collins and the Dana-Farber Cancer Institute +# Contact: Ryan Collins +# Distributed under the terms of the GNU GPL v2.0 + +# WDL to run VEP on one or more input VCFs + + +version 1.0 + + +workflow Vep { + input { + Array[File] vcfs + Array[File] vcf_idxs + + File reference_fasta + File vep_cache_tarball # VEP cache tarball downloaded from Ensembl + Array[String?] vep_remote_files # URIs for files stored in Google buckets that can be remotely sliced using tabix for each shard + Array[File?] vep_remote_file_indexes # Indexes corresponding to vep_remote_files + Array[File?] other_vep_files # All other files needed for VEP. These will be localized in full to each VM and moved to execution directory. + + Array[String] vep_options = [""] + String vep_assembly = "GRCh37" + Int vep_version = 110 + + Boolean shard_vcfs = false + Int remote_query_buffer = 2 + String? cohort_prefix + + String bcftools_docker + String vep_docker = "vanallenlab/g2c-vep:latest" + } + + Int n_remote_files = length(select_all(vep_remote_files)) + Boolean any_remote = n_remote_files > 0 + + + + Array[File?] all_other_vep_files = flatten(select_all([vep_remote_files, vep_remote_file_indexes])) + + call RunVep { + input: + vcf = vcfs[0], + vcf_idx = vcf_idxs[0], + reference_fasta = reference_fasta, + vep_cache_tarball = vep_cache_tarball, + other_vep_files = all_other_vep_files, + vep_options = vep_options, + vep_assembly = vep_assembly, + vep_version = vep_version, + docker = vep_docker + } + + + output { + File annotated_vcfs = RunVep.annotated_vcf + File annotated_vcf_idxs = RunVep.annotated_vcf_idx + } +} + + + +task RunVep { + input { + File vcf + File vcf_idx + + File reference_fasta + File vep_cache_tarball + Array[File?] other_vep_files + String vep_assembly + + Array[String] vep_options + Int vep_max_sv_size = 50 + Int vep_version = 110 + + Float mem_gb = 7.5 + Int n_cpu = 4 + Int? disk_gb + + String docker + } + + String out_filename = basename(vcf, ".vcf.gz") + ".vep.vcf.gz" + Int default_disk_gb = ceil(10 * size([vcf, vep_cache_tarball, reference_fasta], "GB")) + 50 + + command <<< + set -eu -o pipefail + + # Unpack contents of cache into $VEP_CACHE/ + # Note that $VEP_CACHE is a default ENV variable set in VEP docker + tar -xzvf ~{vep_cache_tarball} -C $VEP_CACHE/ + + # Relocate other_vep_files to execution directory + if [ ~{defined(other_vep_files)} == "true" ]; then + while read file; do + mv $file ./ + done < ~{write_lines(select_all(other_vep_files))} + fi + + #Removed '--nearest gene \' + #Removed '' + vep \ + --input_file ~{vcf} \ + --format vcf \ + --output_file ~{out_filename} \ + --vcf \ + --compress_output bgzip \ + --verbose \ + --force_overwrite \ + --species homo_sapiens \ + --assembly ~{vep_assembly} \ + --max_sv_size ~{vep_max_sv_size} \ + --offline \ + --cache \ + --dir_cache $VEP_CACHE/ \ + --cache_version ~{vep_version} \ + --dir_plugins $VEP_PLUGINS/ \ + --fasta ~{reference_fasta} \ + --minimal \ + --distance 10000 \ + --numbers \ + --hgvs \ + --no_escape \ + --symbol \ + --canonical \ + --domains \ + --merged \ + ~{sep=" " vep_options} + + tabix -f ~{out_filename} + + >>> + + output { + File annotated_vcf = "~{out_filename}" + File annotated_vcf_idx = "~{out_filename}.tbi" + } + + runtime { + docker: docker + memory: mem_gb + " GB" + cpu: n_cpu + disks: "local-disk " + select_first([disk_gb, default_disk_gb]) + " HDD" + bootDiskSizeGb: 25 + preemptible: 3 + } +} \ No newline at end of file diff --git a/wdl/WgsFastqToCram.wdl b/wdl/WgsFastqToCram.wdl index e3751d4..78fe90d 100644 --- a/wdl/WgsFastqToCram.wdl +++ b/wdl/WgsFastqToCram.wdl @@ -58,6 +58,7 @@ workflow WgsFastqToCram { Float? bwa_mem_gb Int? bwa_num_cpu Int? bwa_disk_size + Int? gather_bam_files_disk_size } # Clean up FASTQs with fastq-pair @@ -248,7 +249,7 @@ workflow WgsFastqToCram { output_bam_basename = sample_name, docker_image = gatk_docker, gatk_path = gatk_path, - disk_size = ceil(3 * size(AddRg.bam_wRG, "GB")) + 10, + disk_size = select_first([gather_bam_files_disk_size, ceil(3 * size(AddRg.bam_wRG, "GB")) + 10]), preemptible_tries = 5, compression_level = 2 } diff --git a/wdl/aou_charr.wdl b/wdl/aou_charr.wdl new file mode 100644 index 0000000..acf83af --- /dev/null +++ b/wdl/aou_charr.wdl @@ -0,0 +1,41 @@ +version 1.0 + +import "charr.wdl" as charr + +workflow aou_charr { + input { + File samples_file + String vcf_directory + String cohort + } + + scatter (sample_id in read_lines(samples_file)) { + call charr.charr { + input: + sample_id = sample_id, + vcf_directory = vcf_directory + } + + call cp_file { + input: + input_file = charr.output_file, + cohort = cohort, + sample_id = sample_id + } + } +} + +task cp_file { + input { + File input_file + String cohort + String sample_id + } + + command { + gsutil -m cp ~{input_file} gs://fc-secure-d21aa6b0-1d19-42dc-93e3-42de3578da45/dfci-g2c-inputs/aou/~{cohort}/charr/~{sample_id}.txt + } + runtime{ + docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest" + } +} diff --git a/wdl/charr.wdl b/wdl/charr.wdl new file mode 100644 index 0000000..408a1ad --- /dev/null +++ b/wdl/charr.wdl @@ -0,0 +1,163 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +# The purpose of this code is to use CHARR, a +# contamination estimator which leverages the +# infiltration of reference reads within homozygous +# alternate variant calls. +# https://github.com/atgu/CHARR +# We also use echtvar for annotation of gVCF +# https://github.com/brentp/echtvar + + +version 1.0 + +task decompose { + input { + File reblocked_gvcf #g.vcf.gz file + Int? disk_gb + Float memory_var = 3.5 + } + Int default_disk_size = ceil((2 * size(reblocked_gvcf, "GB")) + 10) + + command <<< + bcftools norm -m - ~{reblocked_gvcf} -o clean_bcf.bcf + >>> + + runtime { + docker: "vanallenlab/charr" + memory: "~{memory_var} GB" # Adjust the amount based on your requirements + disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD" + } + output { + File clean_bcf = "clean_bcf.bcf" + } +} + +#We use echtvar to annotate +task annotate { + input { + File clean_bcf + File gnomad_ref + Int? disk_gb + Float memory_var = 3.5 + } + Int default_disk_size = ceil((2 * size(clean_bcf, "GB")) + 10) + + command <<< + echtvar anno -e ~{gnomad_ref} ~{clean_bcf} output.annotated.bcf + >>> + + runtime { + docker: "vanallenlab/charr" + memory: "~{memory_var} GB" # Adjust the amount based on your requirements + disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD" + } + output { + File annotated_bcf = "output.annotated.bcf" + } +} +task step1 { + input { + File gvcf #g.vcf.gz file + Int? disk_gb + Float memory_var = 3.5 + } + Int default_disk_size = ceil((2 * size(gvcf, "GB")) + 10) + + command <<< + bcftools view -i 'ALT!="" & SUM(FORMAT/AD) > 0' ~{gvcf} > step1.annotated.bcf + >>> + + runtime { + docker: "vanallenlab/charr" + memory: "~{memory_var} GB" # Adjust the amount based on your requirements + disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD" + } + output { + File output_bcf = "step1.annotated.bcf" + } +} + +task step2 { + input { + File gvcf #g.vcf.gz file + Int? disk_gb + Float memory_var = 3.5 + } + Int default_disk_size = ceil((2 * size(gvcf, "GB")) + 10) + + + command <<< + bcftools annotate -c INFO/AF:=INFO/gnomad_popmax_af ~{gvcf} > step2.annotated.bcf + >>> + + runtime { + docker: "vanallenlab/charr" + memory: "~{memory_var} GB" # Adjust the amount based on your requirements + disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD" + } + output { + File output_bcf = "step2.annotated.bcf" + } +} + +task contamination { + input { + File gvcf #g.vcf.gz file + Int? disk_gb + Float memory_var = 3.5 + } + + Int default_disk_size = ceil((2 * size(gvcf, "GB")) + 10) + + command <<< + sceVCF -o contamination.txt ~{gvcf} + >>> + + runtime { + docker: "vanallenlab/charr" + memory: "~{memory_var} GB" # Adjust the amount based on your requirements + disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD" + } + output { + File charr_output = "contamination.txt" + } +} + +# Define workflow +workflow charr { + input { + File gnomad_ref = "gs://dfci-g2c-refs/echtvar/gnomad.v3.1.2.echtvar.popmax.v2.zip" + File reblocked_gvcf + } + + + call decompose{ + input: + reblocked_gvcf = reblocked_gvcf + } + call annotate{ + input: + clean_bcf = decompose.clean_bcf, + gnomad_ref = gnomad_ref + } + call step1{ + input: + gvcf = annotate.annotated_bcf + } + call step2{ + input: + gvcf = step1.output_bcf + } + call contamination{ + input: + gvcf = step2.output_bcf + } + + output{ + File charr_output = contamination.charr_output + } +} \ No newline at end of file diff --git a/wdl/convergence.wdl b/wdl/convergence.wdl new file mode 100644 index 0000000..5614efa --- /dev/null +++ b/wdl/convergence.wdl @@ -0,0 +1,224 @@ +# The Germline Genomics of Cancer (G2C) +# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute +# Contact: Noah Fields +# Distributed under the terms of the GNU GPL v2.0 + +version 1.0 + +# Analysis of germline variants and somatic drivers in coding regions +# This workflow identifies pathogenic and high-impact variants in germline data, as well as somatic driver mutations in coding regions. + +# Task to extract coding regions from a somatic table +task Extract_Coding_Regions { + input { + File germline_somatic_table + String cancer_type + } + command <<< + #Only interested in coding variants + cut -f1,2,3,4 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -v 'UNK' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_coding.list + cut -f1,7,8 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > somatic_coding.list + >>> + runtime { + docker: "ubuntu:latest" + preemptible: 3 + } + output { + File germline_coding_list = "germline_coding.list" + File somatic_coding_list = "somatic_coding.list" + } +} + +# Task to extract pathogenic and high-impact germline variants +task Extract_Germline_Variants { + input { + File germline_merged_vep_vcf + File germline_somatic_table + File germline_genes + String patient_id + } + + command <<< + set -eux -o pipefail + + # Extract variants present only in the germline data + bcftools view -s ~{patient_id} ~{germline_merged_vep_vcf} -Oz -o sample.vcf.gz + bcftools view -i 'AC>0 & GT="alt"' sample.vcf.gz -Oz -o germline_only.vcf.gz + rm sample.vcf.gz + echo "Checkpoint 1" + + # Split VEP annotations and filter for relevant information + bcftools +split-vep -f '%CHROM|%POS|%CSQ/SYMBOL|%CSQ/IMPACT|%CSQ/gnomAD_AF_nfe|%CSQ/gnomAD_AF_popmax|%CSQ/gnomAD_controls_AF_popmax|%CSQ/ClinVar_external_CLNSIG|%CSQ/Consequence' germline_only.vcf.gz | cut -d'|' -f1,2,5,6,39-44 | awk '{gsub(/\|/, "\t");print}' | cut -d',' -f1 | awk -F'\t' '($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $9 < 0.02 || $9 == "." || $9 == "") &&($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $7 < 0.02 || $7 == "." || $7 == "") && ($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $8 < 0.02 || $8 == "." || $8 == "")' | grep -Ev 'Benign|Likely_benign|LOW' | grep -Fwf ~{germline_genes} > query.tsv + rm germline_only.vcf.gz + + echo "Checkpoint 2" + # Extract potential deleterious germline genes + cut -f4 query.tsv | sort | uniq > potential_deleterious_germline_genes.list + touch c1.list + touch c2.list + touch c3.list + touch pathogenic_germline_genes.list + + python3 <>> + + runtime { + docker: "vanallenlab/g2c_pipeline" + preemptible: 3 + } + output{ + File out = "~{patient_id}.txt" + } +} + +# Extract somatic driver mutations based on somatic genes +task Extract_Somatic_Drivers { + input { + File somatic_tsv + File somatic_genes + String patient_id + } + command <<< + # Extract somatic driver mutations based on somatic genes + cat ~{somatic_tsv} | cut -f3 | grep -Fwf ~{somatic_genes} | sort | uniq >> ~{patient_id}.somatic_drivers.txt + >>> + + runtime { + docker: "ubuntu:latest" + preemptible: 3 + } + output{ + File out = "~{patient_id}.somatic_drivers.txt" + } + +} + +task Output_Mutations { + input { + File germline_variants + File somatic_drivers + String patient_id + File germline_genes + File somatic_genes + } + + command <<< + # Initialize output string with patient ID + output="~{patient_id}" + + # Check each germline gene for pathogenic variants + while IFS=$'\t' read -r germline_gene; do + if [ $(grep -F "$germline_gene" ~{germline_variants} | awk 'END {print NR}' ) -gt 0 ]; then + output="${output} 1" + else + output="${output} 0" + fi + done < ~{germline_genes} + + # Check each somatic gene for driver mutations + while IFS=$'\t' read -r somatic_gene; do + if [ $(grep -F "$somatic_gene" ~{somatic_drivers} | awk 'END {print NR}' ) -gt 0 ]; then + output="${output} 1" + else + output="${output} 0" + fi + done < ~{somatic_genes} + + # Write output string to file + echo "$output" > ~{patient_id}_info.txt + >>> + + runtime { + docker: "ubuntu:latest" + preemptible: 3 + } + output{ + File out = "~{patient_id}_info.txt" + } +} + + +# Define workflow +workflow convergence { + input{ + File germline_merged_vep_vcf # Merged VCF file annotated with VEP + File germline_somatic_table # Germline-Somatic table file containing convergence pairs + File somatic_tsv # Somatic TSV file containing mutation information + String patient_id # Unique identifier for the patient + String cancer_type # Type of cancer being analyzed + } + call Extract_Coding_Regions { + input: + germline_somatic_table = germline_somatic_table, + cancer_type = cancer_type + } + call Extract_Germline_Variants { + input: + germline_merged_vep_vcf = germline_merged_vep_vcf, + germline_genes = Extract_Coding_Regions.germline_coding_list, + germline_somatic_table = germline_somatic_table, + patient_id = patient_id + } + call Extract_Somatic_Drivers { + input: + somatic_genes = Extract_Coding_Regions.somatic_coding_list, + somatic_tsv = somatic_tsv, + patient_id = patient_id + } + call Output_Mutations { + input: + patient_id = patient_id, + somatic_drivers = Extract_Somatic_Drivers.out, + germline_variants = Extract_Germline_Variants.out, + germline_genes = Extract_Coding_Regions.germline_coding_list, + somatic_genes = Extract_Coding_Regions.somatic_coding_list + } + output{ + File convergence = Output_Mutations.out + } +} diff --git a/wdl/generate-sample-map.3.wdl b/wdl/generate-sample-map.3.wdl new file mode 100644 index 0000000..c345108 --- /dev/null +++ b/wdl/generate-sample-map.3.wdl @@ -0,0 +1,95 @@ +version 1.0 +## Copyright Broad Institute, 2019 +## Purpose: +## Generate a sample_map file, which can be used for JointGenotyping workflow +## +## Requirements/expectations : +## - An array of file paths +## - An array of file names +## - Name of output sample_map +## +## Outputs : +## - sample map file +## +## Cromwell version support +## - Successfully tested on v47 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the dockers +## for detailed licensing information pertaining to the included programs. + + + +# WORKFLOW DEFINITION + +workflow GenerateSampleMap { + input { + Array[String] sample_names + Array[String] file_paths + String sample_map_name + } + + call GenerateSampleMapFile { + input: + sample_names = sample_names, + file_paths = file_paths, + outfile = sample_map_name + ".sample_map" + } + + output { + File sample_map = GenerateSampleMapFile.sample_map + } + +} + + +# TASK DEFINITIONS +task GenerateSampleMapFile { + input{ + # Command parameters + Array[String] sample_names + Array[String] file_paths + String outfile + + # Runtime parameters + String docker = "python:latest" + Int machine_mem_gb = 7 + Int disk_space_gb = 100 + Int preemptible_attempts = 3 + } + command <<< + set -oe pipefail + + python << CODE + file_paths = ['~{sep="','" file_paths}'] + sample_names = ['~{sep="','" sample_names}'] + + if len(file_paths)!= len(sample_names): + print("Number of File Paths does not Equal Number of File Names") + exit(1) + + with open("sample_map_file.txt", "w") as fi: + for i in range(len(file_paths)): + fi.write(sample_names[i] + "\t" + file_paths[i] + "\n") + + CODE + mv sample_map_file.txt ~{outfile} + >>> + + runtime { + docker: docker + memory: machine_mem_gb + " GB" + disks: "local-disk " + disk_space_gb + " HDD" + preemptible: 3 + } + + output { + File sample_map = outfile + } +} \ No newline at end of file