diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..1fd11da
Binary files /dev/null and b/.DS_Store differ
diff --git a/docker/.DS_Store b/docker/.DS_Store
new file mode 100644
index 0000000..6c0f9e6
Binary files /dev/null and b/docker/.DS_Store differ
diff --git a/docker/bcftools/Dockerfile b/docker/bcftools/Dockerfile
new file mode 100644
index 0000000..678311d
--- /dev/null
+++ b/docker/bcftools/Dockerfile
@@ -0,0 +1,34 @@
+# The Germline Genomics of Cancer (G2C)
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# Simple Dockerfile for vanallenlab/bcftools
+
+#FROM ubuntu:22.04
+#FROM ubuntu:focal-20220531
+FROM continuumio/miniconda3
+MAINTAINER "Noah Fields <Noah_Fields@dfci.harvard.edu>"
+
+
+
+
+# Install bcftools dependencies
+RUN apt-get update && \
+    apt-get install -y gcc make zlib1g-dev libbz2-dev liblzma-dev \
+                       libcurl4-openssl-dev libssl-dev libncurses5-dev && \
+    apt-get clean
+
+# Install bcftools
+#ARG BCFTOOLS_VERISON=1.19
+RUN wget https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \
+    tar -xvf bcftools-1.19.tar.bz2 && \
+    cd bcftools-1.19 && \
+    ./configure --prefix=/ && \
+    make && \
+    make install
+
+RUN rm -r bcftools-1.19
+RUN rm bcftools-1.19.tar.bz2
+
+# Launch bash at runtime
+CMD ["/bin/bash"]
diff --git a/docker/charr/.DS_Store b/docker/charr/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/docker/charr/.DS_Store differ
diff --git a/docker/charr/Dockerfile b/docker/charr/Dockerfile
new file mode 100644
index 0000000..cbc62fe
--- /dev/null
+++ b/docker/charr/Dockerfile
@@ -0,0 +1,58 @@
+# The Germline Genomics of Cancer (G2C)
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# Simple Dockerfile for vanallenlab/g2c_pipeline:nf_graf_plink_alpha
+
+#FROM ubuntu:22.04
+#FROM ubuntu:focal-20220531
+FROM continuumio/miniconda3
+MAINTAINER "Noah Fields <Noah_Fields@dfci.harvard.edu>"
+
+# Create a conda environment and activate it
+#RUN conda create --name myenv python=3.11 && \
+#    conda activate myenv
+
+
+# Install make, zlib, file, and git
+RUN apt-get clean && \
+    apt-get -qqy update && \
+    apt-get -qqy install --fix-missing \
+      apt-utils \
+      default-libmysqlclient-dev \
+      curl \
+      vim \
+      wget
+
+#RUN apt install -y python3
+
+
+# Clone NCBI Grafpop
+RUN wget https://github.com/HTGenomeAnalysisUnit/SCE-VCF/releases/download/v0.1.2/sceVCF
+
+RUN chmod +x sceVCF
+RUN mv sceVCF /bin/sceVCF
+
+RUN wget -O /bin/echtvar https://github.com/brentp/echtvar/releases/download/v0.1.9/echtvar \
+    && chmod +x /bin/echtvar
+
+# Install bcftools dependencies
+RUN apt-get update && \
+    apt-get install -y gcc make zlib1g-dev libbz2-dev liblzma-dev \
+                       libcurl4-openssl-dev libssl-dev libncurses5-dev && \
+    apt-get clean
+
+# Install bcftools
+#ARG BCFTOOLS_VERISON=1.19
+RUN wget https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \
+    tar -xvf bcftools-1.19.tar.bz2 && \
+    cd bcftools-1.19 && \
+    ./configure --prefix=/ && \
+    make && \
+    make install
+
+RUN rm -r bcftools-1.19
+RUN rm bcftools-1.19.tar.bz2
+
+# Launch bash at runtime
+CMD ["/bin/bash"]
diff --git a/docker/g2c-vep/.DS_Store b/docker/g2c-vep/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/docker/g2c-vep/.DS_Store differ
diff --git a/docker/g2c_pipeline/Dockerfile b/docker/g2c_pipeline/Dockerfile
index aee66cb..654152f 100644
--- a/docker/g2c_pipeline/Dockerfile
+++ b/docker/g2c_pipeline/Dockerfile
@@ -9,9 +9,7 @@
 
 # It is based on the primary sv-pipeline image from GATK-SV, which is available here:
 # us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline
-# And can be reviewed in more detail here:
-# https://github.com/broadinstitute/gatk-sv/blob/main/dockerfiles/sv-pipeline/Dockerfile
-ARG BASE_IMAGE=sv-pipeline:latest
+ARG BASE_IMAGE=us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-09-13-v0.28.3-beta-af8362e3
 FROM $BASE_IMAGE as base
 
 # Aliases for convenience
@@ -27,5 +25,9 @@ RUN cd /opt && \
     cd pancan_germline_wgs && \
     git checkout $REPO_HASH
 
+# Install missing R libraries as necessary
+RUN Rscript -e 'install.packages("argparse", repos="https://cran.rstudio.com")'
+
+
 # Launch bash as entrypoint
 CMD ["/bin/bash"]
diff --git a/docker/g2c_ufc/Dockerfile b/docker/g2c_ufc/Dockerfile
new file mode 100644
index 0000000..956fecc
--- /dev/null
+++ b/docker/g2c_ufc/Dockerfile
@@ -0,0 +1,6 @@
+FROM condaforge/mambaforge:22.11.1-4 AS build
+ENV CONDA_SUBDIR=linux-64
+RUN conda config --env --set subdir linux-64 && \
+    mamba update --all && \
+    mamba install pandas seaborn matplotlib
+RUN pip install qqman
\ No newline at end of file
diff --git a/docker/g2c_ufc/STEP_1_Make_Batches.plot_qc.py b/docker/g2c_ufc/STEP_1_Make_Batches.plot_qc.py
new file mode 100644
index 0000000..d314443
--- /dev/null
+++ b/docker/g2c_ufc/STEP_1_Make_Batches.plot_qc.py
@@ -0,0 +1,81 @@
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import argparse
+
+def plot_qc_metrics(pre_filtered_df, post_filtered_df):
+    # Check if all columns of interest are present in both DataFrames
+    columns_of_interest = ['n_grafpop_snps', 'hq_het_rate', 'charr', 'inconsistent_ab_het_rate',
+                           'median_coverage', 'chrX_ploidy', 'chrY_ploidy']
+
+    missing_columns = []
+    for col in columns_of_interest:
+        if col not in pre_filtered_df.columns:
+            missing_columns.append((args.pre_filtered, col))
+        if col not in post_filtered_df.columns:
+            missing_columns.append((args.post_filtered, col))
+
+    if missing_columns:
+        for filename, column in missing_columns:
+            print(f"Error: {filename} doesn't contain the '{column}' column.")
+        return
+
+    # Filter 'Post-filtered' DataFrame for rows where global_qc_pass is TRUE
+    print(post_filtered_df.head())
+    post_filtered_df = post_filtered_df[post_filtered_df['global_qc_pass'] == True]
+    print(post_filtered_df[post_filtered_df['global_qc_pass'] == True].head())
+
+    # Define columns of interest
+    for metric in columns_of_interest:
+        # Create a violin plot for pre-filtered and post-filtered datasets
+        plt.figure(figsize=(10, 6))
+
+        # Violin plot
+        plt.subplot(1, 2, 1)  # Subplot for violin plot
+        sns.violinplot(data=[pre_filtered_df[metric], post_filtered_df[metric]], palette="Set3")
+        plt.title(f'{metric} - Quality Control')
+        plt.xlabel('Dataset')
+        plt.ylabel(metric)
+        plt.xticks([0, 1], ['Pre-filtered', 'Post-filtered'])
+
+        # Box plot
+        plt.subplot(1, 2, 2)  # Subplot for box plot
+        sns.boxplot(data=[pre_filtered_df[metric], post_filtered_df[metric]], palette="Set3")
+        plt.title(f'{metric} - Quality Control')
+        plt.xlabel('Dataset')
+        plt.ylabel(metric)
+        plt.xticks([0, 1], ['Pre-filtered', 'Post-filtered'])
+
+        # Adjust layout and spacing
+        plt.tight_layout()
+
+        # Save the plot
+        plt.savefig(f'{metric}_qc.png')
+        plt.close()
+
+
+def parse_arguments():
+    # Create argument parser
+    parser = argparse.ArgumentParser(description="Plot quality control metrics")
+    # Add arguments
+    parser.add_argument("--pre-filtered", required=True, help="Path to pre-filtered TSV file")
+    parser.add_argument("--post-filtered", required=True, help="Path to post-filtered TSV file")
+    # Parse arguments
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == "__main__":
+    # Parse command-line arguments
+    args = parse_arguments()
+
+    # Read pre-filtered and post-filtered TSV files into DataFrames
+    try:
+        pre_filtered_df = pd.read_csv(args.pre_filtered, sep='\t')
+        post_filtered_df = pd.read_csv(args.post_filtered, sep='\t')
+    except FileNotFoundError as e:
+        print(f"Error: {e.filename} not found.")
+        exit(1)
+
+    # Call plot function with DataFrame arguments
+    plot_qc_metrics(pre_filtered_df, post_filtered_df)
\ No newline at end of file
diff --git a/docker/g2c_ufc/qqplot.py b/docker/g2c_ufc/qqplot.py
new file mode 100644
index 0000000..cef6c59
--- /dev/null
+++ b/docker/g2c_ufc/qqplot.py
@@ -0,0 +1,42 @@
+from qqman import qqman
+import pandas as pd
+import matplotlib.pyplot as plt
+
+if __name__ == "__main__":
+	df_pancancer_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.pancancer.txt",sep='\t')
+	df_pancancer_assoc = df_pancancer_assoc[(df_pancancer_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_pancancer_assoc['max_MAF'] == 0.01)]
+	df_breast_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.breast.txt",sep='\t')
+	df_breast_assoc = df_breast_assoc[(df_breast_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_breast_assoc['max_MAF'] == 0.01)]
+	df_colorectal_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.colorectal.txt",sep='\t')
+	df_colorectal_assoc = df_colorectal_assoc[(df_colorectal_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_colorectal_assoc['max_MAF'] == 0.01)]
+
+	df_lung_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.lung.txt",sep='\t')
+	df_lung_assoc = df_lung_assoc[(df_lung_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_lung_assoc['max_MAF'] == 0.01)]
+	df_melanoma_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.melanoma.txt",sep='\t')
+	df_melanoma_assoc = df_melanoma_assoc[(df_melanoma_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_melanoma_assoc['max_MAF'] == 0.01)]
+	df_prostate_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.prostate.txt",sep='\t')
+	df_prostate_assoc = df_prostate_assoc[(df_prostate_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') & (df_prostate_assoc['max_MAF'] == 0.01)]
+
+	pancancer_p_vals = list(df_pancancer_assoc['Pvalue'])
+	breast_p_vals = list(df_breast_assoc['Pvalue'])
+	colorectal_p_vals = list(df_colorectal_assoc['Pvalue'])
+	lung_p_vals = list(df_lung_assoc['Pvalue'])
+	melanoma_p_vals = list(df_melanoma_assoc['Pvalue'])
+	prostate_p_vals = list(df_prostate_assoc['Pvalue'])
+	
+	figure, axes = plt.subplots(nrows=2, ncols=3, figsize = (8,6))
+
+	# qqman.qqplot("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.breast.txt", ax=axes[0,0],title="From file")
+	qqman.qqplot(pancancer_p_vals, ax=axes[0,0],title="Pancancer Cancer")
+	qqman.qqplot(breast_p_vals, ax=axes[0,1],title="Breast Cancer")
+	qqman.qqplot(colorectal_p_vals, ax=axes[0,2],title="Colorectal Cancer")
+	qqman.qqplot(lung_p_vals, ax=axes[1,0],title="Lung Cancer")
+	qqman.qqplot(melanoma_p_vals, ax=axes[1,1],title="Melanoma Cancer")
+	qqman.qqplot(prostate_p_vals, ax=axes[1,2],title="Prostate Cancer")
+	# qqman.qqplot(df_assoc.P, ax=axes[1,0],title="From Series")
+
+	figure.tight_layout()
+	plt.show()
+	#plt.savefig("./SubQQplot.png",format="png")
+	#plt.clf()
+	#plt.close()
\ No newline at end of file
diff --git a/docker/g2c_ufc/qqplot_broad_retreat.py b/docker/g2c_ufc/qqplot_broad_retreat.py
new file mode 100644
index 0000000..3afa25f
--- /dev/null
+++ b/docker/g2c_ufc/qqplot_broad_retreat.py
@@ -0,0 +1,44 @@
+from qqman import qqman
+import pandas as pd
+import matplotlib.pyplot as plt
+
+if __name__ == "__main__":
+    df_pancancer_assoc = pd.read_csv("/Users/noah/Desktop/UFC/saige_output/saige_gene_output.pancancer.txt", sep='\t')
+    df_pancancer_assoc1 = df_pancancer_assoc[
+        (df_pancancer_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH') &
+        (df_pancancer_assoc['max_MAF'] == 0.01)
+    ]
+    df_pancancer_assoc2 = df_pancancer_assoc[
+        (df_pancancer_assoc['Group'] == 'Pathogenic;PLP;Likely_pathogenic;HIGH;MODERATE') &
+        (df_pancancer_assoc['max_MAF'] == 0.01)
+    ]
+
+    pancancer_p_vals = list(df_pancancer_assoc['Pvalue'])
+    
+    figure, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 15))
+
+    # First plot: High Impact Variants
+    qqman.qqplot(df_pancancer_assoc1.Pvalue, ax=axes[0])
+    axes[0].set_title("PLP and High Impact Variants", fontsize=44, pad=30)
+    axes[0].tick_params(axis='both', labelsize=40)  # Adjust tick size
+    axes[0].set_xlabel("Expected -log10(p)", fontsize=40, weight='bold')
+    axes[0].set_ylabel("Observed -log10(p)", fontsize=40, weight='bold')
+
+    # Second plot: High/Moderate Impact Variants
+    qqman.qqplot(df_pancancer_assoc2.Pvalue, ax=axes[1])
+    axes[1].set_title("PLP and High/Moderate Impact Variants", fontsize=44, pad=30)
+    axes[1].tick_params(axis='both', labelsize=40)  # Adjust tick size
+    axes[1].set_xlabel("Expected -log10(p)", fontsize=40, weight='bold')
+    axes[1].set_ylabel("Observed -log10(p)", fontsize=40, weight='bold')
+
+    # Add overarching title
+    figure.suptitle("Preliminary Gene-Based Rare Variant Association Testing on Chr22", fontsize=50, weight='bold')  # Adjust y to bring title down
+
+    # Add footnote with more space (lower the y position)
+    figure.text(0.5, 0.01, "*PLP = Pathogenic and/or Likely Pathogenic", ha='center', fontsize=30, style='italic')
+
+    # Adjust spacing between plots
+    plt.subplots_adjust(wspace=0.8, top=0.83, bottom=0.2)  # Adjust horizontal space between subplots
+
+    # Save the figure
+    plt.savefig("/Users/noah/Desktop/UFC/plots/QQplot_BroadRetreat_2024.png", format="png", dpi=800)
diff --git a/docker/graf/.DS_Store b/docker/graf/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/docker/graf/.DS_Store differ
diff --git a/docker/graf/Dockerfile b/docker/graf/Dockerfile
new file mode 100644
index 0000000..5cc4a90
--- /dev/null
+++ b/docker/graf/Dockerfile
@@ -0,0 +1,58 @@
+# The Germline Genomics of Cancer (G2C)
+# Contact: Ryan Collins <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# Simple Dockerfile for vanallenlab/g2c_pipeline:nf_graf_plink_alpha
+
+#FROM ubuntu:22.04
+#FROM ubuntu:focal-20220531
+FROM continuumio/miniconda3
+MAINTAINER "Noah Fields <Noah_Fields@dfci.harvard.edu>"
+
+# Create a conda environment and activate it
+#RUN conda create --name myenv python=3.11 && \
+#    conda activate myenv
+
+
+# Install make, zlib, file, and git
+RUN apt-get clean && \
+    apt-get -qqy update && \
+    apt-get -qqy install --fix-missing \
+      apt-utils \
+      default-libmysqlclient-dev \
+      curl \
+      vim \
+      wget
+
+#RUN apt install -y python3
+
+# Install Plink on Mac
+#RUN wget \
+#      https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231018.zip && \
+#      unzip plink_linux_x86_64_20231018.zip
+
+# Print the current working directory
+#RUN echo "Current working directory: $(pwd)" > log.txt
+
+# Clone NCBI Grafpop
+RUN wget -O GrafPop1.0.tar.gz \
+      https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetZip.cgi?zip_name=GrafPop1.0.tar.gz && \
+      tar -xzvf GrafPop1.0.tar.gz
+
+# Move the grafpop executable
+RUN mv grafpop /bin
+
+# Copy the AncInferSNPs.txt to the home directory
+RUN cp data/AncInferSNPs.txt .
+
+# Remove all traces of hg19
+RUN rm data/TG_10_1000_g37*
+
+# Remove the tar.gz file for lighter Docker
+RUN rm GrafPop1.0.tar.gz
+
+COPY infer_ancestry.py /opt/
+RUN chmod a+x /opt/infer_ancestry.py
+
+# Launch bash at runtime
+CMD ["/bin/bash"]
diff --git a/docker/graf/infer_ancestry.py b/docker/graf/infer_ancestry.py
new file mode 100644
index 0000000..5b1ae1e
--- /dev/null
+++ b/docker/graf/infer_ancestry.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+import sys
+import csv
+
+def read_sample_data(filename):
+    # Dictionary to store the data
+    sample_data = {}
+
+    # Read the text file and populate the dictionary
+    with open(filename, 'r') as txt_file:
+        lines = txt_file.readlines()
+
+        # Skip lines until the header line
+        header_index = lines.index('Sample\t#SNPs\tGD1 (x)\tGD2 (y)\tGD3 (z)\tGD4\tE(%)\tF(%)\tA(%)\n')
+
+        for line in lines[header_index + 1:]:
+            # Split the line into columns
+            columns = line.strip().split('\t')
+
+            # Extract relevant data
+            sample = columns[0]
+            gd1 = float(columns[2])
+            gd2 = float(columns[3])
+            gd3 = float(columns[4])
+            gd4 = float(columns[5])
+            e_percent = float(columns[6])
+            f_percent = float(columns[7])
+            a_percent = float(columns[8])
+
+            # Store the values in the dictionary
+            sample_data[sample] = {
+                'GD1': gd1,
+                'GD2': gd2,
+                'GD3': gd3,
+                'GD4': gd4,
+                'E(%)': e_percent,
+                'F(%)': f_percent,
+                'A(%)': a_percent
+            }
+
+    return sample_data
+
+def classify_population(sample_data):
+    for sample, data in sample_data.items():
+        e_percent = data['E(%)']
+        f_percent = data['F(%)']
+        a_percent = data['A(%)']
+        gd1 = data['GD1']
+
+        if e_percent >= 90:
+            print(f"European\n1")
+        elif f_percent >= 95:
+            print(f"African\n2")
+        elif a_percent >= 95:
+            print(f"East-Asian\n3")
+        elif 40 <= f_percent < 95 and a_percent < 14:
+            print(f"African-American\n4")
+        elif f_percent < 50 and e_percent < 90 and a_percent < 14 and gd1 < 1.48:
+            print(f"Latin-American-1\n5")
+        elif a_percent >= 14 and f_percent >= 14:
+            print(f"Other\n9")
+        elif gd1 > 30 * data['GD4']**2 + 1.58 and f_percent < 14:
+            print(f"Asian-Pacific-Islander\n7")
+        elif gd1 + data['GD4'] < 1.525 and f_percent < 14:
+            print(f"Latin-American-2\n6")
+        elif data['GD4'] > 5 * (gd1 - 1.524)**2 + 0.0575:
+            print(f"South-Asian\n8")
+        else:
+            print(f"Unknown\n10")
+
+if __name__ == "__main__":
+    # Check if the filename is provided as a command-line argument
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <filename>")
+        sys.exit(1)
+
+    filename = sys.argv[1]
+    data = read_sample_data(filename)
+
+    # Classify the population based on the provided conditions
+    classify_population(data)
diff --git a/docker/gsc_pytools/.DS_Store b/docker/gsc_pytools/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/docker/gsc_pytools/.DS_Store differ
diff --git a/docker/gsc_pytools/Dockerfile b/docker/gsc_pytools/Dockerfile
new file mode 100644
index 0000000..5dd7187
--- /dev/null
+++ b/docker/gsc_pytools/Dockerfile
@@ -0,0 +1,25 @@
+# The Germline Somatic Convergence (G2C)
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+# Germline_Somatic Docker
+# Use an Ubuntu base image
+FROM ubuntu:latest
+MAINTAINER "Noah Fields <Noah_Fields@dfci.harvard.edu>"
+
+# Install Python 3.9 and pip
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y python3.9 python3-pip python3.9-distutils && \
+    apt-get clean
+
+# Install Python packages using pip
+RUN python3.9 -m pip install numpy pandas firthlogist scipy seaborn statsmodels scikit-learn
+
+# Copy your utility script to the image
+COPY gsc_util.py /opt/
+COPY *.py /opt/
+
+# Set the default command to run at runtime
+CMD ["/bin/bash"]
diff --git a/docker/gsc_pytools/gsc_graph.py b/docker/gsc_pytools/gsc_graph.py
new file mode 100644
index 0000000..561ecee
--- /dev/null
+++ b/docker/gsc_pytools/gsc_graph.py
@@ -0,0 +1,349 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score
+from statsmodels.stats.multitest import fdrcorrection
+
+def plot_germline_frequencies(data, 
+                              hmf_col='germline_plp_frequency_HMF', 
+                              profile_col='germline_plp_frequency_PROFILE', 
+                              save_path="germline_snp_frequencies.png"):
+
+    # Define the cancer types of interest
+    cancer_types = ['Breast', 'Prostate', 'Colorectal', 'Lung', 'Kidney', 'Pancancer']
+    
+    # Drop rows where either the HMF or PROFILE frequency is missing (NaN)
+    data_clean = data.dropna(subset=[hmf_col, profile_col])
+    data = data_clean
+
+    # Filtering to keep only the specified columns
+    data = data[['germline_risk_allele', 'cancer_type', hmf_col, profile_col]]
+
+    # Removing duplicates
+    data = data.drop_duplicates()
+
+    # Create a figure with subplots
+    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
+    axes = axes.flatten()  # Flatten the array for easy indexing
+
+    # Iterate through each cancer type and create a subplot
+    for idx, cancer_type in enumerate(cancer_types):
+
+        # Filter the DataFrame for the current cancer type
+        filtered_data = data[data['cancer_type'] == cancer_type]
+        print(f"{cancer_type}: has {len(data)} rows")
+
+
+        # Extract HMF and PROFILE frequencies
+        x = filtered_data[hmf_col].values.reshape(-1, 1)
+        y = filtered_data[profile_col].values
+
+        # Fit a linear regression model
+        model = LinearRegression()
+        model.fit(x, y)
+        y_pred = model.predict(x)
+        r_squared = r2_score(y, y_pred)
+
+        # Plot the data
+        axes[idx].scatter(x, y, alpha=0.5)
+        axes[idx].plot(x, y_pred, color='blue', label='Best Fit', linestyle='--')
+        
+        # Add y=x line
+        axes[idx].plot(x, x, color='black', linestyle='--', label='y = x')
+
+        # Set titles and labels
+        axes[idx].set_title(f'{cancer_type} (R² = {r_squared:.2f})')
+        axes[idx].set_xlabel('Germline Allele Frequency HMF')
+        axes[idx].set_ylabel('Germline Allele Frequency PROFILE')
+        axes[idx].legend()
+        axes[idx].set_xlim(0, max(x.max(), y.max()))
+        axes[idx].set_ylim(0, max(x.max(), y.max()))
+
+    plt.tight_layout()
+    plt.savefig(save_path)
+    plt.close()
+
+def plot_somatic_mutation_frequencies(df, 
+                                        x_col="somatic_plp_frequency_HMF", 
+                                        y_col="somatic_plp_frequency_PROFILE", 
+                                        save_path="somatic_mutation_frequencies.png"):
+    # Specify the cancer types to include
+    cancer_types = ["Breast", "Prostate", "Colorectal", "Lung", "Kidney", "Pancancer"]
+
+    # Drop rows where either the HMF or PROFILE frequency is missing (NaN)
+    df_clean = df.dropna(subset=[x_col, y_col])
+    df = df_clean
+
+    # Filtering to keep only the specified columns
+    df = df[['somatic_gene', 'cancer_type', x_col, y_col]]
+
+    # Removing duplicates
+    df = df.drop_duplicates()
+
+    # Set up the figure and axes
+    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+    axes = axes.flatten()
+    
+    for i, cancer_type in enumerate(cancer_types):
+        # Filter data for the current cancer type
+        subset = df[df['cancer_type'] == cancer_type]
+        
+        # Prepare data for regression
+        X = subset[x_col].values.reshape(-1, 1)
+        y = subset[y_col].values
+
+        # Fit linear regression
+        model = LinearRegression()
+        model.fit(X, y)
+        
+        # Get predictions and R² value
+        y_pred = model.predict(X)
+        r2 = r2_score(y, y_pred)
+        
+        # Plot the data
+        axes[i].scatter(X, y, alpha=0.6, label=cancer_type)
+        axes[i].plot(X, y_pred, linestyle='--', color='red', label='Best Fit')
+        axes[i].plot([X.min(), X.max()], [X.min(), X.max()], 'k--', label='y=x')  # dashed line for y=x
+        axes[i].set_title(f"{cancer_type}: R² = {r2:.2f}")
+        axes[i].set_xlabel(x_col)
+        axes[i].set_ylabel(y_col)
+        axes[i].legend()
+        
+    # Adjust layout and save the figure
+    plt.tight_layout()
+    plt.savefig(save_path)
+    plt.close()
+
+def plot_volcano(df, 
+                 p_col="p_val_combined", 
+                 or_col="OR_combined", 
+                 criteria_col="criteria", 
+                 cancer_type_col="cancer_type",
+                 germline_event="germline_risk_allele",
+                 figure_title="Volcano Plot of Germline Somatic Convergences",
+                 save_path="insert_path_here.png"):
+
+    # Drop rows where either p-value or OR is NaN
+    df_clean = df.dropna(subset=[p_col, or_col])
+    df = df_clean
+    
+    # Calculate -log10(p-value) and log2(OR)
+    df['log10_p'] = -np.log10(df[p_col])
+    df['log2_OR'] = np.log2(df[or_col])
+    
+    # Define shapes and colors
+    shapes = {'known_ppi': 'o', 'same_gene': 's', 'protein_complex': 'h', 
+              'ligand_receptor': '^', 'known_ppi;ligand_receptor': '*', 
+              'known_ppi;protein_complex': 'd'}
+    colors = {'Breast': 'red', 'Prostate': 'blue', 'Colorectal': 'green', 
+              'Lung': 'purple', 'Kidney': 'orange', 'Pancancer': 'black'}
+    
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(10, 8))
+
+    # Plot points
+    for criteria_val, shape in shapes.items():
+        for cancer_type, color in colors.items():
+            subset = df[(df[criteria_col] == criteria_val) & 
+                        (df[cancer_type_col] == cancer_type) & 
+                        (df['log2_OR'] > -20)]
+            ax.scatter(subset['log2_OR'], subset['log10_p'], 
+                       marker=shape, color=color, alpha=0.7)
+
+    # Add significance thresholds
+    bonferroni_threshold = -np.log10(0.05 / len(df))
+    nominal_threshold = -np.log10(0.05)
+    rejected, pvals_corrected = fdrcorrection(df[p_col], alpha=0.05)
+    if rejected.any():
+        fdr_threshold = -np.log10(pvals_corrected[rejected].max())
+        ax.axhline(y=fdr_threshold, color='green', linestyle='--', label=f'FDR Threshold')
+    ax.axhline(y=bonferroni_threshold, color='red', linestyle='--', label=f'Bonferroni')
+    ax.axhline(y=nominal_threshold, color='blue', linestyle='--', label='Nominal')
+
+    # Annotate points above Bonferroni
+    significant_points = df[df['log10_p'] > nominal_threshold]
+    for _, row in significant_points.iterrows():
+        # Get the value from germline_event, fallback to 'germline_gene' if germline_event is NA or NaN
+        germline_value = row[germline_event] if pd.notna(row[germline_event]) else row['germline_gene']
+        label_text = f"({germline_value}, {row['somatic_gene']})"
+        ax.text(row['log2_OR'], row['log10_p'], label_text, fontsize=8, ha='right')
+
+    # # Add legends
+    # color_handles = [plt.Line2D([0], [0], marker='o', color='w', 
+    #                             markerfacecolor=color, label=cancer) 
+    #                  for cancer, color in colors.items()]
+    # color_legend = ax.legend(handles=color_handles, title="Cancer Type", 
+    #                          bbox_to_anchor=(1.05, 1), loc='upper left')
+    # ax.add_artist(color_legend)
+
+    # shape_handles = [plt.Line2D([0], [0], marker=shape, color='k', label=criteria) 
+    #                  for criteria, shape in shapes.items()]
+    # shape_legend = ax.legend(handles=shape_handles, title="Criteria", 
+    #                          bbox_to_anchor=(1.05, 0.75), loc='upper left')
+    # ax.add_artist(shape_legend)
+
+    # Add vertical dashed line at x=0
+    ax.axvline(x=0, color='gray', linestyle='--', alpha=0.7, label='x=0')
+
+    # # # Dashed lines legend
+    # # line_handles = [
+    # #     plt.Line2D([0], [0], color='red', linestyle='--', label='Bonferroni'),
+    # #     plt.Line2D([0], [0], color='blue', linestyle='--', label='Nominal'),
+    # #     plt.Line2D([0], [0], color='green', linestyle='--', label='Benjamini-Hochberg'),
+    # #     plt.Line2D([0], [0], color='gray', linestyle='--', label='x=0')
+    # # ]
+
+    # # # Use a separate legend for lines
+    # # line_legend = ax.legend(handles=line_handles, title="Significance Lines", 
+    # #                     bbox_to_anchor=(1.05, 0.5), loc='upper left')
+    # # ax.add_artist(line_legend)
+
+    # Add point legends (color and shape combined)
+    color_handles = [
+        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, label=cancer) 
+        for cancer, color in colors.items()
+    ]
+    shape_handles = [
+        plt.Line2D([0], [0], marker=shape, color='k', label=criteria) 
+        for criteria, shape in shapes.items()
+    ]
+    
+    # Add line legend for thresholds
+    line_handles = [
+        plt.Line2D([0], [0], color='red', linestyle='--', label='Bonferroni'),
+        plt.Line2D([0], [0], color='blue', linestyle='--', label='Nominal'),
+        plt.Line2D([0], [0], color='green', linestyle='--', label='FDR Threshold'),
+        plt.Line2D([0], [0], color='gray', linestyle='--', label='x=0'),
+    ]
+
+    # Create a combined legend outside the plot
+    fig.legend(handles=color_handles + shape_handles + line_handles,
+               title="Legend", loc='center right', bbox_to_anchor=(1.3, 0.5))
+
+    # Adjust layout to give room for the legend
+    plt.subplots_adjust(right=0.75)
+    
+    # Labels, title, and save
+    ax.set_xlabel('log2(OR)')
+    ax.set_ylabel('-log10(p)')
+    ax.set_title(figure_title)
+    plt.tight_layout()
+    fig.savefig(save_path)
+    plt.close(fig)
+
+
+
+def plot_p_values(data, 
+                  hmf_col='p_val_HMF', 
+                  profile_col='p_val_PROFILE', 
+                  save_path="p_val_comparison.png"):
+    """
+    Function to plot -log10 transformed p-values for HMF and PROFILE across different cancer types.
+    """
+    # Define the cancer types of interest
+    cancer_types = ['Breast', 'Prostate', 'Colorectal', 'Lung', 'Kidney', 'Pancancer']
+
+    # Drop rows where either the HMF or PROFILE p-value is missing (NaN)
+    data_clean = data.dropna(subset=[hmf_col, profile_col])
+    data_clean = data_clean[(data_clean[hmf_col] > 0) & (data_clean[profile_col] > 0)]
+    data = data_clean
+
+    # Transform p-values to -log10(p)
+    data['log_p_val_HMF'] = -np.log10(data[hmf_col])
+    data['log_p_val_PROFILE'] = -np.log10(data[profile_col])
+
+    # Drop rows with non-finite values after transformation
+    data = data_clean[np.isfinite(data['log_p_val_HMF']) & np.isfinite(data['log_p_val_PROFILE'])]
+
+    # Create a figure with subplots
+    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
+    axes = axes.flatten()  # Flatten the array for easy indexing
+
+    # Iterate through each cancer type and create a subplot
+    for idx, cancer_type in enumerate(cancer_types):
+        
+        # Filter the DataFrame for the current cancer type
+        filtered_data = data[data['cancer_type'] == cancer_type]
+
+        # Extract transformed HMF and PROFILE p-values
+        x = filtered_data['log_p_val_HMF'].values.reshape(-1, 1)
+        y = filtered_data['log_p_val_PROFILE'].values
+
+        # Fit a linear regression model
+        model = LinearRegression()
+        model.fit(x, y)
+        y_pred = model.predict(x)
+        r_squared = r2_score(y, y_pred)
+
+        # Plot the data
+        axes[idx].scatter(x, y, alpha=0.5)
+        axes[idx].plot(x, y_pred, color='blue', label='Best Fit', linestyle='--')
+
+        # Add y=x line
+        axes[idx].plot(x, x, color='black', linestyle='--', label='y = x')
+
+        # Set titles and labels
+        axes[idx].set_title(f'{cancer_type} (R² = {r_squared:.2f})')
+        axes[idx].set_xlabel('-log10(p_val_HMF)')
+        axes[idx].set_ylabel('-log10(p_val_PROFILE)')
+        axes[idx].legend()
+        axes[idx].set_xlim(0, max(x.max(), y.max()))
+        axes[idx].set_ylim(0, max(x.max(), y.max()))
+
+    plt.tight_layout()
+    plt.savefig(save_path)
+    plt.close()
+
+def plot_gwas_subplots(df, save_path='gwas_subplots.png'):
+    df = df[(df['relevant_cancer'] == 1) | (df['relevant_cancer'] == 2)]
+    # Convert p-values to -log10 scale for plotting
+    df['-log10_p_val_HMF'] = -np.log10(df['p_val_HMF'])
+    df['log2OR'] = np.log2(df['OR_HMF'])
+    # Extract chromosome position as integer from `germline_risk_allele`
+    df['chrom_position'] = df['germline_risk_allele'].apply(lambda x: int(x.split(':')[1].split('-')[0]))
+
+    # Group by cancer_type, germline_gene, and somatic_gene
+    grouped = df.groupby(['cancer_type', 'germline_gene', 'somatic_gene'])
+
+    # Calculate nominal and Bonferroni thresholds
+    nominal_threshold = -np.log10(0.05)
+
+    # Set up the plot grid
+    num_plots = len(grouped)
+    fig, axes = plt.subplots(nrows=(num_plots // 3) + 1, ncols=3, figsize=(15, 5 * (num_plots // 3 + 1)))
+    axes = axes.flatten()
+
+    for i, (name, group) in enumerate(grouped):
+        cancer_type, germline_gene, somatic_gene = name
+        print(cancer_type)
+        # Calculate Bonferroni threshold for the current group
+        bonferroni_threshold = -np.log10(0.05 / len(group))
+
+        # Plot on the i-th subplot
+        ax = axes[i]
+        ax.scatter(group['chrom_position'], group['-log10_p_val_HMF'], color='b', s=10, alpha=0.6)
+        #ax.scatter(group['chrom_position'], group['log2OR'], color='b', s=10, alpha=0.6)
+        ax.axhline(nominal_threshold, color='green', linestyle='--', label='Nominal p=0.05')
+        ax.axhline(bonferroni_threshold, color='red', linestyle='-', label='Bonferroni Corrected')
+
+        # Set plot titles and labels
+        title = ""
+        if cancer_type == "Pancancer":
+            title = f"{germline_gene} Noncoding and {somatic_gene} Coding Convergence: Pancancer"
+        else:
+            title = f"{germline_gene} Noncoding and {somatic_gene} Coding Convergence: {cancer_type}"
+        ax.set_title(title, fontsize=10)
+        ax.set_xlabel("Chromosome Position")
+        ax.set_ylabel("-log10(p)")
+        ax.legend()
+
+    # Remove any extra subplots in the grid
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+
+    # Adjust layout and save the figure
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=300)
+    plt.show()
+    print(f"Figure saved to {save_path}")
\ No newline at end of file
diff --git a/docker/gsc_pytools/gsc_util.py b/docker/gsc_pytools/gsc_util.py
new file mode 100644
index 0000000..7f777db
--- /dev/null
+++ b/docker/gsc_pytools/gsc_util.py
@@ -0,0 +1,567 @@
+import pandas as pd
+import statsmodels.api as sm
+from scipy.stats import fisher_exact
+import numpy as np
+from firthlogist import FirthLogisticRegression
+import scipy.stats as stats
+
+
+# Assuming FirthLogisticRegression is already imported or defined elsewhere
+# You may need to implement this if it's not available.
+
+def logistic_regression_with_fallback(df, cancer_type, germline_event, somatic_gene, covariates=['male', 'pca_1', 'pca_2', 'pca_3', 'pca_4']):
+    print(f"Cancer Type: {cancer_type}\n Germline Event: {germline_event} \n Somatic Gene: {somatic_gene}")
+    
+    # Filter based on cancer_type if it's not Pancancer
+    if cancer_type != "Pancancer":
+        df = df[df['cancer_type'] == cancer_type]
+
+    # Check if the germline_event and somatic_gene are in the dataframe columns
+    if germline_event not in df.columns or somatic_gene not in df.columns:
+        print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame")
+        return
+    
+    # If either column has no positive cases, skip
+    if df[germline_event].sum() == 0 or df[somatic_gene].sum() == 0:
+        return
+
+    # Drop rows with NaN values in the columns of interest
+    df = df.dropna(subset=[germline_event, somatic_gene] + covariates)
+
+
+
+
+    # Try regular logistic regression
+    try:
+        # Prepare the predictor (X) and response (y) variables
+        X = df[[germline_event] + covariates]
+        y = df[somatic_gene]
+
+        # Normalize somatic_gene values: treat values > 1 as 1
+        y = y.apply(lambda x: 1 if x > 1 else x)
+
+        # Add a constant to the predictors for logistic regression
+        X = sm.add_constant(X)
+
+        model = sm.Logit(y, X)
+        result = model.fit(disp=False)
+        
+        # If the model converged, extract p-value, odds ratio, and confidence intervals
+        if result.mle_retvals['converged']:
+            print("Regular logistic regression converged successfully!")
+            
+            # Extract p-value and odds ratio for the germline_gene predictor
+            p_value = result.pvalues[germline_event]  # Assuming germline_event is the first predictor after the constant
+            odds_ratio = np.exp(result.params[germline_event])
+
+            # Calculate the 95% confidence interval for the odds ratio
+            coef = result.params[germline_event]
+            std_err = result.bse[germline_event]
+
+            # The confidence interval for the coefficient
+            conf_int_coef = [coef - 1.96 * std_err, coef + 1.96 * std_err]
+
+            # Convert the confidence interval from log odds to odds ratio
+            conf_int_or = np.exp(conf_int_coef)
+
+            # Return the results: odds ratio, p-value, and confidence intervals
+            return odds_ratio, p_value, conf_int_or[0], conf_int_or[1]
+        
+        else:
+            raise ValueError("Regular logistic regression failed to converge.")
+    
+    # Fallback to Firth logistic regression if regular logistic regression fails
+    except Exception as e:
+        print(f"Regular logistic regression failed: {e}")
+        print("Falling back to Firth logistic regression...")
+        
+        try:
+            # Prepare the predictor (X) and response (y) variables
+            X = df[[germline_event] + covariates]
+            y = df[somatic_gene]
+
+            # Normalize somatic_gene values: treat values > 1 as 1
+            y = y.apply(lambda x: 1 if x > 1 else x)
+
+            # Fit Firth logistic regression (Assuming FirthLogisticRegression is defined)
+            model = FirthLogisticRegression()
+            model.fit(X, y)
+            
+            # Extract p-value and odds ratio for the germline_gene predictor
+            p_value = model.pvals_[0]  # Assuming the predictor is the second column after the constant
+            odds_ratio = np.exp(model.coef_[0])
+            
+            # Calculate confidence intervals
+            coef = model.coef_[0]
+            std_err = model.bse_[0]
+            conf_int_coef = [coef - 1.96 * std_err, coef + 1.96 * std_err]
+            conf_int_or = np.exp(conf_int_coef)
+
+            return odds_ratio, p_value, conf_int_or[0], conf_int_or[1]
+
+        except Exception as e:
+            print(f"Firth logistic regression failed: {e}")
+            return
+
+
+def firth_logistic_regression(df, cancer_type, germline_event, somatic_gene,covariates=['male','pca_1','pca_2','pca_3','pca_4']):
+    print(f"Cancer Type: {cancer_type}\n Germline Event: {germline_event} \n Somatic Gene: {somatic_gene}")
+    if cancer_type != "Pancancer":
+        df = df[df['cancer_type'] == cancer_type]
+
+    if germline_event not in df.columns or somatic_gene not in df.columns:       
+        print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame")
+        return
+    if df[germline_event].sum() == 0 or df[somatic_gene].sum() == 0:
+        return
+
+    df= df.dropna(subset=[germline_event,somatic_gene] + covariates)
+
+    try:
+        # Prepare the predictor (X) and response (y) variables
+        X = df[[germline_event] + covariates]
+        y = df[somatic_gene]
+        
+        # Normalize somatic_gene values: treat values > 1 as 1
+        y = y.apply(lambda x: 1 if x > 1 else x)
+
+        # Fit Logistic Regression Model
+        model = FirthLogisticRegression()
+        model.fit(X,y)
+        
+        # Extract p-value and odds ratio for the germline_gene predictor
+        p_value = model.pvals_[0]
+        odds_ratio = np.exp(model.coef_[0])
+        
+        # Calculate the 95% confidence interval for the odds ratio
+        coef = model.coef_[0]
+        std_err = model.bse_[0]
+
+        # The confidence interval for the coefficient
+        conf_int_coef = [coef - 1.96 * std_err, coef + 1.96 * std_err]
+
+        # Convert the confidence interval from log odds to odds ratio
+        conf_int_or = np.exp(conf_int_coef)
+
+        # Append results to the results array, including confidence intervals
+        return odds_ratio, p_value, conf_int_or[0], conf_int_or[1]
+
+    except Exception as e:
+        print(f"Error processing combination {germline_event} - {somatic_gene}: {e}")
+
+
+def find_filtered_allele_frequency(df, cancer_type, germline_event, somatic_gene,covariates=['male','pca_1','pca_2','pca_3','pca_4']):
+    """
+    Calculate the allele frequency for a specified column in a DataFrame.
+    
+    Args:
+    df (pd.DataFrame): The input DataFrame.
+    column_name (str): The column name for which to calculate allele frequency.
+    
+    Returns:
+    float: The calculated allele frequency.
+    """
+    if cancer_type != "Pancancer":
+        df = df[df['cancer_type'] == cancer_type]
+
+    if germline_event not in df.columns or somatic_gene not in df.columns:       
+        print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame")
+        return
+
+    df= df.dropna(subset=[germline_event,somatic_gene] + covariates)
+    
+    # Get the column values, excluding NaNs
+    column_values = df[germline_event].dropna()
+    
+    # Calculate the allele frequency
+    allele_frequency = column_values.sum() / (len(column_values) * 2)
+    
+    return allele_frequency,column_values.sum(),(len(column_values) * 2)
+
+# Get a allele frequency for the cancer_type at large
+def find_allele_frequency(df, cancer_type, germline_event):
+    # Filter to Cancer type of interest
+    if cancer_type != "Pancancer":
+        df = df[df['cancer_type'] == cancer_type]
+
+    # Make sure the snp is present in the df
+    if germline_event not in df.columns:       
+        print(f"Combination {germline_event} not found in DataFrame")
+        return
+
+    df= df.dropna(subset=[germline_event])
+    
+    # Get the column values, excluding NaNs
+    column_values = df[germline_event].dropna()
+    
+    # Calculate the allele frequency
+    allele_frequency = column_values.sum() / (len(column_values) * 2)
+    
+    return allele_frequency,column_values.sum(),(len(column_values) * 2)
+
+# Get a mutation frequency for specific logistic regression (verify that all variables are there first)
+def find_filtered_mutation_frequency(df, cancer_type, germline_event, somatic_gene, covariates=['male','pca_1','pca_2','pca_3','pca_4']):
+    # Filter to Cancer Type of Interest
+    if cancer_type != "Pancancer":
+        df = df[df['cancer_type'] == cancer_type]
+
+    # Verify that we have the data we want
+    if germline_event not in df.columns or somatic_gene not in df.columns:       
+        print(f"Combination {germline_event} - {somatic_gene} not found in DataFrame")
+        return
+
+    df= df.dropna(subset=[germline_event,somatic_gene] + covariates)
+    
+    # Get the column values, excluding NaNs
+    column_values = df[somatic_gene].dropna()
+    
+    # Transform the column values: non-zero values become 1, zero values stay 0
+    column_values = column_values.apply(lambda x: 1 if x != 0 else 0)
+    
+    # Calculate the allele frequency
+    mutation_frequency = column_values.sum() / len(column_values)
+    
+    return mutation_frequency,column_values.sum(),len(column_values)
+
+# Get a allele frequency for the cancer_type at large
+def find_mutation_frequency(df, cancer_type, somatic_gene):
+    # Filter to Cancer Type of Interest
+    if cancer_type != "Pancancer":
+        df = df[df['cancer_type'] == cancer_type]
+
+    # Verify that we have the data we want
+    if somatic_gene not in df.columns:       
+        print(f"Combination {somatic_gene} not found in DataFrame")
+        return
+
+    df= df.dropna(subset=[somatic_gene])
+    
+    # Get the column values, excluding NaNs
+    column_values = df[somatic_gene].dropna()
+    
+    # Transform the column values: non-zero values become 1, zero values stay 0
+    column_values = column_values.apply(lambda x: 1 if x != 0 else 0)
+    
+    # Calculate the allele frequency
+    mutation_frequency = column_values.sum() / len(column_values)
+    
+    return mutation_frequency,column_values.sum(),len(column_values)
+
+def merge_and_analyze_noncoding_coding(tsv1_path, tsv2_path, output_path='merged_with_combined_OR_and_p_values.tsv'):
+    """
+    This function merges two TSV files, performs inverse variance analysis on odds ratios (ORs),
+    and combines p-values using Stouffer's Z-score method.
+
+    Args:
+    tsv1_path (str): File path to the HMF TSV file.
+    tsv2_path (str): File path to the PROFILE TSV file.
+    output_path (str): Path to save the output TSV file (default: 'merged_with_combined_OR_and_p_values.tsv').
+
+    Returns:
+    merged_df (DataFrame): The resulting DataFrame with combined OR and p-values.
+    """
+
+    # Step 1: Read the data into pandas DataFrames
+    df1 = pd.read_csv(tsv1_path, sep='\t')
+    df2 = pd.read_csv(tsv2_path, sep='\t')
+
+    # Step 1: Replace 'Renal' with 'Kidney' in the 'cancer_type' column in both DataFrames
+    df1['cancer_type'] = df1['cancer_type'].replace('Renal', 'Kidney')
+    df2['cancer_type'] = df2['cancer_type'].replace('Renal', 'Kidney')
+
+    # Step 2: Merge the DataFrames on the relevant columns, including 'criteria'
+    merged_df = pd.merge(df1, df2, how='outer', 
+                         on=['criteria', 'cancer_type', 'germline_risk_allele', 'germline_gene', 'germline_context', 'somatic_gene', 'somatic_context','relevant_cancer'],
+                         suffixes=('_HMF', '_PROFILE'))
+
+    # Step 3: Calculate variance for ORs based on confidence intervals (CI)
+    merged_df['variance_HMF'] = ((np.log(merged_df['ci_OR_high_HMF']) - np.log(merged_df['ci_OR_low_HMF'])) / (2 * 1.96)) ** 2
+    merged_df['variance_PROFILE'] = ((np.log(merged_df['ci_OR_high_PROFILE']) - np.log(merged_df['ci_OR_low_PROFILE'])) / (2 * 1.96)) ** 2
+
+    # Step 4: Perform inverse variance weighting for OR
+    merged_df['log_OR_HMF'] = np.log(merged_df['OR_HMF'])
+    merged_df['log_OR_PROFILE'] = np.log(merged_df['OR_PROFILE'])
+
+    # Calculate combined log OR using inverse variance weighting
+    merged_df['log_OR_combined'] = (
+        (merged_df['log_OR_HMF'] / merged_df['variance_HMF'] + merged_df['log_OR_PROFILE'] / merged_df['variance_PROFILE']) /
+        (1 / merged_df['variance_HMF'] + 1 / merged_df['variance_PROFILE'])
+    )
+
+    # Convert combined log OR back to OR
+    merged_df['OR_final'] = np.exp(merged_df['log_OR_combined'])
+
+    # Step 5: Calculate the variance of the combined OR using inverse variance
+    merged_df['variance_OR_combined'] = 1 / (1 / merged_df['variance_HMF'] + 1 / merged_df['variance_PROFILE'])
+
+    # Step 6: Calculate Z-scores as OR_combined / variance_OR_combined
+    merged_df['z_combined'] = np.log(merged_df['OR_final']) / np.sqrt(merged_df['variance_OR_combined'])
+
+    # Step 7: Calculate p-value from Z-score
+    merged_df['p_val_final'] = 2 * stats.norm.sf(np.abs(merged_df['z_combined']))
+
+    # Step 8: Save the merged DataFrame with combined OR and p-values
+    merged_df.to_csv(output_path, sep='\t', index=False)
+
+    # Return the merged DataFrame
+    return merged_df
+
+def filter_by_pvalues(input_tsv, output_tsv = "filtered.tsv"):
+    """
+    This function filters the rows where (p_val_HMF or p_val_PROFILE < 0.05) 
+    and p_val_combined < 0.05, and writes the filtered DataFrame to a new file.
+
+    Args:
+    input_tsv (str): Path to the input TSV file.
+    output_tsv (str): Path to save the filtered output TSV file.
+    
+    Returns:
+    filtered_df (DataFrame): The filtered DataFrame.
+    """
+
+    # Read the merged data
+    df = pd.read_csv(input_tsv, sep='\t')
+
+    # Filter based on the conditions
+    filtered_df = df[
+        ((df['p_val_HMF'] < 0.05) | (df['p_val_PROFILE'] < 0.05)) & 
+        (df['p_val_combined'] < 0.05)
+    ]
+
+    # Save the filtered DataFrame to a new TSV file
+    filtered_df.to_csv(output_tsv, sep='\t', index=False)
+
+    # Return the filtered DataFrame
+    return filtered_df
+
+def analyze_data(convergence_table_path,genotype_table_path,germline_context,somatic_context,cohort,output_file):
+  # Step 1: Read in Dataframe containing convergences and Dataframe containing patient level data  
+  convergences_df = pd.read_csv(convergence_table_path,sep='\t')
+  convergences_df = convergences_df[(convergences_df['germline_context'] == germline_context) & 
+                                  (convergences_df['somatic_context'] == somatic_context)]
+  patient_df = pd.read_csv(genotype_table_path, sep='\t')
+  
+  # Initialize an empty list to store results
+  results = []
+
+  # Step 2: Initialize covariates
+  covariates = None
+  if cohort == "HMF":
+    covariates=['male','pca_1','pca_2','pca_3','age','TMB','tumorPurity','primary']
+  elif cohort == "PROFILE":
+    covariates=['male','pca_1','pca_2','pca_3','pca_4','age','TMB','tumorPurity','primary','late_stage']
+
+  # Alternate slate of covariates for hormone sensitive cancers and pancancer
+  covariates_hsc = [cov for cov in covariates if cov != 'male'] 
+  covariates_pancancer = covariates + ['Breast_Diagnosis','Colorectal_Diagnosis','Kidney_Diagnosis','Lung_Diagnosis','Prostate_Diagnosis']
+  
+  # Step 3: Prepare Suffixes
+  germline_suffix = None
+  if germline_context == "coding":
+    germline_suffix = "-g"
+  else:
+    germline_suffix = ""
+
+  somatic_suffix = None
+  if somatic_context == "coding":
+    somatic_suffix = "-s"
+  else:
+    somatic_suffix = "-ncs"
+
+  # Step 4: Loop through each row of the DataFrame and apply logistic regression with firth fallback
+  for index, row in convergences_df.iterrows():
+    germline_event = None
+
+    # Extract necessary columns
+    if germline_context == "coding":
+      gwas_cancer_type = row[0]
+      germline_gene = row[1]
+      germline_context = row[2]
+      somatic_gene = row[3]
+      somatic_context = row[4]
+      criteria = row[5]
+      germline_event = germline_gene
+    else:
+      gwas_cancer_type = row[0]
+      germline_snp = row[1]
+      germline_gene = row[2]
+      germline_context = row[3]
+      somatic_gene = row[4]
+      somatic_context = row[5]
+      criteria = row[6]
+      germline_event = germline_snp
+    
+    for cancer_type in ["Breast","Colorectal","Prostate","Lung","Kidney","Pancancer"]:
+      relevant_cancer = 0
+      if gwas_cancer_type.lower() == cancer_type.lower():
+        relevant_cancer = 1
+      elif cancer_type == "Pancancer":
+        relevant_cancer = 2
+
+      # Apply the functions to get metrics
+      if cancer_type == "Breast" or cancer_type == "Prostate":
+        regression_output = logistic_regression_with_fallback(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix,covariates=covariates_hsc)
+        filtered_germline_output = find_filtered_allele_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_hsc)
+        filtered_somatic_output = find_filtered_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_hsc)
+      elif cancer_type == "Pancancer":
+        regression_output = logistic_regression_with_fallback(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix,covariates=covariates_pancancer)
+        filtered_germline_output = find_filtered_allele_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_pancancer)
+        filtered_somatic_output = find_filtered_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates_pancancer)       
+      else:
+        regression_output = logistic_regression_with_fallback(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix,covariates=covariates)
+        filtered_germline_output = find_filtered_allele_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates)
+        filtered_somatic_output = find_filtered_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix, somatic_gene + somatic_suffix, covariates=covariates)
+              
+      germline_output = find_mutation_frequency(patient_df, cancer_type, germline_event + germline_suffix)
+      somatic_output = find_mutation_frequency(patient_df, cancer_type, somatic_gene + somatic_suffix)
+          
+      # Assign values using ternary operators for cleaner code
+      OR, p_val, ci_OR_low, ci_OR_high = regression_output if regression_output else (pd.NA, pd.NA, pd.NA, pd.NA)
+
+      filtered_germline_plp_frequency, filtered_germline_plp_count, filtered_germline_sample_size = (
+        filtered_germline_output if filtered_germline_output else (pd.NA, pd.NA, pd.NA)
+      )
+
+      germline_plp_frequency, germline_plp_count, germline_sample_size = (
+        germline_output if germline_output else (pd.NA, pd.NA, pd.NA)
+      )
+
+      somatic_plp_frequency, somatic_plp_count, somatic_sample_size = (
+        somatic_output if somatic_output else (pd.NA, pd.NA, pd.NA)
+      )
+
+      filtered_somatic_plp_frequency, filtered_somatic_plp_count, filtered_somatic_sample_size = (
+        filtered_somatic_output if filtered_somatic_output else (pd.NA, pd.NA, pd.NA)
+      )
+
+      if germline_context == "coding":
+        # Append the results to the list
+        results.append([
+            cancer_type, germline_gene, germline_context, 
+            somatic_gene, somatic_context, criteria,
+            germline_plp_frequency, germline_plp_count, germline_sample_size,
+            filtered_germline_plp_frequency, filtered_germline_plp_count, filtered_germline_sample_size,
+            somatic_plp_frequency, somatic_plp_count, somatic_sample_size,
+            filtered_somatic_plp_frequency, filtered_somatic_plp_count, filtered_somatic_sample_size,
+            OR, p_val, ci_OR_low, ci_OR_high, relevant_cancer
+        ])
+      else:
+        # Append the results to the list
+        results.append([
+            cancer_type, germline_snp, germline_gene, germline_context, 
+            somatic_gene, somatic_context, criteria,
+            germline_plp_frequency, germline_plp_count, germline_sample_size,
+            filtered_germline_plp_frequency, filtered_germline_plp_count, filtered_germline_sample_size,
+            somatic_plp_frequency, somatic_plp_count, somatic_sample_size,
+            filtered_somatic_plp_frequency, filtered_somatic_plp_count, filtered_somatic_sample_size,
+            OR, p_val, ci_OR_low, ci_OR_high, relevant_cancer
+        ])          
+  
+  if germline_context == "noncoding" and somatic_context == "coding":
+    cohort_suffix = cohort
+  else:
+    cohort_suffix = "final"
+
+  if germline_context == "coding":
+    # Convert the results to a DataFrame
+    output_df = pd.DataFrame(results, columns=[
+        'cancer_type', 'germline_gene', 'germline_context', 
+        'somatic_gene', 'somatic_context', 'criteria',
+        f'germline_plp_frequency_{cohort}', f'germline_plp_count_{cohort}', f'germline_sample_size_{cohort}',
+        f'filtered_germline_plp_frequency_{cohort}', f'filtered_germline_plp_count_{cohort}', f'filtered_germline_sample_size_{cohort}',
+        f'somatic_plp_frequency_{cohort}', f'somatic_plp_count_{cohort}', f'somatic_sample_size_{cohort}',
+        f'filtered_somatic_plp_frequency_{cohort}', f'filtered_somatic_plp_count_{cohort}', f'filtered_somatic_sample_size_{cohort}',
+        f'OR_{cohort_suffix}', f'p_val_{cohort_suffix}', f'ci_OR_low_{cohort_suffix}', f'ci_OR_high_{cohort_suffix}', 'relevant_cancer'
+    ])
+  else:
+    # Convert the results to a DataFrame
+    output_df = pd.DataFrame(results, columns=[
+        'cancer_type', 'germline_risk_allele','germline_gene', 'germline_context', 
+        'somatic_gene', 'somatic_context', 'criteria',
+        f'germline_plp_frequency_{cohort}', f'germline_plp_count_{cohort}', f'germline_sample_size_{cohort}',
+        f'filtered_germline_plp_frequency_{cohort}', f'filtered_germline_plp_count_{cohort}', f'filtered_germline_sample_size_{cohort}',
+        f'somatic_plp_frequency_{cohort}', f'somatic_plp_count_{cohort}', f'somatic_sample_size_{cohort}',
+        f'filtered_somatic_plp_frequency_{cohort}', f'filtered_somatic_plp_count_{cohort}', f'filtered_somatic_sample_size_{cohort}',
+        f'OR_{cohort_suffix}', f'p_val_{cohort_suffix}', f'ci_OR_low_{cohort_suffix}', f'ci_OR_high_{cohort_suffix}', 'relevant_cancer'
+    ])
+
+  # Eliminate duplicate rows based on all columns
+  output_df = output_df.drop_duplicates()
+
+  # Write the DataFrame to a TSV file
+  output_df.to_csv(output_file, sep='\t', index=False)
+
+def process_and_merge_profile_tables(variant_table_pathway, gene_table_pathway):
+    """
+    Process variant and gene tables by filtering columns based on suffixes, merging them,
+    and adjusting '-s' columns based on '-g' column values.
+
+    Parameters:
+    - variant_table: DataFrame containing variant data.
+    - gene_table: DataFrame containing gene data.
+
+    Returns:
+    - DataFrame: The processed and merged table.
+    """
+
+    variant_table = pd.read_csv(variant_table_pathway,sep='\t')
+    gene_table = pd.read_csv(gene_table_pathway,sep='\t')
+
+    # Step 1: Process column names
+    # Remove '-s' and '-g' suffixes for comparison
+    variant_columns = [col for col in variant_table.columns if col != 'patient_id' and (col.endswith('-s') or col.endswith('-g'))]
+    gene_columns = [col for col in gene_table.columns if col != 'patient_id' and (col.endswith('-s') or col.endswith('-g'))]
+
+    variant_base_columns = [col.rsplit('-', 1)[0] for col in variant_columns]
+    gene_base_columns = [col.rsplit('-', 1)[0] for col in gene_columns]
+
+    # Step 2: Find common base columns
+    common_columns = list(set(variant_base_columns).intersection(set(gene_base_columns)))
+
+    # Step 3: Create filtered_gene_columns and filter gene_table
+    filtered_gene_columns = [col + '-g' for col in common_columns] + ['patient_id']
+    filtered_gene_table = gene_table[filtered_gene_columns]
+
+    # Step 4: Merge filtered_gene_table with variant_table
+    merged_table = variant_table.merge(filtered_gene_table, on='patient_id', how='inner')
+
+    # Step 5: Adjust '-s' columns based on '-g' columns
+    conversion_stats = []  # To store stats for each column
+    for col in common_columns:
+        s_col = col + '-s'
+        g_col = col + '-g'
+        if s_col in merged_table.columns and g_col in merged_table.columns:
+            # Get initial count of values > 0 in the s_col
+            initial_s_col_count = (merged_table[s_col] > 0).sum()
+
+            # Perform the conversion
+            merged_table.loc[(merged_table[g_col] == 1) & (merged_table[s_col] > 0), s_col] = 0
+
+            # Get the count of values converted to 0
+            converted_count = initial_s_col_count - (merged_table[s_col] > 0).sum()
+
+            # Calculate percentage of converted values
+            if initial_s_col_count > 0:
+                conversion_percentage = (converted_count / initial_s_col_count) * 100
+            else:
+                conversion_percentage = 0.0
+
+            # Append stats to the list
+            conversion_stats.append({
+                "s_col": s_col,
+                "g_col": g_col,
+                "converted_count": converted_count,
+                "initial_s_col_count": initial_s_col_count,
+                "conversion_percentage": conversion_percentage
+            })
+
+    # Print stats for each column
+    for stats in conversion_stats:
+        print(f"Column: {stats['s_col']} | "
+              f"Converted: {stats['converted_count']} out of {stats['initial_s_col_count']} "
+              f"({stats['conversion_percentage']:.2f}%)")
+
+
+    merged_table.to_csv("profile_table.tsv",sep='\t',index=False)
+    return
+
diff --git a/docker/hail_rf/Dockerfile b/docker/hail_rf/Dockerfile
new file mode 100644
index 0000000..9c3b7cf
--- /dev/null
+++ b/docker/hail_rf/Dockerfile
@@ -0,0 +1,15 @@
+# The Germline Genomics of Cancer (G2C)
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# Simple Dockerfile for vanallenlab/hail_rf
+
+
+FROM hailgenetics/hail:0.2.127.post1-py3.11
+MAINTAINER "Noah Fields <Noah_Fields@dfci.harvard.edu>"
+
+
+COPY apply_rf_model.py /opt/
+
+# Launch bash at runtime
+CMD ["/bin/bash"]
diff --git a/docker/hail_rf/apply_rf_model.py b/docker/hail_rf/apply_rf_model.py
new file mode 100644
index 0000000..8f41cf3
--- /dev/null
+++ b/docker/hail_rf/apply_rf_model.py
@@ -0,0 +1,640 @@
+# noqa: D100
+#Adopted From Broad Institute: https://broadinstitute.github.io/gnomad_methods/_modules/index.html
+
+import json
+import os
+import logging
+import pprint
+from pprint import pformat
+from typing import Dict, List, Optional, Tuple
+
+import hail as hl
+import pandas as pd
+import pyspark.sql
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import RandomForestClassifier
+from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
+from pyspark.ml.functions import vector_to_array
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col  # pylint: disable=no-name-in-module
+
+#from gnomad.utils.file_utils import file_exists
+
+logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+def run_rf_test(
+    mt: hl.MatrixTable, output: str = "/tmp"
+) -> Tuple[pyspark.ml.PipelineModel, hl.Table]:
+    """
+    Run a dummy test RF on a given MT.
+
+    1. Creates row annotations and labels to run model on
+    2. Trains a RF pipeline model (including median imputation of missing values in created annotations)
+    3. Saves the RF pipeline model
+    4. Applies the model to the MT and prints features importance
+
+    :param mt: Input MT
+    :param output: Output files prefix to save the RF model
+    :return: RF model and MatrixTable after applying RF model
+    """
+    mt = mt.annotate_rows(
+        feature1=hl.rand_bool(0.1),
+        feature2=hl.rand_norm(0.0, 1.0),
+        feature3=hl.or_missing(hl.rand_bool(0.5), hl.rand_norm(0.0, 1.0)),
+    )
+
+    mt = mt.annotate_rows(
+        label=hl.if_else(mt["feature1"] & (mt["feature2"] > 0), "TP", "FP")
+    )
+    ht = mt.rows()
+
+    def f3stats(ht):
+        return ht.aggregate(
+            hl.struct(
+                n=hl.agg.count_where(hl.is_defined(ht["feature3"])),
+                med=hl.median(hl.agg.collect(ht["feature3"])),
+            )
+        )
+
+    f3_before_imputation = f3stats(ht)
+    logger.info("Feature3 defined values before imputation: %d", f3_before_imputation.n)
+    logger.info("Feature3 median: %f", f3_before_imputation.med)
+
+    features_to_impute = ["feature3"]
+    quantiles = get_columns_quantiles(ht, features_to_impute, [0.5])
+    quantiles = {k: v[0] for k, v in quantiles.items()}
+
+    logger.info("Features median:\n%s", [f"{k}: {v}\n" for k, v in quantiles.items()])
+    ht = ht.annotate(**{f: hl.or_else(ht[f], quantiles[f]) for f in features_to_impute})
+    ht = ht.annotate_globals(medians=quantiles)
+
+    f3_after_imputation = f3stats(ht)
+    logger.info("Feature3 defined values after imputation: %d", f3_after_imputation.n)
+    logger.info("Feature3 median: %f", f3_after_imputation.med)
+
+    ht = ht.select("label", "feature1", "feature2", "feature3")
+
+    label = "label"
+    features = ["feature1", "feature2", "feature3"]
+
+    rf_model = train_rf(ht, features, label)
+    save_model(rf_model, out_path=output + "/rf.model", overwrite=True)
+    rf_model = load_model(output + "/rf.model")
+
+    return rf_model, apply_rf_model(ht, rf_model, features, label)
+
+
+
+def check_ht_fields_for_spark(ht: hl.Table, fields: List[str]) -> None:
+    """
+    Check specified fields of a hail table for Spark DataFrame conversion (type and name).
+
+    :param ht: input Table
+    :param fields: Fields to test
+    :return: None
+    """
+    allowed_types = [
+        hl.tfloat,
+        hl.tfloat32,
+        hl.tfloat64,
+        hl.tint,
+        hl.tint32,
+        hl.tint64,
+        hl.tstr,
+        hl.tbool,
+    ]
+
+    bad_field_names = [c for c in fields if "." in c]
+
+    bad_types = [
+        c[0]
+        for c in ht.key_by().select(*fields).row.items()
+        if c[1].dtype not in allowed_types
+    ]
+
+    if bad_field_names or bad_types:
+        raise ValueError(
+            "Only basic type fields can be converted from Hail to Spark. In addition,"
+            " `.` are not allowed in field names in Spark.\n"
+            f"Offending fields (non basic type): {bad_types}"
+            f"Offending fields (bad field name): {', '.join(bad_field_names)}\n"
+        )
+
+    return
+
+
+
+def get_columns_quantiles(
+    ht: hl.Table, fields: List[str], quantiles: List[float], relative_error: int = 0.001
+) -> Dict[str, List[float]]:
+    """
+    Compute approximate quantiles of specified numeric fields from non-missing values. Non-numeric fields are ignored.
+
+    This function returns a Dict of column name -> list of quantiles in the same order specified.
+    If a column only has NAs, None is returned.
+
+    :param ht: input HT
+    :param fields: list of features to impute. If none given, all numerical features with missing data are imputed
+    :param quantiles: list of quantiles to return (e.g. [0.5] would return the median)
+    :param relative_error: The relative error on the quantile approximation
+    :return: Dict of column -> quantiles
+    """
+    check_ht_fields_for_spark(ht, fields)
+
+    df = ht.key_by().select(*fields).to_spark()
+
+    res = {}
+    for f in fields:
+        logger.info("Computing median for column: %s", f)
+        col_no_na = df.select(f).dropna()
+        if col_no_na.first() is not None:
+            res[f] = col_no_na.approxQuantile(str(f), quantiles, relative_error)
+        else:
+            res[f] = None
+
+    return res
+
+
+
+def median_impute_features(
+    ht: hl.Table, strata: Optional[Dict[str, hl.expr.Expression]] = None
+) -> hl.Table:
+    """
+    Numerical features in the Table are median-imputed by Hail's `approx_median`.
+
+    If a `strata` dict is given, imputation is done based on the median of of each stratification.
+
+    The annotations that are added to the Table are
+        - feature_imputed - A row annotation indicating if each numerical feature was imputed or not.
+        - features_median - A global annotation containing the median of the numerical features. If `strata` is given,
+          this struct will also be broken down by the given strata.
+        - variants_by_strata - An additional global annotation with the variant counts by strata that will only be
+          added if imputing by a given `strata`.
+
+    :param ht: Table containing all samples and features for median imputation.
+    :param strata: Whether to impute features median by specific strata (default False).
+    :return: Feature Table imputed using approximate median values.
+    """
+    logger.info("Computing feature medians for imputation of missing numeric values")
+    numerical_features = [
+        k for k, v in ht.row.dtype.items() if v == hl.tint or v == hl.tfloat
+    ]
+
+    median_agg_expr = hl.struct(
+        **{feature: hl.agg.approx_median(ht[feature]) for feature in numerical_features}
+    )
+
+    if strata:
+        ht = ht.annotate_globals(
+            feature_medians=ht.aggregate(
+                hl.agg.group_by(hl.tuple([ht[x] for x in strata]), median_agg_expr),
+                _localize=False,
+            ),
+            variants_by_strata=ht.aggregate(
+                hl.agg.counter(hl.tuple([ht[x] for x in strata])), _localize=False
+            ),
+        )
+        feature_median_expr = ht.feature_medians[hl.tuple([ht[x] for x in strata])]
+        logger.info(
+            "Variant count by strata:\n%s",
+            "\n".join(
+                [
+                    "{}: {}".format(k, v)
+                    for k, v in hl.eval(ht.variants_by_strata).items()
+                ]
+            ),
+        )
+
+    else:
+        ht = ht.annotate_globals(
+            feature_medians=ht.aggregate(median_agg_expr, _localize=False)
+        )
+        feature_median_expr = ht.feature_medians
+
+    ht = ht.annotate(
+        **{f: hl.or_else(ht[f], feature_median_expr[f]) for f in numerical_features},
+        feature_imputed=hl.struct(
+            **{f: hl.is_missing(ht[f]) for f in numerical_features}
+        ),
+    )
+
+    return ht
+
+
+
+def ht_to_rf_df(
+    ht: hl.Table, features: List[str], label: str = None, index: str = None
+) -> pyspark.sql.DataFrame:
+    """
+    Create a Spark dataframe ready for RF from a HT.
+
+    Rows with any missing features are dropped.
+    Missing labels are replaced with 'NA'
+
+    .. note::
+
+        Only basic types are supported!
+
+    :param ht: Input HT
+    :param features: Features that will be used for RF
+    :param label: Optional label column that will be predicted by RF
+    :param index: Optional index column to keep (E.g. for joining results back at later stage)
+    :return: Spark Dataframe
+    """
+    cols_to_keep = features[:]
+
+    if label:
+        cols_to_keep.append(label)
+    if index:
+        cols_to_keep.append(index)
+
+    df = ht.key_by().select(*cols_to_keep).to_spark()
+    df = df.dropna(subset=features)
+
+    if label:
+        df = df.fillna("NA", subset=label)
+
+    return df
+
+
+
+def get_features_importance(
+    rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
+) -> Dict[str, float]:
+    """
+    Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.
+
+    :param rf_pipeline: Input pipeline
+    :param rf_index: index of the RandomForestClassifier stage
+    :param assembler_index: index of the VectorAssembler stage
+    :return: feature importance for each feature in the RF model
+    """
+    feature_names = [
+        x[: -len("_indexed")] if x.endswith("_indexed") else x
+        for x in rf_pipeline.stages[assembler_index].getInputCols()
+    ]
+
+    return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
+
+
+
+def get_labels(rf_pipeline: pyspark.ml.PipelineModel) -> List[str]:
+    """
+    Return the labels from the StringIndexer stage at index 0 from an RF pipeline model.
+
+    :param rf_pipeline: Input pipeline
+    :return: labels
+    """
+    return rf_pipeline.stages[0].labels
+
+
+
+def test_model(
+    ht: hl.Table,
+    rf_model: pyspark.ml.PipelineModel,
+    features: List[str],
+    label: str,
+    prediction_col_name: str = "rf_prediction",
+) -> List[hl.tstruct]:
+    """
+    A wrapper to test a model on a set of examples with known labels.
+
+    1) Runs the model on the data
+    2) Prints confusion matrix and accuracy
+    3) Returns confusion matrix as a list of struct
+
+    :param ht: Input table
+    :param rf_model: RF Model
+    :param features: Columns containing features that were used in the model
+    :param label: Column containing label to be predicted
+    :param prediction_col_name: Where to store the prediction
+    :return: A list containing structs with {label, prediction, n}
+    """
+    ht = apply_rf_model(
+        ht.filter(hl.is_defined(ht[label])),
+        rf_model,
+        features,
+        label,
+        prediction_col_name=prediction_col_name,
+    )
+
+    test_results = (
+        ht.group_by(ht[prediction_col_name], ht[label])
+        .aggregate(n=hl.agg.count())
+        .collect()
+    )
+
+    # Print results
+    df = pd.DataFrame(test_results)
+    df = df.pivot(index=label, columns=prediction_col_name, values="n")
+    logger.info("Testing results:\n%s", pprint.pformat(df))
+    logger.info(
+        "Accuracy: %f",
+        sum([x.n for x in test_results if x[label] == x[prediction_col_name]])
+        / sum([x.n for x in test_results]),
+    )
+
+    return test_results
+
+
+
+def apply_rf_model(
+    ht: hl.Table,
+    rf_model: pyspark.ml.PipelineModel,
+    features: List[str],
+    label: str = None,
+    probability_col_name: str = "rf_probability",
+    prediction_col_name: str = "rf_prediction",
+) -> hl.Table:
+    """
+    Apply a Random Forest (RF) pipeline model to a Table and annotate the RF probabilities and predictions.
+
+    :param ht: Input HT
+    :param rf_model: Random Forest pipeline model
+    :param features: List of feature columns in the pipeline. !Should match the model list of features!
+    :param label: Optional column containing labels. !Should match the model labels!
+    :param probability_col_name: Name of the column that will store the RF probabilities
+    :param prediction_col_name: Name of the column that will store the RF predictions
+    :return: Table with RF columns
+    """
+    logger.info("Applying RF model.")
+
+    check_fields = features[:]
+    if label:
+        check_fields.append(label)
+    check_ht_fields_for_spark(ht, check_fields)
+
+    index_name = "rf_idx"
+    while index_name in ht.row:
+        index_name += "_tmp"
+    ht = ht.add_index(name=index_name)
+
+    ht_keys = ht.key
+    ht = ht.key_by(index_name)
+
+    df = ht_to_rf_df(ht, features, label, index_name)
+
+    rf_df = rf_model.transform(df)
+    prob_cols = get_labels(rf_model)
+
+    rf_df = rf_df.withColumn("probability", vector_to_array("probability")).select(
+        [index_name, "predictedLabel"]
+        + [col("probability")[i] for i in range(len(prob_cols))]
+    )
+    new_colnames = [index_name, "predictedLabel"] + prob_cols
+    rf_df = rf_df.toDF(*new_colnames)
+
+    # Note: SparkSession is needed to write DF to disk before converting to HT;
+    # the resulting HT sometimes has missing and/or duplicate rows without
+    # the intermediate write.
+    spark = SparkSession.builder.getOrCreate()
+    rf_df.write.mode("overwrite").save("rf_probs.parquet")
+    rf_df = spark.read.parquet("rf_probs.parquet")
+
+    rf_ht = hl.Table.from_spark(rf_df)
+
+    rf_ht = rf_ht.key_by(index_name)
+    ht = ht.annotate(
+        **{
+            probability_col_name: {c: rf_ht[ht[index_name]][c] for c in prob_cols},
+            prediction_col_name: rf_ht[ht[index_name]]["predictedLabel"],
+        }
+    )
+
+    ht = ht.key_by(*ht_keys)
+    ht = ht.drop(index_name)
+
+    return ht
+
+
+
+def save_model(
+    rf_pipeline: pyspark.ml.PipelineModel, out_path: str, overwrite: bool = False
+) -> None:
+    """
+    Save a Random Forest pipeline model.
+
+    :param rf_pipeline: Pipeline to save
+    :param out_path: Output path
+    :param overwrite: If set, will overwrite existing file(s) at output location
+    :return: Nothing
+    """
+    logger.info("Saving model to %s", out_path)
+    if overwrite:
+        rf_pipeline.write().overwrite().save(out_path)
+    else:
+        rf_pipeline.save(out_path)
+
+
+
+def load_model(input_path: str) -> pyspark.ml.PipelineModel:
+    """
+    Load a Random Forest pipeline model.
+
+    :param input_path: Location of model to load
+    :return: Random Forest pipeline model
+    """
+    logger.info("Loading model from %s", input_path)
+    return pyspark.ml.PipelineModel.load(input_path)
+
+
+
+def train_rf(
+    ht: hl.Table,
+    features: List[str],
+    label: str,
+    num_trees: int = 500,
+    max_depth: int = 5,
+) -> pyspark.ml.PipelineModel:
+    """
+    Train a Random Forest (RF) pipeline model.
+
+    :param ht: Input HT
+    :param features: List of columns to be used as features
+    :param label: Column containing the label to predict
+    :param num_trees: Number of trees to use
+    :param max_depth: Maximum tree depth
+    :return: Random Forest pipeline model
+    """
+    logger.info(
+        "Training RF model using:\nfeatures: %s\nlabels: %s\nnum_trees:"
+        " %d\nmax_depth: %d",
+        ",".join(features),
+        label,
+        num_trees,
+        max_depth,
+    )
+
+    check_ht_fields_for_spark(ht, features + [label])
+
+    df = ht_to_rf_df(ht, features, label)
+
+    label_indexer = (
+        StringIndexer(inputCol=label, outputCol=label + "_indexed")
+        .setHandleInvalid("keep")
+        .fit(df)
+    )
+    labels = label_indexer.labels
+    logger.info("Found labels: %s", labels)
+
+    string_features = [x[0] for x in df.dtypes if x[0] != label and x[1] == "string"]
+    if string_features:
+        logger.info("Indexing string features: %s", ",".join(string_features))
+    string_features_indexers = [
+        StringIndexer(inputCol=x, outputCol=x + "_indexed")
+        .setHandleInvalid("keep")
+        .fit(df)
+        for x in string_features
+    ]
+
+    assembler = VectorAssembler(
+        inputCols=[
+            x[0] + "_indexed" if x[1] == "string" else x[0]
+            for x in df.dtypes
+            if x[0] != label
+        ],
+        outputCol="features",
+    )
+
+    rf = RandomForestClassifier(
+        labelCol=label + "_indexed",
+        featuresCol="features",
+        maxDepth=max_depth,
+        numTrees=num_trees,
+    )
+
+    label_converter = IndexToString(
+        inputCol="prediction", outputCol="predictedLabel", labels=labels
+    )
+
+    pipeline = Pipeline(
+        stages=[label_indexer]
+        + string_features_indexers
+        + [assembler, rf, label_converter]
+    )
+
+    # Train model
+    logger.info("Training RF model")
+    rf_model = pipeline.fit(df)
+
+    feature_importance = get_features_importance(rf_model)
+
+    logger.info(
+        "RF features importance:\n%s",
+        "\n".join([f"{f}: {i}" for f, i in feature_importance.items()]),
+    )
+
+    return rf_model
+
+
+
+def get_rf_runs(rf_json_fp: str) -> Dict:
+    """
+    Load RF run data from JSON file.
+
+    :param rf_json_fp: File path to rf json file.
+    :return: Dictionary containing the content of the JSON file, or an empty dictionary if the file wasn't found.
+    """
+    if file_exists(rf_json_fp):
+        with hl.hadoop_open(rf_json_fp) as f:
+            return json.load(f)
+    else:
+        logger.warning(
+            "File %s could not be found. Returning empty RF run hash dict.", rf_json_fp
+        )
+        return {}
+
+
+
+def get_run_data(
+    input_args: Dict[str, bool],
+    test_intervals: List[str],
+    features_importance: Dict[str, float],
+    test_results: List[hl.tstruct],) -> Dict:
+    """
+    Create a Dict containing information about the RF input arguments and feature importance.
+
+    :param Dict of bool keyed by str input_args: Dictionary of model input arguments
+    :param List of str test_intervals: Intervals withheld from training to be used in testing
+    :param Dict of float keyed by str features_importance: Feature importance returned by the RF
+    :param List of struct test_results: Accuracy results from applying RF model to the test intervals
+    :return: Dict of RF information
+    """
+    run_data = {
+        "input_args": input_args,
+        "features_importance": features_importance,
+        "test_intervals": test_intervals,
+    }
+
+    if test_results is not None:
+        tps = 0
+        total = 0
+        for row in test_results:
+            values = list(row.values())
+            # Note: values[0] is the TP/FP label and values[1] is the prediction
+            if values[0] == values[1]:
+                tps += values[2]
+            total += values[2]
+        run_data["test_results"] = [dict(x) for x in test_results]
+        run_data["test_accuracy"] = tps / total
+
+    return run_data
+
+
+
+def pretty_print_runs(
+    runs: Dict, label_col: str = "rf_label", prediction_col_name: str = "rf_prediction") -> None:
+    """
+    Print the information for the RF runs loaded from the json file storing the RF run hashes -> info.
+
+    :param runs: Dictionary containing JSON input loaded from RF run file
+    :param label_col: Name of the RF label column
+    :param prediction_col_name: Name of the RF prediction column
+    :return: Nothing -- only prints information
+    """
+    for run_hash, run_data in runs.items():
+        print(f"\n=== {run_hash} ===")
+        testing_results = (
+            run_data.pop("test_results") if "test_results" in run_data else None
+        )
+        # print(testing_results)
+        print(json.dumps(run_data, sort_keys=True, indent=4, separators=(",", ": ")))
+        if testing_results is not None:
+            # Print results
+            res_pd = pd.DataFrame(testing_results)
+            res_pd = res_pd.pivot(
+                index=label_col, columns=prediction_col_name, values="n"
+            )
+            logger.info("Testing results:\n%s", pformat(res_pd))
+
+
+def file_exists(fname: str) -> bool:
+    """
+    Check whether a file exists.
+
+    Supports either local or Google cloud (gs://) paths.
+    If the file is a Hail file (.ht, .mt, .bm, .parquet, .he, and .vds extensions), it
+    checks that _SUCCESS is present.
+
+    :param fname: File name.
+    :return: Whether the file exists.
+    """
+    fext = os.path.splitext(fname)[1]
+    if fext in {".ht", ".mt", ".bm", ".parquet", ".he"}:
+        paths = [f"{fname}/_SUCCESS"]
+    elif fext == ".vds":
+        paths = [f"{fname}/reference_data/_SUCCESS", f"{fname}/variant_data/_SUCCESS"]
+    else:
+        paths = [fname]
+
+    if fname.startswith("gs://"):
+        exists_func = hl.hadoop_exists
+    else:
+        exists_func = os.path.isfile
+
+    exists = all([exists_func(p) for p in paths])
+
+    return exist
\ No newline at end of file
diff --git a/docker/pydata_stack/.DS_Store b/docker/pydata_stack/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/docker/pydata_stack/.DS_Store differ
diff --git a/docker/pydata_stack/Dockerfile b/docker/pydata_stack/Dockerfile
new file mode 100644
index 0000000..a680921
--- /dev/null
+++ b/docker/pydata_stack/Dockerfile
@@ -0,0 +1,18 @@
+# The Germline Genomics of Cancer (G2C)
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# Simple Dockerfile for python3 w/ pandas
+
+FROM ubuntu:latest
+MAINTAINER "Noah Fields <Noah_Fields@dfci.harvard.edu>"
+
+
+# Install python packages dependencies
+RUN apt-get update && \
+    apt-get install -y python3-numpy python3-pandas \
+                       python3-scipy python3-seaborn && \
+    apt-get clean
+
+# Launch bash at runtime
+CMD ["/bin/bash"]
diff --git a/figures/.DS_Store b/figures/.DS_Store
new file mode 100644
index 0000000..f57dff7
Binary files /dev/null and b/figures/.DS_Store differ
diff --git a/figures/charr_figures/HQ_HET_RATE.png b/figures/charr_figures/HQ_HET_RATE.png
new file mode 100644
index 0000000..d2aed28
Binary files /dev/null and b/figures/charr_figures/HQ_HET_RATE.png differ
diff --git a/figures/charr_figures/INCONSISTENT_AB_HET_RATE.png b/figures/charr_figures/INCONSISTENT_AB_HET_RATE.png
new file mode 100644
index 0000000..c262953
Binary files /dev/null and b/figures/charr_figures/INCONSISTENT_AB_HET_RATE.png differ
diff --git a/figures/charr_figures/charr.png b/figures/charr_figures/charr.png
new file mode 100644
index 0000000..30e2c67
Binary files /dev/null and b/figures/charr_figures/charr.png differ
diff --git a/figures/g2c_figures/Everyone_demographics.png b/figures/g2c_figures/Everyone_demographics.png
new file mode 100644
index 0000000..745c361
Binary files /dev/null and b/figures/g2c_figures/Everyone_demographics.png differ
diff --git a/figures/g2c_figures/biome_demographics.png b/figures/g2c_figures/biome_demographics.png
new file mode 100644
index 0000000..72aeb19
Binary files /dev/null and b/figures/g2c_figures/biome_demographics.png differ
diff --git a/figures/g2c_figures/cptac_demographics.png b/figures/g2c_figures/cptac_demographics.png
new file mode 100644
index 0000000..450fc73
Binary files /dev/null and b/figures/g2c_figures/cptac_demographics.png differ
diff --git a/figures/g2c_figures/gtex_demographics.png b/figures/g2c_figures/gtex_demographics.png
new file mode 100644
index 0000000..fa04ab2
Binary files /dev/null and b/figures/g2c_figures/gtex_demographics.png differ
diff --git a/figures/g2c_figures/hgsvc_demographics.png b/figures/g2c_figures/hgsvc_demographics.png
new file mode 100644
index 0000000..771da1d
Binary files /dev/null and b/figures/g2c_figures/hgsvc_demographics.png differ
diff --git a/figures/g2c_figures/hmf_demographics.png b/figures/g2c_figures/hmf_demographics.png
new file mode 100644
index 0000000..83feea1
Binary files /dev/null and b/figures/g2c_figures/hmf_demographics.png differ
diff --git a/figures/g2c_figures/icgc_demographics.png b/figures/g2c_figures/icgc_demographics.png
new file mode 100644
index 0000000..a3dba38
Binary files /dev/null and b/figures/g2c_figures/icgc_demographics.png differ
diff --git a/figures/g2c_figures/lcins_demographics.png b/figures/g2c_figures/lcins_demographics.png
new file mode 100644
index 0000000..1cff8b1
Binary files /dev/null and b/figures/g2c_figures/lcins_demographics.png differ
diff --git a/figures/g2c_figures/mesa_demographics.png b/figures/g2c_figures/mesa_demographics.png
new file mode 100644
index 0000000..098e3be
Binary files /dev/null and b/figures/g2c_figures/mesa_demographics.png differ
diff --git a/figures/g2c_figures/proactive-core_demographics.png b/figures/g2c_figures/proactive-core_demographics.png
new file mode 100644
index 0000000..c75ead4
Binary files /dev/null and b/figures/g2c_figures/proactive-core_demographics.png differ
diff --git a/figures/g2c_figures/proactive-other_demographics.png b/figures/g2c_figures/proactive-other_demographics.png
new file mode 100644
index 0000000..c7bc613
Binary files /dev/null and b/figures/g2c_figures/proactive-other_demographics.png differ
diff --git a/figures/g2c_figures/ufc_demographics.png b/figures/g2c_figures/ufc_demographics.png
new file mode 100644
index 0000000..8f4b8aa
Binary files /dev/null and b/figures/g2c_figures/ufc_demographics.png differ
diff --git a/scripts/.DS_Store b/scripts/.DS_Store
new file mode 100644
index 0000000..facc943
Binary files /dev/null and b/scripts/.DS_Store differ
diff --git a/scripts/aou_ufc_helpers/.DS_Store b/scripts/aou_ufc_helpers/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/scripts/aou_ufc_helpers/.DS_Store differ
diff --git a/scripts/aou_ufc_helpers/VALab_germline_somatic.tsv b/scripts/aou_ufc_helpers/VALab_germline_somatic.tsv
new file mode 100644
index 0000000..7227b8f
--- /dev/null
+++ b/scripts/aou_ufc_helpers/VALab_germline_somatic.tsv
@@ -0,0 +1,265 @@
+#cancer	germline_gene	germline_context	somatic_gene	somatic_context	criteria
+breast	AKAP9	coding	HRAS	coding	known_ppi
+breast	AKAP9	coding	IKZF3	coding	known_ppi
+breast	ARRDC3	noncoding	SMARCD1	coding	known_ppi
+breast	BARD1	coding	BARD1	coding	same_gene
+breast	BRCA1	coding	BRCA1	coding	same_gene
+breast	BRCA1	coding	ESR1	coding	known_ppi
+breast	BRCA1	coding	JAK2	coding	known_ppi
+breast	BRCA1	coding	UBR5	coding	known_ppi
+breast	BRCA2	coding	BRCA2	coding	same_gene
+breast	C2CD6	noncoding	IKZF3	coding	known_ppi
+breast	CBX8	noncoding	BAP1	coding	known_ppi;protein_complex
+breast	CBX8	noncoding	H3C2	coding	known_ppi
+breast	CCNE1	noncoding	CCNE1	coding	same_gene
+breast	CCNE1	noncoding	FBXW7	coding	known_ppi
+breast	CHEK2	noncoding	RB1	coding	known_ppi
+breast	COL1A2	noncoding	SMARCD1	coding	known_ppi
+breast	COL8A1	noncoding	FBLN2	coding	known_ppi
+breast	CUX1	noncoding	CUX1	coding	same_gene
+breast	ERBB4	noncoding	ABL2	coding	known_ppi
+breast	ERBB4	noncoding	ERBB2	coding	known_ppi
+breast	ERBB4	noncoding	ERBB3	coding	known_ppi
+breast	ERBB4	noncoding	ERBB4	coding	same_gene
+breast	ESRRG	noncoding	ATM	coding	known_ppi
+breast	FGF10	noncoding	FGFR1	coding	ligand_receptor
+breast	FGF10	noncoding	FGFR2	coding	ligand_receptor
+breast	GRHL2	noncoding	ESR1	coding	known_ppi
+breast	GRHL2	noncoding	GATA3	coding	known_ppi
+breast	GRHL2	noncoding	TRPS1	noncoding	known_ppi
+breast	H2BC6	noncoding	BAP1	coding	known_ppi
+breast	H2BC6	noncoding	RAD51B	noncoding	known_ppi
+breast	H4C13	noncoding	CREBBP	coding	known_ppi
+breast	H4C13	noncoding	H3C2	coding	known_ppi;protein_complex
+breast	H4C13	noncoding	RAD51B	noncoding	known_ppi
+breast	ITPR1	noncoding	AKT1	coding	known_ppi
+breast	KLF4	noncoding	KLF4	coding	same_gene
+breast	KLHL38	noncoding	IKZF3	coding	known_ppi
+breast	MAGI3	noncoding	PTEN	coding	known_ppi
+breast	MAPT	noncoding	AKT1	coding	known_ppi
+breast	NOTCH2	noncoding	NOTCH2	coding	same_gene
+breast	OTUD7B	noncoding	CASP8	coding	known_ppi
+breast	PALB2	coding	KEAP1	coding	known_ppi
+breast	PDE4D	noncoding	UBR5	coding	known_ppi
+breast	PHLDA3	coding	NTRK1	coding	known_ppi
+breast	PIDD1	noncoding	KEAP1	coding	known_ppi
+breast	PKLR	noncoding	TP53	coding	known_ppi
+breast	PLCL2	noncoding	HRAS	coding	known_ppi
+breast	RB1	coding	MDM4	coding	known_ppi
+breast	RB1	coding	RB1	coding	same_gene
+breast	RBBP8	noncoding	BARD1	coding	protein_complex
+breast	RBBP8	noncoding	BRCA1	coding	known_ppi;protein_complex
+breast	RBBP8	noncoding	BRCA2	coding	known_ppi
+breast	RBBP8	noncoding	MAP2K4	coding	known_ppi
+breast	RBBP8	noncoding	RB1	coding	known_ppi
+breast	RNF115	noncoding	MDM4	coding	known_ppi
+breast	RTKN2	noncoding	CLTC	coding	known_ppi
+breast	TACC2	noncoding	GNAS	coding	known_ppi
+breast	TBX3	noncoding	TBX3	coding	same_gene
+breast	TGFBR2	noncoding	CDH1	coding	known_ppi
+breast	TP53	coding	CREBBP	coding	known_ppi
+breast	TP53	coding	EP300	coding	known_ppi
+breast	TP53	coding	MDM4	coding	known_ppi;protein_complex
+breast	TP53	coding	TP53	coding	same_gene
+breast	TP53	coding	UBR5	coding	known_ppi
+breast	XBP1	noncoding	XBP1	noncoding	same_gene
+breast	XPNPEP3	noncoding	UBR5	coding	known_ppi
+breast	XRCC6	noncoding	NOTCH1	coding	known_ppi
+colorectal	APC	coding	APC	coding	same_gene
+colorectal	APC	coding	AXIN1	coding	protein_complex
+colorectal	APC	coding	CTNNB1	coding	known_ppi
+colorectal	BMP2	noncoding	ACVR2A	coding	ligand_receptor
+colorectal	BMP2	noncoding	BMPR1A	coding	known_ppi;ligand_receptor
+colorectal	BMP4	noncoding	ACVR2A	coding	ligand_receptor
+colorectal	BMP4	noncoding	BMPR1A	coding	known_ppi;ligand_receptor
+colorectal	BMP5	noncoding	ACVR2A	coding	ligand_receptor
+colorectal	BMP5	noncoding	BMPR1A	coding	ligand_receptor
+colorectal	BMP7	noncoding	ACVR2A	coding	ligand_receptor
+colorectal	BMP7	noncoding	BMPR1A	coding	ligand_receptor
+colorectal	COL4A2	noncoding	ITGAV	coding	ligand_receptor
+colorectal	EIF3H	noncoding	ARHGEF10	coding	known_ppi
+colorectal	EIF3H	noncoding	CTNND1	coding	known_ppi
+colorectal	EIF3H	noncoding	EIF3E	coding	known_ppi;protein_complex
+colorectal	EIF3H	noncoding	TP53	coding	known_ppi
+colorectal	EIF3H	noncoding	UBR5	coding	known_ppi
+colorectal	FHL3	noncoding	RAD21	coding	known_ppi
+colorectal	GNA12	noncoding	PPP2R1A	coding	known_ppi
+colorectal	HLA-G	noncoding	B2M	coding	known_ppi
+colorectal	HLA-G	noncoding	HLA-A	coding	known_ppi
+colorectal	KLF5	noncoding	SMAD2	coding	known_ppi
+colorectal	KLF5	noncoding	SMAD4	coding	known_ppi
+colorectal	LAMA5	noncoding	ITGAV	coding	ligand_receptor
+colorectal	LAMC1	noncoding	ITGAV	coding	ligand_receptor
+colorectal	LPAR1	noncoding	KRAS	coding	known_ppi
+colorectal	LRP1	noncoding	GRIN2A	coding	known_ppi
+colorectal	MLH1	coding	MLH1	coding	same_gene
+colorectal	MLH1	noncoding	MLH1	coding	same_gene
+colorectal	MSH2	coding	MSH2	coding	same_gene
+colorectal	MSH6	coding	MSH6	coding	same_gene
+colorectal	MYNN	coding	CDH1	coding	known_ppi
+colorectal	MYNN	coding	CHD4	coding	known_ppi
+colorectal	PKM	noncoding	GNAS	coding	known_ppi
+colorectal	PKM	noncoding	PCBP1	coding	known_ppi
+colorectal	POLD1	coding	POLD1	coding	same_gene
+colorectal	POLD3	noncoding	POLD1	coding	known_ppi;protein_complex
+colorectal	RFC3	noncoding	RAD17	coding	protein_complex
+colorectal	RPL3	noncoding	SMAD2	coding	known_ppi
+colorectal	RPL3	noncoding	SMAD4	coding	known_ppi
+colorectal	RUNX2	noncoding	SOX9	coding	known_ppi
+colorectal	SH2B3	coding	EGFR	coding	known_ppi
+colorectal	SH2B3	coding	ERBB2	coding	known_ppi
+colorectal	SH2B3	coding	ERBB3	coding	known_ppi
+colorectal	SMAD3	noncoding	PARP4	coding	known_ppi
+colorectal	SMAD3	noncoding	PIK3R1	coding	known_ppi
+colorectal	SMAD3	noncoding	SMAD2	coding	known_ppi;protein_complex
+colorectal	SMAD3	noncoding	SMAD3	coding	same_gene
+colorectal	SMAD3	noncoding	SMAD4	coding	known_ppi;protein_complex
+colorectal	SMAD3	noncoding	TGIF1	coding	known_ppi
+colorectal	SMAD9	noncoding	ARID1B	coding	known_ppi
+colorectal	SMAD9	noncoding	BAZ1A	coding	known_ppi
+colorectal	SMAD9	noncoding	EIF3E	coding	known_ppi
+colorectal	SMAD9	noncoding	MGAT1	noncoding	known_ppi
+colorectal	SMAD9	noncoding	SMAD2	coding	known_ppi
+colorectal	SMAD9	noncoding	SMAD4	coding	known_ppi
+colorectal	SMC1B	noncoding	RAD21	coding	protein_complex
+colorectal	TBX3	noncoding	TBX3	coding	same_gene
+colorectal	TCF7L2	noncoding	CTNNB1	coding	known_ppi
+colorectal	TCF7L2	noncoding	TCF7L2	coding	same_gene
+colorectal	TERT	noncoding	AKT1	coding	known_ppi
+colorectal	TERT	noncoding	CTNNB1	coding	known_ppi
+colorectal	TERT	noncoding	SMARCA4	coding	known_ppi
+colorectal	TGFBR2	coding	CDH1	coding	known_ppi
+colorectal	TGFBR2	coding	ETS2	noncoding	known_ppi
+colorectal	TGFBR2	coding	ITGAV	coding	known_ppi
+colorectal	TGFBR2	coding	TGFBR2	coding	same_gene
+colorectal	TNS3	noncoding	ERBB2	coding	known_ppi
+colorectal	TNS3	noncoding	ERBB3	coding	known_ppi
+colorectal	VTI1A	noncoding	VTI1A	coding	same_gene
+colorectal	ZNRF3	noncoding	ZNRF3	coding	same_gene
+lung	APOM	noncoding	KLF6	noncoding	known_ppi
+lung	ATM	coding	ATM	coding	same_gene
+lung	BAZ1A	noncoding	BAZ1A	coding	same_gene
+lung	CHEK2	coding	PER3	coding	known_ppi
+lung	COPS2	noncoding	CREBBP	coding	known_ppi
+lung	COPS2	noncoding	CUL3	coding	known_ppi
+lung	EGFR	coding	EGFR	coding	same_gene
+lung	EGFR	coding	ERBB2	coding	known_ppi
+lung	EGFR	coding	ESR1	coding	known_ppi
+lung	EGFR	coding	JAK2	coding	known_ppi
+lung	EGFR	coding	PIK3R1	coding	known_ppi
+lung	EGFR	coding	RASA1	coding	known_ppi
+lung	FGF7	noncoding	FGFR2	coding	ligand_receptor
+lung	FGF7	noncoding	FGFR2	noncoding	ligand_receptor
+lung	H4C8	noncoding	CREBBP	coding	known_ppi
+lung	HLA-C	noncoding	B2M	coding	known_ppi
+lung	HLA-C	noncoding	B2M	noncoding	known_ppi
+lung	HLA-C	noncoding	HLA-A	coding	known_ppi
+lung	HLA-C	noncoding	RNF213	coding	known_ppi
+lung	HLA-DQA1	noncoding	CD74	coding	known_ppi
+lung	HLA-DRA	noncoding	CD74	coding	known_ppi
+lung	HLA-G	noncoding	B2M	coding	known_ppi
+lung	HLA-G	noncoding	B2M	noncoding	known_ppi
+lung	HLA-G	noncoding	HLA-A	coding	known_ppi
+lung	MORF4L1	noncoding	TRRAP	coding	protein_complex
+lung	NR1H4	noncoding	ESR1	coding	known_ppi
+lung	NRG1	noncoding	ERBB2	coding	known_ppi;ligand_receptor
+lung	NRG1	noncoding	ERBB4	coding	ligand_receptor
+lung	NRG1	noncoding	NRG1	coding	same_gene
+lung	PDS5B	noncoding	BRCA2	coding	known_ppi
+lung	PDS5B	noncoding	RAD21	coding	known_ppi
+lung	PPP1R18	coding	TSC1	coding	known_ppi
+lung	RB1	coding	RB1	coding	same_gene
+lung	SLK	noncoding	KEAP1	coding	known_ppi
+lung	TNXB	coding	SDC4	coding	ligand_receptor
+lung	VARS2	noncoding	TP53	coding	known_ppi
+prostate	ACVR2A	noncoding	ACVR2A	coding	same_gene
+prostate	ACVR2A	noncoding	BMP6	coding	ligand_receptor
+prostate	ADAMTSL4	noncoding	AKT1	coding	known_ppi
+prostate	ADAMTSL4	noncoding	RUNX1T1	coding	known_ppi
+prostate	ATM	coding	ATM	coding	same_gene
+prostate	BMP7	noncoding	ACVR2A	coding	ligand_receptor
+prostate	CASP8	noncoding	CUL3	coding	known_ppi
+prostate	CASP8	noncoding	RAF1	coding	known_ppi
+prostate	CDKN1B	coding	CDKN1B	coding	same_gene
+prostate	CDKN1B	coding	RHOA	coding	known_ppi
+prostate	CHEK2	coding	RB1	coding	known_ppi
+prostate	DPM1	noncoding	ZBTB16	coding	known_ppi
+prostate	ESR2	noncoding	CASZ1	coding	known_ppi
+prostate	FERMT2	noncoding	CTNNB1	coding	known_ppi
+prostate	FOXP1	noncoding	FOXP1	coding	same_gene
+prostate	GDF5	noncoding	ACVR2A	coding	ligand_receptor
+prostate	GDF7	noncoding	ACVR2A	coding	ligand_receptor
+prostate	GRB10	noncoding	NRAS	coding	known_ppi
+prostate	HAUS6	noncoding	GNAS	coding	known_ppi
+prostate	HIBADH	noncoding	TFRC	coding	known_ppi
+prostate	HLA-C	noncoding	RNF213	coding	known_ppi
+prostate	IGF1R	noncoding	CTNNB1	coding	known_ppi
+prostate	INHBB	coding	ACVR2A	coding	ligand_receptor
+prostate	INHBB	noncoding	ACVR2A	coding	ligand_receptor
+prostate	KLK2	noncoding	KLK2	coding	same_gene
+prostate	KLK3	noncoding	KLK3	noncoding	same_gene
+prostate	KNL1	noncoding	EPS15	coding	known_ppi
+prostate	LMNA	noncoding	KLF6	coding	known_ppi
+prostate	LMNA	noncoding	RBFOX1	coding	known_ppi
+prostate	MAP2K1	noncoding	RAF1	coding	known_ppi
+prostate	MAP3K1	noncoding	BRAF	coding	known_ppi
+prostate	MDM4	noncoding	RB1	coding	known_ppi
+prostate	MYEOV	noncoding	MYEOV	coding	same_gene
+prostate	MYO6	noncoding	CYLD	coding	known_ppi
+prostate	MYO16	noncoding	PIK3R1	coding	known_ppi
+prostate	NEDD8	noncoding	CUL3	coding	known_ppi
+prostate	NEDD8	noncoding	CYLD	coding	known_ppi
+prostate	NRIP1	noncoding	NRIP1	noncoding	same_gene
+prostate	NUF2	noncoding	DDX5	coding	known_ppi
+prostate	NUF2	noncoding	EPS15	coding	known_ppi
+prostate	OTX1	noncoding	FBLN2	coding	known_ppi
+prostate	PPP1R7	noncoding	HRAS	coding	known_ppi
+prostate	PPP2R2A	noncoding	LRP1B	coding	known_ppi
+prostate	PTEN	coding	PTEN	coding	same_gene
+prostate	RASSF3	noncoding	TFRC	coding	known_ppi
+prostate	RNF43	noncoding	RNF43	coding	same_gene
+prostate	SKIL	noncoding	SMAD4	coding	known_ppi
+prostate	SLC22A2	noncoding	ETV5	coding	known_ppi
+prostate	SMAD2	noncoding	SMAD4	coding	known_ppi;protein_complex
+prostate	SPEN	noncoding	SPEN	coding	same_gene
+prostate	SPOP	coding	CUL3	coding	known_ppi
+prostate	SPOP	coding	SPOP	coding	same_gene
+prostate	STXBP1	noncoding	NCOR1	coding	known_ppi
+prostate	TCF7L2	noncoding	CTNNB1	coding	known_ppi
+prostate	TERT	noncoding	AKT1	coding	known_ppi
+prostate	TERT	noncoding	CTNNB1	coding	known_ppi
+prostate	TMPRSS2	coding	TMPRSS2	coding	same_gene
+prostate	TMPRSS2	coding	TMPRSS2	noncoding	same_gene
+prostate	TMPRSS2	noncoding	TMPRSS2	coding	same_gene
+prostate	TMPRSS2	noncoding	TMPRSS2	noncoding	same_gene
+prostate	TNFSF10	noncoding	CUL3	coding	known_ppi
+prostate	TNS3	noncoding	ERBB3	coding	known_ppi
+prostate	TOR1A	noncoding	CTNNB1	coding	known_ppi
+prostate	TOR1A	noncoding	PIK3R1	coding	known_ppi
+prostate	TOR1A	noncoding	PRDM1	coding	known_ppi
+prostate	TP53	noncoding	BRCA2	coding	known_ppi
+prostate	TP53	noncoding	TP53	coding	same_gene
+prostate	TP53	noncoding	XPO1	coding	known_ppi
+prostate	UBE3A	noncoding	IDH1	coding	known_ppi
+prostate	VEGFA	noncoding	U2AF1	coding	known_ppi
+prostate	WBP11	noncoding	DDX5	coding	known_ppi
+prostate	WBP11	noncoding	RUNX1T1	coding	known_ppi
+prostate	ZBTB42	coding	ZBTB16	coding	known_ppi
+prostate	ZFHX3	noncoding	ZFHX3	coding	same_gene
+renal	ATM	coding	ATM	coding	same_gene
+renal	BAP1	coding	ASXL2	coding	known_ppi
+renal	BAP1	coding	BAP1	coding	same_gene
+renal	BAP1	coding	NF1	coding	known_ppi
+renal	DPF3	noncoding	ARID1A	coding	protein_complex
+renal	DPF3	noncoding	ARID1B	coding	protein_complex
+renal	DPF3	noncoding	SMARCA4	coding	protein_complex
+renal	FAF1	noncoding	NUP214	coding	known_ppi
+renal	MET	coding	MET	coding	same_gene
+renal	SCARB1	noncoding	PLCG1	coding	known_ppi
+renal	TSC1	coding	TSC1	coding	same_gene
+renal	TSC2	coding	TSC2	coding	same_gene
+renal	VHL	coding	CDKN2A	coding	known_ppi
+renal	VHL	coding	HIF1A	coding	known_ppi
+renal	VHL	coding	NF2	coding	known_ppi
+renal	VHL	coding	PCBP1	coding	known_ppi
+renal	VHL	coding	VHL	coding	same_gene
diff --git a/scripts/aou_ufc_helpers/Write_JointGenotyping_Part2_Json.py b/scripts/aou_ufc_helpers/Write_JointGenotyping_Part2_Json.py
new file mode 100644
index 0000000..90f5f03
--- /dev/null
+++ b/scripts/aou_ufc_helpers/Write_JointGenotyping_Part2_Json.py
@@ -0,0 +1,86 @@
+import os
+import subprocess
+import json
+
+sample_name_map="gs://fc-secure-d531c052-7b41-4dea-9e1d-22e648f6e228/ufc_jg/jg_part1/dfci-ufc.sample_map"
+
+# Construct the base path for the Google Cloud Storage bucket
+bucket_path = os.getenv('WORKSPACE_BUCKET')
+
+# Function to list paths using gsutil and construct arrays
+def process_directory(dir_path,suffix):
+    command = f"gsutil ls {bucket_path}/GnarlyJointGenotypingPart1-Output/{callset_name}/{dir_path}/*{suffix}"
+    paths = subprocess.check_output(command, shell=True).decode('utf-8').strip().split('\n')
+    return paths
+
+# List all directories under ${bucket_path}/GnarlyJointGenotypingPart1-Output/
+command = f"gsutil ls -d {bucket_path}/GnarlyJointGenotypingPart1-Output/*/"
+callset_names = subprocess.check_output(command, shell=True).decode('utf-8').strip().split('\n')
+callset_names = [name.split('/')[-2] for name in callset_names]
+
+# Instantiate Arrays
+output_sites_only_vcf = []
+output_variant_filtered_vcf = []
+output_sites_only_vcf_idx = []
+output_variant_filtered_vcf_idx = []
+
+# Loop through each callset_name
+for callset_name in callset_names:
+    # Process each directory and populate respective arrays
+    sites_only_vcf = process_directory("sites_only_vcf",".vcf.gz")
+    variant_filtered_vcf = process_directory("variant_filtered_vcf",".vcf.gz")
+
+    # Process each directory and populate respective arrays
+    sites_only_vcf_idx = process_directory("sites_only_vcf",".tbi")
+    variant_filtered_vcf_idx = process_directory("variant_filtered_vcf",".tbi")
+
+    # Append paths to respective output lists
+    output_sites_only_vcf.append(sites_only_vcf)
+    output_variant_filtered_vcf.append(variant_filtered_vcf)
+    output_sites_only_vcf_idx.append(sites_only_vcf_idx)
+    output_variant_filtered_vcf_idx.append(variant_filtered_vcf_idx)
+# JSON file path
+output_json_file = "GnarlyJointGenotypingPart2.json"
+
+# Construct JSON content
+json_content = {
+    "GnarlyJointGenotypingPart2.ApplyRecalibration.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.CollectMetricsOnFullVcf.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.CollectMetricsSharded.gatk-publ_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.FinalGatherVcf.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.GatherVariantCallingMetrics.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.IndelsVariantRecalibrator.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.SNPGatherTranches.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.SNPsVariantRecalibratorCreateModel.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.SNPsVariantRecalibratorScattered.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.SitesOnlyGatherVcf.gatk_docker": "broadinstitute/gatk:4.4.0.0",
+    "GnarlyJointGenotypingPart2.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
+    "GnarlyJointGenotypingPart2.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi",
+    "GnarlyJointGenotypingPart2.callset_name": "dfci-ufc",
+    "GnarlyJointGenotypingPart2.dbsnp_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
+    "GnarlyJointGenotypingPart2.dbsnp_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx",
+    "GnarlyJointGenotypingPart2.eval_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list",
+    "GnarlyJointGenotypingPart2.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz",
+    "GnarlyJointGenotypingPart2.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi",
+    "GnarlyJointGenotypingPart2.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
+    "GnarlyJointGenotypingPart2.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
+    "GnarlyJointGenotypingPart2.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz",
+    "GnarlyJointGenotypingPart2.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi",
+    "GnarlyJointGenotypingPart2.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
+    "GnarlyJointGenotypingPart2.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi",
+    "GnarlyJointGenotypingPart2.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
+    "GnarlyJointGenotypingPart2.sites_only_vcfs_index_nested": output_sites_only_vcf_idx,
+    "GnarlyJointGenotypingPart2.sites_only_vcfs_nested": output_sites_only_vcf,
+    "GnarlyJointGenotypingPart2.variant_filtered_vcfs_index_nested": output_variant_filtered_vcf_idx,
+    "GnarlyJointGenotypingPart2.variant_filtered_vcfs_nested": output_variant_filtered_vcf,
+    "GnarlyJointGenotypingPart2.sample_name_map": sample_name_map
+}
+
+# Write JSON content to file without prettifying
+with open(output_json_file, 'w') as f:
+    json.dump(json_content, f, separators=(',', ':'))
+
+    # Add a newline character at the end of the file
+    f.write('\n')
+
+print(f"Output written to {output_json_file}")
\ No newline at end of file
diff --git a/scripts/aou_ufc_helpers/creating_gene_regions_of_interest_hg38 copy.sh b/scripts/aou_ufc_helpers/creating_gene_regions_of_interest_hg38 copy.sh
new file mode 100644
index 0000000..c69ab51
--- /dev/null
+++ b/scripts/aou_ufc_helpers/creating_gene_regions_of_interest_hg38 copy.sh	
@@ -0,0 +1,41 @@
+# Example code for creating gene regions of interest and subsetting AoU VCF
+
+# Given the following input files: 
+# 1. list (flat .txt file) of gene symbols of relevance: riaz_genes.list
+# 2. list (flat .txt file) of sample IDs to retain: keep_samples.list
+# 3. VCF of all variants from AoU: aou.vcf.gz (must also have .tbi index present locally)
+
+# The below can all be run on your local machine from the terminal
+
+# Download MANE GTF:
+wget https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.3/MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz
+
+# Subset MANE to genes of interest
+zcat MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz \
+| fgrep -wf riaz_genes.list \
+| bgzip -c \
+> MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf.gz
+
+# Extract coordinates from subsetted GTF, pad by ±2kb to be conservative, and use BEDTools to merge these coordinates into unique regions
+zcat MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf.gz \
+| awk -v FS="\t" -v OFS="\t" -v buffer=2000 '{ print $1, $4-buffer, $5+buffer }' \
+| sort -Vk1,1 -k2,2n -k3,3n \
+| bedtools merge -i - \
+| bgzip -c \
+> riaz_genes.coordinates.bed.gz
+
+# Copy gene coordinates to your AoU workspace bucket
+# You can find this by running "echo $WORKSPACE_BUCKET" within the AoU workbench terminal
+#gsutil cp riaz_genes.coordinates.bed.gz gs://my_workspace_bucket/
+
+# Subset variants to minimal set of interest for our purposes
+# Note that this must be run in the AoU workspace terminal, not on your local machine
+bcftools view \
+  -O z -o aou.subsetted.vcf.gz \
+  --regions-file riaz_genes.coordinates.bed.gz \
+  --samples-file keep_samples.list \
+  --min-ac 1 \
+  aou.vcf.gz
+
+# Index output VCF for future queries
+tabix -p vcf -f aou.subsetted.vcf.gz
\ No newline at end of file
diff --git a/scripts/aou_ufc_helpers/germline_CPGs_7_31_23.txt b/scripts/aou_ufc_helpers/germline_CPGs_7_31_23.txt
new file mode 100644
index 0000000..f633707
--- /dev/null
+++ b/scripts/aou_ufc_helpers/germline_CPGs_7_31_23.txt
@@ -0,0 +1,149 @@
+value
+ABCB11
+ACD
+AIP
+ALK
+APC
+ATM
+ATR
+AXIN2
+BAP1
+BARD1
+BLM
+BMPR1A
+BRAF
+BRCA1
+BRCA2
+BRIP1
+BUB1B
+CBL
+CDC73
+CDH1
+CDK4
+CDKN1B
+CDKN1C
+CDKN2A
+CDKN2B
+CEBPA
+CHEK2
+CYLD
+DDB2
+DDX41
+DICER1
+DIS3L2
+DKC1
+EGFR
+EPCAM
+ERCC1
+ERCC2
+ERCC3
+ERCC4
+ERCC5
+ETV6
+EXT1
+EXT2
+FAH
+FANCA
+FANCC
+FANCD2
+FANCE
+FANCF
+FANCG
+FANCI
+FANCL
+FANCM
+FH
+FLCN
+GATA2
+GPC3
+HFE
+HMBS
+HNF1A
+HRAS
+KIT
+KRAS
+LZTR1
+MAP2K1
+MAP2K2
+MAX
+MEN1
+MET
+MITF
+MLH1
+MPL
+MSH2
+MSH6
+MTAP
+MUTYH
+NBN
+NF1
+NF2
+NHP2
+NRAS
+NTHL1
+PALB2
+PDGFRA
+PHOX2B
+PMS2
+POLD1
+POLE
+POLH
+POT1
+PRF1
+PRKAR1A
+PTCH1
+PTCH2
+PTEN
+PTPN11
+RAD51C
+RAD51D
+RAF1
+RB1
+RECQL
+RECQL4
+RET
+RFWD3
+RHBDF2
+RTEL1
+RUNX1
+SBDS
+SDHA
+SDHAF2
+SDHB
+SDHC
+SDHD
+SETBP1
+SH2D1A
+SLC25A13
+SLX4
+SMAD4
+SMARCA4
+SMARCB1
+SMARCE1
+SOS1
+SPRTN
+SRP72
+STAT3
+STK11
+SUFU
+TERT
+TGFBR1
+TINF2
+TMEM127
+TP53
+TRIM37
+TSC1
+TSC2
+TSHR
+UROD
+VHL
+WRN
+WT1
+XPA
+XPC
+XRCC3
+GEN1
+RAD51
+RAD54L
+RNF43
+UBE2T
diff --git a/scripts/aou_ufc_helpers/matching.py b/scripts/aou_ufc_helpers/matching.py
new file mode 100644
index 0000000..884d6a3
--- /dev/null
+++ b/scripts/aou_ufc_helpers/matching.py
@@ -0,0 +1,110 @@
+import pandas as pd
+
+# Define file paths
+control_file = 'superset_control_demographics.tsv'
+case_file = 'case_demographics.tsv'
+output_file = 'control_demographics_interim.tsv'
+
+# Read case and control files
+control_df = pd.read_csv(control_file, sep='\t', index_col=0)
+case_df = pd.read_csv(case_file, sep='\t', index_col=0)
+
+# Sort control and case data
+sorted_control = control_df.sort_values(by=['race', 'ethnicity', 'year_of_birth', 'sex_at_birth'])
+sorted_case = case_df.sort_values(by=['race', 'ethnicity', 'year_of_birth', 'sex_at_birth'])
+
+# Initialize matched controls dictionary
+matched_controls = {}
+print(sorted_control.shape,sorted_case.shape)
+# Exact Matching
+for idx, case_row in sorted_case.iterrows():
+    match = sorted_control[
+        (sorted_control['race'] == case_row['race']) &
+        (sorted_control['ethnicity'] == case_row['ethnicity']) &
+        (sorted_control['year_of_birth'] == case_row['year_of_birth']) &
+        (sorted_control['sex_at_birth'] == case_row['sex_at_birth'])
+    ].head(1)
+    if not match.empty:        control_idx = match.index[0]        matched_controls[idx] = control_idx #Case is key and control is value        sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df
+        sorted_case = sorted_case.drop(idx)
+#This is all four variables of interest.
+for bandwidth in range(1,20):
+    for idx, case_row in sorted_case.iterrows():
+        #print(f"Bandwidth: {bandwidth}")
+        match = sorted_control[
+            (sorted_control['race'] == case_row['race']) &
+            (sorted_control['ethnicity'] == case_row['ethnicity']) &
+            (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth)) &
+            (sorted_control['sex_at_birth'] == case_row['sex_at_birth'])
+        ].head(1)
+        if not match.empty:
+            control_idx = match.index[0]
+            matched_controls[idx] = control_idx #Case is key and control is value            sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df            sorted_case = sorted_case.drop(idx)
+#This is just for race, age, and sex. We exclude ethnicity because that is often tied to race.
+for bandwidth in range(0,20):
+    for idx, case_row in sorted_case.iterrows():
+        #print(f"Bandwidth: {bandwidth}")
+        match = sorted_control[
+            (sorted_control['race'] == case_row['race']) &
+            (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth)) &
+            (sorted_control['sex_at_birth'] == case_row['sex_at_birth'])
+        ].head(1)
+        if not match.empty:
+            control_idx = match.index[0]
+            matched_controls[idx] = control_idx #Case is key and control is value
+            sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df
+            sorted_case = sorted_case.drop(idx)
+
+#This is just for race and age. We are putting race ahead of sex for matching.
+for bandwidth in range(0,20):
+    for idx, case_row in sorted_case.iterrows():
+        match = sorted_control[
+            (sorted_control['race'] == case_row['race']) &
+            (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth))
+        ].head(1)
+        if not match.empty:
+            control_idx = match.index[0]
+            matched_controls[idx] = control_idx #Case is key and control is value
+            sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df
+            sorted_case = sorted_case.drop(idx)
+
+#This is just for sex and age once race and ethnicity have been used up
+for bandwidth in range(0,20):
+    for idx, case_row in sorted_case.iterrows():
+        match = sorted_control[
+            (sorted_control['sex_at_birth'] == case_row['sex_at_birth']) &
+            (sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] + bandwidth))
+        ].head(1)
+        if not match.empty:
+            control_idx = match.index[0]
+            matched_controls[idx] = control_idx #Case is key and control is value
+            sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df
+            sorted_case = sorted_case.drop(idx)
+
+#We are only looking for year of birth as a last resort
+#This is useful for when all of the other fields are missing
+for bandwidth in range(0,20):
+    for idx, case_row in sorted_case.iterrows():
+        match = sorted_control[(sorted_control['year_of_birth'].between(case_row['year_of_birth'] - bandwidth, case_row['year_of_birth'] +bandwidth))
+        ].head(1)
+        if not match.empty:
+            control_idx = match.index[0]
+            matched_controls[idx] = control_idx #Case is key and control is value
+            sorted_control = sorted_control.drop(control_idx) #Drops control value from controls df
+            sorted_case = sorted_case.drop(idx)
+
+
+
+
+
+sorted_control.to_csv('unmatched.tsv',sep='\t')
+sorted_case.to_csv('unmatchables.tsv',sep='\t')
+print(len(sorted_case))
+
+# Convert dictionary to DataFrame
+df = pd.DataFrame(list(matched_controls.items()), columns=['person_id_case', 'person_id_control'])
+
+# Save DataFrame to TSV file
+output_file = 'matched_controls.tsv'
+df.to_csv(output_file, sep='\t', index=False)
+
+print("Exact matching done. Interim results saved to:", output_file)
diff --git a/scripts/g2c_helpers/.DS_Store b/scripts/g2c_helpers/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/scripts/g2c_helpers/.DS_Store differ
diff --git a/scripts/g2c_helpers/g2c_charr.py b/scripts/g2c_helpers/g2c_charr.py
new file mode 100644
index 0000000..a81e105
--- /dev/null
+++ b/scripts/g2c_helpers/g2c_charr.py
@@ -0,0 +1,164 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+import pandas as pd
+import gcsfs
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+
+
+cohorts = ['aou','biome','mesa','ufc','gtex','cptac','hgsvc','icgc','hmf','lcins','proactive-core','proactive-other']
+ancestries = ['European', 'Latin-American-1','Latin-American-2','South-Asian','Asian-Pacific-Islander','East-Asian','African', 'African-American','Unknown','Other']
+bar_colors = ['blue', 'red', 'pink', 'purple', 'darkgreen', 'lightgreen', 'yellow', 'orange', 'turquoise', 'brown']
+
+
+# Dictionary to keep track of statistics
+charr_arr = []
+charr_reblocked_arr = []
+charr_dict = dict()
+variable_of_interest = 'CHARR' #'INCONSISTENT_AB_HET_RATE' #
+
+def grab_charr_files():
+    def list_charr_files(base_path, cohort):
+        # Create a GCS file system instance
+        gcs_filesystem = gcsfs.GCSFileSystem(project='terra-446ad1f7')  # Replace with your actual project ID
+
+        # Construct the path pattern for the specified cohort
+        cohort_path = f"{base_path}/{cohort}/charr/*"
+
+        # List all files matching the pattern
+        charr_files = gcs_filesystem.glob(cohort_path)
+
+        return charr_files
+
+    # Specify the base path in the "gs://dfci-g2c-inputs" file system and the cohort
+    base_path = 'gs://dfci-g2c-inputs'
+    
+    charr_files = []
+    for cohort in cohorts:
+        # Get a list of file pathways that have 'demographics' in them for the specified cohort
+        charr_files.extend([f"gs://{file}" for file in list_charr_files(base_path, cohort)])
+
+    return charr_files
+
+def grab_charr_reblocked_files():
+    path = '/Users/noah/Desktop/ssi.tsv'
+    df = pd.read_csv(path,sep='\t')
+    charr_reblocked_files = list(df['contamination_reblocked'])
+
+    return charr_reblocked_files
+
+# Function to process each row
+def process_file(charr_file):
+    df = pd.read_csv(charr_file,sep='\t')
+    return float(df[variable_of_interest].iloc[0])
+
+def process_cohorts():
+    charr_files = grab_charr_files()
+    cohort_dict = dict()
+    cohort_dict['mesa'] = [file for file in charr_files if 'mesa' in file]
+    cohort_dict['ceph'] = [file for file in charr_files if 'ceph' in file]
+    cohort_dict['gtex'] = [file for file in charr_files if 'gtex' in file]
+    cohort_dict['icgc'] = [file for file in charr_files if 'icgc' in file]
+    cohort_dict['lcins'] = [file for file in charr_files if 'lcins' in file]
+    cohort_dict['hmf'] = [file for file in charr_files if 'hmf' in file]
+    cohort_dict['ufc'] = [file for file in charr_files if 'ufc' in file]
+    cohort_dict['cptac'] = [file for file in charr_files if 'cptac' in file]
+    cohort_dict['hgsvc'] = [file for file in charr_files if 'hgsvc' in file]
+    cohort_dict['biome'] = [file for file in charr_files if 'biome' in file]
+    cohort_dict['proactive-core'] = [file for file in charr_files if 'proactive-core' in file]
+    cohort_dict['proactive-other'] = [file for file in charr_files if 'proactive-other' in file]
+
+
+    
+    for key in cohort_dict:
+        for file in cohort_dict[key]:
+            if key not in charr_dict:
+                charr_dict[key]=[]
+            charr_dict[key].append(process_file(file))
+
+def make_figure():
+    def box_plot(data,position, label):
+        plt.boxplot(data,positions=[position], labels=[label])
+
+
+    box_plot(charr_dict['mesa'],1,"MESA")
+    box_plot(charr_dict['ceph'],2,"CEPH")
+    box_plot(charr_dict['gtex'],3,"GTEX")
+    box_plot(charr_dict['icgc'],4,"ICGC")
+    box_plot(charr_dict['lcins'],5,"LCINS")
+    box_plot(charr_dict['hmf'],6,"HMF")
+    box_plot(charr_dict['ufc'],7,"UFC")
+    box_plot(charr_dict['cptac'],8,"CPTAC")
+    box_plot(charr_dict['hgsvc'],9,"HGSVC")
+    box_plot(charr_dict['biome'],10,"BIOME")
+    box_plot(charr_dict['proactive-core'],11,"P-CORE")
+    box_plot(charr_dict['proactive-other'],12,"P-OTHER")
+
+    # Show the plot
+    # Add labels and title
+    plt.xlabel('Cohort')
+    plt.ylabel('CHARR')
+    plt.title(f'Box Plot of {variable_of_interest} values across Cohorts')
+
+    # Add dotted lines and labels
+    if variable_of_interest == "CHARR":
+        plt.axhline(y=0.02, linestyle='--', color='gray', label='Subtle Contamination')
+        plt.axhline(y=0.03, linestyle='--', color='red', label='Suggestive of Contamination')
+        plt.legend()
+    if variable_of_interest == "INCONSISTENT_AB_HET_RATE":
+        plt.axhline(y=0.1, linestyle='--', color='gray', label='Warning')
+        plt.axhline(y=0.15, linestyle='--', color='red', label='Fail')
+        plt.legend()
+    # Create a wider figure
+    #plt.figure(figsize=(10, 6))  # Adjust width as needed
+    plt.savefig(f"../../figures/charr_figures/{variable_of_interest}.png")
+
+def compare_reblocked_raw():
+    # Function to process each row
+    def process_file_compare(charr_file):
+        df = pd.read_csv(charr_file,sep='\t')
+        return float(df['CHARR'].iloc[0])
+    def list_charr_files(base_path):
+        # Create a GCS file system instance
+        gcs_filesystem = gcsfs.GCSFileSystem(project='terra-446ad1f7')  # Replace with your actual project ID
+
+        # Construct the path pattern for the specified cohort
+        original_path = f"{base_path}/charr_original/*"
+        reblocked_path = f"{base_path}/charr/*"
+
+        # List all files matching the pattern
+        original_files = sorted(gcs_filesystem.glob(original_path))
+        reblocked_files = [file.replace('charr_original','charr') for file in original_files]
+
+        return original_files,reblocked_files
+
+    # Specify the base path in the "gs://dfci-g2c-inputs" file system and the cohort
+    base_path = 'gs://dfci-g2c-inputs/mesa'
+    original_files,reblocked_files = list_charr_files(base_path)
+
+    coordinates = []
+    for i in range(len(original_files)):
+        coordinates.append((process_file_compare("gs://" + original_files[i]),process_file_compare("gs://" + reblocked_files[i])))
+
+    # Extract the first and second integers from each tuple into separate arrays
+    x = np.array([t[0] for t in coordinates])
+    y = np.array([t[1] for t in coordinates])
+
+    correlation_coefficient = np.corrcoef(x, y)[0,1]
+    r_squared = correlation_coefficient ** 2
+    print(r_squared)
+
+
+
+if __name__ == '__main__':
+    #compare_reblocked_raw()
+    process_cohorts()
+    make_figure()
+    # main()
+
+
diff --git a/scripts/g2c_helpers/g2c_demographics.py b/scripts/g2c_helpers/g2c_demographics.py
new file mode 100644
index 0000000..3f41bae
--- /dev/null
+++ b/scripts/g2c_helpers/g2c_demographics.py
@@ -0,0 +1,184 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+import pandas as pd
+import gcsfs
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+
+
+cohorts = ['aou','biome','mesa','ufc','gtex','cptac','hgsvc','icgc','hmf','lcins','proactive-core','proactive-other']
+ancestries = ['European', 'Latin-American-1','Latin-American-2','South-Asian','Asian-Pacific-Islander','East-Asian','African', 'African-American','Unknown','Other']
+bar_colors = ['blue', 'red', 'pink', 'purple', 'darkgreen', 'lightgreen', 'yellow', 'orange', 'turquoise', 'brown']
+
+
+# Dictionary to keep track of statistics
+cohort_stats = {}
+cohort_sex_stats = {}
+overall_stats = {}
+overall_sex_stats = {}
+cohort_count = {}
+total_count = 0
+
+def grab_demographics_files():
+    def list_demographics_files(base_path, cohort):
+        # Create a GCS file system instance
+        gcs_filesystem = gcsfs.GCSFileSystem(project='terra-446ad1f7')  # Replace with your actual project ID
+
+        # Construct the path pattern for the specified cohort
+        cohort_path = f"{base_path}/{cohort}/demographics/*"
+
+        # List all files matching the pattern
+        demographics_files = gcs_filesystem.glob(cohort_path)
+
+        return demographics_files
+
+    # Specify the base path in the "gs://dfci-g2c-inputs" file system and the cohort
+    base_path = 'gs://dfci-g2c-inputs'
+    
+    demographics_files = []
+    for cohort in cohorts:
+        # Get a list of file pathways that have 'demographics' in them for the specified cohort
+        demographics_files.extend([f"gs://{file}" for file in list_demographics_files(base_path, cohort)])
+
+    return demographics_files
+
+# Function to process each row
+def process_file(demo_file):
+    global total_count
+    temp_df = pd.read_csv(demo_file,sep='\t')
+    # Extract required columns
+    temp_df = temp_df[['ancestry', 'chrX_count', 'chrY_count']]
+    cohort = demo_file.split('/')[-3]
+    cohort_count[cohort] = cohort_count.get(cohort, 0) + 1
+    total_count += 1
+    # Update cohort-specific statistics
+    if cohort not in cohort_stats:
+        cohort_stats[cohort] = {}
+        cohort_sex_stats[cohort] = {}
+
+    for _, temp_row in temp_df.iterrows():
+        ancestry = temp_row['ancestry']
+        chrX_count = int(temp_row['chrX_count'])
+        chrY_count = int(temp_row['chrY_count'])
+        sex = None
+        if chrX_count == 1 and chrY_count == 1:
+            sex = "Male"
+        elif chrX_count == 2 and chrY_count == 0:
+            sex = "Female"
+        else:
+            sex = "Aneuploidy"
+
+        # Update cohort-specific statistics
+        if ancestry not in cohort_stats[cohort]:
+            cohort_stats[cohort][ancestry] = 0
+        if sex not in cohort_sex_stats[cohort]:
+            cohort_sex_stats[cohort][sex] = 0
+
+        if ancestry not in overall_stats:
+            overall_stats[ancestry] = 0
+        if sex not in overall_sex_stats:
+            overall_sex_stats[sex] = 0
+
+
+        cohort_stats[cohort][ancestry] += 1
+        cohort_sex_stats[cohort][sex] += 1
+        overall_stats[ancestry] += 1
+        overall_sex_stats[sex] += 1
+
+
+
+
+# Loop through the rows of the DataFrame
+for demo_file in tqdm(grab_demographics_files()):
+    process_file(demo_file)
+
+
+with open('g2c_demographics.txt', 'w') as file:
+    # Write cohort-specific statistics
+    file.write("=" * 30 + "\n")
+    for cohort, stats in cohort_stats.items():
+        file.write(f"Cohort: {cohort}\n")
+
+        for ancestry in cohort_stats[cohort]:
+            file.write(f"{ancestry}: {cohort_stats[cohort][ancestry]}\n")
+        file.write("- " * 30 + "\n")
+        for sex in cohort_sex_stats[cohort]:
+            file.write(f"{sex}: {cohort_sex_stats[cohort][sex]}\n")
+        file.write(f"Total: {cohort_count[cohort]}\n")
+        file.write("=" * 30 + "\n")
+
+    # Write overall statistics
+    file.write("Overall Statistics\n")
+    for ancestry in overall_stats:
+        file.write(f"{ancestry}: {overall_stats[ancestry]}\n")
+    file.write("- " * 30 + "\n")
+    
+    # Write overall sex statistics
+    for sex in overall_sex_stats:
+        file.write(f"{sex}: {overall_sex_stats[sex]}\n")
+
+    file.write(f"Total: {total_count}\n")
+
+
+def create_pie_chart(cohort,cohort_stats):
+    values = list(cohort_stats[cohort].values())
+    keys = list(cohort_stats[cohort].keys())
+
+    #labels = [f"{key}: {value}" for key, value in zip(keys, values)]
+
+    # Calculate the total count for percentage calculation
+    total_count = sum(values)
+
+    plt.pie(values,labels=keys,colors = bar_colors, autopct=lambda p: f'{p:.1f}%\n')
+    plt.title(cohort, fontdict={'fontsize': 16, 'fontweight': 'bold'})
+    plt.savefig(f"../../figures/g2c_figures/{cohort}_demographics.png")
+    plt.clf()
+
+def create_total_pie_chart(overall_stats):
+    values = list(overall_stats.values())
+    keys = list(overall_stats.keys())
+
+
+    # Calculate the total count for percentage calculation
+    total_count = sum(values)
+
+    plt.pie(values,labels=keys,colors = bar_colors, autopct=lambda p: f'{p:.1f}%\n')
+    plt.title("Everyone", fontdict={'fontsize': 16, 'fontweight': 'bold'})
+    plt.savefig(f"../../figures/g2c_figures/Everyone_demographics.png")
+    plt.clf()
+
+
+# # Create subplots
+# fig, ax = plt.subplots(figsize=(14, 8))
+
+# Create bar graphs for each cohort and total
+for cohort in cohort_stats:
+    #print(cohort)
+    if cohort not in cohort_stats.keys():
+        continue
+    #create_bar_graph(ax, cohort, cohort_stats, bar_width=0.2)
+    create_pie_chart(cohort,cohort_stats)
+create_total_pie_chart(overall_stats)
+
+# # Add labels and title
+# ax.set_xlabel('Ancestry Groups')
+# ax.set_ylabel('Percentage')
+# ax.set_title('Ancestry Distribution Across Cohorts')
+# ax.set_xticks(np.arange(len(ancestries)) + 0.2)
+# ax.set_xticklabels(ancestries)
+# ax.legend()
+
+# # Save the plot to 'g2c_demographics.png'
+# plt.savefig('g2c_demographics.png')
+
+# # Show the plot
+# plt.show()
+
+
+
+
diff --git a/scripts/g2c_helpers/g2c_demographics.txt b/scripts/g2c_helpers/g2c_demographics.txt
new file mode 100644
index 0000000..0021491
--- /dev/null
+++ b/scripts/g2c_helpers/g2c_demographics.txt
@@ -0,0 +1,187 @@
+==============================
+Cohort: biome
+East-Asian: 107
+Latin-American-1: 539
+European: 1875
+African-American: 146
+Latin-American-2: 41
+Other: 58
+Asian-Pacific-Islander: 22
+South-Asian: 38
+Unknown: 13
+African: 3
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 1331
+Female: 1501
+Aneuploidy: 10
+Total: 2842
+==============================
+Cohort: mesa
+European: 1583
+Latin-American-1: 209
+Latin-American-2: 299
+East-Asian: 111
+African-American: 1004
+Asian-Pacific-Islander: 5
+Other: 44
+Unknown: 4
+African: 2
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 1519
+Female: 1738
+Aneuploidy: 4
+Total: 3261
+==============================
+Cohort: ufc
+European: 42
+Latin-American-1: 5
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 30
+Female: 17
+Total: 47
+==============================
+Cohort: gtex
+African-American: 113
+European: 733
+East-Asian: 7
+Latin-American-1: 13
+Latin-American-2: 13
+African: 1
+South-Asian: 3
+Other: 6
+Asian-Pacific-Islander: 3
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Female: 302
+Male: 583
+Aneuploidy: 7
+Total: 892
+==============================
+Cohort: cptac
+East-Asian: 202
+European: 851
+Latin-American-2: 32
+South-Asian: 7
+Other: 2
+Unknown: 13
+Asian-Pacific-Islander: 7
+Latin-American-1: 12
+African-American: 19
+African: 1
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 602
+Female: 533
+Aneuploidy: 11
+Total: 1146
+==============================
+Cohort: hgsvc
+European: 133
+Unknown: 7
+Latin-American-2: 46
+East-Asian: 60
+Latin-American-1: 11
+Other: 3
+African-American: 93
+African: 32
+South-Asian: 68
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 226
+Female: 224
+Aneuploidy: 3
+Total: 453
+==============================
+Cohort: icgc
+Latin-American-1: 17
+European: 1066
+African-American: 14
+South-Asian: 25
+Latin-American-2: 7
+Asian-Pacific-Islander: 5
+East-Asian: 221
+Unknown: 12
+Other: 1
+African: 6
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Female: 446
+Male: 908
+Aneuploidy: 20
+Total: 1374
+==============================
+Cohort: hmf
+European: 4114
+Unknown: 86
+African-American: 36
+Latin-American-1: 91
+Asian-Pacific-Islander: 65
+East-Asian: 33
+Other: 22
+Latin-American-2: 14
+South-Asian: 19
+African: 3
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 2122
+Female: 2321
+Aneuploidy: 40
+Total: 4483
+==============================
+Cohort: lcins
+European: 213
+Latin-American-1: 9
+African-American: 4
+Latin-American-2: 1
+East-Asian: 3
+Asian-Pacific-Islander: 2
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Female: 175
+Male: 56
+Aneuploidy: 1
+Total: 232
+==============================
+Cohort: proactive-core
+Latin-American-1: 78
+European: 1004
+Asian-Pacific-Islander: 8
+Other: 6
+African-American: 35
+East-Asian: 28
+Latin-American-2: 18
+South-Asian: 14
+Unknown: 6
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 488
+Female: 708
+Aneuploidy: 1
+Total: 1197
+==============================
+Cohort: proactive-other
+Latin-American-1: 32
+European: 252
+East-Asian: 29
+African: 1
+African-American: 20
+Latin-American-2: 17
+South-Asian: 5
+Asian-Pacific-Islander: 5
+Unknown: 3
+Other: 4
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Female: 227
+Male: 140
+Aneuploidy: 1
+Total: 368
+==============================
+Overall Statistics
+East-Asian: 801
+Latin-American-1: 1016
+European: 11866
+African-American: 1484
+Latin-American-2: 488
+Other: 146
+Asian-Pacific-Islander: 122
+South-Asian: 179
+Unknown: 144
+African: 49
+- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
+Male: 8005
+Female: 8192
+Aneuploidy: 98
+Total: 16295
diff --git a/scripts/g2c_helpers/json_converter.py b/scripts/g2c_helpers/json_converter.py
new file mode 100644
index 0000000..53f57a3
--- /dev/null
+++ b/scripts/g2c_helpers/json_converter.py
@@ -0,0 +1,96 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# The purpose of this script is to convert between json
+# code from terra, put it into human readable form (that can be edited if necessary),
+# and then convert to a json file that is ideal for AllofUs workbench.
+# If you are starting from the original terra json, use the flag '-t',
+# otherwise run w/ no flags.
+import json
+import argparse
+import os
+import re
+
+# The goal of this function is to turn terra json
+# files into a human readable json w/ the suffix .hread.json
+# that can be easily edited.
+def terra_to_readable_json(input_file, output_file):
+    line = open(input_file).readline()[1:-1]
+    line = line.replace('\":\"','\" : \"')
+    line = re.sub(r'\${(.*?)}', r'\1', line)
+    line = line.replace('\"true\"','true')
+    line = line.replace('\"false\"','false')
+    line = line.replace('\"[','[').replace(']\"',']')
+
+    arr = re.split(r',(?![^\[]*\])', line)
+    arr = [s for s in arr if '\"\"' not in s]
+    arr = [s for s in arr if '[]' not in s]
+    print(len(arr))
+
+    with open(output_file, 'w') as f:
+        f.write('{\n')
+        for val in arr:
+
+            val = re.sub(r'"(\d+(\.\d+)?)",?', r'\1', val) #turns strings into floats or ints if applicable
+            val = val.replace('\\',"")
+            if val != arr[-1]:
+                f.write(f"\t{val},\n")
+            else:
+                f.write(f"\t{val}\n")
+                
+        f.write('}')
+    
+    print(f"Output written to: {output_file}")
+
+# This function takes in an input file and
+# writes to the directory of the output_file.
+# This takes a human readable json file (normal json)
+# and converts to a json file used by AllofUs workbench.
+def readable_to_aou_json(input_file, output_file):
+    try:
+        # Open the input file for reading
+        with open(input_file, 'r') as input_file_handle:
+            # Read the content from the input file
+            content = input_file_handle.read()
+
+            # Remove '\n' and '\t' characters from the content
+            content_without_newline_tabs = content.replace('\n', '').replace('\t', '')
+
+            # Open the output file for writing
+            with open(output_file, 'w') as output_file_handle:
+                # Write the modified content to the output file
+                output_file_handle.write(content_without_newline_tabs)
+
+        print(f"Conversion successful. Output written to {output_file}")
+
+    except Exception as e:
+        print(f"Error: {e}")
+
+# Example usage:
+# readable_to_aou_json("input.txt", "output.txt")
+
+
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert JSON file keys to separate lines.')
+    parser.add_argument('input_file', help='Input JSON file')
+    parser.add_argument('-t', '--terra', action='store_true', help='If present, run both functions')
+
+    args = parser.parse_args()
+
+    if args.terra:
+        terra_to_readable_json(args.input_file, args.input_file.replace('.json','.hread.json'))
+        readable_to_aou_json(args.input_file.replace('.json','.hread.json'), args.input_file.replace('.json','.out.json'))
+    else:
+        if '.hread.json' in args.input_file:
+            readable_to_aou_json(args.input_file,args.input_file.replace('.hread.json','.out.json'))
+        else:
+            readable_to_aou_json(args.input_file,args.input_file.replace('.json','.out.json'))
+
+if __name__ == '__main__':
+    main()
+
diff --git a/scripts/g2c_helpers/process_cohort.sh b/scripts/g2c_helpers/process_cohort.sh
new file mode 100644
index 0000000..6a1156b
--- /dev/null
+++ b/scripts/g2c_helpers/process_cohort.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+while getopts "c:" flag; do
+    case $flag in
+        c) cohort=$OPTARG ;;
+        *) echo "Usage: $0 -c <cohort>"; exit 1 ;;
+    esac
+done
+
+if [ -z "$cohort" ]; then
+    echo "Cohort name is required. Use -c flag."
+    exit 1
+fi
+
+# Download the sample list file and store it as an array
+samples=()
+while IFS= read -r sample; do
+    samples+=("$sample")
+done < <(gsutil cat "gs://dfci-g2c-inputs/sample-lists/${cohort}.samples.list")
+output_file="${cohort}_consolidated_data.tsv"
+echo -e "Sample\tCohort\tgrafpop_SNPs\tgrafpop_GD1\tgrafpop_GD2\tgrafpop_GD3\tpct_EUR\tpct_AFR\tpct_ASN\tchrX_count\tchrY_count\tancestry\tHQ_HOM\tHQ_HOM_RATE\tHQ_HET\tHQ_HET_RATE\tCHARR\tMEAN_REF_AB_HOM_ALT\tHETEROZYGOSITY_RATE\tINCONSISTENT_AB_HET_RATE\trd_median\trd_mean\tmanta_DEL_count\tmanta_DUP_count\tmanta_INS_count\tmanta_INV_count\tmanta_BND_count\tmelt_INS_count\twham_DEL_count\twham_DUP_count" > $output_file
+# Process each sample
+for sample in "${samples[@]}"; do
+    # Get the string output from the specified file
+    charr_output=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/charr/${sample}.txt" | cut -f2-9 | tail -n 1)
+    
+    demographics_output=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/demographics/${sample}.txt" | cut -f2-5,7-9,13-15 | sed -n '2p')
+    rd_median=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/${sample}.raw-counts.tsv" | grep 'rd_q50' | cut -f2)
+    rd_mean=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/${sample}.raw-counts.tsv" | grep 'rd_mean' | cut -f2)
+
+    manta_DEL_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_DEL_count' | cut -f2)
+    manta_DUP_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_DUP_count' | cut -f2)
+    manta_INS_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_INS_count' | cut -f2)
+    manta_INV_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_INV_count' | cut -f2)
+    manta_BND_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/manta_${sample}.vcf.tsv" | grep 'vcf_BND_count' | cut -f2)
+
+    melt_INS_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/melt_${sample}.vcf.tsv" | grep 'vcf_INS_count' | cut -f2)
+    wham_DEL_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/wham_${sample}.vcf.tsv" | grep 'vcf_DEL_count' |cut -f2)
+    wham_DUP_count=$(gsutil cat "gs://dfci-g2c-inputs/${cohort}/gatk-sv/metrics/wham_${sample}.vcf.tsv" | grep 'vcf_DUP_count' |cut -f2)
+    # Append the string output to the sample name and print to test_output.txt
+    echo -e "${sample}\t${cohort}\t${demographics_output}\t${charr_output}\t${rd_median}\t${rd_mean}\t${manta_DEL_count}\t${manta_DUP_count}\t${manta_INS_count}\t${manta_INV_count}\t${manta_BND_count}\t${melt_INS_count}\t${wham_DEL_count}\t${wham_DUP_count}" >> $output_file
+done
+
+echo "Processing complete. Results saved in ${output_file}."
+
diff --git a/scripts/gsc_helpers/.DS_Store b/scripts/gsc_helpers/.DS_Store
new file mode 100644
index 0000000..87d18f3
Binary files /dev/null and b/scripts/gsc_helpers/.DS_Store differ
diff --git a/scripts/gsc_helpers/Standardize_Hmf_Germline_Vcfs.wdl b/scripts/gsc_helpers/Standardize_Hmf_Germline_Vcfs.wdl
new file mode 100644
index 0000000..85d5e5d
--- /dev/null
+++ b/scripts/gsc_helpers/Standardize_Hmf_Germline_Vcfs.wdl
@@ -0,0 +1,67 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# This code will standardize a HMF vcf.
+
+
+version 1.0
+
+task standardize{
+  input {
+	File unstandardized_vcf
+	String id
+  }
+  Int default_disk_size = ceil((2 * size(unstandardized_vcf, "GB")) + 10)
+  
+  command <<<
+	# Step 1: Remove annotations (INFO fields)
+	bcftools view -h ~{unstandardized_vcf} | tail -n 1 | cut -f10 > sample_name.txt
+	bcftools annotate -x FORMAT/AD,INFO ~{unstandardized_vcf} -o temp.vcf.gz
+
+	# Step 2: Keep only samples listed in sample_name.txt
+	bcftools view -S sample_name.txt temp.vcf.gz -o filtered_samples.vcf.gz
+
+	# Step 3: Normalize the VCF file
+	bcftools norm -m - filtered_samples.vcf.gz -o normalized.vcf.gz
+
+	# Step 4: Filter variants where 'AC > 0' and the filter passes QC
+	bcftools filter -i 'AC > 0 & FILTER == "PASS"' normalized.vcf.gz -o allele_count.vcf.gz
+
+	# Step 5:
+	bcftools +fill-tags allele_count.vcf.gz -o ~{id}.vcf.gz -- -t FORMAT/VAF
+
+	# Step 6: Index the vcf
+	bcftools index -t -o ~{id}.vcf.gz.tbi ~{id}.vcf.gz
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/bcftools"
+    disks: "local-disk " + default_disk_size + " HDD"
+  }
+  output{
+    File vcf = "~{id}.vcf.gz"
+    File vcf_idx  = "~{id}.vcf.gz.tbi"
+  }
+}
+
+
+
+
+# Define workflow
+workflow standardize_hmf {
+  input{
+	File unstandardized_vcf
+    String id
+  }
+  call standardize {
+	input:
+		id = id,
+		unstandardized_vcf = unstandardized_vcf
+  }
+  output{
+	File standardized_vcf = standardize.vcf
+    File standardized_vcf_idx = standardize.vcf_idx
+  }
+}
\ No newline at end of file
diff --git a/scripts/gsc_helpers/annotate_noncoding_snps.VALab_germline_somatic_table.py b/scripts/gsc_helpers/annotate_noncoding_snps.VALab_germline_somatic_table.py
new file mode 100644
index 0000000..42d5208
--- /dev/null
+++ b/scripts/gsc_helpers/annotate_noncoding_snps.VALab_germline_somatic_table.py
@@ -0,0 +1,71 @@
+# Read nc_c.map and create a mapping of germline genes to list of (chr:pos, risk_allele) tuples
+gene_map = {}
+with open("nc_c.map", "r") as map_file:
+    for line in map_file:
+        parts = line.strip().split("\t")
+        if len(parts) == 3:
+            chrom_pos, risk_allele, gene = parts
+            if gene in gene_map:
+                gene_map[gene].append((chrom_pos, risk_allele))
+            else:
+                gene_map[gene] = [(chrom_pos, risk_allele)]
+
+# Read VALab_germline_somatic.tsv, filter rows, and create output
+output = []
+with open("VALab_germline_somatic.tsv", "r") as input_file:
+    # Skip header
+    next(input_file)
+    for line in input_file:
+        parts = line.strip().split("\t")
+        if len(parts) >= 6 and parts[2] == "noncoding" and parts[4] == "coding":
+            cancer_type, germline_gene, _, somatic_gene, _, criteria = parts
+            if germline_gene in gene_map:
+                chrom_pos_risk_alleles = gene_map[germline_gene]
+                for chrom_pos, risk_allele in chrom_pos_risk_alleles:
+                    output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tnoncoding\t{somatic_gene}\tcoding\t{criteria}")
+
+
+# Read VALab_germline_somatic.tsv, filter rows
+with open("VALab_germline_somatic.tsv", "r") as input_file:
+    # Skip header
+    next(input_file)
+    for line in input_file:
+        parts = line.strip().split("\t")
+        if len(parts) >= 6 and parts[2] == "noncoding" and parts[4] == "noncoding":
+            cancer_type, germline_gene, _, somatic_gene, _, criteria = parts
+            if germline_gene in gene_map:
+                chrom_pos_risk_alleles = gene_map[germline_gene]
+                for chrom_pos, risk_allele in chrom_pos_risk_alleles:
+                    output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tnoncoding\t{somatic_gene}\tnoncoding\t{criteria}")
+
+# Read VALab_germline_somatic.tsv, filter rows
+with open("VALab_germline_somatic.tsv", "r") as input_file:
+    # Skip header
+    next(input_file)
+    for line in input_file:
+        parts = line.strip().split("\t")
+        if len(parts) >= 6 and parts[2] == "coding" and parts[4] == "coding":
+            cancer_type, germline_gene, _, somatic_gene, _, criteria = parts
+            if germline_gene in gene_map:
+                chrom_pos_risk_alleles = gene_map[germline_gene]
+                for chrom_pos, risk_allele in chrom_pos_risk_alleles:
+                    output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tcoding\t{somatic_gene}\tcoding\t{criteria}")
+
+# Read VALab_germline_somatic.tsv, filter rows
+with open("VALab_germline_somatic.tsv", "r") as input_file:
+    # Skip header
+    next(input_file)
+    for line in input_file:
+        parts = line.strip().split("\t")
+        if len(parts) >= 6 and parts[2] == "coding" and parts[4] == "noncoding":
+            cancer_type, germline_gene, _, somatic_gene, _, criteria = parts
+            if germline_gene in gene_map:
+                chrom_pos_risk_alleles = gene_map[germline_gene]
+                for chrom_pos, risk_allele in chrom_pos_risk_alleles:
+                    output.append(f"{cancer_type}\t{chrom_pos}-{risk_allele}\t{germline_gene}\tcoding\t{somatic_gene}\tnoncoding\t{criteria}")
+
+# Write output to a new file
+with open("VALab_germline_somatic.noncoding.tsv", "w") as output_file:
+    output_file.write("cancer\tchr:pos-risk_allele\tgermline_gene\tgermline_context\tsomatic_gene\tsomatic_context\tcriteria\n")
+    for line in output:
+        output_file.write(line + "\n")
diff --git a/scripts/gsc_helpers/annotate_table.sh b/scripts/gsc_helpers/annotate_table.sh
new file mode 100644
index 0000000..75ed51c
--- /dev/null
+++ b/scripts/gsc_helpers/annotate_table.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+COSMIC_TABLE="/Users/noah/Desktop/DFCI_Data/gsi/data/COSMIC.CGC.10_26_23.tsv"
+CELLCHAT_TABLE="/Users/noah/Desktop/DFCI_Data/gsi/data/cellchat_db_formatted.csv"
+VALab_germline_somatic_table="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.tsv"
+output_file="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.annotated.tsv"
+
+# Function to check ONC status from COSMIC.CGC.10_26_23.tsv
+get_onc_status() {
+    local gene="$1"
+    local oncogene="$(grep -m 1 "^$gene" "$COSMIC_TABLE" | cut -d $'\t' -f 15)"
+    
+    if grep -q "oncogene" <<< "$oncogene" && grep -q "TSG" <<< "$oncogene"; then
+        echo "ONC-TSG"
+    elif grep -q "oncogene" <<< "$oncogene"; then
+        echo "ONC"
+    elif grep -q "TSG" <<< "$oncogene"; then
+        echo "TSG"
+    else
+        echo "UNK"
+    fi
+}
+
+# Function to check Signaling Scenario from cellchat_db_formatted.csv
+get_signaling_scenario() {
+    local gene="$1"
+    
+    local signaling_scenario=""
+    
+    if grep -q -F "$gene" <(cut -d ',' -f4 "$CELLCHAT_TABLE");then
+	signaling_scenario="Ligand"
+    elif grep -q -F "$gene" <(cut -d ',' -f5 "$CELLCHAT_TABLE");then
+	signaling_scenario="Receptor"
+    else
+	signaling_scenario="UNK"
+    fi
+    
+    echo "$signaling_scenario"
+}
+
+get_mutation_type() {
+    local gene="$1"
+    
+    local mutation_type="$(grep -m 1 "^$gene" "$COSMIC_TABLE" | cut -d $'\t' -f16)"
+
+    if test -z "$mutation_type";then
+        echo "UNK"
+    else
+        echo "$mutation_type"
+    fi
+}
+
+echo -e "cancer\tgermline_gene\tgermline_context\tgermline_onc_status\tgermline_mutations\tgermline_signaling\tsomatic_gene\tsomatic_context\tsomatic_onc_status\tsomatic_mutations\tsomatic_signaling\tcriteria" > "$output_file"
+# Main script
+while IFS=$'\t' read -r cancer germline_gene germline_context somatic_gene somatic_context criteria; do
+    # Get ONC status for germline gene
+    germline_onc_status=$(get_onc_status "$germline_gene")
+    
+    # Get ONC status for somatic gene
+    somatic_onc_status=$(get_onc_status "$somatic_gene")
+    
+    # Get Signaling Scenario
+    germline_signaling_scenario=$(get_signaling_scenario "$germline_gene")
+    somatic_signaling_scenario=$(get_signaling_scenario "$somatic_gene")
+
+    #Get Mutation Type
+    germline_mutation_type=$(get_mutation_type "$germline_gene")
+    somatic_mutation_type=$(get_mutation_type "$somatic_gene")
+
+    # Print the updated line
+    echo -e "$cancer\t$germline_gene\t$germline_context\t$germline_onc_status\t$germline_mutation_type\t$germline_signaling_scenario\t$somatic_gene\t$somatic_context\t$somatic_onc_status\t$germline_mutation_type\t$somatic_signaling_scenario\t$criteria" >> "$output_file"
+done < "$VALab_germline_somatic_table" | tail -n +2
+
diff --git a/scripts/gsc_helpers/check_sorted.sh b/scripts/gsc_helpers/check_sorted.sh
new file mode 100644
index 0000000..7c5f1de
--- /dev/null
+++ b/scripts/gsc_helpers/check_sorted.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+
+#Check if a vcf file is sorted
+
+# Define function to check if variants are properly sorted
+check_sorted() {
+    local vcf_file="$1"
+
+    # Extract chromosome and position from VCF file
+    bcftools query -f '%CHROM\t%POS\n' "${vcf_file}" > chrom_pos.txt
+
+    # Check if variants are properly sorted
+    if sort -c chrom_pos.txt; then
+        echo "VCF file is properly sorted."
+    else
+        echo "VCF file is not properly sorted."
+    fi
+
+    # Clean up temporary file
+    #rm chrom_pos.txt
+}
+
+# Main script
+if [[ $# -eq 0 ]]; then
+    echo "Usage: $0 <vcf_file>"
+    exit 1
+fi
+
+vcf_file="$1"
+check_sorted "${vcf_file}"
\ No newline at end of file
diff --git a/scripts/gsc_helpers/coding_genes.flipped.coordinates.bed.gz b/scripts/gsc_helpers/coding_genes.flipped.coordinates.bed.gz
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/gsc_helpers/coding_genes.list b/scripts/gsc_helpers/coding_genes.list
new file mode 100644
index 0000000..7f0e8e5
--- /dev/null
+++ b/scripts/gsc_helpers/coding_genes.list
@@ -0,0 +1,139 @@
+ABL2
+ACVR2A
+AKAP9
+AKT1
+APC
+ARHGEF10
+ARID1A
+ARID1B
+ASXL2
+ATM
+AXIN1
+B2M
+BAP1
+BARD1
+BAZ1A
+BMP6
+BMPR1A
+BRAF
+BRCA1
+BRCA2
+CASP8
+CASZ1
+CCNE1
+CD74
+CDH1
+CDKN1B
+CDKN2A
+CHD4
+CHEK2
+CLTC
+CREBBP
+CTNNB1
+CTNND1
+CUL3
+CUX1
+CYLD
+DDX5
+EGFR
+EIF3E
+EP300
+EPS15
+ERBB2
+ERBB3
+ERBB4
+ESR1
+ETV5
+FBLN2
+FBXW7
+FGFR1
+FGFR2
+FOXP1
+GATA3
+GNAS
+GRIN2A
+H3C2
+HIF1A
+HLA-A
+HRAS
+IDH1
+IKZF3
+INHBB
+ITGAV
+JAK2
+KEAP1
+KLF4
+KLF6
+KLK2
+KRAS
+LRP1B
+MAP2K4
+MDM4
+MET
+MLH1
+MSH2
+MSH6
+MYEOV
+MYNN
+NCOR1
+NF1
+NF2
+NOTCH1
+NOTCH2
+NRAS
+NRG1
+NTRK1
+NUP214
+PALB2
+PARP4
+PCBP1
+PER3
+PHLDA3
+PIK3R1
+PLCG1
+POLD1
+PPP1R18
+PPP2R1A
+PRDM1
+PTEN
+RAD17
+RAD21
+RAF1
+RASA1
+RB1
+RBFOX1
+RHOA
+RNF213
+RNF43
+RUNX1T1
+SDC4
+SH2B3
+SMAD2
+SMAD3
+SMAD4
+SMARCA4
+SMARCD1
+SOX9
+SPEN
+SPOP
+TBX3
+TCF7L2
+TFRC
+TGFBR2
+TGIF1
+TMPRSS2
+TNXB
+TP53
+TRRAP
+TSC1
+TSC2
+U2AF1
+UBR5
+VHL
+VTI1A
+XPO1
+ZBTB16
+ZBTB42
+ZFHX3
+ZNRF3
+germline_gene
diff --git a/scripts/gsc_helpers/creating_gene_regions_of_interest_hg19.sh b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg19.sh
new file mode 100644
index 0000000..74775c4
--- /dev/null
+++ b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg19.sh
@@ -0,0 +1,27 @@
+# Example code for creating gene regions of interest and subsetting AoU VCF
+
+
+# The below can all be run on your local machine from the terminal
+
+# Download MANE GTF:
+wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/GRCh37_mapping/gencode.v45lift37.annotation.gtf.gz
+gsc_table="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.flipped.tsv"
+#cut -f2,3 "$gsc_table" | grep -v 'noncoding' | cut -f1 | sort | uniq > coding_genes.list
+cut -f2 "$gsc_table" | sort | uniq > coding_genes.list
+
+# Subset MANE to genes of interest
+zcat gencode.v45lift37.annotation.gtf.gz \
+| fgrep -wf coding_genes.list \
+| bgzip -c \
+> gencode.v45lift37.annotation.subsetted.gtf.gz
+
+# Extract coordinates from subsetted GTF, pad by ±2kb to be conservative, and use BEDTools to merge these coordinates into unique regions
+zcat gencode.v45lift37.annotation.subsetted.gtf.gz \
+| awk -v FS="\t" -v OFS="\t" -v buffer=2000 '{ print $1, $4-buffer, $5+buffer }' \
+| sort -Vk1,1 -k2,2n -k3,3n \
+| bedtools merge -i - \
+| bgzip -c \
+> coding_genes.flipped.coordinates.bed.gz
+
+rm coding_genes.list
+
diff --git a/scripts/gsc_helpers/creating_gene_regions_of_interest_hg38.sh b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg38.sh
new file mode 100644
index 0000000..2d03e9c
--- /dev/null
+++ b/scripts/gsc_helpers/creating_gene_regions_of_interest_hg38.sh
@@ -0,0 +1,40 @@
+# Example code for creating gene regions of interest and subsetting AoU VCF
+
+# Given the following input files: 
+# 1. list (flat .txt file) of gene symbols of relevance: riaz_genes.list
+# 2. list (flat .txt file) of sample IDs to retain: keep_samples.list
+# 3. VCF of all variants from AoU: aou.vcf.gz (must also have .tbi index present locally)
+
+# The below can all be run on your local machine from the terminal
+
+# Download MANE GTF:
+#wget https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.3/MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz
+
+input_table="VALab_germline_somatic.tsv"
+prefix="gsc"
+cut -f2 $input_table | tail -n +2 | sort | uniq > "${prefix}_genes.list"
+
+# Subset MANE to genes of interest
+zcat MANE.GRCh38.v1.3.ensembl_genomic.gtf.gz \
+| fgrep -wf "${prefix}_genes.list" \
+> MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf
+
+# Extract coordinates from subsetted GTF, pad by ±2kb to be conservative, and use BEDTools to merge these coordinates into unique regions
+cat MANE.GRCh38.v1.3.ensembl_genomic.subsetted.gtf \
+| awk -v FS="\t" -v OFS="\t" -v buffer=2000 '{ print $1, $4-buffer, $5+buffer }' \
+| sort -Vk1,1 -k2,2n -k3,3n \
+| bedtools merge -i - \
+> "${prefix}_genes.coordinates.bed"
+
+
+# Subset variants to minimal set of interest for our purposes
+# Note that this must be run in the AoU workspace terminal, not on your local machine
+# bcftools view \
+#   -O z -o aou.subsetted.vcf.gz \
+#   --regions-file riaz_genes.coordinates.bed.gz \
+#   --samples-file keep_samples.list \
+#   --min-ac 1 \
+#   aou.vcf.gz
+
+# Index output VCF for future queries
+# tabix -p vcf -f aou.subsetted.vcf.gz
\ No newline at end of file
diff --git a/scripts/gsc_helpers/get_felix_somatic_snps.sh b/scripts/gsc_helpers/get_felix_somatic_snps.sh
new file mode 100644
index 0000000..94fccd1
--- /dev/null
+++ b/scripts/gsc_helpers/get_felix_somatic_snps.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Download the zip file
+#wget https://www.science.org/doi/suppl/10.1126/science.abg5601/suppl_file/science.abg5601_tables_s2_to_s23.zip
+
+# Unzip the file
+#unzip science.abg5601_tables_s2_to_s23.zip
+#rm combined_data.tsv combined_data.tmp
+
+#get somatic noncoding portions
+gsc_table="/Users/noah/Desktop/DFCI_Data/gsi/data/VALab_germline_somatic.tsv"
+cut -f1,4,5 "$gsc_table" | grep 'noncoding' | cut -f2 > somatic_noncoding.list
+
+# Define the Python script to extract sheet names and process each sheet
+read -r -d '' PYTHON_SCRIPT <<'EOF'
+import pandas as pd
+
+# Load the Excel file
+excel_file = pd.ExcelFile('science.abg5601_tables_s2_to_s23/science.abg5601_tables_s2_to_s23.xlsx')
+
+# Loop through each sheet in the Excel file
+for sheet_name in excel_file.sheet_names:
+    cancer_type = sheet_name.split()[-1]
+    cancer_type_set = set(['Breast','Colorectal','Prostate','Lung','Kidney'])
+    if cancer_type not in cancer_type_set:
+        continue
+    # Read the data from the sheet into a DataFrame
+    df = pd.read_excel(excel_file, sheet_name=sheet_name)
+    # Export the data to a TSV file
+    df.to_csv(f'{sheet_name.split()[-1]}.tsv', sep='\t', index=False)
+    #print(sheet_name.split()[-1])
+EOF
+
+# Execute the Python script
+python3 -c "$PYTHON_SCRIPT"
+
+# Combine all TSV files into one
+cat *.tsv | grep -E 'Breast|Colorectal|Prostate|Kidney|Lung' \
+| grep -v 'Table' \
+| awk -F'\t' '$3 != "coding"' \
+| cut -f1,2,7 | awk -F'\t' -v OFS='\t' '{ split($3, chr, "-"); print chr[1],chr[2], $1, $2 }' \
+| awk -F'\t' -v OFS='\t' '{ split($1, a, ":"); print a[1],a[2], $2, $3, $4 }' \
+| grep -Fwf somatic_noncoding.list | sort -n > combined_data.tmp
+
+#rm somatic_noncoding.list
+
+# Remove individual TSV files
+rm Breast.tsv Prostate.tsv Lung.tsv Kidney.tsv Colorectal.tsv
+
+mv combined_data.tmp gsc_noncoding_snps_hg38.bed
+
diff --git a/scripts/gsc_helpers/noncoding_coding/combine_snps.sh b/scripts/gsc_helpers/noncoding_coding/combine_snps.sh
new file mode 100644
index 0000000..9f159b9
--- /dev/null
+++ b/scripts/gsc_helpers/noncoding_coding/combine_snps.sh
@@ -0,0 +1,37 @@
+ref_table="../data/VALab_germline_somatic.tsv"
+
+
+#Breast Section
+grep 'breast' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1  > breast_noncoding_germline.tmp.list
+cut -f3,10,15,18 breast_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf breast_noncoding_germline.tmp.list | awk '{print $0 "\tbreast"}' > breast_noncoding_snps.tmp.tsv
+
+#Colorectal Section
+grep 'colorectal' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > colorectal_noncoding_germline.tmp.list
+cut -f2,9,14,17 colorectal_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf colorectal_noncoding_germline.tmp.list | awk '{print $0 "\tcolorectal"}' > colorectal_noncoding_snps.tmp.tsv
+
+#Lung Section
+grep 'lung' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > lung_noncoding_germline.tmp.list
+cut -f2,9,14,17 lung_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf lung_noncoding_germline.tmp.list | awk '{print $0 "\tlung"}' > lung_noncoding_snps.tmp.tsv
+
+#Kidney Section
+grep 'renal' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > kidney_noncoding_germline.tmp.list
+cut -f2,9,14,17 renal_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf kidney_noncoding_germline.tmp.list | awk '{print $0 "\trenal"}' > kidney_noncoding_snps.tmp.tsv
+
+#Prostate Section
+grep 'prostate' "$ref_table" | cut -f2,3 | grep 'noncoding' | cut -f1 > prostate_noncoding_germline.tmp.list
+cut -f2,9,14,17 prostate_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf prostate_noncoding_germline.tmp.list | awk '{print $0 "\tprostate"}' > prostate_noncoding_snps.tmp.tsv
+
+#New Prostate Section
+cut -f5,11,26,27 new_pca_only_gwas_catalog_sig_filtered.annotated.tsv | tail -n +2 | grep -v 'coding_exon' | grep -Fwf prostate_noncoding_germline.tmp.list | awk '{print $2 "\t" $1 "\t" $3 "\t" $4 "\tprostate"}' > new_prostate_noncoding_snps.tmp.tsv
+
+#combine all files
+cat breast_noncoding_snps.tmp.tsv colorectal_noncoding_snps.tmp.tsv lung_noncoding_snps.tmp.tsv kidney_noncoding_snps.tmp.tsv prostate_noncoding_snps.tmp.tsv new_prostate_noncoding_snps.tmp.tsv | grep -v ',' | awk 'BEGIN {FS="\t"; OFS="\t"} {split($1, a, "-"); print $3 "\t" $2 "\t" a[2] "\t" $5}' | awk 'BEGIN {FS="\t"; OFS="\t"} {split($1, a, ":"); print "chr"a[1] "\t" a[2]-1 "\t" a[2] "\t" $2 "\t" $3 "\t" $4 "\t" $5}' | sort | uniq > pancan_gwas_catalog_sig_filtered.tmp.tsv
+
+grep 'HLA-' pancan_gwas_catalog_sig_filtered.tmp.tsv > HLA.tmp.tsv
+grep -v '-' pancan_gwas_catalog_sig_filtered.tmp.tsv > pancan_gwas_catalog_sig_filtered.tmp2.tsv
+cat HLA.tmp.tsv >> pancan_gwas_catalog_sig_filtered.tmp2.tsv
+
+sort -V pancan_gwas_catalog_sig_filtered.tmp2.tsv | uniq > pancan_gwas_catalog_sig_filtered.tsv
+
+#remove all tmp files
+rm *tmp*
diff --git a/scripts/gsc_helpers/noncoding_coding/create_tables.sh b/scripts/gsc_helpers/noncoding_coding/create_tables.sh
new file mode 100644
index 0000000..30e64b0
--- /dev/null
+++ b/scripts/gsc_helpers/noncoding_coding/create_tables.sh
@@ -0,0 +1,74 @@
+# This file is meant to organize for logistic regression for noncoding-coding convergences
+germline_somatic_file=$1
+non_coding_chr=$2
+non_coding_pos=$3
+somatic_gene=$4
+final_output_file="${non_coding_chr}.${non_coding_pos}.${somatic_gene}.tsv"
+# Here is where we compile lists
+grep "$somatic_gene" "$germline_somatic_file" | grep "$non_coding_chr   $non_coding_pos" | grep 'HET' | cut -f1 | sort | uniq> apc_patients_het.list
+
+grep "$somatic_gene" "$germline_somatic_file" | grep "$non_coding_chr   $non_coding_pos" | grep 'HOM' | cut -f1 | sort | uniq> apc_patients_risk_hom.list
+cut -f1 "$germline_somatic_file" | sort | uniq > patients.list
+
+grep "$somatic_gene" "$germline_somatic_file" | cut -f1 | sort | uniq > apc_patients.list
+grep -vFwf apc_patients_het.list "$germline_somatic_file" | grep -vFwf apc_patients_risk_hom.list | cut -f1 | sort | uniq > apc_patients_safe_hom.list
+# Define input files
+patients_file="patients.list"
+apc_patients_file="apc_patients_risk_hom.list"
+output_file="risk_hom_status.list"
+tmp_file=$(mktemp)
+# Loop through patients.list and check if each patient ID is in apc_patients_file
+
+# Create an array of patient IDs from apc_patients_file
+while read -r patient_id; do
+    if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then
+        echo -e "1" >> "$output_file"  # Patient is in apc_patients_file, write "1"
+    else
+        echo -e "0" >> "$output_file"  # Patient is not in apc_patients_file, write "0"
+    fi
+done < "$patients_file"
+
+# Define input files
+apc_patients_file="apc_patients_het.list"
+output_file="het_status.list"
+
+# Create an array of patient IDs from apc_patients_file
+while read -r patient_id; do
+    if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then
+        echo -e "1" >> "$output_file"  # Patient is in apc_patients_file, write "1"
+    else
+        echo -e "0" >> "$output_file"  # Patient is not in apc_patients_file, write "0"
+    fi
+done < "$patients_file"
+
+
+# Define input files
+apc_patients_file="apc_patients_safe_hom.list"
+output_file="safe_hom_status.list"
+
+# Create an array of patient IDs from apc_patients_file
+while read -r patient_id; do
+    if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then
+        echo -e "1" >> "$output_file"  # Patient is in apc_patients_file, write "1"
+    else
+        echo -e "0" >> "$output_file"  # Patient is not in apc_patients_file, write "0"
+    fi
+done < "$patients_file"
+
+# Define input files
+apc_patients_file="apc_patients.list"
+output_file="driver.list"
+
+# Create an array of patient IDs from apc_patients_file
+while read -r patient_id; do
+    if grep -w "^$patient_id$" "$apc_patients_file" > "$tmp_file"; then
+        echo -e "1" >> "$output_file"  # Patient is in apc_patients_file, write "1"
+    else
+        echo -e "0" >> "$output_file"  # Patient is not in apc_patients_file, write "0"
+    fi
+done < "$patients_file"
+
+echo "patients  risk_hom        het     safe_hom        driver_status" > $final_output_file
+paste patients.list risk_hom_status.list het_status.list safe_hom_status.list driver.list >> $final_output_file
+rm  patients.list risk_hom_status.list het_status.list safe_hom_status.list driver.list
+rm apc_patients.list apc_patients_safe_hom.list apc_patients_het.list apc_patients_risk_hom.list
diff --git a/scripts/gsc_helpers/noncoding_coding/logistic_regression.py b/scripts/gsc_helpers/noncoding_coding/logistic_regression.py
new file mode 100644
index 0000000..e3c3e29
--- /dev/null
+++ b/scripts/gsc_helpers/noncoding_coding/logistic_regression.py
@@ -0,0 +1,44 @@
+import sys
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+
+def main(filename):
+    # Read the TSV file into a pandas DataFrame
+    df = pd.read_csv(filename, sep='\t')
+
+    # Define the features (X) and target (y)
+    X = df[['risk_hom', 'het', 'safe_hom']]
+    y = df['driver_status']
+
+    # Create the logistic regression model
+    model = LogisticRegression()
+
+    # Fit the model to the data
+    model.fit(X, y)
+
+    # Get the coefficients of the model
+    coefficients = model.coef_[0]
+
+    # Calculate the odds ratios (OR)
+    odds_ratios = np.exp(coefficients)
+
+    # Create a DataFrame to display results
+    results = pd.DataFrame({
+        'Feature': X.columns,
+        'Coefficient': coefficients,
+        'Odds Ratio': odds_ratios
+    })
+
+if __name__ == "__main__":
+    # Check if the filename argument is provided
+    if len(sys.argv) < 2:
+        print("Usage: python logistic_regression.py <filename>")
+        sys.exit(1)
+
+    # Get the filename from command line argument
+    filename = sys.argv[1]
+
+    # Call the main function with the filename argument
+    main(filename)
diff --git a/scripts/gsc_helpers/vcf_to_tsv.py b/scripts/gsc_helpers/vcf_to_tsv.py
new file mode 100644
index 0000000..5a8b0f4
--- /dev/null
+++ b/scripts/gsc_helpers/vcf_to_tsv.py
@@ -0,0 +1,91 @@
+import pandas as pd
+import numpy as np
+
+# Function to parse VCF and create DataFrame
+def vcf_to_tsv(vcf_file,risk_snp_dict):
+    # Initialize lists and dictionaries
+    data = {}
+    sample_ids = []
+    variants = []
+
+    # Open VCF file and read line by line
+    with open(vcf_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            
+            # Skip header lines starting with ##
+            if line.startswith('##'):
+                continue
+            
+            # Process header line starting with #CHROM
+            elif line.startswith('#CHROM'):
+                fields = line.split('\t')
+                sample_ids = fields[9:]  # Extract sample IDs from header
+                continue
+            
+            # Process variant data lines
+            fields = line.split('\t')
+            chrom = fields[0]
+            pos = fields[1]
+            ref = fields[3]
+            alt = fields[4]
+            allele = risk_snp_dict[f"{chrom}:{pos}"]
+            key = f"{chrom}:{pos}-{allele}"
+            variants.append(key)
+
+            # Process genotype information for each sample
+            for i, sample_id in enumerate(sample_ids):
+                genotype_info = fields[i + 9]  # Start index for sample genotype info
+                if './.' in genotype_info:
+                    value = np.nan
+                else:
+                    alleles = genotype_info.split(':')[0].split('/')
+                    if allele == ref:
+                        value = 2 - (int(alleles[0]) + int(alleles[1]))
+                    elif allele == alt:
+                        value = int(alleles[0]) + int(alleles[1])
+                    else:
+                        value = 0
+                    
+                    if '.' in alleles:
+                        value = np.nan
+                    #else:
+                         
+                        #count_allele = alleles.count(str(allele_type))
+                        #value = count_allele
+                        #value = sum(int(allele) for allele in alleles)
+
+                if sample_id not in data:
+                    data[sample_id] = {}
+
+                data[sample_id][key] = value
+
+    # Create DataFrame
+    df = pd.DataFrame.from_dict(data, orient='index')
+    df.index.name = 'sample_id'
+    df.columns = variants
+
+    return df
+
+def tsv_to_dict(tsv_file):
+    data_dict = {}
+    with open(tsv_file, 'r') as f:
+        for line in f:
+            if line.startswith('#'):  # Skip header lines if any
+                continue
+            fields = line.strip().split('\t')
+            key = f"{fields[0]}:{fields[2]}"
+            value = fields[3]
+            data_dict[key] = value
+    return data_dict
+
+# Example usage
+vcf_file = 'PROFILE.noncoding.vcf'
+bed_file = 'noncoding_vars.hg19.bed'
+output_tsv = 'output.tsv'
+
+risk_snp_dict = tsv_to_dict(bed_file)
+df = vcf_to_tsv(vcf_file,risk_snp_dict)
+#risk_snp_dict = tsv_to_dict(bed_file)
+df.to_csv(output_tsv, sep='\t', na_rep='NaN')
+
diff --git a/scripts/ufc_helpers/.DS_Store b/scripts/ufc_helpers/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/scripts/ufc_helpers/.DS_Store differ
diff --git a/scripts/ufc_helpers/.Rhistory b/scripts/ufc_helpers/.Rhistory
new file mode 100644
index 0000000..95edd18
--- /dev/null
+++ b/scripts/ufc_helpers/.Rhistory
@@ -0,0 +1,512 @@
+proband = read.delim("/Users/noah/Desktop/proband.counts.tsv")
+proband = read.delim("/Users/noah/Desktop/0450-B-1.counts.tsv")
+df <- read.table("/Users/noah/Desktop/0450-B-1.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6"), fill = TRUE)
+df2 <- read.table("/Users/noah/Desktop/proband.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6"), fill = TRUE)
+df2 <- read.table("/Users/noah/Desktop/proband.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE)
+View(df)
+View(df2)
+df <- read.table("/Users/noah/Desktop/0450-B-1.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE)
+View(df)
+View(df)
+pwd
+estimate_chromosomes()
+library(dplyr)
+estimate_chromosomes_processing <- function(file_path) {
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+# Return the merged data frame with normalized read coverage
+return(merged_df)
+}
+estimate_chromosomes <- function(df) {
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+}
+# Extract the file path from the command-line arguments
+file_path <- commandArgs(trailingOnly = TRUE)[1]
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+estimate_chromosomes_processing("/Users/noah/Desktop/proband.counts.tsv")
+result_df = estimate_chromosomes_processing("/Users/noah/Desktop/proband.counts.tsv")
+result = estimate_chromosomes(result_df)
+result_df = estimate_chromosomes_processing("/Users/noah/Desktop/0450-B-1.counts.tsv")
+result = estimate_chromosomes(result_df)
+df <- read.table("/Users/noah/Desktop/proband.short.counts.tsv", header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6","V7"), fill = TRUE)
+View(df)
+df <- df[, 1:4]
+header_row <- grep("CONTIG", df$Chromosome)
+df2 <- df[(header_row + 1):nrow(df), ]
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+library(dplyr)
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+View(merged_df)
+View(df)
+rm(list=ls())
+cat("hello")
+library(dplyr, quietly = TRUE)
+estimate_chromosomes_processing <- function(file_path) {
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+# Return the merged data frame with normalized read coverage
+return(merged_df)
+}
+estimate_chromosomes <- function(df) {
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+cat("hi")
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+}
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/proband.short.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+result <- estimate_chromosomes(result_df)
+cat(result)
+cat("Hi")
+library(dplyr, quietly = TRUE)
+estimate_chromosomes_processing <- function(file_path) {
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+# Return the merged data frame with normalized read coverage
+return(merged_df)
+}
+estimate_chromosomes <- function(df) {
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+}
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+result <- estimate_chromosomes(result_df)
+cat(result)
+cat("Hi")
+library(dplyr, quietly = TRUE)
+estimate_chromosomes_processing <- function(file_path) {
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+# Return the merged data frame with normalized read coverage
+return(merged_df)
+}
+estimate_chromosomes <- function(df) {
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+cat(chrX_cov,chrY_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+}
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+result <- estimate_chromosomes(result_df)
+cat(result)
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+View(result_df)
+rm(list=ls())
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/0450-B-1.short.counts.tsv"
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+View(merged_df)
+rm(list=ls())
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/0450-B-1.counts.tsv"
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+View(result_df)
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+View(merged_df)
+df = merged_df
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+cat(chrX_cov,chrY_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+rm(list = ls())
+# Extract the file path from the command-line arguments
+#file_path <- commandArgs(trailingOnly = TRUE)[1]
+file_path = "/Users/noah/Desktop/proband.counts.tsv"
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+df = merged_df
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+cat(chrX_cov,chrY_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+library(dplyr, quietly = TRUE)
+estimate_chromosomes_processing <- function(file_path) {
+# Create an R data frame with chromosome numbers and lengths
+chromosome_data <- data.frame(
+Chromosome = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10",
+"chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
+"chr21", "chr22", "chrX", "chrY"),
+Length = c(248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636,
+138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345,
+81282511, 78077248, 59128983, 63025520, 48129895, 50735781, 156040895, 57227415)
+)
+# Read the TSV file into a data frame
+df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
+# Keep only the first four columns
+df <- df[, 1:4]
+# Find the row index where the specified header pattern is located
+header_row <- grep("CONTIG", df$Chromosome)
+# Omit rows before the header row (if header is found)
+df2 <- df[(header_row + 1):nrow(df), ]
+# Group by V1 and calculate the sum of V4 for each group
+result_df <- df2 %>%
+group_by(Chromosome) %>%
+summarize(Sum_V4 = sum(as.numeric(V4)))
+# Merge the two data frames based on the "Chromosome" column
+merged_df <- merge(result_df, chromosome_data, by = "Chromosome")
+# Calculate normalized read coverage
+merged_df$norm_read_cov <- merged_df$Sum_V4 / merged_df$Length
+# Return the merged data frame with normalized read coverage
+return(merged_df)
+}
+estimate_chromosomes <- function(df) {
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+cat(chrX_cov,chrY_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+}
+file_path <- "/Users/noah/Desktop/0450-B-1.short.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+View(result_df)
+result <- estimate_chromosomes(result_df)
+cat(result)
+file_path <- "/Users/noah/Desktop/proband.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+result <- estimate_chromosomes(result_df)
+cat(result)
+file_path <- "/Users/noah/Desktop/HG00096.counts.tsv"
+# Example usage
+result_df <- estimate_chromosomes_processing(file_path)
+result <- estimate_chromosomes(result_df)
+View(result_df)
+df <- result_df
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+cat(chrX_cov,chrY_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+result <- estimate_chromosomes(result_df)
+estimate_chromosomes <- function(df) {
+# Extract 'norm_read_cov' values for autosomes and sex chromosomes
+autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
+chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
+chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
+# Calculate mean of autosomal counts
+autosomal_mean <- mean(autosomal_cov)
+#cat(chrX_cov,chrY_cov)
+# Define values to compare proximity
+zero_value <- 0
+one_copy_value <- autosomal_mean / 2
+two_copies_value <- autosomal_mean
+three_copies_value <- 1.5 * autosomal_mean
+# Find the closest value for chrX_cov
+closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
+# Find the closest value for chrY_cov
+closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
+# Return the estimates in the specified format
+return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
+}
+result <- estimate_chromosomes(result_df)
+cat(result)
diff --git a/scripts/ufc_helpers/organize_results.py b/scripts/ufc_helpers/organize_results.py
new file mode 100755
index 0000000..e50808b
--- /dev/null
+++ b/scripts/ufc_helpers/organize_results.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+
+import argparse
+
+def process_data(input_file, ancestry, ancestry_ID, chrX_count, chrY_count):
+    # Read the file line by line
+    with open(input_file, 'r') as file:
+        lines = file.readlines()
+
+    # Extract coordinates from the comments
+    coordinates = [line.strip().split()[1:] for line in lines[3:6]]
+    coordinate_dict = {item[0].rstrip(':'): tuple(map(float, item[1:])) for item in coordinates}
+
+
+    # Extract data from the main part of the file
+    data_line = lines[8].strip().split('\t')
+
+
+    # Prepare output data
+    output_data = {
+        'Sample': data_line[0].strip(),
+        'SNPs': int(data_line[1].strip()),
+        'GD1(x)': float(data_line[2].strip()),
+        'GD2(y)': float(data_line[3].strip()),
+        'GD3(z)': float(data_line[4].strip()),
+        'GD4': float(data_line[5].strip()),
+        'E(%)': float(data_line[6].strip()),
+        'F(%)': float(data_line[7].strip()),
+        'A(%)': float(data_line[8].strip()),
+        'F(xyz)': coordinate_dict['F'],
+        'A(xyz)': coordinate_dict['A'],
+        'E(xyz)': coordinate_dict['E'],
+        'chrX_count': chrX_count,
+        'chrY_count': chrY_count,
+        'ancestry': ancestry,
+        'ancestry_ID': ancestry_ID
+    }
+
+    # Convert output data to CSV format
+    output_csv = (
+        f"Sample\tSNPs\tGD1(x)\tGD2(y)\tGD3(z)\tGD4\tE(%)\tF(%)\tA(%)\tF(xyz)\tA(xyz)\tE(xyz)\tchrX_count\tchrY_count\tancestry\tancestry_ID\n"
+        f"{output_data['Sample']}\t{output_data['SNPs']}\t{output_data['GD1(x)']}\t{output_data['GD2(y)']}\t"
+        f"{output_data['GD3(z)']}\t{output_data['GD4']}\t{output_data['E(%)']}\t{output_data['F(%)']}\t{output_data['A(%)']}\t"
+        f"{output_data['F(xyz)']}\t{output_data['A(xyz)']}\t{output_data['E(xyz)']}\t{output_data['chrX_count']}\t"
+        f"{output_data['chrY_count']}\t{output_data['ancestry']}\t{output_data['ancestry_ID']}\n"
+    )
+
+    print(output_csv)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process input data and create output TSV.')
+    parser.add_argument('--file', type=str, help='Input file path', required=True)
+    parser.add_argument('--ancestry', type=str, help='Column name for Ancestry', required=True)
+    parser.add_argument('--ancestryID', type=int, help='Column name for Ancestry ID', required=True)
+    parser.add_argument('--chrX_count', type=int, help='Column name for chrX count', required=True)
+    parser.add_argument('--chrY_count', type=int, help='Column name for chrY count', required=True)
+
+    args = parser.parse_args()
+
+    process_data(args.file, args.ancestry, args.ancestryID, args.chrX_count, args.chrY_count)
diff --git a/scripts/ufc_helpers/sex_ancestry_accuracy_calc.py b/scripts/ufc_helpers/sex_ancestry_accuracy_calc.py
new file mode 100644
index 0000000..25fa387
--- /dev/null
+++ b/scripts/ufc_helpers/sex_ancestry_accuracy_calc.py
@@ -0,0 +1,47 @@
+import pandas as pd
+
+def process_data(df):
+    # Task 1: Calculate the accuracy prediction for chrX_count, chrY_count, and SEX
+    condition1 = (df['chrX_count'] == 1) & (df['chrY_count'] == 1) & (df['Sex'] == 'MALE')
+    condition2 = (df['chrX_count'] == 2) & (df['chrY_count'] == 0) & (df['Sex'] == 'FEMALE')
+    
+    # Omit rows not meeting the conditions
+    filtered_df = df[condition1 | condition2]
+    
+    correct_rows = filtered_df.shape[0]
+    incorrect_rows = df.shape[0] - correct_rows
+    
+    accuracy = correct_rows / (correct_rows + incorrect_rows)
+    
+    print(f"\nAccuracy Prediction for chrX_count, chrY_count, and SEX: {accuracy:.2%}")
+    print(f"Instances omitted: {incorrect_rows}")
+
+    # Task 2: Calculate accuracy for all Population and Ancestry pairings
+    df['Ancestry'] = df['Population'].astype(str) + '_' + df['ancestry'].astype(str)
+    pairings_count = df.groupby(['ancestry', 'Population']).size().reset_index(name='Count')
+
+    print("\nCount of Ancestry Population pairings:")
+    print(pairings_count)
+
+    return accuracy
+
+def main():
+    # Replace 'your_data.tsv' with the actual file path
+    file_path = '/Users/noah/Desktop/test_sample.tsv'
+
+    # Read the TSV file into a DataFrame
+    df = pd.read_csv(file_path, sep='\t')
+
+    # Task for each individual cohort
+    cohorts = df['Cohort'].unique()
+    for cohort in cohorts:
+        print(f"\nProcessing cohort: {cohort}")
+        cohort_df = df[df['Cohort'] == cohort]
+        process_data(cohort_df)
+
+    # Task for the combined cohort
+    print("\nProcessing combined cohort:")
+    process_data(df)
+
+if __name__ == "__main__":
+    main()
diff --git a/sex_infer.R b/scripts/ufc_helpers/sex_infer.R
similarity index 63%
rename from sex_infer.R
rename to scripts/ufc_helpers/sex_infer.R
index 70ed09f..16d1564 100644
--- a/sex_infer.R
+++ b/scripts/ufc_helpers/sex_infer.R
@@ -1,4 +1,4 @@
-library(dplyr)
+library(dplyr, quietly = TRUE)
 
 estimate_chromosomes_processing <- function(file_path) {
   # Create an R data frame with chromosome numbers and lengths
@@ -12,10 +12,10 @@ estimate_chromosomes_processing <- function(file_path) {
   )
   
   # Read the TSV file into a data frame
-  df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4", "V5", "V6"), fill = TRUE)
+  df <- read.table(file_path, header = FALSE, sep = "\t", col.names = c("Chromosome", "V2", "V3", "V4","V5","V6","V7"), fill = TRUE)
   
-  # Remove the 5th and 6th columns
-  df <- df[, -c(5, 6)]
+  # Keep only the first four columns
+  df <- df[, 1:4]
   
   # Find the row index where the specified header pattern is located
   header_row <- grep("CONTIG", df$Chromosome)
@@ -43,50 +43,28 @@ estimate_chromosomes <- function(df) {
   autosomal_cov <- df$norm_read_cov[!df$Chromosome %in% c("chrX", "chrY")]
   chrX_cov <- df$norm_read_cov[df$Chromosome == "chrX"]
   chrY_cov <- df$norm_read_cov[df$Chromosome == "chrY"]
-  
-  # Calculate mean and standard deviation of autosomal counts
+
+  # Calculate mean of autosomal counts
   autosomal_mean <- mean(autosomal_cov)
-  autosomal_sd <- sd(autosomal_cov)
-  
-  # Set thresholds based on mean and standard deviation
-  threshold_one_copy <- autosomal_mean - 3 * autosomal_sd
-  threshold_two_copies <- autosomal_mean - 2 * autosomal_sd
-  threshold_three_copies <- autosomal_mean + 2 * autosomal_sd
-  
-  # Initialize estimates
-  estimated_chrX <- 0
-  estimated_chrY <- 0
-  
-  # Iterate through each threshold and add 1 copy for every threshold it passes
-  if (max(chrX_cov) > threshold_one_copy) {
-    estimated_chrX <- estimated_chrX + 1
-  }
+  #cat(chrX_cov,chrY_cov)
+  # Define values to compare proximity
+  zero_value <- 0
+  one_copy_value <- autosomal_mean / 2
+  two_copies_value <- autosomal_mean
+  three_copies_value <- 1.5 * autosomal_mean
   
-  if (max(chrX_cov) > threshold_two_copies) {
-    estimated_chrX <- estimated_chrX + 1
-  }
+  # Find the closest value for chrX_cov
+  closest_chrX <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrX_cov))
   
-  if (max(chrX_cov) > threshold_three_copies) {
-    estimated_chrX <- estimated_chrX + 1
-  }
-  
-  if (max(chrY_cov) > threshold_one_copy) {
-    estimated_chrY <- estimated_chrY + 1
-  }
-  
-  if (max(chrY_cov) > threshold_two_copies) {
-    estimated_chrY <- estimated_chrY + 1
-  }
-  
-  if (max(chrY_cov) > threshold_three_copies) {
-    estimated_chrY <- estimated_chrY + 1
-  }
+  # Find the closest value for chrY_cov
+  closest_chrY <- which.min(abs(c(zero_value, one_copy_value, two_copies_value, three_copies_value) - chrY_cov))
   
   # Return the estimates in the specified format
-  return(paste(estimated_chrX, estimated_chrY, sep = "-"))
+  return(paste(closest_chrX - 1, closest_chrY - 1, sep = "-"))
 }
 
 
+
 # Extract the file path from the command-line arguments
 file_path <- commandArgs(trailingOnly = TRUE)[1]
 
@@ -94,3 +72,4 @@ file_path <- commandArgs(trailingOnly = TRUE)[1]
 result_df <- estimate_chromosomes_processing(file_path)
 result <- estimate_chromosomes(result_df)
 cat(result)
+
diff --git a/wdl/.DS_Store b/wdl/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/wdl/.DS_Store differ
diff --git a/wdl/Coding_Convergence.gsc.wdl b/wdl/Coding_Convergence.gsc.wdl
new file mode 100644
index 0000000..4c96987
--- /dev/null
+++ b/wdl/Coding_Convergence.gsc.wdl
@@ -0,0 +1,215 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+version 1.0
+
+# Analysis of germline variants and somatic drivers in coding regions
+# This workflow identifies pathogenic and high-impact variants in germline data, as well as somatic driver mutations in coding regions.
+
+# Task to extract coding regions from a somatic table
+task Extract_Coding_Regions {
+  input {
+    File germline_somatic_table
+    String cancer_type
+  }
+  command <<<
+    #Only interested in coding variants
+    cut -f1,2,3 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_coding.list
+    cut -f1,7,8 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > somatic_coding.list
+  >>>
+  runtime {
+    docker: "ubuntu:latest"
+  }
+  output {
+  	File germline_coding_list = "germline_coding.list"
+    File somatic_coding_list = "somatic_coding.list"
+  }
+}
+
+# Task to extract pathogenic and high-impact germline variants
+task Extract_Germline_Variants {
+  input {
+    File germline_merged_vep_vcf
+    File germline_somatic_table
+    File germline_genes
+    String patient_id
+  }
+  
+  command <<<
+	# Extract variants present only in the germline data
+	bcftools view -s ~{patient_id} ~{germline_merged_vep_vcf} -o sample.vcf
+	bcftools view -i 'AC>0 & GT="alt"' sample.vcf -o germline_only.vcf
+
+	# Split VEP annotations and filter for relevant information
+	bcftools +split-vep -f '%CHROM|%POS|%CSQ/SYMBOL|%CSQ/IMPACT|%CSQ/gnomAD_AF_nfe|%CSQ/gnomAD_AF_popmax|%CSQ/gnomAD_controls_AF_popmax|%CSQ/ClinVar_external_CLNSIG' germline_only.vcf | cut -d'|' -f1,2,5,6,39-43 | awk '{gsub(/\|/, "\t");print}' | cut -d',' -f1 | awk -F'\t' '($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $9 < 0.02 || $9 == "." || $9 == "") &&($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $7 < 0.02 || $7 == "." || $7 == "") && ($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $8 < 0.02 || $8 == "." || $8 == "")' | grep -Ev 'Benign|Likely_benign|LOW' | grep -Fwf ~{germline_genes} > query.tsv
+
+	# Extract potential deleterious germline genes
+	cut -f4 query.tsv | sort | uniq > potential_deleterious_germline_genes.list
+	touch c1.list
+	touch c2.list
+	touch c3.list
+	touch pathogenic_germline_genes.list
+
+	# Condition 1: Pathogenic or Likely pathogenic by ClinVar
+	grep -E 'Pathogenic|Likely_pathogenic' query.tsv | cut -f4 | sort | uniq > c1.list
+
+	# Condition 2: Tumor suppressor gene (TSG) or Mutation-Type includes D,N,F
+	while IFS= read -r gene; do
+		TSG=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | grep 'TSG' | awk 'END {print NR}')
+		MT=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | cut -f5 | grep -E 'D|N|F' | awk 'END {print NR}')
+		if ((TSG + MT > 0)); then
+			echo "$gene" >> c2.list
+		fi
+	done < potential_deleterious_germline_genes.list
+
+	# Condition 3: Oncogene (ONC) or Mutation-Type includes Mis
+	while IFS= read -r gene; do
+		ONC=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | grep 'ONC' | awk 'END {print NR}')
+		MIS=$(cut -f2-5 ~{germline_somatic_table} | grep "$gene" | cut -f5 | grep 'Mis' | awk 'END {print NR}')
+		if ((ONC + MIS > 0)); then
+			echo "$gene" >> c3.list
+		fi
+	done < potential_deleterious_germline_genes.list
+
+	# Determine pathogenic germline genes based on conditions
+	while IFS= read -r gene; do
+		# If condition 1 is met
+		if [ $(grep "$gene" c1.list | awk 'END {print NR}') -gt 0 ];then
+			echo "$gene" >> pathogenic_germline_genes.list
+		# If condition 2 but not 3 is met
+		elif [ $(grep "$gene" c2.list | awk 'END {print NR}') -gt 0 ] && [ $(grep "$gene" c3.list | awk 'END {print NR}') -eq 0 ]; then
+			if [ $(grep "$gene" query.tsv | grep 'HIGH' | awk 'END {print NR}') -gt 0 ]; then
+				echo "$gene" >> pathogenic_germline_genes.list
+			fi
+		# If condition 3 but not 2 is met
+		elif [ $(grep "$gene" c2.list | awk 'END {print NR}') -eq 0 ] && [ $(grep "$gene" c3.list | awk 'END {print NR}') -gt 0 ]; then
+			if [ $(grep "$gene" query.tsv | grep 'MODERATE' | awk 'END {print NR}') -gt 0 ]; then
+				echo "$gene" >> pathogenic_germline_genes.list
+			fi
+		# If condition 2 and 3 is met or none of the conditions are met
+		else
+			if [ $(grep "$gene" query.tsv | grep -E 'HIGH|MODERATE' | awk 'END {print NR}') -gt 0 ]; then
+				echo "$gene" >> pathogenic_germline_genes.list
+			fi
+		fi   	
+	done < potential_deleterious_germline_genes.list
+
+	sort pathogenic_germline_genes.list | uniq > ~{patient_id}.txt
+
+
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/bcftools"
+  }
+  output{
+  	File out = "~{patient_id}.txt"
+  }
+}
+
+# Extract somatic driver mutations based on somatic genes
+task Extract_Somatic_Drivers {
+  input {
+	File somatic_tsv
+	File somatic_genes
+  String patient_id
+  }
+  command <<<
+	# Extract somatic driver mutations based on somatic genes
+	cat ~{somatic_tsv} | cut -f3 | grep -Fwf ~{somatic_genes} | sort | uniq >> ~{patient_id}.somatic_drivers.txt
+  >>>
+  
+  runtime {
+    docker: "ubuntu:latest"
+  }
+  output{
+  	File out = "~{patient_id}.somatic_drivers.txt"
+  }
+
+}
+
+task Output_Mutations {
+  input {
+    File germline_variants
+    File somatic_drivers
+    String patient_id
+    File germline_genes
+    File somatic_genes
+  }
+  
+  command <<<
+	# Initialize output string with patient ID
+	output="~{patient_id}"
+
+	# Check each germline gene for pathogenic variants
+	while IFS=$'\t' read -r germline_gene; do
+		if [ $(grep -F "$germline_gene" ~{germline_variants} | awk 'END {print NR}' ) -gt 0 ]; then
+			output="${output}	1"
+		else
+			output="${output}	0"
+		fi
+	done < ~{germline_genes}
+
+	# Check each somatic gene for driver mutations
+	while IFS=$'\t' read -r somatic_gene; do
+		if [ $(grep -F "$somatic_gene" ~{somatic_drivers} | awk 'END {print NR}' ) -gt 0 ]; then
+			output="${output}	1"
+		else
+			output="${output}	0"
+		fi
+	done < ~{somatic_genes}
+
+	# Write output string to file
+	echo "$output" > ~{patient_id}_info.txt
+  >>>
+  
+  runtime {
+    docker: "ubuntu:latest"
+  }
+  output{
+  	File out = "~{patient_id}_info.txt"
+  }
+}
+
+
+# Define workflow
+workflow convergence {
+  input{
+    File germline_merged_vep_vcf # Merged VCF file annotated with VEP
+    File germline_somatic_table # Germline-Somatic table file containing convergence pairs
+    File somatic_tsv # Somatic TSV file containing mutation information
+    String patient_id # Unique identifier for the patient
+    String cancer_type # Type of cancer being analyzed
+  }
+  call Extract_Coding_Regions {
+    input:
+      germline_somatic_table = germline_somatic_table,
+      cancer_type = cancer_type
+  }
+  call Extract_Germline_Variants {
+    input:
+      germline_merged_vep_vcf = germline_merged_vep_vcf,
+      germline_genes = Extract_Coding_Regions.germline_coding_list,
+      germline_somatic_table = germline_somatic_table,
+      patient_id = patient_id
+  }
+  call Extract_Somatic_Drivers {
+    input:
+      somatic_genes = Extract_Coding_Regions.somatic_coding_list,
+      somatic_tsv = somatic_tsv,
+      patient_id = patient_id
+  }
+  call Output_Mutations {
+	input:
+      patient_id = patient_id,
+      somatic_drivers = Extract_Somatic_Drivers.out,
+      germline_variants = Extract_Germline_Variants.out,
+      germline_genes = Extract_Coding_Regions.germline_coding_list,
+	    somatic_genes = Extract_Coding_Regions.somatic_coding_list
+  }
+  output{
+  	File convergence = Output_Mutations.out
+  }
+}
\ No newline at end of file
diff --git a/wdl/Consolidate_Cohort_Data.wdl b/wdl/Consolidate_Cohort_Data.wdl
new file mode 100644
index 0000000..cf82b22
--- /dev/null
+++ b/wdl/Consolidate_Cohort_Data.wdl
@@ -0,0 +1,173 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# The purpose of this code is to consolidate data for each G2C cohort.
+
+version 1.0
+
+task get_individuals {
+  input{
+  	String cohort
+  }
+  
+  command <<<
+	gsutil cat "gs://dfci-g2c-inputs/sample-lists/~{cohort}.samples.list" | awk 'NF > 0' > samples.list
+  >>>
+  
+  runtime {
+	docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+  }
+  output{
+	Array[String] out = read_lines("samples.list") 
+  }
+}
+
+task unzip_ploidy_tsv {
+  input{
+  	String ploidy_location
+  }
+  
+  command <<<
+	gsutil -m cp ~{ploidy_location} ploidy.tsv.gz
+	gunzip ploidy.tsv.gz
+  >>>
+  
+  runtime {
+	docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+  }
+  output{
+	File out = "ploidy.tsv"
+  }
+}
+
+
+task get_sample_data{
+	input{
+		String sample
+		String cohort
+        File ploidy_tsv
+    }
+	command <<<
+		charr_output=$(gsutil cat gs://dfci-g2c-inputs/~{cohort}/charr/~{sample}.txt | cut -f2-9 | tail -n 1)
+		if [ -z "$charr_output" ]; then
+			charr_output="NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"
+		fi
+		demographics_output=$(gsutil cat gs://dfci-g2c-inputs/~{cohort}/demographics/~{sample}.txt | cut -f2-5,7-9,13-15 | sed -n '2p')
+		if [ -z "$demographics_output" ]; then
+			demographics_output="NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"
+		fi
+		melt_INS_count=$(gsutil cat gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/melt_~{sample}.vcf.tsv | grep 'vcf_INS_count' | cut -f2)
+		if [ -z "$melt_INS_count" ] || [ "$melt_INS_count" -eq 0 ]; then
+			melt_INS_count="NA"
+		fi
+
+		gsutil -m cp gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/manta_~{sample}.vcf.tsv manta_~{cohort}.tmp
+		gsutil -m cp gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/wham_~{sample}.vcf.tsv wham_~{cohort}.tmp
+		gsutil -m cp gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample}.raw-counts.tsv metrics_~{cohort}.tmp
+
+		rd_median=$(grep 'rd_q50' metrics_~{cohort}.tmp | cut -f2)
+		if [ -z "$rd_median" ] || [ "$rd_median" -eq 0 ]; then
+			rd_median="NA"
+		fi
+
+		rd_mean=$(grep 'rd_mean' metrics_~{cohort}.tmp | cut -f2)
+		if [ -z "$rd_mean" ] || [ "$rd_mean" -eq 0 ]; then
+			rd_mean="NA"
+		fi
+
+		manta_DEL_count=$(grep 'vcf_DEL_count' manta_~{cohort}.tmp | cut -f2)
+		if [ -z "$manta_DEL_count" ] || [ "$manta_DEL_count" -eq 0 ]; then
+			manta_DEL_count="NA"
+		fi
+
+		manta_DUP_count=$(grep 'vcf_DUP_count' manta_~{cohort}.tmp | cut -f2)
+		if [ -z "$manta_DUP_count" ] || [ "$manta_DUP_count" -eq 0 ]; then
+			manta_DUP_count="NA"
+		fi
+
+		manta_INS_count=$(grep 'vcf_INS_count' manta_~{cohort}.tmp | cut -f2)
+		if [ -z "$manta_INS_count" ] || [ "$manta_INS_count" -eq 0 ]; then
+			manta_INS_count="NA"
+		fi
+
+		manta_INV_count=$(grep 'vcf_INV_count' manta_~{cohort}.tmp | cut -f2)
+		if [ -z "$manta_INV_count" ] || [ "$manta_INV_count" -eq 0 ]; then
+			manta_INV_count="NA"
+		fi
+
+		manta_BND_count=$(grep 'vcf_BND_count' manta_~{cohort}.tmp | cut -f2)
+		if [ -z "$manta_BND_count" ] || [ "$manta_BND_count" -eq 0 ]; then
+			manta_BND_count="NA"
+		fi
+
+		wham_DEL_count=$(grep 'vcf_DEL_count' wham_~{cohort}.tmp | cut -f2)
+		if [ -z "$wham_DEL_count" ] || [ "$wham_DEL_count" -eq 0 ]; then
+			wham_DEL_count="NA"
+		fi
+
+		wham_DUP_count=$(grep 'vcf_DUP_count' wham_~{cohort}.tmp | cut -f2)
+		if [ -z "$wham_DUP_count" ] || [ "$wham_DUP_count" -eq 0 ]; then
+			wham_DUP_count="NA"
+		fi
+
+		ploidy_estimate=$(awk -F'\t' '$1 == "~{cohort}" {print}'  ~{ploidy_tsv} | awk -v sample="~{sample}" '{ if ($2==sample) print }' | cut -f3-26,28-30)
+		if [ -z "$ploidy_estimate" ]; then
+			ploidy_estimate="NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA"
+		fi
+		echo -e "~{sample}\t~{cohort}\t${demographics_output}\t${charr_output}\t${rd_median}\t${rd_mean}\t${manta_DEL_count}\t${manta_DUP_count}\t${manta_INS_count}\t${manta_INV_count}\t${manta_BND_count}\t${melt_INS_count}\t${wham_DEL_count}\t${wham_DUP_count}\t${ploidy_estimate}" > ~{sample}.txt
+	>>>
+	runtime {
+		docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+	}
+	output{
+		String out = read_string("~{sample}.txt")
+	}
+}
+
+task write_output {
+	input {
+		String cohort
+		Array[String] data
+	}
+    command <<<
+		echo -e "Sample\tCohort\tgrafpop_SNPs\tgrafpop_GD1\tgrafpop_GD2\tgrafpop_GD3\tpct_EUR\tpct_AFR\tpct_ASN\tchrX_count\tchrY_count\tancestry\tHQ_HOM\tHQ_HOM_RATE\tHQ_HET\tHQ_HET_RATE\tCHARR\tMEAN_REF_AB_HOM_ALT\tHETEROZYGOSITY_RATE\tINCONSISTENT_AB_HET_RATE\trd_median\trd_mean\tmanta_DEL_count\tmanta_DUP_count\tmanta_INS_count\tmanta_INV_count\tmanta_BND_count\tmelt_INS_count\twham_DEL_count\twham_DUP_count\tchr1_CopyNumber\tchr2_CopyNumber\tchr3_CopyNumber\tchr4_CopyNumber\tchr5_CopyNumber\tchr6_CopyNumber\tchr7_CopyNumber\tchr8_CopyNumber\tchr9_CopyNumber\tchr10_CopyNumber\tchr11_CopyNumber\tchr12_CopyNumber\tchr13_CopyNumber\tchr14_CopyNumber\tchr15_CopyNumber\tchr16_CopyNumber\tchr17_CopyNumber\tchr18_CopyNumber\tchr19_CopyNumber\tchr20_CopyNumber\tchr21_CopyNumber\tchr22_CopyNumber\tchrX_CopyNumber\tchrY_CopyNumber\tmedian_coverage\twgd_score\tnondiploid_bins" > ~{cohort}.consolidated_data.tsv
+		cat ~{write_lines(data)} >> ~{cohort}.consolidated_data.tsv
+		gsutil -m cp ~{cohort}.consolidated_data.tsv gs://fc-secure-826914ff-6f0b-48bd-b6b1-1002dbffd5a3/
+    >>>
+
+    runtime {
+		docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+    }
+    output{
+    }
+}
+
+workflow consolidate {
+	input {
+		String cohort
+        String ploidy_location
+	}
+    call get_individuals{
+		input:
+			cohort = cohort
+    }
+    call unzip_ploidy_tsv{
+    	input:
+        	ploidy_location = ploidy_location
+    }
+    scatter(i in get_individuals.out){
+		call get_sample_data{
+			input:
+				sample = i,
+				cohort = cohort,
+                ploidy_tsv = unzip_ploidy_tsv.out
+    	}
+    }
+    call write_output {
+		input:
+			cohort = cohort,
+			data = get_sample_data.out
+	}
+}
diff --git a/wdl/G2C_Auto_File_Transfer.wdl b/wdl/G2C_Auto_File_Transfer.wdl
new file mode 100644
index 0000000..6960d21
--- /dev/null
+++ b/wdl/G2C_Auto_File_Transfer.wdl
@@ -0,0 +1,224 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+#The purpose of this code is to transfer files from Terra to Google Cloud Storage.
+
+
+version 1.0
+
+
+task gatk_hc_reblocked{
+  input {
+    #Go into /cohort/gatk-hc/reblocked
+    File reblocked_gvcf
+    File reblocked_gvcf_idx
+    String sample_id
+    String cohort
+  }
+  Int disk_gb = ceil(1.5 * size(reblocked_gvcf, "GB")) + 10
+  String reblocked_gvcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-hc/reblocked/~{sample_id}.reblocked.g.vcf.gz"
+  String reblocked_gvcf_idx_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-hc/reblocked/~{sample_id}.reblocked.g.vcf.gz.tbi"
+  command <<<
+    gsutil -m cp ~{reblocked_gvcf} ~{reblocked_gvcf_pathway}
+    gsutil -m cp ~{reblocked_gvcf_idx} ~{reblocked_gvcf_idx_pathway}
+  >>>
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+    disks: "local-disk " + disk_gb + " HDD"
+  }
+
+}
+
+task gatk_sv_coverage{
+  input {
+    #Go into /cohort/gatk-sv/coverage
+    File coverage_counts
+    String sample_id
+    String cohort
+  }
+  String coverage_counts_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/coverage/~{sample_id}.counts.tsv.gz"
+  
+  command <<<
+    gsutil -m cp ~{coverage_counts} ~{coverage_counts_pathway}
+  >>>
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+  }
+}
+
+task gatk_sv_metrics{
+  input {
+    #Go into /cohort/gatk-sv/metrics
+    Array[File] sample_metric_files
+    String sample_id
+    String cohort
+  }
+  String pe_file_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample_id}.pe-file.tsv"
+  String raw_counts_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample_id}.raw-counts.tsv"
+  String sr_file_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/~{sample_id}.sr-file.tsv"
+  String manta_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/manta_~{sample_id}.vcf.tsv"
+  String melt_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/melt_~{sample_id}.vcf.tsv"
+  String wham_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/wham_~{sample_id}.vcf.tsv"
+  
+  command <<<
+    gsutil -m cp ~{sep=" " sample_metric_files} "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/metrics/"
+
+  >>>
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+  }
+}
+
+task gatk_sv_pesr{
+  input {
+    #Go into /cohort/gatk-sv/pesr/
+    File pesr_disc
+    File pesr_disc_index
+    File pesr_sd
+    File pesr_sd_index
+    File pesr_split
+    File pesr_split_index
+    String sample_id
+    String cohort
+  }
+  String pesr_disc_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.pe.txt.gz"
+  String pesr_disc_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.pe.txt.gz.tbi"
+  
+  String pesr_sd_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sd.txt.gz"
+  String pesr_sd_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sd.txt.gz.tbi"
+  
+  String pesr_split_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sr.txt.gz"
+  String pesr_split_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/gatk-sv/pesr/~{sample_id}.sr.txt.gz.tbi"
+  
+  command <<<
+    gsutil -m cp ~{pesr_disc} ~{pesr_disc_pathway}
+    gsutil -m cp ~{pesr_disc_index} ~{pesr_disc_index_pathway}
+    
+    gsutil -m cp ~{pesr_sd} ~{pesr_sd_pathway}
+    gsutil -m cp ~{pesr_sd_index} ~{pesr_sd_index_pathway}
+    
+    gsutil -m cp ~{pesr_split} ~{pesr_split_pathway}
+    gsutil -m cp ~{pesr_split_index} ~{pesr_split_index_pathway}
+  >>>
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+  }
+
+}
+task manta{
+  input {
+    #Go into /cohort/manta
+    File manta_vcf
+    File manta_vcf_tbi
+    String sample_id
+    String cohort
+  }
+  Int disk_gb = ceil(1.5 * size(manta_vcf, "GB")) + 10
+  String manta_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/manta/~{sample_id}.manta.vcf.gz"
+  String manta_vcf_tbi_pathway = "gs://dfci-g2c-inputs/~{cohort}/manta/~{sample_id}.manta.vcf.gz.tbi"
+  
+  command <<<
+    gsutil -m cp ~{manta_vcf} ~{manta_vcf_pathway}
+    gsutil -m cp ~{manta_vcf_tbi} ~{manta_vcf_tbi_pathway}
+  >>>
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+    disks: "local-disk " + disk_gb + " HDD"
+  }
+
+}
+
+task melt{
+  input {
+    #Go into /cohort/melt
+    File melt_vcf
+    File melt_vcf_tbi
+    String sample_id
+    String cohort
+
+  }
+  Int disk_gb = ceil(1.5 * size(melt_vcf, "GB")) + 10
+  String melt_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/melt/~{sample_id}.melt.vcf.gz"
+  String melt_vcf_tbi_pathway = "gs://dfci-g2c-inputs/~{cohort}/melt/~{sample_id}.melt.vcf.gz.tbi"
+  
+  command <<<
+    gsutil -m cp ~{melt_vcf} ~{melt_vcf_pathway}
+    gsutil -m cp ~{melt_vcf_tbi} ~{melt_vcf_tbi_pathway}
+  >>>
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+    disks: "local-disk " + disk_gb + " HDD"
+  }
+}
+
+task wham{
+  input {
+    #Go into /cohort/wham
+    File wham_vcf
+    File wham_index
+    String sample_id
+    String cohort
+  }
+  Int disk_gb = ceil(1.5 * size(wham_vcf, "GB")) + 10
+  #Should these be Strings or File variables
+  String wham_vcf_pathway = "gs://dfci-g2c-inputs/~{cohort}/wham/~{sample_id}.wham.vcf.gz"
+  String wham_vcf_index_pathway = "gs://dfci-g2c-inputs/~{cohort}/wham/~{sample_id}.wham.vcf.gz.tbi"
+  
+  command <<<
+    gsutil -m cp ~{wham_vcf} ~{wham_vcf_pathway}
+    gsutil -m cp ~{wham_index} ~{wham_vcf_index_pathway}
+  >>>
+  
+  runtime{
+    docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+    disks: "local-disk " + disk_gb + " HDD"
+  }
+}
+
+
+# Define workflow
+workflow store_in_gcs {
+
+  input {
+  	String cohort
+  	String sample_id
+  }
+
+  call gatk_hc_reblocked{
+    input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+  call gatk_sv_coverage{
+  	input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+  call gatk_sv_metrics{
+  	input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+  call gatk_sv_pesr{
+  	input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+  call manta{
+  	input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+  call melt{
+ 	input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+  call wham{
+  	input:
+      sample_id = sample_id,
+      cohort = cohort
+  }
+}
\ No newline at end of file
diff --git a/wdl/GSC.wdl b/wdl/GSC.wdl
new file mode 100644
index 0000000..0deb2f0
--- /dev/null
+++ b/wdl/GSC.wdl
@@ -0,0 +1,244 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# This code will gather germline and somatic convergence.
+
+
+version 1.0
+
+import "https://raw.githubusercontent.com/vanallenlab/pancan_germline_wgs/infer_sex_ancestry/wdl/Coding_Convergence.wdl" as GSC_CC
+
+# The workflow gathers germline-somatic convergence for all combinations.
+workflow Convergence {
+  input{
+    Array[File] germline_noncoding_vcfs
+    Array[File] somatic_noncoding_vcfs
+    Array[File] somatic_noncoding_vcf_idxs
+    Array[File] somatic_tsv_arr
+    Array[String] patient_ids
+    File germline_merged_vep_vcf
+    File germline_somatic_table
+    File noncoding_germline_somatic_table
+    File somatic_noncoding_hotspots_bed
+    String cancer_type
+  }
+  
+  scatter(patient_index in range(length(patient_ids))){
+    call GSC_CC.convergence {
+      input:
+    	id = patient_ids[patient_index],
+        cancer_type = cancer_type,
+        somatic_tsv = somatic_tsv_arr[patient_index],
+        germline_merged_vep_vcf = germline_merged_vep_vcf,
+        germline_somatic_table = germline_somatic_table
+    }
+    
+    call Extract_Noncoding_Germline_SNPs {
+      input:
+        patient_id = patient_ids[patient_index],
+        cancer_type = cancer_type,
+        germline_noncoding_vcf = germline_noncoding_vcfs[patient_index],
+        noncoding_germline_somatic_table = noncoding_germline_somatic_table
+    }
+    
+    call Extract_Somatic_Noncoding_Mutations {
+      input:
+        somatic_noncoding_vcf = somatic_noncoding_vcfs[patient_index],
+        somatic_noncoding_vcf_idx = somatic_noncoding_vcf_idxs[patient_index],
+        somatic_noncoding_hotspots_bed = somatic_noncoding_hotspots_bed,
+        patient_id = patient_ids[patient_index],
+        cancer_type = cancer_type
+    }
+    
+    call Concatnate_Strings {
+      input:
+        coding_str = convergence.convergence,
+        germline_noncoding_str = Extract_Noncoding_Germline_SNPs.out,
+        somatic_noncoding_str = Extract_Somatic_Noncoding_Mutations.out
+    }
+  }
+  
+  call Write_tsv {
+    input:
+      cancer_type = cancer_type,
+      patients_info = Concatnate_Strings.out,
+      germline_somatic_table = germline_somatic_table,
+      noncoding_germline_somatic_table = noncoding_germline_somatic_table,
+      somatic_noncoding_hotspots_bed = somatic_noncoding_hotspots_bed
+  }
+  
+  output{
+    File out = Write_tsv.out
+  }
+}
+
+
+# Task to Extract Noncoding Germline SNPs
+task Extract_Noncoding_Germline_SNPs {
+  input {
+    File germline_noncoding_vcf  # Germline VCF file
+    File noncoding_germline_somatic_table  # Table defining noncoding germline somatic regions
+    String cancer_type  # Type of cancer being analyzed
+    String patient_id  # Unique identifier for the sample
+  }
+  
+  command <<<
+    # Create an empty file to store extracted SNPs
+    touch ~{patient_id}.vars.txt
+    
+    # Extract SNP IDs from the noncoding germline somatic table for the specified cancer type
+    cut -f1,2 < ~{noncoding_germline_somatic_table} | tail -n +2 | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_noncoding.list
+    
+    # Initialize an empty string to store SNP information
+    out=""
+    
+    # Extract CHROM, POS, REF, ALT, and GT fields from the VCF file
+    bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t[%GT]\n' ~{germline_noncoding_vcf} | awk 'BEGIN {FS="\t"; OFS="\t"} {
+    split($5, a, "/")
+    if (a[1] == "1" && a[2] == "1") {
+        print $1":"$2"-"$4, "HOM"
+    } else if (a[1] == "0" && a[2] == "1") {
+        print $1":"$2"-"$3, "HET"
+        print $1":"$2"-"$4, "HET"
+    } else if (a[1] == "1" && a[2] == "0") {
+        print $1":"$2"-"$3, "HET"
+        print $1":"$2"-"$4, "HET"
+    } else if (a[1] == "0" && a[2] == "0") {
+        print $1":"$2"-"$3, "HOM"
+    }
+    }' > query.txt
+    
+    # Filter the SNP table to get CHR, POS, RISK_ALLELE, then search through query.txt for it
+    grep -Fwf germline_noncoding.list query.txt > ~{patient_id}.vars.txt
+    
+    # Loop through each SNP in the SNP list
+    while IFS= read -r snp; do
+      # Check if SNP is homozygous for risk allele
+      if [ $(grep "$snp	HOM" ~{patient_id}.vars.txt | awk 'END {print NR}') -gt 0 ]; then
+        out="${out}	2"  # Append '2' to the output string
+      # Check if SNP is heterozygous
+      elif [ $(grep "$snp	HET" ~{patient_id}.vars.txt | awk 'END {print NR}') -gt 0 ]; then
+        out="${out}	1"  # Append '1' to the output string
+      else
+        out="${out}	0"  # Append '0' to the output string
+      fi
+    done < germline_noncoding.list
+
+    # Write the output string to a file
+    echo -e "$out" > ~{patient_id}.txt
+  >>>
+  
+  output {
+    File out = "~{patient_id}.txt"
+  }
+  runtime {
+    docker: "vanallenlab/bcftools"
+  }
+}
+
+# Task to Extract Somatic Mutations in Noncoding Regions
+task Extract_Somatic_Noncoding_Mutations {
+  input {
+    File somatic_noncoding_vcf # Somatic VCF file
+    File somatic_noncoding_vcf_idx # Index file for the somatic VCF
+    File somatic_noncoding_hotspots_bed # BED file defining noncoding regions
+    String patient_id  # Unique identifier for the patient (tumor(s) for each patient are previously put into one VCF)
+    String cancer_type # Type of cancer the patient has
+    
+  }
+  command <<<
+    # Filter noncoding regions BED file to include only regions relevant to the specified cancer type
+    grep -i '~{cancer_type}' ~{somatic_noncoding_hotspots_bed} | sort > somatic_noncoding_hotspots.bed
+    
+    # Initialize an empty string to store somatic mutation counts
+    somatic_noncoding_str=""
+
+    # Iterate through each noncoding regions
+    while IFS=$'\t' read -r CHROM START END TISSUE GENE; do
+
+      # Count somatic mutations within the region using bcftools
+      somatic_mutation_count=$(bcftools view -H -r $CHROM:$START-$END ~{somatic_noncoding_vcf} | wc -l)
+
+      # Append the mutation count to the somatic mutations string
+      somatic_noncoding_str="$somatic_noncoding_str	$somatic_mutation_count"
+    done < somatic_noncoding_hotspots.bed
+
+    # Write somatic mutation counts to a text file
+    echo "$somatic_noncoding_str" > noncoding_somatic_mutation_counts.txt
+  >>>
+
+  output {
+    File out = "noncoding_somatic_mutation_counts.txt"
+  }
+  runtime {
+    docker: "vanallenlab/bcftools"
+  }
+}
+
+# Task to Extract Somatic Mutations in Noncoding Regions
+task Write_tsv{
+  input{
+    Array[File] patients_info
+    File germline_somatic_table
+    File noncoding_germline_somatic_table
+    File somatic_noncoding_hotspots_bed
+    String cancer_type
+  }
+  
+  command <<<
+    #Only interested in coding variants
+    cut -f1,2,3 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_coding.list
+    cut -f1,7,8 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > somatic_coding.list
+    cut -f1,2 < ~{noncoding_germline_somatic_table} | tail -n +2 | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_noncoding.list
+    grep -i '~{cancer_type}' ~{somatic_noncoding_hotspots_bed} | sort | cut -f5 > noncoding_somatic_regions.list
+    
+    header="patient_id"
+    while IFS= read -r gene; do
+      header="${header}	$gene-g"
+    done < germline_coding.list
+    while IFS= read -r gene; do
+      header="${header}	$gene-s"
+    done < somatic_coding.list
+    while IFS= read -r snp; do
+      header="${header}	$snp"
+    done < germline_noncoding.list
+    while IFS= read -r gene_region; do
+      header="${header}	$gene_region-ncs"
+    done < noncoding_somatic_regions.list
+    
+    echo "$header" > ~{cancer_type}.tsv
+    cat ~{write_lines(patients_info)} >> patient_files.list
+    while IFS= read -r file;do
+      cat "$file" >> ~{cancer_type}.tsv
+    done < patient_files.list
+  >>>
+  runtime {
+    docker: "ubuntu:latest"
+  }
+  output {
+  	File out = "~{cancer_type}.tsv"
+  }
+}
+
+task Concatnate_Strings {
+  input{
+    File coding_str
+    File germline_noncoding_str
+    File somatic_noncoding_str
+  }
+  command <<<
+    str1=$(cat ~{coding_str})
+    str2=$(cat ~{germline_noncoding_str})
+    str3=$(cat ~{somatic_noncoding_str})
+    echo "$str1$str2$str3" > out.txt
+  >>>
+  runtime{
+    docker: "ubuntu:latest"
+  }
+  output {
+    File out = "out.txt"
+  }
+}
+
diff --git a/wdl/Infer_Sex_and_Ancestry.92.wdl b/wdl/Infer_Sex_and_Ancestry.92.wdl
new file mode 100644
index 0000000..e357b54
--- /dev/null
+++ b/wdl/Infer_Sex_and_Ancestry.92.wdl
@@ -0,0 +1,127 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+#The purpose of this code is to infer sex and ancestry
+#from variant call data. 
+
+
+version 1.0
+
+task get_chr_count {
+  input {
+  	File counts_coverage_file
+  }
+
+  command <<<
+    Rscript /opt/pancan_germline_wgs/scripts/ufc_helpers/sex_infer.R ~{counts_coverage_file} > log.txt
+    
+    head -n1 log.txt | cut -f1 -d'-' > chrX.txt
+    head -n1 log.txt | cut -f2 -d'-' > chrY.txt
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/g2c_pipeline:nf_sex_ancestry_alpha"
+    memory: "8 GB"  # Adjust the amount based on your requirements
+  }
+  output {
+    Int chrX = read_int("chrX.txt")
+    Int chrY = read_int("chrY.txt")
+    File output_log = "log.txt"
+  }
+}
+
+task organize {
+  input {
+  	File raw_ancestry_file
+    String sample_ID
+    Int ancestryID
+    String ancestry
+    Int chrX_count
+    Int chrY_count
+  }
+
+  command <<<
+    /opt/pancan_germline_wgs/scripts/ufc_helpers/organize_results.py --file ~{raw_ancestry_file} --ancestry ~{ancestry} --ancestryID ~{ancestryID} --chrX_count ~{chrX_count} --chrY_count ~{chrY_count} > ~{sample_ID}_demo.txt
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/g2c_pipeline:nf_sex_ancestry_alpha"
+    memory: "8 GB"  # Adjust the amount based on your requirements
+  }
+  output {
+    File output_file = "~{sample_ID}_demo.txt"
+  }
+}
+
+task infer_ancestry{
+  input {
+    File gvcf_file
+    #Int memory_var = 60
+    #Int? bootDiskSizeGb_var
+    String sample_ID
+  }
+  Int memory_var = ceil(25 * size(gvcf_file, "GB"))
+  Int bootDiskSizeGb_var = ceil(6 * size(gvcf_file, "GB")+10)
+  
+  command <<<
+    cp /AncInferSNPs.txt .
+    grafpop ~{gvcf_file} ~{sample_ID}_ancestry_raw.txt
+    /opt/infer_ancestry.py ~{sample_ID}_ancestry_raw.txt > tmp.txt #outputs to ancestryID.txt and ancestry.txt
+    
+    # Extract the first line and save it to 'ancestry.txt'
+    head -n 1 tmp.txt > ancestry.txt
+
+    # Extract the second line and save it to 'ancestryID.txt'
+    tail -n 1 tmp.txt > ancestryID.txt
+
+  >>>
+  runtime{
+    docker: "vanallenlab/g2c_pipeline:nf_graf_plink_alpha"
+    memory: "~{memory_var} GB"  # Adjust the amount based on your requirements
+    bootDiskSizeGb: bootDiskSizeGb_var # Adjust the amount based on your requirements
+  }
+  output{
+    Int Ancestry_ID = read_int("ancestryID.txt")
+    String Ancestry = read_string("ancestry.txt")
+    File grafpop_output_file = "~{sample_ID}_ancestry_raw.txt"
+
+    
+  }
+}
+
+# Define workflow
+workflow infer_sex_ancestry {
+
+  input {
+    File counts_coverage_file
+    File gvcf_file
+    String sample_ID
+    
+  }
+  
+  call get_chr_count{
+    input:
+      counts_coverage_file = counts_coverage_file,
+  }
+  call infer_ancestry{
+    input:
+      gvcf_file = gvcf_file,
+      sample_ID = sample_ID
+  }
+  call organize{
+  	input:
+      raw_ancestry_file = infer_ancestry.grafpop_output_file,
+      sample_ID = sample_ID,
+      ancestryID = infer_ancestry.Ancestry_ID,
+      ancestry = infer_ancestry.Ancestry,
+      chrX_count = get_chr_count.chrX,
+      chrY_count = get_chr_count.chrY,
+  }
+  
+  output{
+    File demographics_file = organize.output_file
+  }
+}
+
diff --git a/wdl/Vep.wdl b/wdl/Vep.wdl
index cc67dcb..6278d20 100644
--- a/wdl/Vep.wdl
+++ b/wdl/Vep.wdl
@@ -27,7 +27,7 @@ workflow Vep {
     Array[File?] other_vep_files # All other files needed for VEP. These will be localized in full to each VM and moved to execution directory.
 
     Array[String] vep_options = [""]
-    String vep_assembly = "GRCh38"
+    String vep_assembly = "GRCh37"
     Int vep_version = 110
 
     Boolean shard_vcfs = true
diff --git a/wdl/Vep_hg19.wdl b/wdl/Vep_hg19.wdl
new file mode 100644
index 0000000..a0a2277
--- /dev/null
+++ b/wdl/Vep_hg19.wdl
@@ -0,0 +1,149 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Ryan L. Collins and the Dana-Farber Cancer Institute
+# Contact: Ryan Collins <Ryan_Collins@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# WDL to run VEP on one or more input VCFs
+
+
+version 1.0
+
+
+workflow Vep {
+  input {
+    Array[File] vcfs
+    Array[File] vcf_idxs
+
+    File reference_fasta
+    File vep_cache_tarball # VEP cache tarball downloaded from Ensembl
+    Array[String?] vep_remote_files # URIs for files stored in Google buckets that can be remotely sliced using tabix for each shard
+    Array[File?] vep_remote_file_indexes # Indexes corresponding to vep_remote_files
+    Array[File?] other_vep_files # All other files needed for VEP. These will be localized in full to each VM and moved to execution directory.
+
+    Array[String] vep_options = [""]
+    String vep_assembly = "GRCh37"
+    Int vep_version = 110
+
+    Boolean shard_vcfs = false
+    Int remote_query_buffer = 2
+    String? cohort_prefix
+
+    String bcftools_docker
+    String vep_docker = "vanallenlab/g2c-vep:latest"
+  }
+
+  Int n_remote_files = length(select_all(vep_remote_files))
+  Boolean any_remote = n_remote_files > 0
+
+
+
+  Array[File?] all_other_vep_files = flatten(select_all([vep_remote_files, vep_remote_file_indexes]))
+
+  call RunVep {
+    input:
+      vcf = vcfs[0],
+      vcf_idx = vcf_idxs[0],
+      reference_fasta = reference_fasta,
+      vep_cache_tarball = vep_cache_tarball,
+      other_vep_files = all_other_vep_files,
+      vep_options = vep_options,
+      vep_assembly = vep_assembly,
+      vep_version = vep_version,
+      docker = vep_docker
+  }
+
+
+  output {
+    File annotated_vcfs = RunVep.annotated_vcf
+    File annotated_vcf_idxs = RunVep.annotated_vcf_idx
+  }
+}
+
+
+
+task RunVep {
+  input {
+    File vcf
+    File vcf_idx
+
+    File reference_fasta
+    File vep_cache_tarball
+    Array[File?] other_vep_files
+    String vep_assembly
+
+    Array[String] vep_options
+    Int vep_max_sv_size = 50
+    Int vep_version = 110
+
+    Float mem_gb = 7.5
+    Int n_cpu = 4
+    Int? disk_gb
+
+    String docker
+  }
+
+  String out_filename = basename(vcf, ".vcf.gz") + ".vep.vcf.gz"
+  Int default_disk_gb = ceil(10 * size([vcf, vep_cache_tarball, reference_fasta], "GB")) + 50
+
+  command <<<
+    set -eu -o pipefail
+
+    # Unpack contents of cache into $VEP_CACHE/
+    # Note that $VEP_CACHE is a default ENV variable set in VEP docker
+    tar -xzvf ~{vep_cache_tarball} -C $VEP_CACHE/
+
+    # Relocate other_vep_files to execution directory
+    if [ ~{defined(other_vep_files)} == "true" ]; then
+      while read file; do
+        mv $file ./
+      done < ~{write_lines(select_all(other_vep_files))}
+    fi
+
+    #Removed '--nearest gene \'
+    #Removed ''
+    vep \
+      --input_file ~{vcf} \
+      --format vcf \
+      --output_file ~{out_filename} \
+      --vcf \
+      --compress_output bgzip \
+      --verbose \
+      --force_overwrite \
+      --species homo_sapiens \
+      --assembly ~{vep_assembly} \
+      --max_sv_size ~{vep_max_sv_size} \
+      --offline \
+      --cache \
+      --dir_cache $VEP_CACHE/ \
+      --cache_version ~{vep_version} \
+      --dir_plugins $VEP_PLUGINS/ \
+      --fasta ~{reference_fasta} \
+      --minimal \
+      --distance 10000 \
+      --numbers \
+      --hgvs \
+      --no_escape \
+      --symbol \
+      --canonical \
+      --domains \
+      --merged \
+      ~{sep=" " vep_options} 
+
+    tabix -f ~{out_filename}
+
+  >>>
+
+  output {
+    File annotated_vcf = "~{out_filename}"
+    File annotated_vcf_idx = "~{out_filename}.tbi"
+  }
+
+  runtime {
+    docker: docker
+    memory: mem_gb + " GB"
+    cpu: n_cpu
+    disks: "local-disk " + select_first([disk_gb, default_disk_gb]) + " HDD"
+    bootDiskSizeGb: 25
+    preemptible: 3
+  }
+}
\ No newline at end of file
diff --git a/wdl/WgsFastqToCram.wdl b/wdl/WgsFastqToCram.wdl
index e3751d4..78fe90d 100644
--- a/wdl/WgsFastqToCram.wdl
+++ b/wdl/WgsFastqToCram.wdl
@@ -58,6 +58,7 @@ workflow WgsFastqToCram {
     Float? bwa_mem_gb
     Int? bwa_num_cpu
     Int? bwa_disk_size
+    Int? gather_bam_files_disk_size
   }
 
   # Clean up FASTQs with fastq-pair
@@ -248,7 +249,7 @@ workflow WgsFastqToCram {
       output_bam_basename = sample_name,
       docker_image = gatk_docker,
       gatk_path = gatk_path,
-      disk_size = ceil(3 * size(AddRg.bam_wRG, "GB")) + 10,
+      disk_size = select_first([gather_bam_files_disk_size, ceil(3 * size(AddRg.bam_wRG, "GB")) + 10]),
       preemptible_tries = 5,
       compression_level = 2
   }
diff --git a/wdl/aou_charr.wdl b/wdl/aou_charr.wdl
new file mode 100644
index 0000000..acf83af
--- /dev/null
+++ b/wdl/aou_charr.wdl
@@ -0,0 +1,41 @@
+version 1.0
+
+import "charr.wdl" as charr
+
+workflow aou_charr {
+    input {
+        File samples_file
+        String vcf_directory
+        String cohort
+    }
+
+    scatter (sample_id in read_lines(samples_file)) {
+        call charr.charr {
+            input:
+                sample_id = sample_id,
+                vcf_directory = vcf_directory
+        }
+
+        call cp_file {
+            input:
+                input_file = charr.output_file,
+                cohort = cohort,
+                sample_id = sample_id
+        }
+    }
+}
+
+task cp_file {
+    input {
+        File input_file
+        String cohort
+        String sample_id
+    }
+
+    command {
+        gsutil -m cp ~{input_file} gs://fc-secure-d21aa6b0-1d19-42dc-93e3-42de3578da45/dfci-g2c-inputs/aou/~{cohort}/charr/~{sample_id}.txt
+    }
+    runtime{
+        docker: "us.gcr.io/google.com/cloudsdktool/google-cloud-cli:latest"
+    }
+}
diff --git a/wdl/charr.wdl b/wdl/charr.wdl
new file mode 100644
index 0000000..408a1ad
--- /dev/null
+++ b/wdl/charr.wdl
@@ -0,0 +1,163 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fields@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+# The purpose of this code is to use CHARR, a 
+# contamination estimator which leverages the 
+# infiltration of reference reads within homozygous 
+# alternate variant calls.
+# https://github.com/atgu/CHARR
+# We also use echtvar for annotation of gVCF
+# https://github.com/brentp/echtvar
+
+
+version 1.0
+
+task decompose {
+  input {
+  	File reblocked_gvcf #g.vcf.gz file
+    Int? disk_gb
+    Float memory_var = 3.5
+  }
+  Int default_disk_size = ceil((2 * size(reblocked_gvcf, "GB")) + 10)
+  
+  command <<<
+    bcftools norm -m - ~{reblocked_gvcf} -o clean_bcf.bcf
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/charr"
+    memory: "~{memory_var} GB"  # Adjust the amount based on your requirements
+    disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD"
+  }
+  output {
+  	File clean_bcf = "clean_bcf.bcf"
+  }
+}
+
+#We use echtvar to annotate
+task annotate {
+  input {
+  	File clean_bcf
+    File gnomad_ref
+    Int? disk_gb
+    Float memory_var = 3.5
+  }
+  Int default_disk_size = ceil((2 * size(clean_bcf, "GB")) + 10)
+  
+  command <<<
+    echtvar anno -e ~{gnomad_ref} ~{clean_bcf} output.annotated.bcf
+  >>>  
+  
+  runtime {
+    docker: "vanallenlab/charr"
+    memory: "~{memory_var} GB"  # Adjust the amount based on your requirements
+    disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD"
+  }
+  output {
+  	File annotated_bcf = "output.annotated.bcf"
+  }
+}
+task step1 {
+  input {
+  	File gvcf #g.vcf.gz file
+    Int? disk_gb
+    Float memory_var = 3.5
+  }
+  Int default_disk_size = ceil((2 * size(gvcf, "GB")) + 10)
+  
+  command <<<
+    bcftools view -i 'ALT!="<NON_REF>" & SUM(FORMAT/AD) > 0' ~{gvcf} > step1.annotated.bcf
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/charr"
+    memory: "~{memory_var} GB"  # Adjust the amount based on your requirements
+    disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD"
+  }
+  output {
+  	File output_bcf = "step1.annotated.bcf"
+  }
+}
+
+task step2 {
+  input {
+  	File gvcf #g.vcf.gz file
+    Int? disk_gb
+    Float memory_var = 3.5
+  }
+  Int default_disk_size = ceil((2 * size(gvcf, "GB")) + 10)
+  
+  
+  command <<<
+    bcftools annotate -c INFO/AF:=INFO/gnomad_popmax_af ~{gvcf} > step2.annotated.bcf
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/charr"
+    memory: "~{memory_var} GB"  # Adjust the amount based on your requirements
+    disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD"
+  }
+  output {
+  	File output_bcf = "step2.annotated.bcf"
+  }
+}
+
+task contamination {
+  input {
+  	File gvcf #g.vcf.gz file
+    Int? disk_gb
+    Float memory_var = 3.5
+  }
+  
+  Int default_disk_size = ceil((2 * size(gvcf, "GB")) + 10)
+  
+  command <<<
+    sceVCF -o contamination.txt ~{gvcf}
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/charr"
+    memory: "~{memory_var} GB"  # Adjust the amount based on your requirements
+    disks: "local-disk " + select_first([disk_gb, default_disk_size]) + " HDD"
+  }
+  output {
+  	File charr_output = "contamination.txt"
+  }
+}
+
+# Define workflow
+workflow charr {
+  input {
+  	 File gnomad_ref = "gs://dfci-g2c-refs/echtvar/gnomad.v3.1.2.echtvar.popmax.v2.zip"
+     File reblocked_gvcf
+  }
+ 
+  
+  call decompose{
+  	input:
+      reblocked_gvcf = reblocked_gvcf
+  }
+  call annotate{
+    input: 
+      clean_bcf = decompose.clean_bcf,
+      gnomad_ref = gnomad_ref
+  }
+  call step1{
+  	input:
+    	gvcf = annotate.annotated_bcf
+  }
+  call step2{
+  	input:
+    	gvcf = step1.output_bcf
+  }
+  call contamination{
+    input:
+      gvcf = step2.output_bcf
+  }
+  
+  output{
+    File charr_output = contamination.charr_output
+  }
+}
\ No newline at end of file
diff --git a/wdl/convergence.wdl b/wdl/convergence.wdl
new file mode 100644
index 0000000..5614efa
--- /dev/null
+++ b/wdl/convergence.wdl
@@ -0,0 +1,224 @@
+# The Germline Genomics of Cancer (G2C)
+# Copyright (c) 2023-Present, Noah Fields and the Dana-Farber Cancer Institute
+# Contact: Noah Fields <Noah_Fieldss@dfci.harvard.edu>
+# Distributed under the terms of the GNU GPL v2.0
+
+version 1.0
+
+# Analysis of germline variants and somatic drivers in coding regions
+# This workflow identifies pathogenic and high-impact variants in germline data, as well as somatic driver mutations in coding regions.
+
+# Task to extract coding regions from a somatic table
+task Extract_Coding_Regions {
+  input {
+    File germline_somatic_table
+    String cancer_type
+  }
+  command <<<
+    #Only interested in coding variants
+    cut -f1,2,3,4 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -v 'UNK' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > germline_coding.list
+    cut -f1,7,8 < ~{germline_somatic_table} | tail -n +2 | grep -v 'noncoding' | grep -i '~{cancer_type}' | cut -f2 | sort | uniq > somatic_coding.list
+  >>>
+  runtime {
+    docker: "ubuntu:latest"
+    preemptible: 3
+  }
+  output {
+    File germline_coding_list = "germline_coding.list"
+    File somatic_coding_list = "somatic_coding.list"
+  }
+}
+
+# Task to extract pathogenic and high-impact germline variants
+task Extract_Germline_Variants {
+  input {
+    File germline_merged_vep_vcf
+    File germline_somatic_table
+    File germline_genes
+    String patient_id
+  }
+  
+  command <<<
+  set -eux -o pipefail
+
+  # Extract variants present only in the germline data
+  bcftools view -s ~{patient_id} ~{germline_merged_vep_vcf} -Oz -o sample.vcf.gz
+  bcftools view -i 'AC>0 & GT="alt"' sample.vcf.gz -Oz -o germline_only.vcf.gz
+  rm sample.vcf.gz
+  echo "Checkpoint 1"
+
+  # Split VEP annotations and filter for relevant information
+  bcftools +split-vep -f '%CHROM|%POS|%CSQ/SYMBOL|%CSQ/IMPACT|%CSQ/gnomAD_AF_nfe|%CSQ/gnomAD_AF_popmax|%CSQ/gnomAD_controls_AF_popmax|%CSQ/ClinVar_external_CLNSIG|%CSQ/Consequence' germline_only.vcf.gz | cut -d'|' -f1,2,5,6,39-44 | awk '{gsub(/\|/, "\t");print}' | cut -d',' -f1 | awk -F'\t' '($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $9 < 0.02 || $9 == "." || $9 == "") &&($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $7 < 0.02 || $7 == "." || $7 == "") && ($5 == "Pathogenic" || $5 == "Likely_pathogenic" || $8 < 0.02 || $8 == "." || $8 == "")' | grep -Ev 'Benign|Likely_benign|LOW' | grep -Fwf ~{germline_genes} > query.tsv
+  rm germline_only.vcf.gz
+
+  echo "Checkpoint 2"
+  # Extract potential deleterious germline genes
+  cut -f4 query.tsv | sort | uniq > potential_deleterious_germline_genes.list
+  touch c1.list
+  touch c2.list
+  touch c3.list
+  touch pathogenic_germline_genes.list
+
+  python3 <<CODE
+  import pandas as pd
+
+  print("Checkpoint 2.5")
+  
+  # Load data
+  vep_df = pd.read_csv("query.tsv", sep='|', names=["CHROM", "POS", "Gene", "IMPACT", "gnomad_AF_nfe", "gnomad_AF_popmax", "gnomAD_controls_AF_popmax", "ClinVar_CLNSIG", "Consequence"])
+  convergence_table = pd.read_csv("~{germline_somatic_table}", sep='\t')
+
+  # Create arrays for gene categories
+  tsg_arr = list(set(convergence_table[convergence_table['germline_onc_status'] == "TSG"]['germline_gene'].tolist()))
+  onc_tsg_arr = list(set(convergence_table[convergence_table['germline_onc_status'] == "ONC-TSG"]['germline_gene'].tolist()))
+
+  # Create arrays for mutation types
+  deletion_arr = list(set(convergence_table[convergence_table['germline_mutations'].str.contains('D', na=False)]['germline_gene'].tolist()))
+  nonsense_arr = list(set(convergence_table[convergence_table['germline_mutations'].str.contains('N', na=False)]['germline_gene'].tolist()))
+  missense_arr = list(set(convergence_table[convergence_table['germline_mutations'].str.contains('Mis', na=False)]['germline_gene'].tolist()))
+  splice_site_arr = list(set(convergence_table[convergence_table['germline_mutations'].str.contains('S', na=False)]['germline_gene'].tolist()))
+  translocation_arr = list(set(convergence_table[convergence_table['germline_mutations'].str.contains('T', na=False)]['germline_gene'].tolist()))
+  frameshift_arr = list(set(convergence_table[convergence_table['germline_mutations'].str.contains('F', na=False)]['germline_gene'].tolist()))
+
+  # Initialize an empty set for pathogenic genes
+  pathogenic_gene_list = set()
+  print("Checkpoint 3")
+  
+  # Loop through each row in the DataFrame
+  for _, row in vep_df.iterrows():
+      # If there is a P/LP variant, add the gene
+      if row['ClinVar_CLNSIG'] in ['Pathogenic', "Likely_pathogenic", "Pathogenic/Likely_pathogenic"]:
+          pathogenic_gene_list.add(row['Gene'])
+          continue
+
+      # Check for HIGH-impact mutations in TSG or ONC-TSG genes
+      if (row['Gene'] in tsg_arr or row['Gene'] in onc_tsg_arr) and \
+         (row['Consequence'].lower() in ["deletion", "frameshift", "nonsense"] and row['IMPACT'] == "HIGH"):
+          pathogenic_gene_list.add(row['Gene'])
+          continue
+
+  # Write the list to a file
+  with open("~{patient_id}.txt", "w") as file:
+      for gene in sorted(pathogenic_gene_list):  # Sorting is optional but improves readability
+          file.write(f"{gene}\n")
+
+  print(f"Pathogenic genes written to ~{patient_id}.txt")
+  CODE
+
+  >>>
+  
+  runtime {
+    docker: "vanallenlab/g2c_pipeline"
+    preemptible: 3
+  }
+  output{
+    File out = "~{patient_id}.txt"
+  }
+}
+
+# Extract somatic driver mutations based on somatic genes
+task Extract_Somatic_Drivers {
+  input {
+	File somatic_tsv
+	File somatic_genes
+  String patient_id
+  }
+  command <<<
+	# Extract somatic driver mutations based on somatic genes
+	cat ~{somatic_tsv} | cut -f3 | grep -Fwf ~{somatic_genes} | sort | uniq >> ~{patient_id}.somatic_drivers.txt
+  >>>
+  
+  runtime {
+    docker: "ubuntu:latest"
+    preemptible: 3
+  }
+  output{
+  	File out = "~{patient_id}.somatic_drivers.txt"
+  }
+
+}
+
+task Output_Mutations {
+  input {
+    File germline_variants
+    File somatic_drivers
+    String patient_id
+    File germline_genes
+    File somatic_genes
+  }
+  
+  command <<<
+	# Initialize output string with patient ID
+	output="~{patient_id}"
+
+	# Check each germline gene for pathogenic variants
+	while IFS=$'\t' read -r germline_gene; do
+		if [ $(grep -F "$germline_gene" ~{germline_variants} | awk 'END {print NR}' ) -gt 0 ]; then
+			output="${output}	1"
+		else
+			output="${output}	0"
+		fi
+	done < ~{germline_genes}
+
+	# Check each somatic gene for driver mutations
+	while IFS=$'\t' read -r somatic_gene; do
+		if [ $(grep -F "$somatic_gene" ~{somatic_drivers} | awk 'END {print NR}' ) -gt 0 ]; then
+			output="${output}	1"
+		else
+			output="${output}	0"
+		fi
+	done < ~{somatic_genes}
+
+	# Write output string to file
+	echo "$output" > ~{patient_id}_info.txt
+  >>>
+  
+  runtime {
+    docker: "ubuntu:latest"
+    preemptible: 3
+  }
+  output{
+  	File out = "~{patient_id}_info.txt"
+  }
+}
+
+
+# Define workflow
+workflow convergence {
+  input{
+    File germline_merged_vep_vcf # Merged VCF file annotated with VEP
+    File germline_somatic_table # Germline-Somatic table file containing convergence pairs
+    File somatic_tsv # Somatic TSV file containing mutation information
+    String patient_id # Unique identifier for the patient
+    String cancer_type # Type of cancer being analyzed
+  }
+  call Extract_Coding_Regions {
+    input:
+      germline_somatic_table = germline_somatic_table,
+      cancer_type = cancer_type
+  }
+  call Extract_Germline_Variants {
+    input:
+      germline_merged_vep_vcf = germline_merged_vep_vcf,
+      germline_genes = Extract_Coding_Regions.germline_coding_list,
+      germline_somatic_table = germline_somatic_table,
+      patient_id = patient_id
+  }
+  call Extract_Somatic_Drivers {
+    input:
+      somatic_genes = Extract_Coding_Regions.somatic_coding_list,
+      somatic_tsv = somatic_tsv,
+      patient_id = patient_id
+  }
+  call Output_Mutations {
+	input:
+      patient_id = patient_id,
+      somatic_drivers = Extract_Somatic_Drivers.out,
+      germline_variants = Extract_Germline_Variants.out,
+      germline_genes = Extract_Coding_Regions.germline_coding_list,
+	    somatic_genes = Extract_Coding_Regions.somatic_coding_list
+  }
+  output{
+  	File convergence = Output_Mutations.out
+  }
+}
diff --git a/wdl/generate-sample-map.3.wdl b/wdl/generate-sample-map.3.wdl
new file mode 100644
index 0000000..c345108
--- /dev/null
+++ b/wdl/generate-sample-map.3.wdl
@@ -0,0 +1,95 @@
+version 1.0
+## Copyright Broad Institute, 2019
+## Purpose: 
+## Generate a sample_map file, which can be used for JointGenotyping workflow
+##
+## Requirements/expectations :
+## - An array of file paths
+## - An array of file names
+## - Name of output sample_map
+##
+## Outputs :
+## - sample map file
+##
+## Cromwell version support 
+## - Successfully tested on v47
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+##
+## LICENSING : 
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in 
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the dockers
+## for detailed licensing information pertaining to the included programs.
+
+
+
+# WORKFLOW DEFINITION 
+
+workflow GenerateSampleMap {
+  input {
+    Array[String] sample_names
+    Array[String] file_paths
+    String sample_map_name
+  }
+  
+  call GenerateSampleMapFile {
+    input:
+      sample_names = sample_names,
+      file_paths = file_paths,
+      outfile = sample_map_name + ".sample_map"
+  }
+  
+  output {
+    File sample_map = GenerateSampleMapFile.sample_map
+  }
+
+}
+
+
+# TASK DEFINITIONS
+task GenerateSampleMapFile {
+  input{
+    # Command parameters
+    Array[String] sample_names
+    Array[String] file_paths
+    String outfile 
+    
+    # Runtime parameters
+    String docker = "python:latest"
+    Int machine_mem_gb = 7
+    Int disk_space_gb = 100
+    Int preemptible_attempts = 3
+  }
+    command <<<
+    set -oe pipefail
+    
+    python << CODE
+    file_paths = ['~{sep="','" file_paths}']
+    sample_names = ['~{sep="','" sample_names}']
+
+    if len(file_paths)!= len(sample_names):
+      print("Number of File Paths does not Equal Number of File Names")
+      exit(1)
+
+    with open("sample_map_file.txt", "w") as fi:
+      for i in range(len(file_paths)):
+        fi.write(sample_names[i] + "\t" + file_paths[i] + "\n") 
+
+    CODE
+    mv sample_map_file.txt ~{outfile}
+    >>>
+
+    runtime {
+        docker: docker
+        memory: machine_mem_gb + " GB"
+        disks: "local-disk " + disk_space_gb + " HDD"
+        preemptible: 3
+    }
+
+    output {
+        File sample_map = outfile
+    }
+}
\ No newline at end of file