BioGeMT · davidcechak · Nov 12, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/code/plots/Fig5/auc_pr_vs_train_data_size.md b/code/plots/Fig5/auc_pr_vs_train_data_size.md
@@ -0,0 +1,42 @@
+# AUC-PR vs Dataset Size Plot
+
+## Description
+The `auc_pr_vs_dataset_size.py` script generates a plot that visualizes the relationship between the Area Under the Precision-Recall Curve (AUC-PR) and the dataset size for various prediction methods.
+
+## Usage
+Run the script from the command line with the following syntax:
+python auc_pr_vs_dataset_size.py --dataset-dir <dataset_directory> --dataset-name <dataset_name>
+
+### Arguments:
+- `--dataset-dir`: (Optional) The directory containing the dataset file. Default is `'predictions/'`.
+- `--dataset-name`: (Optional) The name of the dataset file. Default is `'AGO2_eCLIP_Manakov2022_100_CNN_predictions'`.
+- `--manakov-rename-map`: (Optional) A dictionary that maps the original Manakov dataset column names to the desired names for the plot. Default is: {
+'CNN_Manakov_full': 'Manakov_2,524,246',
+"CNN_Manakov_subset_200": 'Manakov_subset_200',
+"CNN_Manakov_subset_2k": 'Manakov_subset_2k',
+"CNN_Manakov_subset_7720": 'Manakov_subset_7720',
+"CNN_Manakov_subset_20k": 'Manakov_subset_20k',
+"CNN_Manakov_subset_200k": 'Manakov_subset_200k',
+}
+- `--method-names`: (Optional) A list of method names to include in the plot. Default is: ["random", "Manakov_subset_200", "Manakov_subset_2k", "Manakov_subset_7720",
+"Manakov_subset_20k", "Manakov_subset_200k", "Manakov_2,524,246"]
+
+## Input File Format
+The script expects a TSV file (tab-separated values) with a header row. The file should contain the following columns:
+- `label`: The ground truth label column
+- Additional columns for each prediction method, where the column name matches the values in the `--method-names` argument.
+
+## Dependencies
+The script requires the following Python libraries:
+- random
+- numpy
+- pandas
+- matplotlib
+- scikit-learn (for `precision_recall_curve` and `auc`)
+
+Make sure you have these libraries installed before running the script.
+
+## Output
+The script will generate two output files in the `'output/'` directory:
+- `<dataset_name>.auc_pr_vs_train_data_size.svg`: A Scalable Vector Graphics (SVG) file containing the AUC-PR vs dataset size plot.
+- `<dataset_name>.auc_pr_vs_train_data_size.png`: A Portable Network Graphics (PNG) file containing the AUC-PR vs dataset size plot.
diff --git a/code/plots/Fig5/auc_pr_vs_train_data_size.py b/code/plots/Fig5/auc_pr_vs_train_data_size.py
@@ -0,0 +1,97 @@
+import argparse
+import random
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_curve, auc
+from avrg_fp_per_sensitivity import COLOR_PALETTE
+
+
+def generate_random_predictions(length):
+    return [random.uniform(0, 1) for _ in range(length)]
+
+
+def calculate_auc_pr(predictions, method_names, label_column='label'):
+    """Calculates AUC-PR for specified method columns in the predictions DataFrame."""
+    auc_values = {}
+    for method in method_names:
+        if method in predictions.columns and pd.api.types.is_numeric_dtype(predictions[method]):
+            precision, recall, _ = precision_recall_curve(predictions[label_column], predictions[method])
+            auc_values[method] = auc(recall, precision)
+        else:
+            print(f"Warning: Method {method} not found or is not numeric.")
+    return auc_values
+
+
+def plot_auc_vs_dataset_size(auc_values, dataset_sizes, color_palette):
+    """Plots AUC-PR against dataset size using a specified color palette."""
+    x = [dataset_sizes[method] for method in auc_values.keys()]
+    y = [auc_values[method] for method in auc_values.keys()]
+
+    plt.figure(figsize=(5.5, 6))
+    for i, method in enumerate(auc_values.keys()):
+        plt.scatter(x[i], y[i], color=color_palette[i % len(color_palette)], label=method)
+        plt.annotate(method, (x[i], y[i]))
+
+    plt.xlabel("Dataset Size (log10)")
+    plt.ylabel("AUC-PR")
+    plt.annotate(method, (x[i] - 0.05, y[i]))
+    plt.ylim(-0.05, 1)
+    plt.grid(False)
+    ax = plt.gca()
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+
+
+def main(args):
+    dataset_path = args.dataset_dir + args.dataset_name
+    predictions = pd.read_csv(f'{dataset_path}.tsv', sep='\t', header=0)
+    predictions = predictions.rename(args.manakov_rename_map, axis=1)
+    predictions["random"] = generate_random_predictions(predictions.shape[0])
+
+    # Verify label column exists
+    if 'label' not in predictions.columns:
+        raise ValueError("The dataset must contain a 'label' column for ground truth.")
+
+    # Calculate AUC-PR for each specified method in method_names
+    auc_values = calculate_auc_pr(predictions, args.method_names, label_column='label')
+
+    # Dataset sizes (log10 scale for plotting)
+    dataset_sizes = {
+        'random': np.log10(predictions.shape[0]),
+        'Manakov_subset_200': np.log10(200),
+        'Manakov_subset_2k': np.log10(2000),
+        'Manakov_subset_7720': np.log10(7720),
+        'Manakov_subset_20k': np.log10(20000),
+        'Manakov_subset_200k': np.log10(200000),
+        'Manakov_2,524,246': np.log10(2524246)
+    }
+
+    plot_auc_vs_dataset_size(auc_values, dataset_sizes, COLOR_PALETTE)
+
+    title_suffix = ".auc_pr_vs_train_data_size"
+    plt.savefig(f"output/{args.dataset_name}{title_suffix}.svg", format='svg')
+    plt.savefig(f"output/{args.dataset_name}{title_suffix}.png", format='png')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='AUC-PR vs Dataset Size Plot')
+    parser.add_argument('--dataset-dir', type=str, default='predictions/', 
+                       help='Directory containing the dataset')
+    parser.add_argument('--dataset-name', type=str, 
+                       default='AGO2_eCLIP_Manakov2022_100_CNN_predictions', 
+                       help='Name of the dataset file')
+    parser.add_argument('--manakov-rename-map', type=lambda x: eval(x), default={
+        'CNN_Manakov_full': 'Manakov_2,524,246',
+        "CNN_Manakov_subset_200": 'Manakov_subset_200',
+        "CNN_Manakov_subset_2k": 'Manakov_subset_2k',
+        "CNN_Manakov_subset_7720": 'Manakov_subset_7720',
+        "CNN_Manakov_subset_20k": 'Manakov_subset_20k',
+        "CNN_Manakov_subset_200k": 'Manakov_subset_200k',
+    }, help='Mapping for renaming Manakov dataset columns')
+    parser.add_argument('--method-names', type=lambda x: eval(x), default=[
+        "random", "Manakov_subset_200", "Manakov_subset_2k", "Manakov_subset_7720", 
+        "Manakov_subset_20k", "Manakov_subset_200k", "Manakov_2,524,246"
+    ], help='List of method names for plotting')
+    args = parser.parse_args()
+    main(args)
diff --git a/code/plots/Fig5/avrg_fp_per_sensitivity.md b/code/plots/Fig5/avrg_fp_per_sensitivity.md
@@ -0,0 +1,48 @@
+# Average Scanning Length per Sensitivity Threshold Plot
+
+## Description
+The `avrg_fp_per_sensitivity_threshold.py` script generates a plot that shows the average scanning length (1/FPR) for various prediction models at a specified sensitivity threshold.
+
+## Usage
+Run the script from the command line with the following syntax:
+python avrg_fp_per_sensitivity_threshold.py --dataset-dir <dataset_directory> --dataset-name <dataset_name>
+
+### Arguments:
+- `--dataset-dir`: (Optional) The directory containing the dataset file. Default is `'predictions/'`.
+- `--output-dir`: (Optional) The directory where the output figures and intermediate results will be saved. Default is `'output/'`.
+- `--dataset-name`: (Optional) The name of the dataset file. Default is `'AGO2_eCLIP_Manakov2022_100_CNN_predictions'`.
+- `--thresholds`: (Optional) A list of desired sensitivity thresholds (recall) to use for the plot. Default is `[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]`.
+- `--manakov-rename-map`: (Optional) A dictionary that maps the original Manakov dataset column names to the desired names for the plot. Default is: {
+'CNN_Manakov_full': 'Manakov_2,524,246',
+"CNN_Manakov_subset_200": 'Manakov_subset_200',
+"CNN_Manakov_subset_2k": 'Manakov_subset_2k',
+"CNN_Manakov_subset_7720": 'Manakov_subset_7720',
+"CNN_Manakov_subset_20k": 'Manakov_subset_20k',
+"CNN_Manakov_subset_200k": 'Manakov_subset_200k',
+}
+- `--method-names`: (Optional) A list of method names to include in the plot. Default is: ["random", "Manakov_subset_200", "Manakov_subset_2k", "Manakov_subset_7720",
+"Manakov_subset_20k", "Manakov_subset_200k", "Manakov_2,524,246"]
+
+## Input File Format
+The script expects a TSV file (tab-separated values) with a header row. The file should contain the following columns:
+- `label`: The ground truth label column
+- Additional columns for each prediction method, where the column name matches the values in the `--method-names` argument.
+
+## Dependencies
+The script requires the following Python libraries:
+- json
+- argparse
+- random
+- pandas
+- numpy
+- matplotlib
+- scikit-learn (for `roc_curve`)
+
+Make sure you have these libraries installed before running the script.
+
+## Output
+The script will generate two output files in the `'output/'` directory:
+- `<dataset_name>.avrg_fp_per_sensitivity_threshold.svg`: A Scalable Vector Graphics (SVG) file containing the average scanning length per sensitivity threshold plot.
+- `<dataset_name>.avrg_fp_per_sensitivity_threshold.png`: A Portable Network Graphics (PNG) file containing the average scanning length per sensitivity threshold plot.
+
+Additionally, the script will save the scanning length results in a JSON file named `'scanning_length_results.json'` in the `'output/'` directory.
diff --git a/code/plots/Fig5/avrg_fp_per_sensitivity.py b/code/plots/Fig5/avrg_fp_per_sensitivity.py
@@ -0,0 +1,155 @@
+import json
+import argparse
+import random
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_curve
+
+
+COLOR_PALETTE = ['#E69F00','#56B4E9','#009E73','#F0E442','#CC79A7','#0072B2','#D55E00', '#000000']
+
+
+def generate_random_predictions(length):
+    return [random.uniform(0, 1) for _ in range(length)]
+
+
+def calculate_scanning_length_at_sensitivity(predictions, labels, desired_sensitivity):
+    """
+    Function to calculate average scanning length (1/FPR) at a given sensitivity threshold.
+    It calculates the FPR corresponding to the desired sensitivity and returns 1/FPR.
+    """
+    fpr, tpr, thresholds = roc_curve(labels, predictions)
+
+    # Find the threshold corresponding to the desired sensitivity
+    # Since TPR is sorted in increasing order, we can interpolate or find the closest one
+    indices = np.where(tpr >= desired_sensitivity)[0]
+    if len(indices) == 0:
+        # Desired sensitivity not achievable
+        print(f"Desired sensitivity {desired_sensitivity} not achievable.")
+        return np.nan  # Or 0 or some other value
+
+    index = indices[0]  # First index where TPR >= desired_sensitivity
+    threshold = thresholds[index]
+    fpr_at_threshold = fpr[index]
+
+    if fpr_at_threshold == 0:
+        scanning_length = np.inf
+    else:
+        scanning_length = 1 / fpr_at_threshold  # Scanning length is the inverse of FPR
+
+    return scanning_length
+
+
+def plot_scanning_length_based_on_sensitivity(df, labels_column, methods, sensitivities, max_value=2000, dpi=300, font_size=9, title='', show_y_axis_text=True, output_json_path='output/scanning_length_results.json'):
+    """
+    This function computes the scanning length (1/FPR) for each method at desired sensitivities and plots it.
+
+    Parameters:
+    - df: DataFrame containing the predictions and labels
+    - labels_column: The name of the column containing the true labels
+    - methods: A list of column names for the methods to be plotted
+    - sensitivities: A list of desired sensitivities for each method
+    - max_value: Maximum value to cap the infinite values, default is 2000 bp
+    """
+    labels = df[labels_column]
+
+    # Calculate scanning lengths for each method
+    lengths = []
+    result={}
+    for method, sensitivity in zip(methods, sensitivities):
+        predictions = df[method]
+        length = calculate_scanning_length_at_sensitivity(predictions, labels, sensitivity)
+        if np.isinf(length) or np.isnan(length):
+            length = max_value  # Replace infinite or NaN values with a large finite number
+        lengths.append(length)
+        result[f"{method}_{sensitivity}"]=length
+
+    with open(output_json_path, 'w') as f:
+        json.dump(result, f, indent=4)
+
+    # Create the figure and axis for the plot
+    if show_y_axis_text:
+        fig, ax = plt.subplots(figsize=(8, 4), dpi=dpi)
+    else:
+        fig, ax = plt.subplots(figsize=(5, 3), dpi=dpi)
+
+    # Bar positions and labels
+    y_pos = np.arange(len(methods))
+
+    # Add small space between y-axis and bars
+    small_offset = lengths[-1] * 0.01  # X% of the last bar length as offset
+
+    # Plotting the bars with a left offset
+    ax.barh(
+        y_pos, lengths, align='center', left=small_offset, 
+        color=COLOR_PALETTE
+    )
+
+    # Adjust x-axis limits to accommodate the offset
+    ax.set_xlim(0, max(lengths) + small_offset * 2)
+
+    # Add text labels
+    for i, v in enumerate(lengths):
+        ax.text(v + small_offset, i, f"{v:.0f} bp", va='center', fontsize=font_size)
+
+    ax.set_xticks([])
+    if show_y_axis_text:
+        ax.set_yticks(y_pos)
+    else:
+        ax.set_yticks([])
+    ax.set_yticklabels([f"{meth} (Sens={round(sens, 3)})" for meth, sens in zip(methods, sensitivities)])
+    ax.set_xlabel('Average Scanning Length (1/FPR) [bp] at recall 0.5')
+    ax.set_title(title)
+
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['bottom'].set_visible(False)
+
+    plt.tight_layout()
+
+
+def main(args):
+    dataset_path = args.dataset_dir + args.dataset_name
+
+    predictions = pd.read_csv(f'{dataset_path}.tsv', sep='\t', header=0)
+    predictions = predictions.rename(args.manakov_rename_map, axis=1)
+    predictions["random"] = generate_random_predictions(predictions.shape[0])
+
+    plot_scanning_length_based_on_sensitivity(
+        predictions, labels_column='label', methods=args.method_names, 
+        sensitivities=args.thresholds,
+        show_y_axis_text=False,
+        output_json_path='output/scanning_length_results.json',
+    )
+
+    title_suffix = ".avrg_fp_per_kbp_sensitivity_threshold"
+    plt.savefig(f"output/{args.dataset_name}{title_suffix}.svg", format='svg')
+    plt.savefig(f"output/{args.dataset_name}{title_suffix}.png", format='png')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Average Scanning Length per Sensitivity Threshold Plot')
+    parser.add_argument('--dataset-dir', type=str, default='predictions/', 
+                       help='Directory containing the dataset')
+    parser.add_argument('--output-dir', type=str, default='output/', 
+                       help='Directory containing the figures and intermediate results')
+    parser.add_argument('--dataset-name', type=str, 
+                       default='AGO2_eCLIP_Manakov2022_100_CNN_predictions', 
+                       help='Name of the dataset file')
+    parser.add_argument('--thresholds', type=float, nargs='+', 
+                       default=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5], 
+                       help='Thresholds for scanning length plots')
+    parser.add_argument('--manakov-rename-map', type=lambda x: eval(x), default={
+        'CNN_Manakov_full': 'Manakov_2,524,246',
+        "CNN_Manakov_subset_200": 'Manakov_subset_200',
+        "CNN_Manakov_subset_2k": 'Manakov_subset_2k',
+        "CNN_Manakov_subset_7720": 'Manakov_subset_7720',
+        "CNN_Manakov_subset_20k": 'Manakov_subset_20k',
+        "CNN_Manakov_subset_200k": 'Manakov_subset_200k',
+    }, help='Mapping for renaming Manakov dataset columns')
+    parser.add_argument('--method-names', type=lambda x: eval(x), default=[
+        "random", "Manakov_subset_200", "Manakov_subset_2k", "Manakov_subset_7720", 
+        "Manakov_subset_20k", "Manakov_subset_200k", "Manakov_2,524,246"
+    ], help='List of method names for plotting')
+    args = parser.parse_args()
+    main(args)
diff --git a/code/plots/Fig5/download_predictions.sh b/code/plots/Fig5/download_predictions.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Install gdown if not already installed
+if ! command -v gdown &> /dev/null
+then
+    echo "gdown not found. Installing gdown..."
+    pip install gdown
+fi
+
+download_dir="predictions"
+plot_output_dir="output"
+FILE_ID_1="15gQ4WShxUZjEjSwppc9cukfusanpbl6n"
+
+mkdir -p $download_dir
+mkdir -p $plot_output_dir
+
+echo "Downloading AGO2_eCLIP_Manakov2022_100_CNN_predictions.tsv"
+gdown "https://drive.google.com/uc?id=$FILE_ID_1" -O $download_dir/AGO2_eCLIP_Manakov2022_100_CNN_predictions.tsv
+
+echo "Download completed! .tsv files are saved in the $download_dir directory."