Merge pull request #4 from papaemmelab/refactor-main

✅ Refactor each process as module, and fix matched/unmatched mode
papaemmelab · Dec 24, 2024 · 937606c · 937606c
2 parents 1f9517c + de5b15e
commit 937606c
Show file tree

Hide file tree

Showing 60 changed files with 1,064 additions and 651 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,13 +20,7 @@ jobs:
       - name: Pull Docker image and cache
         run: |
           docker pull papaemmelab/purple:v0.1.1
-      - name: Run unit tests of each process for Amber, Cobalt, Purple
+          docker pull quay.io/biocontainers/hmftools-sage:3.4.4--hdfd78af_0
+      - name: Run unit tests for process module and main workflow
         run: |
-          nf-test test tests/main.runamber.nf.test
-          nf-test test tests/main.runcobalt.nf.test
-          nf-test test tests/main.bincobalt.nf.test
-          nf-test test tests/main.runpurple.nf.test
-      - name: Run pipeline end-to-end test
-        run: |
-          nf-test test tests/main.nf.test
-      
+          nf-test test --ci --coverage
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Nextflow run files
 nextflow
+nf-test
 work
 capsule
 framework
@@ -10,5 +11,7 @@ tmp
 
 # Tests
 tests/outdir/*
+tests/data/ref/ensembl_data_original
 outdir
 plugins
+slurm*.out
diff --git a/Dockerfile b/Dockerfile
@@ -1,3 +1,4 @@
+# Dockerfile use for papaemme/purple:v0.1.1
 FROM papaemmelab/docker-hmftools:v1.0.0
 
 # Clean up to free space

diff --git a/README.md b/README.md
@@ -3,22 +3,45 @@
 [![nf-purple CI](https://github.com/papaemmelab/nf-purple/actions/workflows/ci.yml/badge.svg)](https://github.com/papaemmelab/nf-purple/actions/workflows/ci.yml)
 [![nf-test](https://img.shields.io/badge/tested_with-nf--test-337ab7.svg)](https://github.com/askimed/nf-test)
 
-Nextflow Pipeline to run [Purple](https://github.com/hartwigmedical/hmftools/blob/master/purple/README.md#tumor-only-mode) in *Tumor-Only* mode, uses [Amber](https://github.com/hartwigmedical/hmftools/tree/master/amber#tumor-only-mode) and [Cobalt](https://github.com/hartwigmedical/hmftools/tree/master/cobalt#tumor-only-mode) from HMFTools suite, of the Hartwig Foundation.
+Nextflow Pipeline to run [Purple](https://github.com/hartwigmedical/hmftools/blob/master/purple/README.md) in *Tumor-Only* mode, uses [Amber](https://github.com/hartwigmedical/hmftools/tree/master/amber) and [Cobalt](https://github.com/hartwigmedical/hmftools/tree/master/cobalt) from HMFTools suite, of the Hartwig Foundation.
 
 ## 🚀 Run Pipeline
 
 You need Nextflow installed.
 
+### Tumor-Normal matched:
+
 ```bash
 module load java/jdk-11.0.11
 
+# To run matched pipeline
 nextflow papaemmelab/nf-purple \
     --tumor $tumor \
     --tumor_bam $TUMOR_BAM \
+    --normal $normal \
+    --normal_bam $NORMAL_BAM \
     --outdir $OUTDIR \
     ...refargs
 ```
 
+- See more info: [Purple](https://github.com/hartwigmedical/hmftools/blob/master/purple/README.md#arguments), [Amber](https://github.com/hartwigmedical/hmftools/tree/master/amber#paired-normaltumor-mode), [Cobalt](https://github.com/hartwigmedical/hmftools/tree/master/cobalt#mandatory-arguments)
+
+### Tumor only mode:
+
+```bash
+module load java/jdk-11.0.11
+
+# To run unmatched tumor-only
+nextflow papaemmelab/nf-purple \
+    --tumor $tumor \
+    --tumor_bam $TUMOR_BAM \
+    --outdir $OUTDIR \
+    ...refargs
+```
+
+- See more info: [Purple](https://github.com/hartwigmedical/hmftools/blob/master/purple/README.md#tumor-only-mode), [Amber](https://github.com/hartwigmedical/hmftools/tree/master/amber#tumor-only-mode), [Cobalt](https://github.com/hartwigmedical/hmftools/tree/master/cobalt#tumor-only-mode)
+
+
 ## 🧬  Get Reference Data
 
 Downloaded from [Purple Ref Data](https://console.cloud.google.com/storage/browser/hmf-public/HMFtools-Resources/dna_pipeline) for genome version 37.

diff --git a/assets/NO_FILE b/assets/NO_FILE
diff --git a/bin/bin_cobalt.py b/bin/bin_cobalt.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+import argparse
+import shutil
+import pandas as pd
+import numpy as np
+
+parser = argparse.ArgumentParser(
+    description=(
+        "Bin cobalt probes with similar LogR values "
+        "together to decrease oversegmentation."
+    )
+)
+parser.add_argument(
+    "--in_pcf",
+    type=str,
+    required=True,
+    help="Path to the input cobalt ratio .pcf file.",
+)
+parser.add_argument(
+    "--bin_probes", 
+    type=int, 
+    required=True,
+    help="Max probe bin size."
+)
+parser.add_argument(
+    "--bin_log_r",
+    type=float,
+    required=True,
+    help="Max probe logR difference to bin."
+)
+args = parser.parse_args()
+
+cobalt_ratio_pcf = pd.read_csv(args.in_pcf, sep="\t")
+cobalt_ratio_pcf_probes = pd.DataFrame(columns=cobalt_ratio_pcf.columns)
+
+# First bin by probes
+chrom_arm = None
+last_idx = None
+for idx, seg in cobalt_ratio_pcf.iterrows():
+    if chrom_arm != "_".join(seg[["chrom", "arm"]].astype(str)):
+        chrom_arm = "_".join(seg[["chrom", "arm"]].astype(str))
+        cobalt_ratio_pcf_probes = pd.concat(
+            [cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True
+        )
+        last_idx = cobalt_ratio_pcf_probes.index[-1]
+        continue
+    if (
+        cobalt_ratio_pcf_probes.loc[last_idx, "n.probes"] <= args.bin_probes
+        or seg["n.probes"] <= args.bin_probes
+    ):
+        means = [
+            cobalt_ratio_pcf_probes.loc[last_idx, "mean"]
+        ] * cobalt_ratio_pcf_probes.loc[last_idx, "n.probes"]
+        means.extend([seg["mean"]] * seg["n.probes"])
+        cobalt_ratio_pcf_probes.loc[last_idx, "mean"] = np.mean(means)
+        cobalt_ratio_pcf_probes.loc[last_idx, "n.probes"] += seg["n.probes"]
+        cobalt_ratio_pcf_probes.loc[last_idx, "end.pos"] = seg["end.pos"]
+    else:
+        cobalt_ratio_pcf_probes = pd.concat(
+            [cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True
+        )
+        last_idx = cobalt_ratio_pcf_probes.index[-1]
+
+# Then bin by logR mean
+cobalt_ratio_pcf_probes = cobalt_ratio_pcf_probes.reset_index().drop(columns="index")
+cobalt_ratio_pcf_probes_logR = pd.DataFrame(columns=cobalt_ratio_pcf_probes.columns)
+chrom_arm = None
+for idx, seg in cobalt_ratio_pcf_probes.iterrows():
+    if chrom_arm != "_".join(seg[["chrom", "arm"]].astype(str)):
+        chrom_arm = "_".join(seg[["chrom", "arm"]].astype(str))
+        cobalt_ratio_pcf_probes_logR = pd.concat(
+            [cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True
+        )
+        last_idx = cobalt_ratio_pcf_probes_logR.index[-1]
+        continue
+    if (
+        abs(cobalt_ratio_pcf_probes.loc[last_idx, "mean"] - seg["mean"])
+        <= args.bin_log_r
+    ):
+        means = [
+            cobalt_ratio_pcf_probes_logR.loc[last_idx, "mean"]
+        ] * cobalt_ratio_pcf_probes_logR.loc[last_idx, "n.probes"]
+        means.extend([seg["mean"]] * seg["n.probes"])
+        cobalt_ratio_pcf_probes_logR.loc[last_idx, "mean"] = np.mean(means)
+        cobalt_ratio_pcf_probes_logR.loc[last_idx, "n.probes"] += seg["n.probes"]
+        cobalt_ratio_pcf_probes_logR.loc[last_idx, "end.pos"] = seg["end.pos"]
+    else:
+        cobalt_ratio_pcf_probes_logR = pd.concat(
+            [cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True
+        )
+        last_idx = cobalt_ratio_pcf_probes_logR.index[-1]
+
+# store input with another name to replace original
+shutil.move(args.in_pcf, args.in_pcf.replace(".pcf", ".original.pcf"))
+cobalt_ratio_pcf_probes_logR.to_csv(args.in_pcf, sep="\t", index=False)