Skip to content

Commit

Permalink
Merge pull request #3 from papaemmelab/binning-mod
Browse files Browse the repository at this point in the history
🛠️ Modification to bin cobalt outputs and add purity forcing
  • Loading branch information
ddomenico authored Aug 13, 2024
2 parents 73822d5 + 4d5f1ef commit 1f9517c
Show file tree
Hide file tree
Showing 12 changed files with 235 additions and 29 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ jobs:
sudo mv nf-test /usr/local/bin/
- name: Pull Docker image and cache
run: |
docker pull papaemmelab/purple:v0.1.0
docker pull papaemmelab/purple:v0.1.1
- name: Run unit tests of each process for Amber, Cobalt, Purple
run: |
nf-test test tests/main.runamber.nf.test
nf-test test tests/main.runcobalt.nf.test
nf-test test tests/main.bincobalt.nf.test
nf-test test tests/main.runpurple.nf.test
- name: Run pipeline end-to-end test
run: |
Expand Down
14 changes: 13 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
FROM papaemmelab/docker-hmftools:v1.0.0

# Clean up to free space
RUN apt-get clean

# Install dependencies
RUN apt-get update && apt-get install -y tar curl python3
RUN apt-get update && apt-get install -y \
tar \
curl \
python3 \
python3-pip \
python3-venv

# Install Google Cloud SDK
RUN curl -sSL https://sdk.cloud.google.com | bash
Expand All @@ -20,6 +28,10 @@ RUN \
tar -xzf /data/${REF_DIR}.tar.gz -C /data && \
rm /data/${REF_DIR}.tar.gz

# Link python3 to python and install packages
RUN ln -s /usr/bin/python3 /usr/bin/python && \
pip install --no-cache-dir numpy pandas

COPY . /app
WORKDIR /app

Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ More information on their [docs](https://github.com/hartwigmedical/hmftools/blob
sh tests/run_test.sh
```

To check tests and update snapshots run:
```bash
nf-test test --update-snapshot
```

### Docker

[Purple Docker image](https://hub.docker.com/r/papaemmelab/purple) was built for several platforms using [docker buildx](https://docs.docker.com/buildx/working-with-buildx/).
Expand Down
160 changes: 137 additions & 23 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
params.cores = 4
params.cores = 1
params.memory = '4 GB'

// Params Defaults in juno
params.refGenome = "/work/isabl/ref/homo_sapiens/GRCh37d5/gr37.fasta"
Expand All @@ -8,17 +9,26 @@ params.loci = "/data/copy_number/GermlineHetPon.37.vcf.gz"
params.gcProfile = "/data/copy_number/GC_profile.1000bp.37.cnp"
params.ensemblDataDir = "/data/common/ensembl_data"
params.diploidRegions = "/data/copy_number/DiploidRegions.37.bed.gz"
params.binProbes = 0
params.binLogR = 0
params.minPurity = 0.08
params.maxPurity = 1.0


log.info """\
HMFTOOLS - PURPLE
========================================
Params:
----------------------------------------
tumor : ${params.tumor}
tumorBam : ${params.tumorBam}
outdir : ${params.outdir}
cores : ${params.cores}
tumor : ${params.tumor}
tumorBam : ${params.tumorBam}
outdir : ${params.outdir}
cores : ${params.cores}
memory : ${params.memory}
binProbes : ${params.binProbes}
binLogR : ${params.binLogR}
minPurity : ${params.minPurity}
maxPurity : ${params.maxPurity}
========================================
Workflow:
----------------------------------------
Expand All @@ -32,7 +42,7 @@ process runAmber {
tag "AMBER on ${params.tumor}"
publishDir "${params.outdir}/amber", mode: 'copy'
cpus params.cores
memory '32 GB'
memory params.memory
time '1h'

input:
Expand All @@ -46,21 +56,30 @@ process runAmber {

script:
"""
amber \
-tumor ${tumor} \
-tumor_bam ${tumorBam} \
-output_dir \$PWD \
-threads ${params.cores} \
-loci ${params.loci} \
-ref_genome_version ${params.genomeVersion}
if [ -f "${params.outdir}/amber/${tumor}.amber.baf.tsv.gz" ] && \
[ -f "${params.outdir}/amber/${tumor}.amber.baf.pcf" ] && \
[ -f "${params.outdir}/amber/${tumor}.amber.qc" ]; then
echo "Output files already exist. Skipping amber execution."
ln -s ${params.outdir}/amber/${tumor}.amber.baf.tsv.gz ${tumor}.amber.baf.tsv.gz
ln -s ${params.outdir}/amber/${tumor}.amber.baf.pcf ${tumor}.amber.baf.pcf
ln -s ${params.outdir}/amber/${tumor}.amber.qc ${tumor}.amber.qc
else
amber \
-tumor ${tumor} \
-tumor_bam ${tumorBam} \
-output_dir \$PWD \
-threads ${params.cores} \
-loci ${params.loci} \
-ref_genome_version ${params.genomeVersion}
fi
""".stripIndent()
}

process runCobalt {
tag "COBALT on ${params.tumor}"
publishDir "${params.outdir}/cobalt", mode: 'copy'
cpus params.cores
memory '32 GB'
memory params.memory
time '1h'

input:
Expand All @@ -73,21 +92,102 @@ process runCobalt {

script:
"""
cobalt \
if [ -f "${params.outdir}/cobalt/${tumor}.cobalt.ratio.tsv.gz" ] && \
[ -f "${params.outdir}/cobalt/${tumor}.cobalt.ratio.pcf" ]; then
echo "Output files already exist. Skipping cobalt execution."
ln -s ${params.outdir}/cobalt/${tumor}.cobalt.ratio.tsv.gz ${tumor}.cobalt.ratio.tsv.gz
ln -s ${params.outdir}/cobalt/${tumor}.cobalt.ratio.pcf ${tumor}.cobalt.ratio.pcf
else
cobalt \
-tumor ${tumor} \
-tumor_bam ${tumorBam} \
-output_dir \$PWD \
-threads ${params.cores} \
-gc_profile ${params.gcProfile} \
-tumor_only_diploid_bed ${params.diploidRegions}
fi
""".stripIndent()
}

process binCobalt {
tag "COBALT BIN on ${params.tumor}"
publishDir "${params.outdir}/cobalt/binned_${params.binProbes}_probes_${params.binLogR}_LogR", mode: 'copy'
cpus params.cores
memory params.memory
time '1h'

input:
val tumor
val binProbes
val binLogR
path cobalt_ratio_pcf
path cobalt_ratio_tsv

output:
path "${tumor}.cobalt.ratio.tsv.gz", emit: cobalt_ratio_tsv
path "${tumor}.cobalt.ratio.pcf", emit: cobalt_ratio_pcf

script:
"""
#!/usr/bin/env python
import pandas as pd
import numpy as np
cobalt_ratio_pcf = pd.read_csv('${cobalt_ratio_pcf}', sep='\\t')
cobalt_ratio_pcf_probes = pd.DataFrame(columns=cobalt_ratio_pcf.columns)
# First bin by probes
chrom_arm = None
last_idx = None
for idx, seg in cobalt_ratio_pcf.iterrows():
if chrom_arm != '_'.join(seg[['chrom','arm']].astype(str)):
chrom_arm = '_'.join(seg[['chrom','arm']].astype(str))
cobalt_ratio_pcf_probes = pd.concat([cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True)
last_idx = cobalt_ratio_pcf_probes.index[-1]
continue
if (
cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] <= ${binProbes}
) or (
seg['n.probes'] <= ${binProbes}
):
means = [cobalt_ratio_pcf_probes.loc[last_idx, 'mean']] * cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes']
means.extend([seg['mean']] * seg['n.probes'])
cobalt_ratio_pcf_probes.loc[last_idx, 'mean'] = np.mean(means)
cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] += seg['n.probes']
cobalt_ratio_pcf_probes.loc[last_idx, 'end.pos'] = seg['end.pos']
else:
cobalt_ratio_pcf_probes = pd.concat([cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True)
last_idx = cobalt_ratio_pcf_probes.index[-1]
# Then bin by logR mean
cobalt_ratio_pcf_probes = cobalt_ratio_pcf_probes.reset_index().drop(columns="index")
cobalt_ratio_pcf_probes_logR = pd.DataFrame(columns=cobalt_ratio_pcf_probes.columns)
chrom_arm = None
for idx, seg in cobalt_ratio_pcf_probes.iterrows():
if chrom_arm != '_'.join(seg[['chrom','arm']].astype(str)):
chrom_arm = '_'.join(seg[['chrom','arm']].astype(str))
cobalt_ratio_pcf_probes_logR = pd.concat([cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True)
last_idx = cobalt_ratio_pcf_probes_logR.index[-1]
continue
if abs(cobalt_ratio_pcf_probes.loc[last_idx, 'mean'] - seg['mean']) <= ${binLogR}:
means = [cobalt_ratio_pcf_probes_logR.loc[last_idx, 'mean']] * cobalt_ratio_pcf_probes_logR.loc[last_idx, 'n.probes']
means.extend([seg['mean']] * seg['n.probes'])
cobalt_ratio_pcf_probes_logR.loc[last_idx, 'mean'] = np.mean(means)
cobalt_ratio_pcf_probes_logR.loc[last_idx, 'n.probes'] += seg['n.probes']
cobalt_ratio_pcf_probes_logR.loc[last_idx, 'end.pos'] = seg['end.pos']
else:
cobalt_ratio_pcf_probes_logR = pd.concat([cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True)
last_idx = cobalt_ratio_pcf_probes_logR.index[-1]
cobalt_ratio_pcf_probes_logR.to_csv("${tumor}.cobalt.ratio.pcf", sep='\\t', index=False)
""".stripIndent()
}

process runPurple {
tag "PURPLE on ${params.tumor}"
publishDir "${params.outdir}/purple", mode: 'copy'
publishDir "${params.outdir}/purple/purple_${params.minPurity}_${params.maxPurity}", mode: 'copy'
cpus params.cores
memory '32 GB'
memory params.memory
time '1h'

input:
Expand All @@ -97,6 +197,7 @@ process runPurple {
path amber_qc
path cobalt_ratio_tsv
path cobalt_ratio_pcf
path cobalt_path

output:
path "${tumor}.purple.purity.tsv", emit: purple_purity_tsv
Expand All @@ -118,21 +219,34 @@ process runPurple {
purple \
-tumor ${tumor} \
-amber ${params.outdir}/amber \
-cobalt ${params.outdir}/cobalt \
-cobalt ${cobalt_path} \
-output_dir \$PWD \
-gc_profile ${params.gcProfile} \
-ref_genome ${params.refGenome} \
-ref_genome_version ${params.genomeVersion} \
-ensembl_data_dir ${params.ensemblDataDir} \
-circos ${params.circos}
-circos ${params.circos} \
-min_purity ${params.minPurity} \
-max_purity ${params.maxPurity}
rsync -a --no-links \$PWD/ ${params.outdir}/purple/
""".stripIndent()
}

workflow {
tumor = Channel.value(params.tumor)
tumorBam = Channel.fromPath(params.tumorBam)

runAmber(tumor, tumorBam)
runCobalt(tumor, tumorBam)
runPurple(tumor, runAmber.out, runCobalt.out)
binProbes = Channel.value(params.binProbes)
binLogR = Channel.value(params.binLogR)

amberOutput = runAmber(tumor, tumorBam)
cobaltOutput = runCobalt(tumor, tumorBam)

if (binProbes != 0 || binLogR != 0) {
binCobaltOutput = binCobalt(tumor, binProbes, binLogR, cobaltOutput.cobalt_ratio_pcf, cobaltOutput.cobalt_ratio_tsv)
runPurple(tumor, amberOutput, binCobaltOutput, "${params.outdir}/cobalt/binned_${params.binProbes}_probes_${params.binLogR}_LogR")
} else {
runPurple(tumor, amberOutput, cobaltOutput, "${params.outdir}/cobalt")
}
}

2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ profiles {
}
process {
executor = 'local'
container = 'papaemmelab/purple:v0.1.0'
container = 'papaemmelab/purple:v0.1.1'
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions tests/data/TEST.cobalt.ratio.pcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sampleID chrom arm start.pos end.pos n.probes mean
S1 2 p 14001 397001 340 -7.24
S1 17 p 1 397001 341 -8.813389396729216
Binary file added tests/data/TEST.cobalt.ratio.tsv.gz
Binary file not shown.
41 changes: 41 additions & 0 deletions tests/main.bincobalt.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
nextflow_process {

name "Test Process binCobalt"
script "main.nf"
process "binCobalt"

test("Should bin Cobalt without failures") {

when {
params {
tumor = "TEST"
binProbes = 100
binLogR = 0.5
cobalt_ratio_pcf = "${projectDir}/tests/data/TEST.cobalt.ratio.pcf"
cobalt_ratio_tsv = "${projectDir}/tests/data/TEST.cobalt.ratio.tsv.gz"
}
process {
"""
input[0] = Channel.value(params.tumor)
input[1] = Channel.value(params.binProbes)
input[2] = Channel.value(params.binLogR)
input[3] = Channel.fromPath(params.cobalt_ratio_pcf)
input[4] = Channel.fromPath(params.cobalt_ratio_tsv)
"""
}
}

then {
assert process.success
assert snapshot(process.out).match()

// check expected files
with(process.out) {
assert cobalt_ratio_tsv.size() == 1
assert cobalt_ratio_pcf.size() == 1
}
}

}

}
25 changes: 25 additions & 0 deletions tests/main.bincobalt.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"Should bin Cobalt without failures": {
"content": [
{
"0": [
"TEST.cobalt.ratio.tsv.gz:md5,87cf8451b04e4373fc4dd7ccda3e6afe"
],
"1": [
"TEST.cobalt.ratio.pcf:md5,cd95f6b56cecfd80461bf1bd13c7a9fd"
],
"cobalt_ratio_pcf": [
"TEST.cobalt.ratio.pcf:md5,cd95f6b56cecfd80461bf1bd13c7a9fd"
],
"cobalt_ratio_tsv": [
"TEST.cobalt.ratio.tsv.gz:md5,87cf8451b04e4373fc4dd7ccda3e6afe"
]
}
],
"meta": {
"nf-test": "0.9.0",
"nextflow": "24.04.4"
},
"timestamp": "2024-08-07T11:15:21.745138"
}
}
4 changes: 2 additions & 2 deletions tests/main.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ nextflow_pipeline {
with(workflow) {
assert failed
assert exitStatus == 1
assert trace.tasks().size() == 3
assert workflow.trace.succeeded().size() == 2
assert trace.tasks().size() == 4
assert workflow.trace.succeeded().size() == 3
assert workflow.trace.failed().size() == 1
}
}
Expand Down
Loading

0 comments on commit 1f9517c

Please sign in to comment.