diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 713c250..2f5dcce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -19,11 +19,12 @@ jobs: sudo mv nf-test /usr/local/bin/ - name: Pull Docker image and cache run: | - docker pull papaemmelab/purple:v0.1.0 + docker pull papaemmelab/purple:v0.1.1 - name: Run unit tests of each process for Amber, Cobalt, Purple run: | nf-test test tests/main.runamber.nf.test nf-test test tests/main.runcobalt.nf.test + nf-test test tests/main.bincobalt.nf.test nf-test test tests/main.runpurple.nf.test - name: Run pipeline end-to-end test run: | diff --git a/Dockerfile b/Dockerfile index 30e77be..fa277a3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,15 @@ FROM papaemmelab/docker-hmftools:v1.0.0 +# Clean up to free space +RUN apt-get clean + # Install dependencies -RUN apt-get update && apt-get install -y tar curl python3 +RUN apt-get update && apt-get install -y \ + tar \ + curl \ + python3 \ + python3-pip \ + python3-venv # Install Google Cloud SDK RUN curl -sSL https://sdk.cloud.google.com | bash @@ -20,6 +28,10 @@ RUN \ tar -xzf /data/${REF_DIR}.tar.gz -C /data && \ rm /data/${REF_DIR}.tar.gz +# Link python3 to python and install packages +RUN ln -s /usr/bin/python3 /usr/bin/python && \ + pip install --no-cache-dir numpy pandas + COPY . /app WORKDIR /app diff --git a/README.md b/README.md index c8489bc..e871442 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,11 @@ More information on their [docs](https://github.com/hartwigmedical/hmftools/blob sh tests/run_test.sh ``` +To check tests and update snapshots run: +```bash +nf-test test --update-snapshot +``` + ### Docker [Purple Docker image](https://hub.docker.com/r/papaemmelab/purple) was built for several platforms using [docker buildx](https://docs.docker.com/buildx/working-with-buildx/). diff --git a/main.nf b/main.nf index 598e9a9..81ae32f 100644 --- a/main.nf +++ b/main.nf @@ -1,4 +1,5 @@ -params.cores = 4 +params.cores = 1 +params.memory = '4 GB' // Params Defaults in juno params.refGenome = "/work/isabl/ref/homo_sapiens/GRCh37d5/gr37.fasta" @@ -8,6 +9,10 @@ params.loci = "/data/copy_number/GermlineHetPon.37.vcf.gz" params.gcProfile = "/data/copy_number/GC_profile.1000bp.37.cnp" params.ensemblDataDir = "/data/common/ensembl_data" params.diploidRegions = "/data/copy_number/DiploidRegions.37.bed.gz" +params.binProbes = 0 +params.binLogR = 0 +params.minPurity = 0.08 +params.maxPurity = 1.0 log.info """\ @@ -15,10 +20,15 @@ log.info """\ ======================================== Params: ---------------------------------------- - tumor : ${params.tumor} - tumorBam : ${params.tumorBam} - outdir : ${params.outdir} - cores : ${params.cores} + tumor : ${params.tumor} + tumorBam : ${params.tumorBam} + outdir : ${params.outdir} + cores : ${params.cores} + memory : ${params.memory} + binProbes : ${params.binProbes} + binLogR : ${params.binLogR} + minPurity : ${params.minPurity} + maxPurity : ${params.maxPurity} ======================================== Workflow: ---------------------------------------- @@ -32,7 +42,7 @@ process runAmber { tag "AMBER on ${params.tumor}" publishDir "${params.outdir}/amber", mode: 'copy' cpus params.cores - memory '32 GB' + memory params.memory time '1h' input: @@ -46,13 +56,22 @@ process runAmber { script: """ - amber \ - -tumor ${tumor} \ - -tumor_bam ${tumorBam} \ - -output_dir \$PWD \ - -threads ${params.cores} \ - -loci ${params.loci} \ - -ref_genome_version ${params.genomeVersion} + if [ -f "${params.outdir}/amber/${tumor}.amber.baf.tsv.gz" ] && \ + [ -f "${params.outdir}/amber/${tumor}.amber.baf.pcf" ] && \ + [ -f "${params.outdir}/amber/${tumor}.amber.qc" ]; then + echo "Output files already exist. Skipping amber execution." + ln -s ${params.outdir}/amber/${tumor}.amber.baf.tsv.gz ${tumor}.amber.baf.tsv.gz + ln -s ${params.outdir}/amber/${tumor}.amber.baf.pcf ${tumor}.amber.baf.pcf + ln -s ${params.outdir}/amber/${tumor}.amber.qc ${tumor}.amber.qc + else + amber \ + -tumor ${tumor} \ + -tumor_bam ${tumorBam} \ + -output_dir \$PWD \ + -threads ${params.cores} \ + -loci ${params.loci} \ + -ref_genome_version ${params.genomeVersion} + fi """.stripIndent() } @@ -60,7 +79,7 @@ process runCobalt { tag "COBALT on ${params.tumor}" publishDir "${params.outdir}/cobalt", mode: 'copy' cpus params.cores - memory '32 GB' + memory params.memory time '1h' input: @@ -73,21 +92,102 @@ process runCobalt { script: """ - cobalt \ + if [ -f "${params.outdir}/cobalt/${tumor}.cobalt.ratio.tsv.gz" ] && \ + [ -f "${params.outdir}/cobalt/${tumor}.cobalt.ratio.pcf" ]; then + echo "Output files already exist. Skipping cobalt execution." + ln -s ${params.outdir}/cobalt/${tumor}.cobalt.ratio.tsv.gz ${tumor}.cobalt.ratio.tsv.gz + ln -s ${params.outdir}/cobalt/${tumor}.cobalt.ratio.pcf ${tumor}.cobalt.ratio.pcf + else + cobalt \ -tumor ${tumor} \ -tumor_bam ${tumorBam} \ -output_dir \$PWD \ -threads ${params.cores} \ -gc_profile ${params.gcProfile} \ -tumor_only_diploid_bed ${params.diploidRegions} + fi + """.stripIndent() +} + +process binCobalt { + tag "COBALT BIN on ${params.tumor}" + publishDir "${params.outdir}/cobalt/binned_${params.binProbes}_probes_${params.binLogR}_LogR", mode: 'copy' + cpus params.cores + memory params.memory + time '1h' + + input: + val tumor + val binProbes + val binLogR + path cobalt_ratio_pcf + path cobalt_ratio_tsv + + output: + path "${tumor}.cobalt.ratio.tsv.gz", emit: cobalt_ratio_tsv + path "${tumor}.cobalt.ratio.pcf", emit: cobalt_ratio_pcf + + script: + """ + #!/usr/bin/env python + import pandas as pd + import numpy as np + + cobalt_ratio_pcf = pd.read_csv('${cobalt_ratio_pcf}', sep='\\t') + cobalt_ratio_pcf_probes = pd.DataFrame(columns=cobalt_ratio_pcf.columns) + + # First bin by probes + chrom_arm = None + last_idx = None + for idx, seg in cobalt_ratio_pcf.iterrows(): + if chrom_arm != '_'.join(seg[['chrom','arm']].astype(str)): + chrom_arm = '_'.join(seg[['chrom','arm']].astype(str)) + cobalt_ratio_pcf_probes = pd.concat([cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True) + last_idx = cobalt_ratio_pcf_probes.index[-1] + continue + if ( + cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] <= ${binProbes} + ) or ( + seg['n.probes'] <= ${binProbes} + ): + means = [cobalt_ratio_pcf_probes.loc[last_idx, 'mean']] * cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] + means.extend([seg['mean']] * seg['n.probes']) + cobalt_ratio_pcf_probes.loc[last_idx, 'mean'] = np.mean(means) + cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] += seg['n.probes'] + cobalt_ratio_pcf_probes.loc[last_idx, 'end.pos'] = seg['end.pos'] + else: + cobalt_ratio_pcf_probes = pd.concat([cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True) + last_idx = cobalt_ratio_pcf_probes.index[-1] + + # Then bin by logR mean + cobalt_ratio_pcf_probes = cobalt_ratio_pcf_probes.reset_index().drop(columns="index") + cobalt_ratio_pcf_probes_logR = pd.DataFrame(columns=cobalt_ratio_pcf_probes.columns) + chrom_arm = None + for idx, seg in cobalt_ratio_pcf_probes.iterrows(): + if chrom_arm != '_'.join(seg[['chrom','arm']].astype(str)): + chrom_arm = '_'.join(seg[['chrom','arm']].astype(str)) + cobalt_ratio_pcf_probes_logR = pd.concat([cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True) + last_idx = cobalt_ratio_pcf_probes_logR.index[-1] + continue + if abs(cobalt_ratio_pcf_probes.loc[last_idx, 'mean'] - seg['mean']) <= ${binLogR}: + means = [cobalt_ratio_pcf_probes_logR.loc[last_idx, 'mean']] * cobalt_ratio_pcf_probes_logR.loc[last_idx, 'n.probes'] + means.extend([seg['mean']] * seg['n.probes']) + cobalt_ratio_pcf_probes_logR.loc[last_idx, 'mean'] = np.mean(means) + cobalt_ratio_pcf_probes_logR.loc[last_idx, 'n.probes'] += seg['n.probes'] + cobalt_ratio_pcf_probes_logR.loc[last_idx, 'end.pos'] = seg['end.pos'] + else: + cobalt_ratio_pcf_probes_logR = pd.concat([cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True) + last_idx = cobalt_ratio_pcf_probes_logR.index[-1] + + cobalt_ratio_pcf_probes_logR.to_csv("${tumor}.cobalt.ratio.pcf", sep='\\t', index=False) """.stripIndent() } process runPurple { tag "PURPLE on ${params.tumor}" - publishDir "${params.outdir}/purple", mode: 'copy' + publishDir "${params.outdir}/purple/purple_${params.minPurity}_${params.maxPurity}", mode: 'copy' cpus params.cores - memory '32 GB' + memory params.memory time '1h' input: @@ -97,6 +197,7 @@ process runPurple { path amber_qc path cobalt_ratio_tsv path cobalt_ratio_pcf + path cobalt_path output: path "${tumor}.purple.purity.tsv", emit: purple_purity_tsv @@ -118,21 +219,34 @@ process runPurple { purple \ -tumor ${tumor} \ -amber ${params.outdir}/amber \ - -cobalt ${params.outdir}/cobalt \ + -cobalt ${cobalt_path} \ -output_dir \$PWD \ -gc_profile ${params.gcProfile} \ -ref_genome ${params.refGenome} \ -ref_genome_version ${params.genomeVersion} \ -ensembl_data_dir ${params.ensemblDataDir} \ - -circos ${params.circos} + -circos ${params.circos} \ + -min_purity ${params.minPurity} \ + -max_purity ${params.maxPurity} + + rsync -a --no-links \$PWD/ ${params.outdir}/purple/ """.stripIndent() } workflow { tumor = Channel.value(params.tumor) tumorBam = Channel.fromPath(params.tumorBam) - - runAmber(tumor, tumorBam) - runCobalt(tumor, tumorBam) - runPurple(tumor, runAmber.out, runCobalt.out) + binProbes = Channel.value(params.binProbes) + binLogR = Channel.value(params.binLogR) + + amberOutput = runAmber(tumor, tumorBam) + cobaltOutput = runCobalt(tumor, tumorBam) + + if (binProbes != 0 || binLogR != 0) { + binCobaltOutput = binCobalt(tumor, binProbes, binLogR, cobaltOutput.cobalt_ratio_pcf, cobaltOutput.cobalt_ratio_tsv) + runPurple(tumor, amberOutput, binCobaltOutput, "${params.outdir}/cobalt/binned_${params.binProbes}_probes_${params.binLogR}_LogR") + } else { + runPurple(tumor, amberOutput, cobaltOutput, "${params.outdir}/cobalt") + } } + diff --git a/nextflow.config b/nextflow.config index 93b8bf5..c50c302 100644 --- a/nextflow.config +++ b/nextflow.config @@ -48,7 +48,7 @@ profiles { } process { executor = 'local' - container = 'papaemmelab/purple:v0.1.0' + container = 'papaemmelab/purple:v0.1.1' } } } diff --git a/tests/data/TEST.cobalt.ratio.pcf b/tests/data/TEST.cobalt.ratio.pcf new file mode 100644 index 0000000..f07a0b7 --- /dev/null +++ b/tests/data/TEST.cobalt.ratio.pcf @@ -0,0 +1,3 @@ +sampleID chrom arm start.pos end.pos n.probes mean +S1 2 p 14001 397001 340 -7.24 +S1 17 p 1 397001 341 -8.813389396729216 diff --git a/tests/data/TEST.cobalt.ratio.tsv.gz b/tests/data/TEST.cobalt.ratio.tsv.gz new file mode 100644 index 0000000..9e867b9 Binary files /dev/null and b/tests/data/TEST.cobalt.ratio.tsv.gz differ diff --git a/tests/main.bincobalt.nf.test b/tests/main.bincobalt.nf.test new file mode 100644 index 0000000..8bc09b7 --- /dev/null +++ b/tests/main.bincobalt.nf.test @@ -0,0 +1,41 @@ +nextflow_process { + + name "Test Process binCobalt" + script "main.nf" + process "binCobalt" + + test("Should bin Cobalt without failures") { + + when { + params { + tumor = "TEST" + binProbes = 100 + binLogR = 0.5 + cobalt_ratio_pcf = "${projectDir}/tests/data/TEST.cobalt.ratio.pcf" + cobalt_ratio_tsv = "${projectDir}/tests/data/TEST.cobalt.ratio.tsv.gz" + } + process { + """ + input[0] = Channel.value(params.tumor) + input[1] = Channel.value(params.binProbes) + input[2] = Channel.value(params.binLogR) + input[3] = Channel.fromPath(params.cobalt_ratio_pcf) + input[4] = Channel.fromPath(params.cobalt_ratio_tsv) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + + // check expected files + with(process.out) { + assert cobalt_ratio_tsv.size() == 1 + assert cobalt_ratio_pcf.size() == 1 + } + } + + } + +} diff --git a/tests/main.bincobalt.nf.test.snap b/tests/main.bincobalt.nf.test.snap new file mode 100644 index 0000000..a6aec7d --- /dev/null +++ b/tests/main.bincobalt.nf.test.snap @@ -0,0 +1,25 @@ +{ + "Should bin Cobalt without failures": { + "content": [ + { + "0": [ + "TEST.cobalt.ratio.tsv.gz:md5,87cf8451b04e4373fc4dd7ccda3e6afe" + ], + "1": [ + "TEST.cobalt.ratio.pcf:md5,cd95f6b56cecfd80461bf1bd13c7a9fd" + ], + "cobalt_ratio_pcf": [ + "TEST.cobalt.ratio.pcf:md5,cd95f6b56cecfd80461bf1bd13c7a9fd" + ], + "cobalt_ratio_tsv": [ + "TEST.cobalt.ratio.tsv.gz:md5,87cf8451b04e4373fc4dd7ccda3e6afe" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-07T11:15:21.745138" + } +} \ No newline at end of file diff --git a/tests/main.nf.test b/tests/main.nf.test index 27f968c..3974b5b 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -16,8 +16,8 @@ nextflow_pipeline { with(workflow) { assert failed assert exitStatus == 1 - assert trace.tasks().size() == 3 - assert workflow.trace.succeeded().size() == 2 + assert trace.tasks().size() == 4 + assert workflow.trace.succeeded().size() == 3 assert workflow.trace.failed().size() == 1 } } diff --git a/tests/main.runcobalt.nf.test.snap b/tests/main.runcobalt.nf.test.snap index 9542102..ebd916f 100644 --- a/tests/main.runcobalt.nf.test.snap +++ b/tests/main.runcobalt.nf.test.snap @@ -16,6 +16,10 @@ ] } ], - "timestamp": "2024-01-23T10:50:58.292541" + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-07T14:43:04.879169" } } \ No newline at end of file diff --git a/tests/main.runpurple.nf.test b/tests/main.runpurple.nf.test index ed0271f..63d11f1 100644 --- a/tests/main.runpurple.nf.test +++ b/tests/main.runpurple.nf.test @@ -40,6 +40,7 @@ nextflow_process { input[0] = Channel.value(params.tumor) input[1] = runAmber.out input[2] = runCobalt.out + input[3] = "${params.outdir}/cobalt" """ } }