diff --git a/.github/workflows/build-hic-breakfinder-dockerfile.yml b/.github/workflows/build-hic-breakfinder-dockerfile.yml new file mode 100644 index 0000000..d384565 --- /dev/null +++ b/.github/workflows/build-hic-breakfinder-dockerfile.yml @@ -0,0 +1,29 @@ +# Copyright (c) 2023 Roberto Rossini (roberros@uio.no) +# SPDX-License-Identifier: MIT + +name: Build hic_breakfinder Dockerfile + +on: + push: + branches: [ main ] + paths: + - ".github/workflows/build-hic-breakfinder*dockerfile.yml" + - ".github/workflows/build-dockerfile.yml" + - "containers/hic-breakfinder*.Dockerfile" + pull_request: + branches: [ main ] + paths: + - ".github/workflows/build-hic-breakfinder*dockerfile.yml" + - ".github/workflows/build-dockerfile.yml" + - "containers/hic-breakfinder*.Dockerfile" + +jobs: + build-hic-breakfinder-dockerfile: + name: Build hic_breakfinder Dockerfile + uses: paulsengroup/2022-mcf10a-cancer-progression/.github/workflows/build-dockerfile.yml@main + with: + dockerfile-glob: "containers/hic-breakfinder*.Dockerfile" + + permissions: + contents: read + packages: write diff --git a/.github/workflows/build-hictrans-dockerfile.yml b/.github/workflows/build-hictrans-dockerfile.yml new file mode 100644 index 0000000..c524a9f --- /dev/null +++ b/.github/workflows/build-hictrans-dockerfile.yml @@ -0,0 +1,29 @@ +# Copyright (c) 2023 Roberto Rossini (roberros@uio.no) +# SPDX-License-Identifier: MIT + +name: Build HiCTrans Dockerfile + +on: + push: + branches: [ main ] + paths: + - ".github/workflows/build-hictrans*dockerfile.yml" + - ".github/workflows/build-dockerfile.yml" + - "containers/hictrans*.Dockerfile" + pull_request: + branches: [ main ] + paths: + - ".github/workflows/build-hictrans*dockerfile.yml" + - ".github/workflows/build-dockerfile.yml" + - "containers/hictrans*.Dockerfile" + +jobs: + build-hictrans-dockerfile: + name: Build HiCTrans Dockerfile + uses: paulsengroup/2022-mcf10a-cancer-progression/.github/workflows/build-dockerfile.yml@main + with: + dockerfile-glob: "containers/hictrans*.Dockerfile" + + permissions: + contents: read + packages: write diff --git a/.github/workflows/build-hint-dockerfile.yml b/.github/workflows/build-hint-dockerfile.yml new file mode 100644 index 0000000..7ff2124 --- /dev/null +++ b/.github/workflows/build-hint-dockerfile.yml @@ -0,0 +1,31 @@ +# Copyright (c) 2023 Roberto Rossini (roberros@uio.no) +# SPDX-License-Identifier: MIT + +name: Build HiNT Dockerfile + +on: + push: + branches: [ main ] + paths: + - ".github/workflows/build-hint*dockerfile.yml" + - ".github/workflows/build-dockerfile.yml" + - "containers/hint*.Dockerfile" + - "containers/patches/hint.patch" + pull_request: + branches: [ main ] + paths: + - ".github/workflows/build-hint*dockerfile.yml" + - ".github/workflows/build-dockerfile.yml" + - "containers/hint*.Dockerfile" + - "containers/patches/hint.patch" + +jobs: + build-hint-dockerfile: + name: Build HiNT Dockerfile + uses: paulsengroup/2022-mcf10a-cancer-progression/.github/workflows/build-dockerfile.yml@main + with: + dockerfile-glob: "containers/hint*.Dockerfile" + + permissions: + contents: read + packages: write diff --git a/bin/compute_restriction_sites_for_hint.py b/bin/compute_restriction_sites_for_hint.py new file mode 100755 index 0000000..3d11068 --- /dev/null +++ b/bin/compute_restriction_sites_for_hint.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import argparse + +import bioframe as bf +import pandas as pd + + +def make_cli(): + cli = argparse.ArgumentParser() + + cli.add_argument("fna", type=str) + cli.add_argument("enzymes", nargs="+", type=str) + + return cli + + +if __name__ == "__main__": + args = vars(make_cli().parse_args()) + + dfs = [] + + fna = bf.load_fasta(args["fna"]) + for enz in args["enzymes"]: + dfs.append(bf.digest(fna, enz)) + + df = pd.concat(dfs)[["chrom", "start"]].drop_duplicates(keep="first") + + groups = df.groupby("chrom")["start"].aggregate(list) + for chrom, sites in groups.apply(lambda l: " ".join(str(x) for x in sorted(l))).items(): + print(chrom, sites) diff --git a/configs/detect_structural_variants.config b/configs/detect_structural_variants.config new file mode 100644 index 0000000..454ba83 --- /dev/null +++ b/configs/detect_structural_variants.config @@ -0,0 +1,57 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +params { + data_dir = 'data' + input_dir = "${data_dir}/input" + output_dir = "${data_dir}/output/structural_variants" + + mcools = "${data_dir}/output/nfcore_hic/mcools/hg38*_merged.mcool" + nfcore_hic_samplesheet = "${data_dir}/input/nfcore_hic_samplesheet.csv" + alignment_dir = "${data_dir}/output/nfcore_hic/alignments" + + ref_genome_name = 'hg38' + reference_genome = "${data_dir}/input/${ref_genome_name}/${ref_genome_name}.filtered.fa" + blacklist = "${data_dir}/input/${ref_genome_name}/${ref_genome_name}_blacklist.bed.gz" + + hictrans_resolution = 10000 + + hint_resolution = 50 // kb + hint_refzip = "${data_dir}/input/hint/refData_${ref_genome_name}.zip" + hint_backdirzip = "${data_dir}/input/hint/backgroundMatrices_${ref_genome_name}.zip" + restriction_enzymes = 'DpnII HindIII' + restriction_enzymes_alias = 'arima' + + hic_breakfinder_quality_score = 30 + hic_breakfinder_expected_intra = "${data_dir}/input/hic_breakfinder/intra_expect_100kb.${ref_genome_name}.txt" + hic_breakfinder_expected_inter = "${data_dir}/input/hic_breakfinder/inter_expect_1Mb.${ref_genome_name}.txt" +} + +process { + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/hictrans:e26ad6a' + withName:run_hictrans { + memory = 40.GB + } + withName:digest_genome_for_hint { + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/hint:2.2.8' + } + withName:run_hint_cnv { + memory = 6.GB + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/hint:2.2.8' + } + withName:run_hint_tl { + memory = 40.GB + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/hint:2.2.8' + } + withName:filter_mappings { + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/samtools:1.17' + } + withName:merge_bams { + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/samtools:1.17' + } + withName:run_hic_breakfinder { + memory = 165.GB + container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/hic-breakfinder:30a0dcc' + } +} diff --git a/containers/hic-breakfinder__v30a0dcc.Dockerfile b/containers/hic-breakfinder__v30a0dcc.Dockerfile new file mode 100644 index 0000000..35de9db --- /dev/null +++ b/containers/hic-breakfinder__v30a0dcc.Dockerfile @@ -0,0 +1,71 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +FROM mambaorg/micromamba:1.4.3 AS builder + +ARG CONTAINER_VERSION + +RUN if [ -z "$CONTAINER_VERSION" ]; then echo "Missing CONTAINER_VERSION --build-arg" && exit 1; fi + +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +ARG HIC_BREAKFINDER_GIT='https://github.com/dixonlab/hic_breakfinder.git' +ARG HIC_BREAKFINDER_TAG="${CONTAINER_VERSION}" + +RUN micromamba install -y \ + -c conda-forge \ + -c bioconda \ + bamtools \ + eigen \ + git \ + gxx \ + make + +RUN cd /tmp \ +&& git clone "$HIC_BREAKFINDER_GIT" \ +&& cd hic_breakfinder \ +&& git checkout "$HIC_BREAKFINDER_TAG" + +RUN cd hic_breakfinder \ +&& CXXFLAGS='-isystem /opt/conda/include/bamtools -isystem /opt/conda/include/eigen3 -fpermissive' \ + ./configure \ +&& make -j $(nproc) + + +FROM mambaorg/micromamba:1.4.3 AS base + +ARG CONTAINER_VERSION + +RUN if [ -z "$CONTAINER_VERSION" ]; then echo "Missing CONTAINER_VERSION --build-arg" && exit 1; fi + +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +ARG HINT_VERSION="${CONTAINER_VERSION}" + + +RUN micromamba install -y \ + -c conda-forge \ + -c bioconda \ + bamtools \ + procps-ng \ +&& micromamba clean --all -y + +COPY --from=builder --chown=nobody:nogroup /tmp/hic_breakfinder/src/hic_breakfinder /usr/local/bin/ +COPY --from=builder --chown=nobody:nogroup /tmp/hic_breakfinder/LICENSE /usr/local/share/licenses/hic_breakfinder/ + +WORKDIR /data + +ENV PATH="/opt/conda/bin:$PATH" +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] +CMD ["/usr/local/bin/hic_breakfinder"] +WORKDIR /data + +RUN whereis hic_breakfinder + +LABEL org.opencontainers.image.authors='Roberto Rossini ' +LABEL org.opencontainers.image.url='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.documentation='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.source='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.licenses='MIT' +LABEL org.opencontainers.image.title="${CONTAINER_TITLE:-hic_breakfinder}" +LABEL org.opencontainers.image.version="${CONTAINER_VERSION:-latest}" diff --git a/containers/hictrans__ve26ad6a.Dockerfile b/containers/hictrans__ve26ad6a.Dockerfile new file mode 100644 index 0000000..de930ed --- /dev/null +++ b/containers/hictrans__ve26ad6a.Dockerfile @@ -0,0 +1,83 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +FROM ubuntu:22.04 AS download + +ARG CONTAINER_VERSION +ARG HICTRANS_VERSION="${CONTAINER_VERSION}" + +RUN apt-get update \ +&& apt-get install -y git \ +&& cd /tmp \ +&& git clone https://github.com/ay-lab/HiCtrans.git \ +&& cd HiCtrans \ +&& git checkout "$HICTRANS_VERSION" + + +FROM mambaorg/micromamba:1.4.3 AS base + +ARG CONTAINER_VERSION + +RUN if [ -z "$CONTAINER_VERSION" ]; then echo "Missing CONTAINER_VERSION --build-arg" && exit 1; fi + +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +ARG HICTRANS_VERSION="${CONTAINER_VERSION}" + + +RUN micromamba install -y \ + -c conda-forge \ + -c bioconda \ + cooler \ + pigz \ + procps-ng \ + r-catools \ + r-changepoint \ + r-data.table \ + r-depmixs4 \ + r-deoptimr \ + r-hashmap \ + r-optparse \ + r-r.utils \ + r-rcpp \ +&& micromamba clean --all -y + +COPY --from=download --chown=nobody:nogroup /tmp/HiCtrans/hictrans.v3.R /opt/hictrans/ + +USER root +RUN mkdir /opt/hictrans/bin \ +&& printf '%s\n%s "$@"' \ + '#!/usr/bin/env sh' \ + 'Rscript /opt/hictrans/hictrans.v3.R' > /opt/hictrans/bin/hictrans \ +&& chown -R nobody:nogroup /opt/hictrans \ +&& chmod 755 /opt/hictrans/bin/hictrans +USER mambauser + +WORKDIR /data + +ENV PATH="/opt/conda/bin:/opt/hictrans/bin:$PATH" +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] +CMD ["/opt/hictrans/bin/hictrans"] +WORKDIR /data + +# We have to explicitly set these R_* env variables in order for the +# container to work correctly when running using Apptainer +ENV R_HOME=/opt/conda/lib/R +ENV R_LIBS=/opt/conda/lib/R/lib +ENV R_ENVIRON=/opt/conda/lib/R/etc/Renviron +ENV R_HISTFILE=/tmp/.Rhistory + +ENV R_HOME_USER='$R_HOME' +ENV R_LIBS_USER='$R_LIBS' +ENV R_ENVIRON_USER='$R_ENVIRON' +ENV R_PROFILE_USER=/opt/conda/lib/R/etc/.Rprofile + +RUN hictrans --help + +LABEL org.opencontainers.image.authors='Roberto Rossini ' +LABEL org.opencontainers.image.url='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.documentation='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.source='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.licenses='MIT' +LABEL org.opencontainers.image.title="${CONTAINER_TITLE:-hictrans}" +LABEL org.opencontainers.image.version="${CONTAINER_VERSION:-latest}" diff --git a/containers/hint__v2.2.8.Dockerfile b/containers/hint__v2.2.8.Dockerfile new file mode 100644 index 0000000..c1dd057 --- /dev/null +++ b/containers/hint__v2.2.8.Dockerfile @@ -0,0 +1,78 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +FROM ubuntu:22.04 AS download + +ARG BICSEQ2_SEG_URL='http://www.math.pku.edu.cn/teachers/xirb/downloads/software/BICseq2/BICseq2/BICseq2-seg_v0.7.3.tar.gz' + +RUN apt-get update \ +&& apt-get install -y curl \ +&& cd /tmp \ +&& curl -L "$BICSEQ2_SEG_URL" | tar -xzf - + + +FROM mambaorg/micromamba:1.4.3 AS base + +ARG CONTAINER_VERSION + +RUN if [ -z "$CONTAINER_VERSION" ]; then echo "Missing CONTAINER_VERSION --build-arg" && exit 1; fi + +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +ARG HINT_VERSION="${CONTAINER_VERSION}" + + +RUN micromamba install -y \ + -c conda-forge \ + -c bioconda \ + "hint=$HINT_VERSION" \ + bioframe \ + biopython \ + 'pandas<1' \ + pysam \ + procps-ng \ + unzip \ +&& micromamba clean --all -y + +COPY --from=download --chown=nobody:nogroup /tmp/BICseq2-seg_v0.7.3 /opt/BICseq2-seg/ +COPY --chown=nobody:nogroup containers/patches/hint.patch /tmp/ + +USER root +RUN apt-get update \ +&& apt-get install -y patch \ +&& patch /opt/conda/bin/hint -i /tmp/hint.patch \ +&& rm /tmp/hint.patch \ +&& apt-get remove -y patch \ +&& apt-get autoremove -y \ +&& rm -rf /var/lib/apt/lists/* +USER mambauser + +WORKDIR /data + +ENV PATH="/opt/conda/bin:$PATH" +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] +CMD ["/opt/hictrans/bin/hint"] +WORKDIR /data + + +# We have to explicitly set these R_* env variables in order for the +# container to work correctly when running using Apptainer +ENV R_HOME=/opt/conda/lib/R +ENV R_LIBS=/opt/conda/lib/R/lib +ENV R_ENVIRON=/opt/conda/lib/R/etc/Renviron +ENV R_HISTFILE=/tmp/.Rhistory + +ENV R_HOME_USER='$R_HOME' +ENV R_LIBS_USER='$R_LIBS' +ENV R_ENVIRON_USER='$R_ENVIRON' +ENV R_PROFILE_USER=/opt/conda/lib/R/etc/.Rprofile + +RUN hint --help + +LABEL org.opencontainers.image.authors='Roberto Rossini ' +LABEL org.opencontainers.image.url='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.documentation='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.source='https://github.com/paulsengroup/2022-mcf10a-cancer-progression' +LABEL org.opencontainers.image.licenses='MIT' +LABEL org.opencontainers.image.title="${CONTAINER_TITLE:-hint}" +LABEL org.opencontainers.image.version="${CONTAINER_VERSION:-latest}" diff --git a/containers/patches/hint.patch b/containers/patches/hint.patch new file mode 100644 index 0000000..7493182 --- /dev/null +++ b/containers/patches/hint.patch @@ -0,0 +1,22 @@ +diff --git a/bin/hint b/bin/hint +index bc6db7f..3d7fcec 100644 +--- a/bin/hint ++++ b/bin/hint +@@ -127,7 +127,7 @@ def add_cnv_parser(subparsers): + cnvparser.add_argument("--doiter",dest="doIterative",action='store_true',help = "If this switch is on, HiNT will do the iterative regression model by removing copy numer variated regions, DEFAULT=False",required=False,default=False) + cnvparser.add_argument("-f","--format",dest="format",type=str,required = False, choices = ("cooler","juicer"), default="cooler", + help="Format for the output contact matrix, DEFAULT: cooler") +- cnvparser.add_argument("-e","--enzyme",dest="enzyme",choices = ("MboI","HindIII","DpnII"), default="MboI",required = False, help="enzyme used for the Hi-C experiments, will be used to calculate enzyme sites") ++ cnvparser.add_argument("-e","--enzyme",dest="enzyme",choices = ("arima","MboI","HindIII","DpnII"), default="MboI",required = True, help="enzyme used for the Hi-C experiments, will be used to calculate enzyme sites") + cnvparser.add_argument("-r","--resolution",dest="resolution",type=int,required = False, default=50, + help="Resolution for the Hi-C contact matrix used for the CNV detection, unit: kb, DEFAULT: 50kb") + cnvparser.add_argument("-g","--genome",dest="genome", choices = ("hg38","hg19","mm10"), default="hg19",required=False, +@@ -152,7 +152,7 @@ def add_translocation_parser(subparsers): + or the directory that contain Hi-C interaction matrix in sparse or dense matrix format, interchromosomal interaction matrices only. Absolute path is required") + tranlparser.add_argument("--refdir",dest="referencedir",type=str,required = True, + help="the reference directory that downloaded from dropbox dropbox. (https://www.dropbox.com/sh/2ufsyu4wvrboxxp/AABk5-_Fwy7jdM_t0vIsgYf4a?dl=0.)") +- tranlparser.add_argument("-e","--enzyme",dest="enzyme",type=str,required = False, choices = ("DpnII","MboI","HindIII"), default="MboI", ++ tranlparser.add_argument("-e","--enzyme",dest="enzyme",type=str,required = True, choices = ("arima", "DpnII","MboI","HindIII"), default="MboI", + help="Enzyme used in Hi-C experiment, DEFAULT: MboI") + tranlparser.add_argument("-f","--format",dest="format",type=str,required = False, choices = ("cooler","juicer"), default="cooler", + help="Format for the output contact matrix, DEFAULT: cooler") diff --git a/data/downloads.tsv b/data/downloads.tsv index 3311f68..1986a05 100644 --- a/data/downloads.tsv +++ b/data/downloads.tsv @@ -1,4 +1,8 @@ url sha256 dest action +https://www.dropbox.com/scl/fi/7lekl4jeeduym9gav2ajr/backgroundMatrices_hg38.zip?rlkey=bh0pd0pbpedtxf45u6nb2e3m0&dl=1 d0964aa7cb005bd0a1d855175840ccbd26a18f3ec18ec16ebb848a93c441b2de input/hint/backgroundMatrices_hg38.zip none +https://www.dropbox.com/scl/fi/qtac9ptpy5j8066rtr9z6/refData_hg38.zip?rlkey=col12qdnu3nmpdzb461b29sof&dl=1 1d3e34d8a53a6d0fe90db4e49803d6d0452121da53dc9ac69ef9203ce6bdd878 input/hint/refData_hg38.zip none +https://www.dropbox.com/scl/fi/gxr3f5wtolcdbifwxix61/inter_expect_1Mb.hg38.txt?rlkey=4qya4yst8qry1n0l9luqjkisq&dl=1 98cbf89605e03b21ef1ab438b434c800d51661a63c379034dc154a0ed1bfcbea input/hic_breakfinder/inter_expect_1Mb.hg38.txt none +https://www.dropbox.com/scl/fi/t6mw9sxnq33qlyyfn5rf3/intra_expect_100kb.hg38.txt?rlkey=gf62gc8dpvi7h069bninagkfv&dl=1 e9910862481aeef61b0be7571c0f85705c4151701041707ddb85141b3e0c4472 input/hic_breakfinder/intra_expected_100kb.hg38.txt https://hgdownload.soe.ucsc.edu/goldenPath/hg17/liftOver/hg17ToHg38.over.chain.gz 0e68992e2bf3fabc8f549be61a4c0ee69e4234dfa0b4b116cc5cfca14f8eb806 input/NCBI35/liftover/hg17_to_hg38.over.chain.gz none ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE19nnn/GSE19920/suppl/GSE19920_mcf10a_series_cnat_copynumber.txt.gz d1605f99440b541056308b5ea819286be5f511e1793e1b105639f080882eeb5e input/NCBI35/microarrays/GSE19920_MCF10A_series_cnvs.txt.gz none https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?mode=raw&is_datatable=true&acc=GPL3718&id=44346&db=GeoDb_blob10 73909b66e63eb287655aa258dbebadafb57baffc34a7941054c7530239d16c9f input/NCBI35/microarrays/affymetrix_probes_mapping250K_Nsp.tsv.gz compress diff --git a/run_detect_structural_variants.sh b/run_detect_structural_variants.sh new file mode 100755 index 0000000..0466050 --- /dev/null +++ b/run_detect_structural_variants.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Copyright (C) 2022 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -u +set -o pipefail +set -x + + +step='detect_structural_variants' +wd=".nextflow-$step-wd" +mkdir -p "$wd" + +./setup_workflow_workdir.sh "$PWD" "$wd" +./run_workflow.sh "$wd" "$step" diff --git a/workflows/detect_structural_variants.nf b/workflows/detect_structural_variants.nf new file mode 100644 index 0000000..318dc85 --- /dev/null +++ b/workflows/detect_structural_variants.nf @@ -0,0 +1,426 @@ +#!/usr/bin/env nextflow +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +nextflow.enable.dsl=2 + +def collect_files(prefix, sample_id, suffix, type = "file") { + def files = file("${prefix}/${sample_id}*${suffix}", + type: type, + checkIfExists: true) + + // Example: hg38_001_MCF10A_WT_REP1 -> hg38_MCF10A_WT + def condition_id = sample_id.replaceAll(/(.*)_\d{3}_(.*)_REP\d/, '$1_$2') + + return tuple(condition_id, files) +} + + +workflow { + Channel.fromPath(params.mcools, checkIfExists: true) + .set { mcools } + + Channel.fromPath(params.nfcore_hic_samplesheet) + .splitCsv(sep: ",", header: true) + .map { it.sample } + .unique() + .set { sample_ids } + + aln_dir = file(params.alignment_dir, type: 'dir', checkIfExists: true) + + sample_ids + .map { collect_files(aln_dir, it, '.bwt2pairs.cram') } + .set { alignments } + + extract_chrom_sizes_from_cooler( + mcools.first(), + params.hictrans_resolution + ) + + extract_chrom_sizes_from_cooler.out.chrom_sizes + .set { chrom_sizes } + + generate_chromosome_pairs( + chrom_sizes + ) + + generate_chromosome_pairs.out.pairs + .splitCsv(sep: '\t', header: true) + .set { chrom_pairs } + + preproc_coolers_for_hictrans( + mcools.combine([params.hictrans_resolution]) + .combine(chrom_pairs) + .map { tuple(it[0], it[1], it[2].chrom1, it[2].chrom2) } + ) + + run_hictrans( + preproc_coolers_for_hictrans.out.data, + chrom_sizes + ) + + digest_genome_for_hint( + file(params.reference_genome, checkIfExists: true), + params.ref_genome_name, + params.restriction_enzymes, + params.restriction_enzymes_alias + ) + + run_hint_cnv( + mcools, + file(params.hint_refzip, checkIfExists: true), + digest_genome_for_hint.out.txt, + params.hint_resolution, + params.ref_genome_name + ) + + run_hint_tl( + mcools, + file(params.hint_refzip, checkIfExists: true), + file(params.hint_backdirzip, checkIfExists: true), + digest_genome_for_hint.out.txt, + params.ref_genome_name + ) + + filter_mappings( + alignments, + params.hic_breakfinder_quality_score + ) + + merge_bams( + filter_mappings.out.bam.groupTuple() + ) + + run_hic_breakfinder( + merge_bams.out.bam, + file(params.hic_breakfinder_expected_intra), + file(params.hic_breakfinder_expected_inter) + ) +} + + +process generate_chromosome_pairs { + label 'process_very_short' + + input: + path chrom_sizes + + output: + path "*.tsv", emit: pairs + + shell: + ''' + #!/usr/bin/env python3 + + import pandas as pd + + df = pd.read_table("!{chrom_sizes}", names=["chrom", "length"]) + + chroms = df["chrom"].tolist() + + with open("chromosome_pairs.tsv", "w") as f: + print("chrom1\\tchrom2", file=f) + for i, chrom1 in enumerate(chroms): + for chrom2 in chroms[i:]: + print(f"{chrom1}\\t{chrom2}", file=f) + ''' +} + + +process extract_chrom_sizes_from_cooler { + label 'process_very_short' + + input: + path cooler + val resolution + + output: + path "*.chrom.sizes", emit: chrom_sizes + + shell: + outname="${cooler.baseName}_${resolution}.chrom.sizes" + ''' + set -o pipefail + + if [ '!{resolution}' -ne 0 ]; then + cooler='!{cooler}::/resolutions/!{resolution}' + else + cooler='!{cooler}' + fi + + # Dump .chrom.sizes + cooler dump -t chroms "$cooler" | grep 'chr[0-9X]\\+' > '!{outname}' + ''' +} + +process preproc_coolers_for_hictrans { + label 'process_medium' + tag "${cooler.simpleName}_${chrom1}_${chrom2}" + + input: + tuple path(cooler), + val(resolution), + val(chrom1), + val(chrom2) + + output: + tuple val(chrom1), + val(chrom2), + path("*_abs.bed.gz"), + path("*.matrix.gz"), emit: data + + shell: + outprefix="${cooler.simpleName}" + // For some weird reason, adding comments inside the shell block for this process + // breaks -resume!?!? + // Comment #1: + // Dump the bin table. + // 4th column contains the absolute bin id (starting from 1) + // The 5th column contains 1 if the bin overlaps with one or + // more blacklisted regions and 0 otherwise + // Comment #2: + // cooler dump --one-based-ids does not work as intended, so instead we use AWK to offset bin IDs + ''' + set -o pipefail + + if [ '!{resolution}' -ne 0 ]; then + cooler='!{cooler}::/resolutions/!{resolution}' + else + cooler='!{cooler}' + fi + + cooler dump -t bins "$cooler" | + grep '^chr[0-9X]\\+' | + cut -f 1-3 | + awk -F '\\t' 'BEGIN{OFS=FS} {print $1,$2,$3,NR}' | + pigz -9 -p '!{task.cpus}' > '!{outprefix}_abs.bed.gz' + + cooler dump -t pixels "$cooler" -r '!{chrom1}' -r2 '!{chrom2}' | + awk -F '\\t' 'BEGIN{OFS=FS} {print $1+1,$2+1,$3}' | + pigz -9 -p '!{task.cpus}' > '!{outprefix}.matrix.gz' + ''' +} + + +process run_hictrans { + publishDir "${params.output_dir}/hictrans", mode: 'copy' + tag "${matrix.simpleName}_${chrom1}_${chrom2}" + + label 'process_long' + + input: + tuple val(chrom1), + val(chrom2), + path(bins), + path(matrix) + + path chrom_sizes + + output: + path "*.tar.gz", emit: tar + + shell: + prefix="${matrix.simpleName}" + outdir="${prefix}_${chrom1}_${chrom2}" + ''' + hictrans \\ + --mat '!{matrix}' \\ + --bed '!{bins}' \\ + --chrA '!{chrom1}' \\ + --chrB '!{chrom2}' \\ + --resolutions 2,3,4,5,6,8,10 \\ + --covq 0.1 \\ + --chromsize '!{chrom_sizes}' \\ + --prefix '!{prefix}' + + mkdir -p '!{outdir}' + + if compgen -G "./*/*/MultiResolution_supported_Translocations/*.txt" > /dev/null; then + cp ./*/*/MultiResolution_supported_Translocations/*.txt '!{outdir}/' + fi + + if compgen -G "./*/*/Translocations/Details/*.txt" > /dev/null; then + cp ./*/*/Translocations/Details/*.txt '!{outdir}/' + fi + + tar -czf '!{outdir}.tar.gz' '!{outdir}' + ''' +} + +process digest_genome_for_hint { + + input: + path fna + val assembly + val enzymes + val enzyme_alias + + output: + path "*.txt", emit: txt + + shell: + outputname="${assembly}_${enzyme_alias}_enzymeSites.txt" + ''' + compute_restriction_sites_for_hint.py \\ + '!{fna}' \\ + !{enzymes} > '!{outputname}' + ''' +} + +process run_hint_cnv { + publishDir "${params.output_dir}/hint_cnv", mode: 'copy' + + tag "${cooler.simpleName}" + + input: + path cooler + path refzip + path restriction_sites + val resolution + val ref_genome_name + + output: + path "*.tar.gz", emit: tar + + shell: + outprefix="${cooler.simpleName}" + ''' + if [ '!{resolution}' -ne 0 ]; then + cooler='!{cooler}::/resolutions/!{resolution}000' + else + cooler='!{cooler}' + fi + + unzip '!{refzip}' + cp '!{restriction_sites}' hg38/ + + hint cnv \\ + --matrixfile "$cooler" \\ + --format cooler \\ + --resolution '!{resolution}' \\ + --refdir '!{ref_genome_name}' \\ + --genome '!{ref_genome_name}' \\ + --name '!{outprefix}' \\ + --bicseq /opt/BICseq2-seg \\ + --enzyme 'arima' + + mv HiNTcnv_OUTPUT '!{outprefix}' + tar -czf '!{outprefix}.tar.gz' '!{outprefix}' + ''' +} + + +process run_hint_tl { + publishDir "${params.output_dir}/hint_tl", mode: 'copy' + + tag "${mcool.simpleName}" + + label 'process_high' + + input: + path mcool + path refzip + path backdirzip + path restriction_sites + val ref_genome_name + + output: + path "*.tar.gz", emit: tar + + shell: + outprefix="${mcool.simpleName}" + ''' + unzip '!{refzip}' + unzip '!{backdirzip}' + cp '!{restriction_sites}' hg38/ + + # Setting --name breaks things + hint tl \\ + --matrixfile '!{mcool}::/resolutions/1000000,!{mcool}::/resolutions/100000' \\ + --format cooler \\ + --refdir '!{ref_genome_name}' \\ + --backdir '!{ref_genome_name}' \\ + --genome '!{ref_genome_name}' \\ + --cutoff 0.05 \\ + --enzyme 'arima' \\ + --ppath "$(which pairix)" \\ + --threads '!{task.cpus}' + + mv HiNTtransl_OUTPUT '!{outprefix}' + tar -czf '!{outprefix}.tar.gz' '!{outprefix}' + ''' +} + +process filter_mappings { + label 'process_medium' + + tag "${aln.simpleName}" + + input: + tuple val(condition), + path(aln) + + val score + + output: + tuple val(condition), + path("*.bam"), + emit: bam + + shell: + outname="${aln.baseName}.filtered.bam" + ''' + mkdir -p tmp + samtools view -bSq '!{score}' -@'!{task.cpus}' '!{aln}' | + samtools sort -T tmp/ -O bam -@'!{task.cpus}' -o '!{outname}' + ''' +} + +process merge_bams { + label 'process_medium' + + tag "${condition}" + + input: + tuple val(condition), + path(bams) + + output: + tuple val(condition), + path("*.bam"), + emit: bam + + shell: + outname="${condition}.filtered.bam" + ''' + samtools merge -@'!{task.cpus}' -O bam -o '!{outname}' !{bams} + ''' +} + + +process run_hic_breakfinder { + publishDir "${params.output_dir}/hic_breakfinder", mode: 'copy' + + tag "${condition}" + + input: + tuple val(condition), + path(bam) + + path expected_intra + path expected_inter + + output: + path "*breaks*.txt", emit: txt + + shell: + outprefix=condition + ''' + hic_breakfinder \\ + --bam-file '!{bam}' \\ + --exp-file-inter '!{expected_inter}' \\ + --exp-file-intra '!{expected_intra}' \\ + --name '!{outprefix}' + ''' +}