diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/Dockerfile b/Dockerfile old mode 100644 new mode 100755 index dc22e26..18e65df --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,13 @@ FROM mambaorg/micromamba:0.25.1 +USER root + COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/environment.yml -RUN micromamba install -y -n base -f /tmp/environment.yml && \ - micromamba clean --all --yes && rm /tmp/environment.yml +RUN apt-get update && \ + apt-get install --no-install-recommends -y procps && \ + micromamba install -y -n base -f /tmp/environment.yml && \ + micromamba clean --all --yes && \ + rm /tmp/environment.yml ENV PATH "$MAMBA_ROOT_PREFIX/bin:$PATH" diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/assets/metro_map.png b/assets/metro_map.png old mode 100644 new mode 100755 diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py index 230a9db..e6bba39 100755 --- a/bin/filter_gtf_ndr.py +++ b/bin/filter_gtf_ndr.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python3 +#! /usr/bin/env python3 from typing import Set from GTF import GTF @@ -9,7 +9,7 @@ def parse_bambu(line): def parse_tfkmers(line): ids = line[0].split("::") - return ids[0], ids[1], line[1] + return ids[1], ids[0], line[1] def parse_ndr(csv, origin, th) -> Set[str]: @@ -26,7 +26,7 @@ def parse_ndr(csv, origin, th) -> Set[str]: elif origin == "tfkmers": line = parse_tfkmers(line) - _, tx_id, ndr = line + tx_id, _, ndr = line ndr = float(ndr) if ndr < th: @@ -39,7 +39,7 @@ def filter_count_matrix(file, transcripts, wr): print(next(file), file=wr) for line in file: line_splitted = line.split("\t") - if line_splitted[0].startswith("tx.") and line_splitted[0] not in transcripts: + if line_splitted[0].startswith("BambuTx") and line_splitted[0].lower() not in transcripts: continue print(line.rstrip(), file=wr) diff --git a/bin/qc.R b/bin/qc.R index ee8c3f8..4adbb12 100755 --- a/bin/qc.R +++ b/bin/qc.R @@ -233,7 +233,8 @@ transcript = read.csv(paste0(prefix,".transcript.stats"), header = T) lncRNA_biotypes = c("lncRNA", "antisense", "non-coding", - "lnc_RNA") + "lnc_RNA", + "ncRNA") mRNA_biotypes = c("protein_coding", "mRNA") transcript = transcript %>% diff --git a/bin/qc_gtf.py b/bin/qc_gtf.py index 46d07c4..1980b9d 100755 --- a/bin/qc_gtf.py +++ b/bin/qc_gtf.py @@ -1,4 +1,4 @@ -#! /usr/bin/env python3 +#! /usr/bin/env python3 from GTF import GTF @@ -48,14 +48,14 @@ def qc_gtf(gtf, gene_counts, ref): ) exon_str = "exon_biotype,length,discovery\n" - biotypes = set(("protein_coding", "lncRNA", "lnc_RNA")) + biotypes = set(("protein_coding", "lncRNA", "lnc_RNA", "ncRNA")) for gene in GTF.parse(gtf).values(): if gene["gene_biotype"] not in biotypes: continue g_id = gene["gene_id"] g_biotype = gene["gene_biotype"] - g_status = "novel" if g_id.startswith("gene.") else "known" + g_status = "novel" if g_id.startswith(('BambuGene','unstranded.Gene')) else "known" g_count = gene_counts[g_id]["counts"] # Counts in all samples g_samples = gene_counts[g_id]["validates"] # Found in x samples g_nb_tx = len(gene.transcripts) # Number of isoforms @@ -63,7 +63,7 @@ def qc_gtf(gtf, gene_counts, ref): # Compute genomic extension with start/end in ref ext_5 = 0 ext_3 = 0 - if not g_id.startswith("gene."): + if not g_id.startswith(('BambuGene','unstranded.Gene')): if gene.strand == "+": ext_5 = ref_start_end[gene["gene_id"]]["start"] - gene.start ext_3 = gene.end - ref_start_end[gene["gene_id"]]["end"] @@ -82,7 +82,7 @@ def qc_gtf(gtf, gene_counts, ref): for transcript in gene.transcripts: tx_id = transcript["transcript_id"] tx_biotype = transcript["transcript_biotype"] - tx_status = "novel" if tx_id.startswith("tx.") else "known" + tx_status = "novel" if tx_id.startswith("BambuTx") else "known" tx_nb_exons = len(transcript.exons) tx_length = sum([len(exon) for exon in transcript.exons]) diff --git a/bin/run_bambu.R b/bin/run_bambu.R index 81d2c4f..e68d601 100755 --- a/bin/run_bambu.R +++ b/bin/run_bambu.R @@ -8,16 +8,12 @@ suppressPackageStartupMessages(library("BSgenome")) ################################################ args = commandArgs(trailingOnly = TRUE) -output_tag <- - strsplit(grep('--tag*', args, value = TRUE), split = '=')[[1]][[2]] -ncore <- - strsplit(grep('--ncore*', args, value = TRUE), split = '=')[[1]][[2]] -genomeseq <- - strsplit(grep('--fasta*', args, value = TRUE), split = '=')[[1]][[2]] +output_tag <- strsplit(grep('--tag*', args, value = TRUE), split = '=')[[1]][[2]] +ncore <- strsplit(grep('--ncore*', args, value = TRUE), split = '=')[[1]][[2]] +genomeseq <- strsplit(grep('--fasta*', args, value = TRUE), split = '=')[[1]][[2]] genomeSequence <- Rsamtools::FaFile(genomeseq) Rsamtools::indexFa(genomeseq) -annot_gtf <- - strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]] +annot_gtf <- strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]] readlist <- args[5:length(args)] print("BAMs:") @@ -32,16 +28,16 @@ se <- bambu( annotations = grlist, genome = genomeSequence, ncore = ncore, - opt.discovery = list(min.readCount = 5, - max.txNDR = 1, - min.txScore.singleExon = 0) + verbose = TRUE, + NDR = 1, + opt.discovery = list(min.txScore.singleExon = 0) ) # Extract NDR -tx_infos <- se@rowRanges@elementMetadata@listData -new_tx_idx <- tx_infos[["newTxClass"]] != "annotation" +tx_infos <- se@rowRanges@elementMetadata +new_tx_idx <- tx_infos[tx_infos$novelTranscript == "TRUE" & tx_infos$txClassDescription != "annotation", c(1,2,3) ] write.csv( - data.frame(tx_infos[c("GENEID", "TXNAME", "txNDR")])[new_tx_idx,], + new_tx_idx, "bambu_ndr.csv", quote = FALSE, row.names = FALSE diff --git a/bin/validate_gtf.py b/bin/validate_gtf.py index 7050984..33ef1aa 100755 --- a/bin/validate_gtf.py +++ b/bin/validate_gtf.py @@ -46,7 +46,7 @@ # Check for RefSeq gene_biotype format if g_biotype == "mRNA": g_biotype = "protein_coding" - if g_biotype == "lnc_RNA": + if g_biotype == "lnc_RNA" or g_biotype == "ncRNA": g_biotype = "lncRNA" record["gene_biotype"] = g_biotype @@ -58,7 +58,7 @@ # Check for RefSeq transcript_biotype format if t_biotype == "mRNA": t_biotype = "protein_coding" - elif t_biotype == "lnc_RNA": + elif t_biotype == "lnc_RNA" or t_biotype == "ncRNA": t_biotype = "lncRNA" record["transcript_biotype"] = t_biotype diff --git a/environment.yml b/environment.yml old mode 100644 new mode 100755 index 8ae12fd..8ad9d51 --- a/environment.yml +++ b/environment.yml @@ -15,6 +15,8 @@ dependencies: - conda-forge::r-ggridges - conda-forge::r-viridis + - conda-forge::procps-ng + - pip - pip: - git+https://github.com/igdrion/transforkmers.git diff --git a/examples/CML10_19.bam b/examples/CML10_19.bam old mode 100644 new mode 100755 diff --git a/examples/annotation_19.gtf b/examples/annotation_19.gtf old mode 100644 new mode 100755 diff --git a/examples/launch.sh b/examples/launch.sh old mode 100644 new mode 100755 diff --git a/examples/popsi_19.bam b/examples/popsi_19.bam old mode 100644 new mode 100755 diff --git a/examples/results/known_lncRNA.geneBodyCoverage.curves.pdf b/examples/results/known_lncRNA.geneBodyCoverage.curves.pdf old mode 100644 new mode 100755 diff --git a/examples/results/novel_lncRNA.geneBodyCoverage.curves.pdf b/examples/results/novel_lncRNA.geneBodyCoverage.curves.pdf old mode 100644 new mode 100755 diff --git a/examples/results/qc_gtf.pdf b/examples/results/qc_gtf.pdf old mode 100644 new mode 100755 diff --git a/examples/samples.txt b/examples/samples.txt old mode 100644 new mode 100755 diff --git a/examples/sequence_19.fa b/examples/sequence_19.fa old mode 100644 new mode 100755 diff --git a/examples/twiny_19.bam b/examples/twiny_19.bam old mode 100644 new mode 100755 diff --git a/main.nf b/main.nf old mode 100644 new mode 100755 diff --git a/modules/bambu/bambu.nf b/modules/bambu/bambu.nf old mode 100644 new mode 100755 index fd33e61..4195a77 --- a/modules/bambu/bambu.nf +++ b/modules/bambu/bambu.nf @@ -1,8 +1,8 @@ process BAMBU { - conda (params.enable_conda ? "bioconda::bioconductor-bambu=2.0.6" : null) + conda (params.enable_conda ? "bioconda::bioconductor-bambu=3.0.8" : null) container "${ workflow.containerEngine == 'singularity' ? - 'https://depot.galaxyproject.org/singularity/bioconductor-bambu:2.0.6--r41h619a076_0' : - 'quay.io/biocontainers/bioconductor-bambu:2.0.6--r41h619a076_0' }" + 'https://depot.galaxyproject.org/singularity/bioconductor-bambu:3.0.8--r42hc247a5b_0' : + 'quay.io/biocontainers/bioconductor-bambu:3.0.8--r42hc247a5b_0' }" publishDir "$params.outdir/bambu", mode: 'copy' cpus params.maxCpu memory params.maxMemory diff --git a/modules/bambu/split.nf b/modules/bambu/split.nf old mode 100644 new mode 100755 index 7e1f32a..bacd911 --- a/modules/bambu/split.nf +++ b/modules/bambu/split.nf @@ -8,8 +8,8 @@ process BAMBU_SPLIT_RESULTS { shell: ''' - grep "tx\\." !{extended_annotation} | awk '$3=="exon"' > novel.gtf - grep "gene\\." novel.gtf > novel_genes.gtf - grep -v "gene\\." novel.gtf > novel_isoforms.gtf + grep "BambuTx" !{extended_annotation} | awk '$3=="exon"' > novel.gtf + grep -e "BambuGene" -e "unstranded.Gene" novel.gtf > novel_genes.gtf + grep -v -e "BambuGene" -e "unstranded.Gene" novel.gtf > novel_isoforms.gtf ''' } diff --git a/modules/feelnc/codpot.nf b/modules/feelnc/codpot.nf old mode 100644 new mode 100755 diff --git a/modules/feelnc/format.nf b/modules/feelnc/format.nf old mode 100644 new mode 100755 diff --git a/modules/header.nf b/modules/header.nf old mode 100644 new mode 100755 diff --git a/modules/index_bam.nf b/modules/index_bam.nf old mode 100644 new mode 100755 diff --git a/modules/input/validate.nf b/modules/input/validate.nf old mode 100644 new mode 100755 diff --git a/modules/merge_novel.nf b/modules/merge_novel.nf old mode 100644 new mode 100755 diff --git a/modules/qc/merge_known_novel.nf b/modules/qc/merge_known_novel.nf old mode 100644 new mode 100755 diff --git a/modules/qc/report.nf b/modules/qc/report.nf old mode 100644 new mode 100755 diff --git a/modules/qc/workflow.nf b/modules/qc/workflow.nf old mode 100644 new mode 100755 diff --git a/modules/restore_biotypes.nf b/modules/restore_biotypes.nf old mode 100644 new mode 100755 diff --git a/modules/rseqc/gene_body_coverage.nf b/modules/rseqc/gene_body_coverage.nf old mode 100644 new mode 100755 diff --git a/modules/rseqc/genepredtobed.nf b/modules/rseqc/genepredtobed.nf old mode 100644 new mode 100755 diff --git a/modules/rseqc/gtftogenepred.nf b/modules/rseqc/gtftogenepred.nf old mode 100644 new mode 100755 diff --git a/modules/rseqc/prepare.nf b/modules/rseqc/prepare.nf old mode 100644 new mode 100755 diff --git a/modules/rseqc/workflow.nf b/modules/rseqc/workflow.nf old mode 100644 new mode 100755 diff --git a/modules/transforkmers/extract_regions.nf b/modules/transforkmers/extract_regions.nf old mode 100644 new mode 100755 diff --git a/modules/transforkmers/extract_sequences.nf b/modules/transforkmers/extract_sequences.nf old mode 100644 new mode 100755 diff --git a/modules/transforkmers/filter.nf b/modules/transforkmers/filter.nf old mode 100644 new mode 100755 diff --git a/modules/transforkmers/predict.nf b/modules/transforkmers/predict.nf old mode 100644 new mode 100755 diff --git a/modules/transforkmers/workflow.nf b/modules/transforkmers/workflow.nf old mode 100644 new mode 100755 diff --git a/nextflow.config b/nextflow.config old mode 100644 new mode 100755 index f64be72..fa22833 --- a/nextflow.config +++ b/nextflow.config @@ -8,7 +8,7 @@ params { filter = false tfkmers_threshold = 0.2 bambu_threshold = 0.2 - operation = "union" + operation = "intersection" tfkmers_model = null tfkmers_tokenizer = null }