Merge pull request #5 from IGDRion/dev

Dev
IGDRion · Dec 26, 2023 · ace38f2 · ace38f2
2 parents 91f5900 + 9f67c99
commit ace38f2
Show file tree

Hide file tree

Showing 45 changed files with 39 additions and 35 deletions.
diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml
diff --git a/.gitignore b/.gitignore
diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,13 @@
 FROM mambaorg/micromamba:0.25.1
 
+USER root
+
 COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/environment.yml
 
-RUN micromamba install -y -n base -f /tmp/environment.yml && \
-    micromamba clean --all --yes && rm /tmp/environment.yml
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y procps && \
+    micromamba install -y -n base -f /tmp/environment.yml && \
+    micromamba clean --all --yes && \
+    rm /tmp/environment.yml
 
 ENV PATH "$MAMBA_ROOT_PREFIX/bin:$PATH"
diff --git a/README.md b/README.md
diff --git a/assets/metro_map.png b/assets/metro_map.png
diff --git a/bin/filter_gtf_ndr.py b/bin/filter_gtf_ndr.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python3
+#! /usr/bin/env python3 
 from typing import Set
 from GTF import GTF
 
@@ -9,7 +9,7 @@ def parse_bambu(line):
 
 def parse_tfkmers(line):
     ids = line[0].split("::")
-    return ids[0], ids[1], line[1]
+    return ids[1], ids[0], line[1]
 
 
 def parse_ndr(csv, origin, th) -> Set[str]:
@@ -26,7 +26,7 @@ def parse_ndr(csv, origin, th) -> Set[str]:
         elif origin == "tfkmers":
             line = parse_tfkmers(line)
 
-        _, tx_id, ndr = line
+        tx_id, _, ndr = line
         ndr = float(ndr)
 
         if ndr < th:
@@ -39,7 +39,7 @@ def filter_count_matrix(file, transcripts, wr):
     print(next(file), file=wr)
     for line in file:
         line_splitted = line.split("\t")
-        if line_splitted[0].startswith("tx.") and line_splitted[0] not in transcripts:
+        if line_splitted[0].startswith("BambuTx") and line_splitted[0].lower() not in transcripts:
             continue
         print(line.rstrip(), file=wr)
 

diff --git a/bin/qc.R b/bin/qc.R
@@ -233,7 +233,8 @@ transcript = read.csv(paste0(prefix,".transcript.stats"), header = T)
 lncRNA_biotypes = c("lncRNA",
                     "antisense",
                     "non-coding",
-                    "lnc_RNA")
+                    "lnc_RNA",
+                    "ncRNA")
 mRNA_biotypes = c("protein_coding", "mRNA")
 
 transcript = transcript %>%

diff --git a/bin/qc_gtf.py b/bin/qc_gtf.py
@@ -1,4 +1,4 @@
-#! /usr/bin/env python3
+#! /usr/bin/env python3 
 from GTF import GTF
 
 
@@ -48,22 +48,22 @@ def qc_gtf(gtf, gene_counts, ref):
     )
     exon_str = "exon_biotype,length,discovery\n"
 
-    biotypes = set(("protein_coding", "lncRNA", "lnc_RNA"))
+    biotypes = set(("protein_coding", "lncRNA", "lnc_RNA", "ncRNA"))
     for gene in GTF.parse(gtf).values():
         if gene["gene_biotype"] not in biotypes:
             continue
 
         g_id = gene["gene_id"]
         g_biotype = gene["gene_biotype"]
-        g_status = "novel" if g_id.startswith("gene.") else "known"
+        g_status = "novel" if g_id.startswith(('BambuGene','unstranded.Gene')) else "known"
         g_count = gene_counts[g_id]["counts"]  # Counts in all samples
         g_samples = gene_counts[g_id]["validates"]  # Found in x samples
         g_nb_tx = len(gene.transcripts)  # Number of isoforms
 
         # Compute genomic extension with start/end in ref
         ext_5 = 0
         ext_3 = 0
-        if not g_id.startswith("gene."):
+        if not g_id.startswith(('BambuGene','unstranded.Gene')):
             if gene.strand == "+":
                 ext_5 = ref_start_end[gene["gene_id"]]["start"] - gene.start
                 ext_3 = gene.end - ref_start_end[gene["gene_id"]]["end"]
@@ -82,7 +82,7 @@ def qc_gtf(gtf, gene_counts, ref):
         for transcript in gene.transcripts:
             tx_id = transcript["transcript_id"]
             tx_biotype = transcript["transcript_biotype"]
-            tx_status = "novel" if tx_id.startswith("tx.") else "known"
+            tx_status = "novel" if tx_id.startswith("BambuTx") else "known"
             tx_nb_exons = len(transcript.exons)
             tx_length = sum([len(exon) for exon in transcript.exons])
 

diff --git a/bin/run_bambu.R b/bin/run_bambu.R
@@ -8,16 +8,12 @@ suppressPackageStartupMessages(library("BSgenome"))
 ################################################
 args = commandArgs(trailingOnly = TRUE)
 
-output_tag     <-
-  strsplit(grep('--tag*', args, value = TRUE), split = '=')[[1]][[2]]
-ncore          <-
-  strsplit(grep('--ncore*', args, value = TRUE), split = '=')[[1]][[2]]
-genomeseq      <-
-  strsplit(grep('--fasta*', args, value = TRUE), split = '=')[[1]][[2]]
+output_tag     <- strsplit(grep('--tag*', args, value = TRUE), split = '=')[[1]][[2]]
+ncore          <- strsplit(grep('--ncore*', args, value = TRUE), split = '=')[[1]][[2]]
+genomeseq      <- strsplit(grep('--fasta*', args, value = TRUE), split = '=')[[1]][[2]]
 genomeSequence <- Rsamtools::FaFile(genomeseq)
 Rsamtools::indexFa(genomeseq)
-annot_gtf      <-
-  strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]]
+annot_gtf      <- strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]]
 readlist       <- args[5:length(args)]
 
 print("BAMs:")
@@ -32,16 +28,16 @@ se     <- bambu(
   annotations = grlist,
   genome = genomeSequence,
   ncore = ncore,
-  opt.discovery = list(min.readCount = 5,
-                       max.txNDR = 1,
-                       min.txScore.singleExon = 0)
+  verbose = TRUE,
+  NDR = 1,
+  opt.discovery = list(min.txScore.singleExon = 0)
 )
 
 # Extract NDR
-tx_infos   <- se@rowRanges@elementMetadata@listData
-new_tx_idx <- tx_infos[["newTxClass"]] != "annotation"
+tx_infos   <- se@rowRanges@elementMetadata
+new_tx_idx <- tx_infos[tx_infos$novelTranscript == "TRUE" & tx_infos$txClassDescription != "annotation", c(1,2,3) ]
 write.csv(
-  data.frame(tx_infos[c("GENEID", "TXNAME", "txNDR")])[new_tx_idx,],
+  new_tx_idx,
   "bambu_ndr.csv",
   quote = FALSE,
   row.names = FALSE

diff --git a/bin/validate_gtf.py b/bin/validate_gtf.py
@@ -46,7 +46,7 @@
         # Check for RefSeq gene_biotype format
         if g_biotype == "mRNA":
             g_biotype = "protein_coding"
-        if g_biotype == "lnc_RNA":
+        if g_biotype == "lnc_RNA" or g_biotype == "ncRNA":
             g_biotype = "lncRNA"
         record["gene_biotype"] = g_biotype
 
@@ -58,7 +58,7 @@
         # Check for RefSeq transcript_biotype format
         if t_biotype == "mRNA":
             t_biotype = "protein_coding"
-        elif t_biotype == "lnc_RNA":
+        elif t_biotype == "lnc_RNA" or t_biotype == "ncRNA":
             t_biotype = "lncRNA"
         record["transcript_biotype"] = t_biotype
 

diff --git a/environment.yml b/environment.yml
@@ -15,6 +15,8 @@ dependencies:
   - conda-forge::r-ggridges
   - conda-forge::r-viridis
 
+  - conda-forge::procps-ng
+
   - pip
   - pip:
       - git+https://github.com/igdrion/transforkmers.git