Skip to content

Commit

Permalink
Merge pull request #5 from IGDRion/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
tderrien authored Dec 26, 2023
2 parents 91f5900 + 9f67c99 commit ace38f2
Show file tree
Hide file tree
Showing 45 changed files with 39 additions and 35 deletions.
Empty file modified .github/workflows/build-and-publish.yml
100644 → 100755
Empty file.
Empty file modified .gitignore
100644 → 100755
Empty file.
9 changes: 7 additions & 2 deletions Dockerfile
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
FROM mambaorg/micromamba:0.25.1

USER root

COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/environment.yml

RUN micromamba install -y -n base -f /tmp/environment.yml && \
micromamba clean --all --yes && rm /tmp/environment.yml
RUN apt-get update && \
apt-get install --no-install-recommends -y procps && \
micromamba install -y -n base -f /tmp/environment.yml && \
micromamba clean --all --yes && \
rm /tmp/environment.yml

ENV PATH "$MAMBA_ROOT_PREFIX/bin:$PATH"
Empty file modified README.md
100644 → 100755
Empty file.
Empty file modified assets/metro_map.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 4 additions & 4 deletions bin/filter_gtf_ndr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python3
#! /usr/bin/env python3
from typing import Set
from GTF import GTF

Expand All @@ -9,7 +9,7 @@ def parse_bambu(line):

def parse_tfkmers(line):
ids = line[0].split("::")
return ids[0], ids[1], line[1]
return ids[1], ids[0], line[1]


def parse_ndr(csv, origin, th) -> Set[str]:
Expand All @@ -26,7 +26,7 @@ def parse_ndr(csv, origin, th) -> Set[str]:
elif origin == "tfkmers":
line = parse_tfkmers(line)

_, tx_id, ndr = line
tx_id, _, ndr = line
ndr = float(ndr)

if ndr < th:
Expand All @@ -39,7 +39,7 @@ def filter_count_matrix(file, transcripts, wr):
print(next(file), file=wr)
for line in file:
line_splitted = line.split("\t")
if line_splitted[0].startswith("tx.") and line_splitted[0] not in transcripts:
if line_splitted[0].startswith("BambuTx") and line_splitted[0].lower() not in transcripts:
continue
print(line.rstrip(), file=wr)

Expand Down
3 changes: 2 additions & 1 deletion bin/qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@ transcript = read.csv(paste0(prefix,".transcript.stats"), header = T)
lncRNA_biotypes = c("lncRNA",
"antisense",
"non-coding",
"lnc_RNA")
"lnc_RNA",
"ncRNA")
mRNA_biotypes = c("protein_coding", "mRNA")

transcript = transcript %>%
Expand Down
10 changes: 5 additions & 5 deletions bin/qc_gtf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#! /usr/bin/env python3
#! /usr/bin/env python3
from GTF import GTF


Expand Down Expand Up @@ -48,22 +48,22 @@ def qc_gtf(gtf, gene_counts, ref):
)
exon_str = "exon_biotype,length,discovery\n"

biotypes = set(("protein_coding", "lncRNA", "lnc_RNA"))
biotypes = set(("protein_coding", "lncRNA", "lnc_RNA", "ncRNA"))
for gene in GTF.parse(gtf).values():
if gene["gene_biotype"] not in biotypes:
continue

g_id = gene["gene_id"]
g_biotype = gene["gene_biotype"]
g_status = "novel" if g_id.startswith("gene.") else "known"
g_status = "novel" if g_id.startswith(('BambuGene','unstranded.Gene')) else "known"
g_count = gene_counts[g_id]["counts"] # Counts in all samples
g_samples = gene_counts[g_id]["validates"] # Found in x samples
g_nb_tx = len(gene.transcripts) # Number of isoforms

# Compute genomic extension with start/end in ref
ext_5 = 0
ext_3 = 0
if not g_id.startswith("gene."):
if not g_id.startswith(('BambuGene','unstranded.Gene')):
if gene.strand == "+":
ext_5 = ref_start_end[gene["gene_id"]]["start"] - gene.start
ext_3 = gene.end - ref_start_end[gene["gene_id"]]["end"]
Expand All @@ -82,7 +82,7 @@ def qc_gtf(gtf, gene_counts, ref):
for transcript in gene.transcripts:
tx_id = transcript["transcript_id"]
tx_biotype = transcript["transcript_biotype"]
tx_status = "novel" if tx_id.startswith("tx.") else "known"
tx_status = "novel" if tx_id.startswith("BambuTx") else "known"
tx_nb_exons = len(transcript.exons)
tx_length = sum([len(exon) for exon in transcript.exons])

Expand Down
24 changes: 10 additions & 14 deletions bin/run_bambu.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,12 @@ suppressPackageStartupMessages(library("BSgenome"))
################################################
args = commandArgs(trailingOnly = TRUE)

output_tag <-
strsplit(grep('--tag*', args, value = TRUE), split = '=')[[1]][[2]]
ncore <-
strsplit(grep('--ncore*', args, value = TRUE), split = '=')[[1]][[2]]
genomeseq <-
strsplit(grep('--fasta*', args, value = TRUE), split = '=')[[1]][[2]]
output_tag <- strsplit(grep('--tag*', args, value = TRUE), split = '=')[[1]][[2]]
ncore <- strsplit(grep('--ncore*', args, value = TRUE), split = '=')[[1]][[2]]
genomeseq <- strsplit(grep('--fasta*', args, value = TRUE), split = '=')[[1]][[2]]
genomeSequence <- Rsamtools::FaFile(genomeseq)
Rsamtools::indexFa(genomeseq)
annot_gtf <-
strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]]
annot_gtf <- strsplit(grep('--annotation*', args, value = TRUE), split = '=')[[1]][[2]]
readlist <- args[5:length(args)]

print("BAMs:")
Expand All @@ -32,16 +28,16 @@ se <- bambu(
annotations = grlist,
genome = genomeSequence,
ncore = ncore,
opt.discovery = list(min.readCount = 5,
max.txNDR = 1,
min.txScore.singleExon = 0)
verbose = TRUE,
NDR = 1,
opt.discovery = list(min.txScore.singleExon = 0)
)

# Extract NDR
tx_infos <- se@rowRanges@elementMetadata@listData
new_tx_idx <- tx_infos[["newTxClass"]] != "annotation"
tx_infos <- se@rowRanges@elementMetadata
new_tx_idx <- tx_infos[tx_infos$novelTranscript == "TRUE" & tx_infos$txClassDescription != "annotation", c(1,2,3) ]
write.csv(
data.frame(tx_infos[c("GENEID", "TXNAME", "txNDR")])[new_tx_idx,],
new_tx_idx,
"bambu_ndr.csv",
quote = FALSE,
row.names = FALSE
Expand Down
4 changes: 2 additions & 2 deletions bin/validate_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
# Check for RefSeq gene_biotype format
if g_biotype == "mRNA":
g_biotype = "protein_coding"
if g_biotype == "lnc_RNA":
if g_biotype == "lnc_RNA" or g_biotype == "ncRNA":
g_biotype = "lncRNA"
record["gene_biotype"] = g_biotype

Expand All @@ -58,7 +58,7 @@
# Check for RefSeq transcript_biotype format
if t_biotype == "mRNA":
t_biotype = "protein_coding"
elif t_biotype == "lnc_RNA":
elif t_biotype == "lnc_RNA" or t_biotype == "ncRNA":
t_biotype = "lncRNA"
record["transcript_biotype"] = t_biotype

Expand Down
2 changes: 2 additions & 0 deletions environment.yml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ dependencies:
- conda-forge::r-ggridges
- conda-forge::r-viridis

- conda-forge::procps-ng

- pip
- pip:
- git+https://github.com/igdrion/transforkmers.git
Expand Down
Loading

0 comments on commit ace38f2

Please sign in to comment.