nf-core · lgrochowalski · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/bin/calculate_TPM_HTSeq.R b/bin/calculate_TPM_HTSeq.R
@@ -0,0 +1,103 @@
+#!/usr/bin/env Rscript
+
+#-------------------------
+#
+# Description: 
+# Script to count TPM values for HTSeq quantification results
+#
+# Created by B. Mika-Gospodorz
+# Date: 29th May 2020
+#
+# Input args:   [1] = quantification_results_uniquely_mapped.tsv - HTSeq quantification results
+#			    [2] = host gene attribute used for quantification, e.g. 'gene_id'
+#				[3] =  pathogen gff file with gene features (from 3rd col) replaced with 'quant'
+#				[4] =  host gff file with gene features (from 3rd col) replaced with 'quant'
+# Output file: quantification_results_uniquely_mapped_NumReads_TPM.tsv
+#
+#-------------------------
+
+library(plyr)
+library(rtracklayer)
+
+
+args = commandArgs(trailingOnly=TRUE)
+# read quantification table
+gene_attribute <- args[2]
+table_htseq <- read.table(args[1], sep="\t", stringsAsFactors=F, header = T,row.names = gene_attribute) 
+pathogen_gff <- import(args[3])
+host_gff <- import(args[4])
+
+
+# function to extract gene length from gff file
+extract_transcript_length <- function(x,host_gff,gene_attribute){
+  #find annotations that contain host transcripts
+  h_tr <- host_gff[mcols(host_gff)[gene_attribute][[1]]==x]
+  # Filter quant features
+  quants <- h_tr[h_tr$type == "quant",]
+  # merge overlapping features, so that the same bases are covered by reduced collection of features
+  t_length <- sum(width(reduce(quants)))
+  return(t_length)
+}
+
+######################### extract gene length #######################################
+
+### pathogen
+names_gff_pathogen <- mcols(pathogen_gff)[gene_attribute][[1]]  # extract gene names from gff file
+common_pathogen = intersect(rownames(table_htseq), names_gff_pathogen) # find positions of pathogen genes in quantification table
+pathogen_table_htseq <- table_htseq[common_pathogen,] # extract pathogen quantification results
+pathogen_gff_match <- match(common_pathogen,names_gff_pathogen) # find positions of corresponding genes in gff file
+pathogen_table_htseq <- cbind(length = pathogen_gff@ranges@width[pathogen_gff_match],pathogen_table_htseq) # extract gene length from gff file and combine it with quantification table
+
+
+### host
+names_gff_host <- mcols(host_gff)[gene_attribute][[1]] # extract gene names from gff file
+lack_of_attribute <- which(is.na(names_gff_host)) #find positions without gene_attribute (eg.genes that don't have transcript_id attribute)
+if (length(lack_of_attribute)> 0) {
+  host_gff <- host_gff[-lack_of_attribute] #remove positions without gene_attribute 
+  names_gff_host <- names_gff_host[-lack_of_attribute] # extract gene names that contain gene_attribute
+}
+common_host = intersect(rownames(table_htseq),names_gff_host) # find positions of host genes in quantification table
+host_table_htseq <- table_htseq[common_host,] #extract quantification results of host genes 
+transcript_length <- sapply(common_host, extract_transcript_length, host_gff=host_gff,gene_attribute = gene_attribute,simplify = TRUE) #extract gene length
+host_table_htseq <- cbind(length = transcript_length,host_table_htseq) # add gene length into quantification table
+
+
+# combine host and pathogen tables
+htseq_table_with_gene_length = rbind(pathogen_table_htseq,host_table_htseq)
+
+
+######################### calculate TPM values ####################################### 
+
+# function to calculate TPMs
+tpm <- function(counts, lengths) {
+  rate <- counts / lengths
+  return(rate / sum(rate) * 1e6)
+}
+
+# function to add _TPM suffix
+rename_add_TPM <- function(x) {
+  paste(x,"_TPM",sep='')
+}
+
+# function to add _NumReads suffix
+rename_add_NumReads <- function(x) {
+  paste(x,"_NumReads",sep='')
+}
+
+
+# calculate TPMs for each gene in each sample
+TPMs <- apply(htseq_table_with_gene_length[2:dim(htseq_table_with_gene_length)[2]], 2, function(x) tpm(x, htseq_table_with_gene_length$length))
+colnames(TPMs) <- colnames(htseq_table_with_gene_length[,-1])
+# add TPM suffix to column names with TPM values
+colnames(TPMs) <-sapply(colnames(TPMs),function(x) rename_add_TPM(x))
+# add NumReads suffix to column names with NumReads values
+colnames_NR <- sapply(colnames(htseq_table_with_gene_length[,-1]),function(x) rename_add_NumReads(x))
+# add 'length' name for column which stores gene length
+colnames(htseq_table_with_gene_length) <- (c('length',colnames_NR))
+
+# combine results
+quant_results <- cbind(name = rownames(TPMs), htseq_table_with_gene_length,TPMs)
+names(quant_results)[1] <- gene_attribute
+
+# save results
+write.table(quant_results,file = "quantification_results_uniquely_mapped_NumReads_TPM.tsv",sep = "\t", row.names = F, quote = F)
diff --git a/bin/combine_quant_annotations.py b/bin/combine_quant_annotations.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Apr  9 12:07:56 2020
+
+@author: B.Mika-Gospdoorz
+
+Input files: .tsv quantification table with combined results from all samples
+            .tsv file with annotations extracted from gff using extract_annotations_from_gff.py
+Output file: *combined_quant_gene_level_annotations.tsv with gene annotations and quantification results
+Description: Used to combine annotations with quantification results
+"""
+
+import argparse
+import pandas as pd
+
+
+# function to combine annotations with quantification results
+def combine_annotations_quant(quantification_table, annotations_table, gene_attribute, organism):
+    # read quantification results
+    col_names = pd.read_csv(quantification_table, sep = '\t', nrows=0).columns
+    types_dict = {gene_attribute: str}
+    types_dict.update({col: float for col in col_names if col not in types_dict})
+    quantification = pd.read_csv(quantification_table,sep="\t",index_col=0, dtype = types_dict)
+    # read annotations 
+    annotations = pd.read_csv(annotations_table,sep="\t",index_col=0, dtype='str')
+    # combine annotations and quantification results
+    quant_merged_table = pd.concat([annotations, quantification], axis=1, join = 'inner').sort_index()
+    quant_merged_table.index.names = [gene_attribute] 
+    # save results
+    if organism == 'pathogen':
+        quant_merged_table.to_csv("pathogen_combined_quant_annotations.tsv",sep='\t')
+    elif organism == 'host':
+        quant_merged_table.to_csv("host_combined_quant_annotations.tsv",sep='\t')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Combine annotations with quantification results""")
+    parser.add_argument("-q", "--quantification_table", metavar='<quantification_table>', help="Path to quantification results ")
+    parser.add_argument("-annotations", "--annotations", metavar='<annotations>', help="Path to annotations extracted from gff file")
+    parser.add_argument("-a", "--gene_attribute",metavar='<gene_attribute>', help="gene attribute")
+    parser.add_argument("-org", "--organism",metavar='<organism>', help="host, pathogen or both")
+
+
+    args = parser.parse_args()
+
+    # combine annotations with quantification results
+    combine_annotations_quant(args.quantification_table,args.annotations,args.gene_attribute,args.organism)
diff --git a/bin/split_quant_tables.sh b/bin/split_quant_tables.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to split HTSeq quantification tables into host and pathogen results
+#
+# Created by B. Mika-Gospodorz
+# Date: 3rd June 2020
+#
+# Input files:  $1 HTSeq quantification table 
+# 		$2 tsv file that contains host annotations extracted with extract_annotations_from_gff.py
+# 		$3 tsv file that contains pathogen annotations extracted with extract_annotations_from_gff.py
+# 		$4 output file name
+# Output: host_* and pathogen_* tsv files defined in the 4th argument
+#
+#-------------------------
+
+quant_table=$1
+host_annotations=$2
+pathogen_annotations=$3
+out_name=$4
+
+# extract host quantification results based on gene annotation from tsv file (extract first column from host annotation table with gene names and extract quantification results for them)
+awk -F"\t" '!(NR<=1) {print $1}' $host_annotations | awk 'NR==FNR{a[$0]=$0}NR>FNR{if($1==a[$1])print $0}' - $quant_table > host_quant
+# copy header from quantification table to host quantification table
+awk 'NR==1 {print; exit}' $quant_table | cat - host_quant > host_$out_name
+
+
+# extract pathogen quantification results based on gene annotation from tsv file
+awk -F"\t" '!(NR<=1) {print $1}' $pathogen_annotations | awk 'NR==FNR{a[$0]=$0}NR>FNR{if($1==a[$1])print $0}' - $quant_table > pathogen_quant
+# copy header from quantification table to pathogen quantification table
+awk 'NR==1 {print; exit}' $quant_table | cat - pathogen_quant > pathogen_$out_name
diff --git a/conf/modules.config b/conf/modules.config
@@ -157,6 +157,14 @@ process {
         ]
         ext.args = '--sjdbGTFfeatureExon quant --sjdbGTFtagExonParentTranscript parent'
     }
+
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:STAR_GENOMEGENERATE' {
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/star/" },
+            mode: params.publish_dir_mode
+        ]
+        ext.args = '--sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --sjdbOverhang 100'
+    }
 
     withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:SALMON_ALIGNMENT_BASED:STAR_ALIGN' {
         ext.args = ['--readFilesCommand gunzip -c',
@@ -183,6 +191,65 @@ process {
         ]
     }
 
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:STAR_ALIGN' {
+        ext.args = ['--readFilesCommand gunzip -c',
+        "--outSAMunmapped ${params.outSAMunmapped}",
+        "--outSAMattributes ${params.outSAMattributes}",
+        "--outFilterMultimapNmax ${params.outFilterMultimapNmax}",
+        "--outFilterType ${params.outFilterType}",
+        "--alignSJoverhangMin ${params.alignSJoverhangMin}",
+        "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}",
+        "--outFilterMismatchNmax ${params.outFilterMismatchNmax}",
+        "--outFilterMismatchNoverReadLmax ${params.outFilterMismatchNoverReadLmax}",
+        "--alignIntronMin ${params.alignIntronMin}",
+        "--alignIntronMax ${params.alignIntronMax}",
+        "--alignMatesGapMax ${params.alignMatesGapMax}",
+        "--outWigType ${params.outWigType}",
+        "--outWigStrand ${params.outWigStrand}",
+        "--limitBAMsortRAM ${params.limitBAMsortRAM}",
+        "--sjdbGTFfeatureExon exon",
+        "--sjdbGTFtagExonParentTranscript Parent",
+        "--winAnchorMultimapNmax ${params.winAnchorMultimapNmax}"].join(' ').trim()
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/star/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:COMBINE_QUANTIFICATION_RESULTS_HTS' {
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/comb_quants_htseq_uniq_mapped/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:HTSEQ_UNIQUELY_MAPPED_TPM' {
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/htseq_uniquely_mapped_TPM/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:SPLIT_QUATIFICATION_TABLES_HTSEQ' {
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/split_quants_uniq_mapped_host/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:COMBINE_ANNOTATIONS_QUANT_PATHOGEN' {
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/comb_annots_quant_host/" },
+            mode: params.publish_dir_mode
+        ]
+    }
+
+    withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:COMBINE_ANNOTATIONS_QUANT_HOST' {
+        publishDir = [
+            path: { "${params.outdir}/STAR_HTSEQ/comb_annots_quant_host/" },
+            mode: params.publish_dir_mode
+        ]
+    }
 
     withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:SALMON_ALIGNMENT_BASED:SALMON_QUANT' {
         publishDir = [

diff --git a/modules/local/combine_annotations_quant.nf b/modules/local/combine_annotations_quant.nf
@@ -0,0 +1,19 @@
+process COMBINE_ANNOTATIONS_QUANT {
+	label 'process_high'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'nfcore/dualrnaseq:dev' :
+        'nfcore/dualrnaseq:dev' }"
+
+	input: 
+		path(quantification_table)
+		path(annotation_table)
+		val(attribute)
+
+	output:
+		path "pathogen_combined_quant_annotations.tsv"
+
+	script:
+	"""
+	python3 $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org pathogen
+	"""
+	}
diff --git a/modules/local/combine_quantification_results_hts.nf b/modules/local/combine_quantification_results_hts.nf
@@ -5,16 +5,15 @@ process COMBINE_QUANTIFICATION_RESULTS_HTS {
         'nfcore/dualrnaseq:dev' }"
 
     input: 
-	    path input_quantification
-	    val gene_attribute
-        val organism
+	    path(input_quantification)
+        val(host_attribute)
     output:
-	    path "combined_$organism.tsv", emit: combined_quant_data
+	    path "quantification_results_htseq.tsv", emit: combined_quant_data
     script:
     """
     python $workflow.projectDir/bin/collect_quantification_data_hts.py \
         -i $input_quantification \
-        -a $gene_attribute \
-        -org $organism
+        -a $host_attribute        
     """
-}
+}
+
diff --git a/modules/local/htseq.nf b/modules/local/htseq.nf
@@ -8,41 +8,39 @@ process HTSEQ {
         'quay.io/biocontainers/htseq:2.0.2--py38h7a2e8c7_0' }"
 
     input:
-    tuple val(meta), path(gff)
-	val(sample_name), path(st)
+	tuple val(meta), path(st)
+    path(gff)
 	val(host_attribute)
-	val(stranded)
-    val(quantifier)
-
+    val(stranded)
+
     output:
-	tuple val(meta), path("*_count.txt"), emit: counts
+	tuple val(meta), path("*_count.txt"), emit: results
     path("versions.yml"), emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def args = task.ext.args ?: ''
-	def output_file = sample_name + "_count.txt"
+	def output_file = meta.id + "_count.txt"
     """
 	htseq-count  \\
         -n ${task.cpus}  \\
-        -t $quantifier  \\
+        -t quant  \\
         -f bam  \\
         -r pos $st $gff  \\
         -i $host_attribute  \\
         -s $stranded  \\
         --max-reads-in-buffer=${params.max_reads_in_buffer}  \\
         -a ${params.minaqual}  \\
-        ${params.htseq_params}  \\
         $args  \\
         > $output_file
 
-	sed -i '1{h;s/.*/'"$sample_name"'/;G}' "$output_file"
+	sed -i '1{h;s/.*/'"$meta.id"'/;G}' "$output_file"
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        htseq-count: \$( htseq-count --help | grep -i version | awk '{ print \$10 }' )
-    END_VERSIONS
+cat <<-END_VERSIONS > versions.yml
+"${task.process}":
+    htseq-count: \$( htseq-count --help | grep -i version | tail -n 1 | cut -d' ' -f2 )
+END_VERSIONS
     """
 }
diff --git a/modules/local/htseq_uniquely_mapped_tpm.nf b/modules/local/htseq_uniquely_mapped_tpm.nf
@@ -0,0 +1,20 @@
+process HTSEQ_UNIQUELY_MAPPED_TPM {
+	label 'process_high'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'nfcore/dualrnaseq:dev' :
+        'nfcore/dualrnaseq:dev' }"
+
+	input: 
+		tuple val(meta), path(input_quantification)
+		val(host_attribute)
+		path(gff_host)
+		path(gff_pathogen)
+
+	output:
+		path "quantification_results_uniquely_mapped_NumReads_TPM.tsv"
+
+	script:
+	"""
+	Rscript $workflow.projectDir/bin/calculate_TPM_HTSeq.R $input_quantification $host_attribute $gff_pathogen $gff_host
+	"""
+	}
diff --git a/modules/local/replace_attribute.nf b/modules/local/replace_attribute.nf
@@ -1,6 +1,6 @@
 // ONE FOR HOST_GENOME
 // ONE FOR HOST_TRNA
-process REPLACE_ATTRIBUTE_GFF_STAR_SALMON {
+process REPLACE_ATTRIBUTE_GFF {
 	tag "repl_attribute_host_tRNA_gff"
 
 	label 'process_high'