Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTSEQ subworkflow #95

Open
wants to merge 12 commits into
base: dev
Choose a base branch
from
103 changes: 103 additions & 0 deletions bin/calculate_TPM_HTSeq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env Rscript

#-------------------------
#
# Description:
# Script to count TPM values for HTSeq quantification results
#
# Created by B. Mika-Gospodorz
# Date: 29th May 2020
#
# Input args: [1] = quantification_results_uniquely_mapped.tsv - HTSeq quantification results
# [2] = host gene attribute used for quantification, e.g. 'gene_id'
# [3] = pathogen gff file with gene features (from 3rd col) replaced with 'quant'
# [4] = host gff file with gene features (from 3rd col) replaced with 'quant'
# Output file: quantification_results_uniquely_mapped_NumReads_TPM.tsv
#
#-------------------------

library(plyr)
library(rtracklayer)


args = commandArgs(trailingOnly=TRUE)
# read quantification table
gene_attribute <- args[2]
table_htseq <- read.table(args[1], sep="\t", stringsAsFactors=F, header = T,row.names = gene_attribute)
pathogen_gff <- import(args[3])
host_gff <- import(args[4])


# function to extract gene length from gff file
extract_transcript_length <- function(x,host_gff,gene_attribute){
#find annotations that contain host transcripts
h_tr <- host_gff[mcols(host_gff)[gene_attribute][[1]]==x]
# Filter quant features
quants <- h_tr[h_tr$type == "quant",]
# merge overlapping features, so that the same bases are covered by reduced collection of features
t_length <- sum(width(reduce(quants)))
return(t_length)
}

######################### extract gene length #######################################

### pathogen
names_gff_pathogen <- mcols(pathogen_gff)[gene_attribute][[1]] # extract gene names from gff file
common_pathogen = intersect(rownames(table_htseq), names_gff_pathogen) # find positions of pathogen genes in quantification table
pathogen_table_htseq <- table_htseq[common_pathogen,] # extract pathogen quantification results
pathogen_gff_match <- match(common_pathogen,names_gff_pathogen) # find positions of corresponding genes in gff file
pathogen_table_htseq <- cbind(length = pathogen_gff@ranges@width[pathogen_gff_match],pathogen_table_htseq) # extract gene length from gff file and combine it with quantification table


### host
names_gff_host <- mcols(host_gff)[gene_attribute][[1]] # extract gene names from gff file
lack_of_attribute <- which(is.na(names_gff_host)) #find positions without gene_attribute (eg.genes that don't have transcript_id attribute)
if (length(lack_of_attribute)> 0) {
host_gff <- host_gff[-lack_of_attribute] #remove positions without gene_attribute
names_gff_host <- names_gff_host[-lack_of_attribute] # extract gene names that contain gene_attribute
}
common_host = intersect(rownames(table_htseq),names_gff_host) # find positions of host genes in quantification table
host_table_htseq <- table_htseq[common_host,] #extract quantification results of host genes
transcript_length <- sapply(common_host, extract_transcript_length, host_gff=host_gff,gene_attribute = gene_attribute,simplify = TRUE) #extract gene length
host_table_htseq <- cbind(length = transcript_length,host_table_htseq) # add gene length into quantification table


# combine host and pathogen tables
htseq_table_with_gene_length = rbind(pathogen_table_htseq,host_table_htseq)


######################### calculate TPM values #######################################

# function to calculate TPMs
tpm <- function(counts, lengths) {
rate <- counts / lengths
return(rate / sum(rate) * 1e6)
}

# function to add _TPM suffix
rename_add_TPM <- function(x) {
paste(x,"_TPM",sep='')
}

# function to add _NumReads suffix
rename_add_NumReads <- function(x) {
paste(x,"_NumReads",sep='')
}


# calculate TPMs for each gene in each sample
TPMs <- apply(htseq_table_with_gene_length[2:dim(htseq_table_with_gene_length)[2]], 2, function(x) tpm(x, htseq_table_with_gene_length$length))
colnames(TPMs) <- colnames(htseq_table_with_gene_length[,-1])
# add TPM suffix to column names with TPM values
colnames(TPMs) <-sapply(colnames(TPMs),function(x) rename_add_TPM(x))
# add NumReads suffix to column names with NumReads values
colnames_NR <- sapply(colnames(htseq_table_with_gene_length[,-1]),function(x) rename_add_NumReads(x))
# add 'length' name for column which stores gene length
colnames(htseq_table_with_gene_length) <- (c('length',colnames_NR))

# combine results
quant_results <- cbind(name = rownames(TPMs), htseq_table_with_gene_length,TPMs)
names(quant_results)[1] <- gene_attribute

# save results
write.table(quant_results,file = "quantification_results_uniquely_mapped_NumReads_TPM.tsv",sep = "\t", row.names = F, quote = F)
47 changes: 47 additions & 0 deletions bin/combine_quant_annotations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 9 12:07:56 2020

@author: B.Mika-Gospdoorz

Input files: .tsv quantification table with combined results from all samples
.tsv file with annotations extracted from gff using extract_annotations_from_gff.py
Output file: *combined_quant_gene_level_annotations.tsv with gene annotations and quantification results
Description: Used to combine annotations with quantification results
"""

import argparse
import pandas as pd


# function to combine annotations with quantification results
def combine_annotations_quant(quantification_table, annotations_table, gene_attribute, organism):
# read quantification results
col_names = pd.read_csv(quantification_table, sep = '\t', nrows=0).columns
types_dict = {gene_attribute: str}
types_dict.update({col: float for col in col_names if col not in types_dict})
quantification = pd.read_csv(quantification_table,sep="\t",index_col=0, dtype = types_dict)
# read annotations
annotations = pd.read_csv(annotations_table,sep="\t",index_col=0, dtype='str')
# combine annotations and quantification results
quant_merged_table = pd.concat([annotations, quantification], axis=1, join = 'inner').sort_index()
quant_merged_table.index.names = [gene_attribute]
# save results
if organism == 'pathogen':
quant_merged_table.to_csv("pathogen_combined_quant_annotations.tsv",sep='\t')
elif organism == 'host':
quant_merged_table.to_csv("host_combined_quant_annotations.tsv",sep='\t')

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""Combine annotations with quantification results""")
parser.add_argument("-q", "--quantification_table", metavar='<quantification_table>', help="Path to quantification results ")
parser.add_argument("-annotations", "--annotations", metavar='<annotations>', help="Path to annotations extracted from gff file")
parser.add_argument("-a", "--gene_attribute",metavar='<gene_attribute>', help="gene attribute")
parser.add_argument("-org", "--organism",metavar='<organism>', help="host, pathogen or both")


args = parser.parse_args()

# combine annotations with quantification results
combine_annotations_quant(args.quantification_table,args.annotations,args.gene_attribute,args.organism)
33 changes: 33 additions & 0 deletions bin/split_quant_tables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

#-------------------------
#
# Description:
# Script to split HTSeq quantification tables into host and pathogen results
#
# Created by B. Mika-Gospodorz
# Date: 3rd June 2020
#
# Input files: $1 HTSeq quantification table
# $2 tsv file that contains host annotations extracted with extract_annotations_from_gff.py
# $3 tsv file that contains pathogen annotations extracted with extract_annotations_from_gff.py
# $4 output file name
# Output: host_* and pathogen_* tsv files defined in the 4th argument
#
#-------------------------

quant_table=$1
host_annotations=$2
pathogen_annotations=$3
out_name=$4

# extract host quantification results based on gene annotation from tsv file (extract first column from host annotation table with gene names and extract quantification results for them)
awk -F"\t" '!(NR<=1) {print $1}' $host_annotations | awk 'NR==FNR{a[$0]=$0}NR>FNR{if($1==a[$1])print $0}' - $quant_table > host_quant
# copy header from quantification table to host quantification table
awk 'NR==1 {print; exit}' $quant_table | cat - host_quant > host_$out_name


# extract pathogen quantification results based on gene annotation from tsv file
awk -F"\t" '!(NR<=1) {print $1}' $pathogen_annotations | awk 'NR==FNR{a[$0]=$0}NR>FNR{if($1==a[$1])print $0}' - $quant_table > pathogen_quant
# copy header from quantification table to pathogen quantification table
awk 'NR==1 {print; exit}' $quant_table | cat - pathogen_quant > pathogen_$out_name
67 changes: 67 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,14 @@ process {
]
ext.args = '--sjdbGTFfeatureExon quant --sjdbGTFtagExonParentTranscript parent'
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:STAR_GENOMEGENERATE' {
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/star/" },
mode: params.publish_dir_mode
]
ext.args = '--sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --sjdbOverhang 100'
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:SALMON_ALIGNMENT_BASED:STAR_ALIGN' {
ext.args = ['--readFilesCommand gunzip -c',
Expand All @@ -183,6 +191,65 @@ process {
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:STAR_ALIGN' {
ext.args = ['--readFilesCommand gunzip -c',
"--outSAMunmapped ${params.outSAMunmapped}",
"--outSAMattributes ${params.outSAMattributes}",
"--outFilterMultimapNmax ${params.outFilterMultimapNmax}",
"--outFilterType ${params.outFilterType}",
"--alignSJoverhangMin ${params.alignSJoverhangMin}",
"--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}",
"--outFilterMismatchNmax ${params.outFilterMismatchNmax}",
"--outFilterMismatchNoverReadLmax ${params.outFilterMismatchNoverReadLmax}",
"--alignIntronMin ${params.alignIntronMin}",
"--alignIntronMax ${params.alignIntronMax}",
"--alignMatesGapMax ${params.alignMatesGapMax}",
"--outWigType ${params.outWigType}",
"--outWigStrand ${params.outWigStrand}",
"--limitBAMsortRAM ${params.limitBAMsortRAM}",
"--sjdbGTFfeatureExon exon",
"--sjdbGTFtagExonParentTranscript Parent",
"--winAnchorMultimapNmax ${params.winAnchorMultimapNmax}"].join(' ').trim()
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/star/" },
mode: params.publish_dir_mode
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:COMBINE_QUANTIFICATION_RESULTS_HTS' {
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/comb_quants_htseq_uniq_mapped/" },
mode: params.publish_dir_mode
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:HTSEQ_UNIQUELY_MAPPED_TPM' {
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/htseq_uniquely_mapped_TPM/" },
mode: params.publish_dir_mode
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:SPLIT_QUATIFICATION_TABLES_HTSEQ' {
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/split_quants_uniq_mapped_host/" },
mode: params.publish_dir_mode
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:COMBINE_ANNOTATIONS_QUANT_PATHOGEN' {
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/comb_annots_quant_host/" },
mode: params.publish_dir_mode
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:COMBINE_ANNOTATIONS_QUANT_HOST' {
publishDir = [
path: { "${params.outdir}/STAR_HTSEQ/comb_annots_quant_host/" },
mode: params.publish_dir_mode
]
}

withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:SALMON_ALIGNMENT_BASED:SALMON_QUANT' {
publishDir = [
Expand Down
19 changes: 19 additions & 0 deletions modules/local/combine_annotations_quant.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
process COMBINE_ANNOTATIONS_QUANT {
label 'process_high'
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'nfcore/dualrnaseq:dev' :
'nfcore/dualrnaseq:dev' }"

input:
path(quantification_table)
path(annotation_table)
val(attribute)

output:
path "pathogen_combined_quant_annotations.tsv"

script:
"""
python3 $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org pathogen
"""
}
13 changes: 6 additions & 7 deletions modules/local/combine_quantification_results_hts.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@ process COMBINE_QUANTIFICATION_RESULTS_HTS {
'nfcore/dualrnaseq:dev' }"

input:
path input_quantification
val gene_attribute
val organism
path(input_quantification)
val(host_attribute)
output:
path "combined_$organism.tsv", emit: combined_quant_data
path "quantification_results_htseq.tsv", emit: combined_quant_data
script:
"""
python $workflow.projectDir/bin/collect_quantification_data_hts.py \
-i $input_quantification \
-a $gene_attribute \
-org $organism
-a $host_attribute
"""
}
}

26 changes: 12 additions & 14 deletions modules/local/htseq.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,39 @@ process HTSEQ {
'quay.io/biocontainers/htseq:2.0.2--py38h7a2e8c7_0' }"

input:
tuple val(meta), path(gff)
val(sample_name), path(st)
tuple val(meta), path(st)
path(gff)
val(host_attribute)
val(stranded)
val(quantifier)

val(stranded)

output:
tuple val(meta), path("*_count.txt"), emit: counts
tuple val(meta), path("*_count.txt"), emit: results
path("versions.yml"), emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def output_file = sample_name + "_count.txt"
def output_file = meta.id + "_count.txt"
"""
htseq-count \\
-n ${task.cpus} \\
-t $quantifier \\
-t quant \\
-f bam \\
-r pos $st $gff \\
-i $host_attribute \\
-s $stranded \\
--max-reads-in-buffer=${params.max_reads_in_buffer} \\
-a ${params.minaqual} \\
${params.htseq_params} \\
$args \\
> $output_file

sed -i '1{h;s/.*/'"$sample_name"'/;G}' "$output_file"
sed -i '1{h;s/.*/'"$meta.id"'/;G}' "$output_file"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
htseq-count: \$( htseq-count --help | grep -i version | awk '{ print \$10 }' )
END_VERSIONS
cat <<-END_VERSIONS > versions.yml
"${task.process}":
htseq-count: \$( htseq-count --help | grep -i version | tail -n 1 | cut -d' ' -f2 )
END_VERSIONS
"""
}
20 changes: 20 additions & 0 deletions modules/local/htseq_uniquely_mapped_tpm.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
process HTSEQ_UNIQUELY_MAPPED_TPM {
label 'process_high'
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'nfcore/dualrnaseq:dev' :
'nfcore/dualrnaseq:dev' }"

input:
tuple val(meta), path(input_quantification)
val(host_attribute)
path(gff_host)
path(gff_pathogen)

output:
path "quantification_results_uniquely_mapped_NumReads_TPM.tsv"

script:
"""
Rscript $workflow.projectDir/bin/calculate_TPM_HTSeq.R $input_quantification $host_attribute $gff_pathogen $gff_host
"""
}
2 changes: 1 addition & 1 deletion modules/local/replace_attribute.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ONE FOR HOST_GENOME
// ONE FOR HOST_TRNA
process REPLACE_ATTRIBUTE_GFF_STAR_SALMON {
process REPLACE_ATTRIBUTE_GFF {
tag "repl_attribute_host_tRNA_gff"

label 'process_high'
Expand Down
Loading