From d7bfcd7831cb45dbe8802e2fd3cdb6a120854a71 Mon Sep 17 00:00:00 2001 From: Adrian Seyboldt Date: Fri, 9 Sep 2016 15:19:09 +0200 Subject: [PATCH 01/11] Load bowtie2 in jobscript --- jobscript.sh | 1 + 1 file changed, 1 insertion(+) mode change 100755 => 100644 jobscript.sh diff --git a/jobscript.sh b/jobscript.sh old mode 100755 new mode 100644 index 7e18136..28bfb4f --- a/jobscript.sh +++ b/jobscript.sh @@ -12,6 +12,7 @@ set -e module load bio/fastqc/0.10 module load qbic/anaconda module load qbic/htseq/0.6.1p2 +module load qbic/bowtie2/2.2.3 module load qbic/tophat module load bio/samtools/1.2 From dfe0aa714fad8393a36e1a2b0d97603f106a2516 Mon Sep 17 00:00:00 2001 From: qbicStefanC Date: Tue, 8 Nov 2016 12:51:22 +0100 Subject: [PATCH 02/11] Update Snakefile --- Snakefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 231b274..9b9b13a 100644 --- a/Snakefile +++ b/Snakefile @@ -208,8 +208,13 @@ rule MergeAdapters: output: "MergeAdapters/merged.fasta" shell: "cat {input} > {output}" +rule subset_Adapters: + input: "MergeAdapters/merged.fasta", + output: "MergeAdapters/merged.subset.fasta" + shell: "awk '/^>/ {{P=index($0,""No Hit"")==0}} {{if(P) print}} ' {input} > {output}" + rule CutAdapt: - input: "MergeAdapters/merged.fasta", "PreFilterReads/{name}.fastq" + input: "MergeAdapters/merged.subset.fasta", "PreFilterReads/{name}.fastq" output: "CutAdaptMerge/{name}.fastq" run: with open(str(input[0])) as f: From fe25802e2570058817a8d4ec539339de1f557720 Mon Sep 17 00:00:00 2001 From: qbicStefanC Date: Tue, 8 Nov 2016 15:54:46 +0100 Subject: [PATCH 03/11] Update Snakefile --- Snakefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 9b9b13a..8a6b065 100644 --- a/Snakefile +++ b/Snakefile @@ -211,7 +211,10 @@ rule MergeAdapters: rule subset_Adapters: input: "MergeAdapters/merged.fasta", output: "MergeAdapters/merged.subset.fasta" - shell: "awk '/^>/ {{P=index($0,""No Hit"")==0}} {{if(P) print}} ' {input} > {output}" + shell: + """ + awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} + """ rule CutAdapt: input: "MergeAdapters/merged.subset.fasta", "PreFilterReads/{name}.fastq" From cd28335ae96903b8d8173ee52df443970c0f02b0 Mon Sep 17 00:00:00 2001 From: Timo Lucas Date: Fri, 4 May 2018 12:20:49 +0200 Subject: [PATCH 04/11] Update Snakefile --- Snakefile | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 8a6b065..180de5f 100644 --- a/Snakefile +++ b/Snakefile @@ -101,6 +101,9 @@ OUTPUT_FILES.extend(expand("TopHat2/{name}/accepted_hits.bai", name=INPUT_FILES, OUTPUT_FILES.extend(expand("Summary/MappingStats/{name}.txt", name=INPUT_FILES, result=RESULT)) #OUTPUT_FILES.append("checksums.ok") OUTPUT_FILES.append(result('all_counts.csv')) +OUTPUT_FILES.append("Summary/software_versions.txt") + + rule all: input: OUTPUT_FILES @@ -213,7 +216,7 @@ rule subset_Adapters: output: "MergeAdapters/merged.subset.fasta" shell: """ - awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} + awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} """ rule CutAdapt: @@ -293,3 +296,19 @@ rule NumreadsOrig: input: "fastq/{name}.fastq" output: "Summary/NumReads/Original/{name}.txt" shell: '''dc -e "$(wc -l {input} | cut -f1 -d' ') 4 / p" > {output}''' +""" +Rule to get software versions of used programs in workflow. Rule either calls program with --version flag if possible, or runs +it without parameters displaying output row containing version information with unix tail command. +It then redirects it to Summary/software_versions.txt +""" +rule SoftwareVersions: + input: result("all_counts.csv") + output: "Summary/software_versions.txt" + run: + shell("anaconda --version > Summary/software_versions.txt") + shell("conda --version >> Summary/software_versions.txt") + shell("fastqc --version >> Summary/software_versions.txt") + shell("htseq-count -h | tail -1 >> Summary/software_versions.txt") + shell("bowtie2 --version | head -1 >> Summary/software_versions.txt") + shell("tophat2 --version >> Summary/software_versions.txt") + shell("samtools --version | head -2 >> Summary/software_versions.txt") From 61cb543a3aa4b3c29598d504b80c94595470e059 Mon Sep 17 00:00:00 2001 From: Timo Lucas Date: Fri, 4 May 2018 12:25:47 +0200 Subject: [PATCH 05/11] Software Versions Rule Added a rule to display the versions of the software used by the workflow. It simply runs the used tools with --version flag (if possible) and stores the info in a text file in the Summary folder. As htseq-count does not provide the --version flag I just took the last line when calling it without parameters. --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 180de5f..d5d3a5b 100644 --- a/Snakefile +++ b/Snakefile @@ -299,7 +299,7 @@ rule NumreadsOrig: """ Rule to get software versions of used programs in workflow. Rule either calls program with --version flag if possible, or runs it without parameters displaying output row containing version information with unix tail command. -It then redirects it to Summary/software_versions.txt +It then redirects it to Summary/software_versions.txt """ rule SoftwareVersions: input: result("all_counts.csv") From cffe75ec8cedba6059b4b1f653714228e29a129d Mon Sep 17 00:00:00 2001 From: Timo Lucas Date: Fri, 4 May 2018 12:30:31 +0200 Subject: [PATCH 06/11] indentation Fixed small indentation error --- Snakefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index d5d3a5b..1f14489 100644 --- a/Snakefile +++ b/Snakefile @@ -216,7 +216,7 @@ rule subset_Adapters: output: "MergeAdapters/merged.subset.fasta" shell: """ - awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} + awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} """ rule CutAdapt: @@ -296,11 +296,13 @@ rule NumreadsOrig: input: "fastq/{name}.fastq" output: "Summary/NumReads/Original/{name}.txt" shell: '''dc -e "$(wc -l {input} | cut -f1 -d' ') 4 / p" > {output}''' + """ Rule to get software versions of used programs in workflow. Rule either calls program with --version flag if possible, or runs it without parameters displaying output row containing version information with unix tail command. It then redirects it to Summary/software_versions.txt """ + rule SoftwareVersions: input: result("all_counts.csv") output: "Summary/software_versions.txt" From 111dd7f6ac95a9d812896540982064374cc88333 Mon Sep 17 00:00:00 2001 From: Timo Lucas Date: Fri, 4 May 2018 12:31:44 +0200 Subject: [PATCH 07/11] Update Snakefile --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 1f14489..0b2c728 100644 --- a/Snakefile +++ b/Snakefile @@ -216,7 +216,7 @@ rule subset_Adapters: output: "MergeAdapters/merged.subset.fasta" shell: """ - awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} + awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} """ rule CutAdapt: From b2fba368df72887730238bce846e76cdff05751b Mon Sep 17 00:00:00 2001 From: Timo Lucas Date: Fri, 4 May 2018 12:33:50 +0200 Subject: [PATCH 08/11] Update Snakefile --- Snakefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Snakefile b/Snakefile index 0b2c728..623a3ff 100644 --- a/Snakefile +++ b/Snakefile @@ -103,8 +103,6 @@ OUTPUT_FILES.extend(expand("Summary/MappingStats/{name}.txt", name=INPUT_FILES, OUTPUT_FILES.append(result('all_counts.csv')) OUTPUT_FILES.append("Summary/software_versions.txt") - - rule all: input: OUTPUT_FILES From 4fb1afaae012760b5219a5a53a040d6e398214d6 Mon Sep 17 00:00:00 2001 From: Timo Lucas Date: Fri, 18 May 2018 13:49:30 +0200 Subject: [PATCH 09/11] Update README.md --- README.md | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dce21d0..6acc3d5 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,14 @@ -rna seq workflow for use with qproject. +RNA-Seq workflow for single-end data. The workflow can be downloaded and run on a cluster environment using the tool qproject provided on github: https://github.com/qbicsoftware/qproject -Add a config file "params.json" in `etc`: +The workflow uses a module system to load the required software. Be sure to check the `jobscript.sh` file to see which software modules are required. The modules are loaded automatically when using `qproject run` to start the workflow, otherwise they have to be loaded manually. + +One should use qproject to download the files. This also creates all folders necessary for the workflow. + +``` +qproject create -t . -w github:qbicsoftware/rnaseq +``` + +Be sure to add a config file "params.json" in `etc` which should look like this: ```json { @@ -17,6 +25,55 @@ Add a config file "params.json" in `etc`: where `indexed_genome` and `gtf` are paths relative to `ref`. `indexed_genome` is the basename of a bowtie2 index. +`gtf` is the .gtf file of the reference genome The parameters `stranded`, `overlap_mode`, `feature_type` and `gff_attribute` are explained in the htseq documentation. + +Members of QBiC can download the data for analysis using `qpostman`: https://github.com/qbicsoftware/postman-cli + +It should be installed on the computing stations and can be loaded with: + +``` +module load qbic/qpostman/0.1.2.3 +``` + +To download the data navigate to the data folder and either provide a QBiC ID +``` +java -jar qpostman.jar -i -u +``` + +or a file containing the project QBiC IDs: + +``` +postman-0.1.2.3 -f sample_IDs.csv -u user-name +``` + +If you're not using `qpostman` just put the relevant files in the data folder (formats supported: `.fastq`, `.fastq.gz`). + +To run the workflow navigate to the `src` folder. +Using `snakemake -n` one can display the operations the workflow will perform. +Using the `--dag` parameter and piping it to `dot` one can create a .pdf version of the directed acyclic graph used by snakemake to inspect the behavious of the workflow on a local machine. + +``` +cd src/ +snakemake -n +snakemake --dag | dot -Tpdf > dag.pdf +``` + +To run the workflow: + +``` +qproject run -t .. +``` + +While running one can inspect the log files (e.g. in a different screen session) for the progress and errors generated by the workflow: + +``` +cd logs/ +tail snake.err -f +``` + +And to check the jobs on the computing cluster one can use `qstat`. + +Alternatively to using `qproject run` one could use `snakemake -j` to run the workflow, but then be sure to check the `jobscript.sh` to load the required modules manually and also note that this would also not use `qsub` to submit the jobs. From f56e69b65c0a08d782788bc29e062c34a25feb50 Mon Sep 17 00:00:00 2001 From: qbicStefanC Date: Mon, 13 Aug 2018 15:54:11 +0200 Subject: [PATCH 10/11] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6acc3d5..ad2da05 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Using `snakemake -n` one can display the operations the workflow will perform. Using the `--dag` parameter and piping it to `dot` one can create a .pdf version of the directed acyclic graph used by snakemake to inspect the behavious of the workflow on a local machine. ``` +module load qbic/anaconda cd src/ snakemake -n snakemake --dag | dot -Tpdf > dag.pdf From 8cc7f75f93a5a7e10fa3a72271fafe3dbf78c8fe Mon Sep 17 00:00:00 2001 From: qbicStefanC Date: Tue, 23 Oct 2018 13:03:23 +0200 Subject: [PATCH 11/11] Update jobscript.sh updated samtools load of module --- jobscript.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jobscript.sh b/jobscript.sh index c79a60d..8a4b49c 100644 --- a/jobscript.sh +++ b/jobscript.sh @@ -14,7 +14,7 @@ module load qbic/anaconda module load qbic/htseq/0.6.1p2 module load qbic/bowtie2/2.2.3 module load qbic/tophat -module load bio/samtools/1.2 +module load qbic/samtools {exec_job} exit 0