qbicsoftware-archive · qbicStefanC · Jun 27, 2016 · Sep 9, 2016 · Sep 9, 2016 · Nov 8, 2016
diff --git a/README.md b/README.md
@@ -1,6 +1,14 @@
-rna seq workflow for use with qproject.
+RNA-Seq workflow for single-end data. The workflow can be downloaded and run on a cluster environment using the tool qproject provided on github: https://github.com/qbicsoftware/qproject
 
-Add a config file "params.json" in `etc`:
+The workflow uses a module system to load the required software. Be sure to check the `jobscript.sh` file to see which software modules are required. The modules are loaded automatically when using `qproject run` to start the workflow, otherwise they have to be loaded manually.
+
+One should use qproject to download the files. This also creates all folders necessary for the workflow.
+
+```
+qproject create -t . -w github:qbicsoftware/rnaseq
+```
+
+Be sure to add a config file "params.json" in `etc` which should look like this:
 
 ```json
 {
@@ -17,6 +25,56 @@ Add a config file "params.json" in `etc`:
 where `indexed_genome` and `gtf` are paths relative to `ref`.
 
 `indexed_genome` is the basename of a bowtie2 index.
+`gtf` is the .gtf file of the reference genome
 
 The parameters `stranded`, `overlap_mode`, `feature_type` and `gff_attribute`
 are explained in the htseq documentation.
+
+Members of QBiC can download the data for analysis using `qpostman`: https://github.com/qbicsoftware/postman-cli
+
+It should be installed on the computing stations and can be loaded with:
+
+```
+module load qbic/qpostman/0.1.2.3
+```
+
+To download the data navigate to the data folder and either provide a QBiC ID
+```
+java -jar qpostman.jar -i <QBiC ID> -u <your_qbic_username>
+```
+
+or a file containing the project QBiC IDs:
+
+```
+postman-0.1.2.3 -f sample_IDs.csv -u user-name
+```
+
+If you're not using `qpostman` just put the relevant files in the data folder (formats supported: `.fastq`, `.fastq.gz`).
+
+To run the workflow navigate to the `src` folder.
+Using `snakemake -n` one can display the operations the workflow will perform.
+Using the `--dag` parameter and piping it to `dot` one can create a .pdf version of the directed acyclic graph used by snakemake to inspect the behavious of the workflow on a local machine.
+
+```
+module load qbic/anaconda
+cd src/
+snakemake -n
+snakemake --dag | dot -Tpdf > dag.pdf
+```
+
+To run the workflow:
+
+```
+qproject run -t ..
+```
+
+While running one can inspect the log files (e.g. in a different screen session) for the progress and errors generated by the workflow:
+
+```
+cd logs/
+tail snake.err -f
+```
+
+And to check the jobs on the computing cluster one can use `qstat`.
+
+Alternatively to using `qproject run` one could use `snakemake -j` to run the workflow, but then be sure to check the `jobscript.sh` to load the required modules manually and also note that this would also not use `qsub` to submit the jobs.
diff --git a/Snakefile b/Snakefile
@@ -101,6 +101,7 @@ OUTPUT_FILES.extend(expand("TopHat2/{name}/accepted_hits.bai", name=INPUT_FILES,
 OUTPUT_FILES.extend(expand("Summary/MappingStats/{name}.txt", name=INPUT_FILES, result=RESULT))
 #OUTPUT_FILES.append("checksums.ok")
 OUTPUT_FILES.append(result('all_counts.csv'))
+OUTPUT_FILES.append("Summary/software_versions.txt")
 
 rule all:
     input: OUTPUT_FILES
@@ -208,8 +209,16 @@ rule MergeAdapters:
     output: "MergeAdapters/merged.fasta"
     shell: "cat {input} > {output}"
 
+rule subset_Adapters:
+    input: "MergeAdapters/merged.fasta",
+    output: "MergeAdapters/merged.subset.fasta"
+    shell:
+        """
+       awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output}
+        """
+
 rule CutAdapt:
-    input: "MergeAdapters/merged.fasta", "PreFilterReads/{name}.fastq"
+    input: "MergeAdapters/merged.subset.fasta", "PreFilterReads/{name}.fastq"
     output: "CutAdaptMerge/{name}.fastq"
     run:
         with open(str(input[0])) as f:
@@ -285,3 +294,21 @@ rule NumreadsOrig:
     input: "fastq/{name}.fastq"
     output: "Summary/NumReads/Original/{name}.txt"
     shell: '''dc -e "$(wc -l {input} | cut -f1 -d' ') 4 / p" > {output}'''
+
+"""
+Rule to get software versions of used programs in workflow. Rule either calls program with --version flag if possible, or runs
+it without parameters displaying output row containing version information with unix tail command.
+It then redirects it to Summary/software_versions.txt 
+"""
+
+rule SoftwareVersions:
+    input: result("all_counts.csv")
+    output: "Summary/software_versions.txt"
+    run:
+        shell("anaconda --version > Summary/software_versions.txt")
+        shell("conda --version >> Summary/software_versions.txt")
+        shell("fastqc --version >> Summary/software_versions.txt")
+        shell("htseq-count -h | tail -1 >> Summary/software_versions.txt")
+        shell("bowtie2 --version | head -1 >> Summary/software_versions.txt")
+        shell("tophat2 --version >> Summary/software_versions.txt")
+        shell("samtools --version | head -2 >> Summary/software_versions.txt")
diff --git a/jobscript.sh b/jobscript.sh
@@ -12,8 +12,9 @@ set -e
 module load qbic/fastqc/0.11.4
 module load qbic/anaconda
 module load qbic/htseq/0.6.1p2
+module load qbic/bowtie2/2.2.3
 module load qbic/tophat
-module load bio/samtools/1.2
+module load qbic/samtools
 
 {exec_job}
 exit 0