diff --git a/README.md b/README.md index dce21d0..ad2da05 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,14 @@ -rna seq workflow for use with qproject. +RNA-Seq workflow for single-end data. The workflow can be downloaded and run on a cluster environment using the tool qproject provided on github: https://github.com/qbicsoftware/qproject -Add a config file "params.json" in `etc`: +The workflow uses a module system to load the required software. Be sure to check the `jobscript.sh` file to see which software modules are required. The modules are loaded automatically when using `qproject run` to start the workflow, otherwise they have to be loaded manually. + +One should use qproject to download the files. This also creates all folders necessary for the workflow. + +``` +qproject create -t . -w github:qbicsoftware/rnaseq +``` + +Be sure to add a config file "params.json" in `etc` which should look like this: ```json { @@ -17,6 +25,56 @@ Add a config file "params.json" in `etc`: where `indexed_genome` and `gtf` are paths relative to `ref`. `indexed_genome` is the basename of a bowtie2 index. +`gtf` is the .gtf file of the reference genome The parameters `stranded`, `overlap_mode`, `feature_type` and `gff_attribute` are explained in the htseq documentation. + +Members of QBiC can download the data for analysis using `qpostman`: https://github.com/qbicsoftware/postman-cli + +It should be installed on the computing stations and can be loaded with: + +``` +module load qbic/qpostman/0.1.2.3 +``` + +To download the data navigate to the data folder and either provide a QBiC ID +``` +java -jar qpostman.jar -i -u +``` + +or a file containing the project QBiC IDs: + +``` +postman-0.1.2.3 -f sample_IDs.csv -u user-name +``` + +If you're not using `qpostman` just put the relevant files in the data folder (formats supported: `.fastq`, `.fastq.gz`). + +To run the workflow navigate to the `src` folder. +Using `snakemake -n` one can display the operations the workflow will perform. +Using the `--dag` parameter and piping it to `dot` one can create a .pdf version of the directed acyclic graph used by snakemake to inspect the behavious of the workflow on a local machine. + +``` +module load qbic/anaconda +cd src/ +snakemake -n +snakemake --dag | dot -Tpdf > dag.pdf +``` + +To run the workflow: + +``` +qproject run -t .. +``` + +While running one can inspect the log files (e.g. in a different screen session) for the progress and errors generated by the workflow: + +``` +cd logs/ +tail snake.err -f +``` + +And to check the jobs on the computing cluster one can use `qstat`. + +Alternatively to using `qproject run` one could use `snakemake -j` to run the workflow, but then be sure to check the `jobscript.sh` to load the required modules manually and also note that this would also not use `qsub` to submit the jobs. diff --git a/Snakefile b/Snakefile index 231b274..623a3ff 100644 --- a/Snakefile +++ b/Snakefile @@ -101,6 +101,7 @@ OUTPUT_FILES.extend(expand("TopHat2/{name}/accepted_hits.bai", name=INPUT_FILES, OUTPUT_FILES.extend(expand("Summary/MappingStats/{name}.txt", name=INPUT_FILES, result=RESULT)) #OUTPUT_FILES.append("checksums.ok") OUTPUT_FILES.append(result('all_counts.csv')) +OUTPUT_FILES.append("Summary/software_versions.txt") rule all: input: OUTPUT_FILES @@ -208,8 +209,16 @@ rule MergeAdapters: output: "MergeAdapters/merged.fasta" shell: "cat {input} > {output}" +rule subset_Adapters: + input: "MergeAdapters/merged.fasta", + output: "MergeAdapters/merged.subset.fasta" + shell: + """ + awk '/^>/ {{P=index($0,"No Hit")==0}} {{if(P) print}} ' {input} > {output} + """ + rule CutAdapt: - input: "MergeAdapters/merged.fasta", "PreFilterReads/{name}.fastq" + input: "MergeAdapters/merged.subset.fasta", "PreFilterReads/{name}.fastq" output: "CutAdaptMerge/{name}.fastq" run: with open(str(input[0])) as f: @@ -285,3 +294,21 @@ rule NumreadsOrig: input: "fastq/{name}.fastq" output: "Summary/NumReads/Original/{name}.txt" shell: '''dc -e "$(wc -l {input} | cut -f1 -d' ') 4 / p" > {output}''' + +""" +Rule to get software versions of used programs in workflow. Rule either calls program with --version flag if possible, or runs +it without parameters displaying output row containing version information with unix tail command. +It then redirects it to Summary/software_versions.txt +""" + +rule SoftwareVersions: + input: result("all_counts.csv") + output: "Summary/software_versions.txt" + run: + shell("anaconda --version > Summary/software_versions.txt") + shell("conda --version >> Summary/software_versions.txt") + shell("fastqc --version >> Summary/software_versions.txt") + shell("htseq-count -h | tail -1 >> Summary/software_versions.txt") + shell("bowtie2 --version | head -1 >> Summary/software_versions.txt") + shell("tophat2 --version >> Summary/software_versions.txt") + shell("samtools --version | head -2 >> Summary/software_versions.txt") diff --git a/jobscript.sh b/jobscript.sh old mode 100755 new mode 100644 index 4615b6f..8a4b49c --- a/jobscript.sh +++ b/jobscript.sh @@ -12,8 +12,9 @@ set -e module load qbic/fastqc/0.11.4 module load qbic/anaconda module load qbic/htseq/0.6.1p2 +module load qbic/bowtie2/2.2.3 module load qbic/tophat -module load bio/samtools/1.2 +module load qbic/samtools {exec_job} exit 0