diff --git a/README.rst b/README.rst index bb09a6a..4f4f9c0 100644 --- a/README.rst +++ b/README.rst @@ -158,6 +158,7 @@ Changelog ========= ==================================================================== Version Description ========= ==================================================================== +0.18.0 * New plots in the HTML reports. Includes version of executables. 0.17.2 * CHANGES: in star section, added --limitBAMsortRAM and set to 30G * BUG: Fix missing params (options) in star_mapping rule not taken into account diff --git a/environment.yml b/environment.yml index 4095a46..36a4dfa 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,8 @@ channels: - r dependencies: +- cutadapt +- atropos - bowtie - samtools>1.7 - bamtools diff --git a/requirements.txt b/requirements.txt index 1ab2e2b..2317f78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ sequana>=0.14.2 sequana_pipetools>=0.9.2 +cutadapt +atropos diff --git a/sequana_pipelines/rnaseq/config.yaml b/sequana_pipelines/rnaseq/config.yaml index 16bc9b1..b092cef 100644 --- a/sequana_pipelines/rnaseq/config.yaml +++ b/sequana_pipelines/rnaseq/config.yaml @@ -7,6 +7,8 @@ # If input_directory provided, use it otherwise if input_pattern provided, # use it, otherwise use input_samples. # ============================================================================ +sequana_wrappers: "v0.15.1" + input_directory: input_readtag: _R[12]_ input_pattern: '*fastq.gz' @@ -45,14 +47,6 @@ general: custom_gff: '' -###################### -# if files are required for a pipeline and are within sequana or should -# be downloaded before the pipeline provide them in this section -# Note that sequana and url fields are followed by itemised files or links -# using the front dashes -requirements: '' - - ################################################################# # FastQC section # diff --git a/sequana_pipelines/rnaseq/main.py b/sequana_pipelines/rnaseq/main.py index 6653d68..e8d8b11 100755 --- a/sequana_pipelines/rnaseq/main.py +++ b/sequana_pipelines/rnaseq/main.py @@ -69,7 +69,13 @@ def __init__(self, prog=NAME, epilog=None): "--rRNA-feature", default="rRNA", help="""Feature name corresponding to the rRNA to be identified in -the input GFF/GTF files""", +the input GFF/GTF files. Must exist and be valid. If you do not have any, +you may skip this step using --skip-rRNA or provide a fasta file using --contaminant-file""", + ) + pipeline_group.add_argument( + "--skip-rRNA", + action="store_true", + help="""skip the mapping on rRNA feature. ignored if --contaminant-file is provided""", ) pipeline_group.add_argument( "--contaminant-file", @@ -208,6 +214,8 @@ def main(args=None): "You are using a custom FASTA --contaminant_file so --rRNA-feature will be ignored" ) cfg.general.rRNA_feature = None + elif options.skip_rRNA: + cfg.general.rRNA_feature = None else: cfg.general.rRNA_feature = options.rRNA_feature diff --git a/sequana_pipelines/rnaseq/rnaseq.rules b/sequana_pipelines/rnaseq/rnaseq.rules index a3584a1..25c9042 100644 --- a/sequana_pipelines/rnaseq/rnaseq.rules +++ b/sequana_pipelines/rnaseq/rnaseq.rules @@ -17,8 +17,6 @@ import sequana from sequana_pipetools import snaketools as sm import sequana.featurecounts as fc -sequana_wrapper_branch="main" - # ========================================================= The main config file # configfile: "config.yaml" @@ -39,7 +37,6 @@ if manager.config['general']['aligner'] == 'salmon': input: "multiqc/multiqc_report.html", ".sequana/rulegraph.svg", - #"post_analysis/all_features.out", "post_analysis/rnadiff.sh" else: manager.globals['strand_summary'] = "outputs/strand_summary.csv" @@ -113,7 +110,6 @@ if manager.config.general.contaminant_file: samtools faidx {input[0]} """ - elif manager.config.general.rRNA_feature: # extract the rRNA feature from the GFF file. Build the corresponding FastA # file. if not found, a dummy FastA file with AAAAAAAAAAAAAA is built @@ -148,7 +144,6 @@ elif manager.config.general.rRNA_feature: samtools faidx {output.fasta} """ - # ========================================================= Indexing for rRNA and contmination # # redo the indexing whatsover since it is pretty fast @@ -169,7 +164,7 @@ if manager.config.general.contaminant_file or manager.config.general.rRNA_featur container: config['apptainers']['sequana_tools'] wrapper: - f"{sequana_wrapper_branch}/wrappers/bowtie1/build" + f"{manager.wrappers}/wrappers/bowtie1/build" # ============================================================================ bowtie2 index @@ -222,7 +217,7 @@ if manager.config.general.aligner == "bowtie2": resources: **config['bowtie2_index']['resources'] wrapper: - f"{sequana_wrapper_branch}/wrappers/bowtie2/build" + f"{manager.wrappers}/wrappers/bowtie2/build" # ============================================================================ star index @@ -260,7 +255,7 @@ elif manager.config.general.aligner == "star": resources: **config['star_index']['resources'] wrapper: - f"{sequana_wrapper_branch}/wrappers/star/index" + f"{manager.wrappers}/wrappers/star/index" # ========================================================================== salmon @@ -275,9 +270,7 @@ elif manager.config.general.aligner == "salmon": logger.warning(f"Could not determine salmon version. Index will be stored in {genome_directory}/salmon/") salmon_version = "" - __salmon_index__output_done = genome_directory + f"/salmon{salmon_version}/salmon.done" - - if os.path.exists(__salmon_index__output_done): + if os.path.exists(genome_directory + f"/salmon{salmon_version}/salmon.done"): pass # index exists, no need to do it, everything should be fine else: rule salmon_index: @@ -285,7 +278,7 @@ elif manager.config.general.aligner == "salmon": fasta=__fasta_file__, gff=__gff_file__ output: - done= __salmon_index__output_done + done=genome_directory + f"/salmon{salmon_version}/salmon.done" threads: config['salmon_index']['threads'] resources: @@ -297,7 +290,7 @@ elif manager.config.general.aligner == "salmon": log: "logs/salmon_indexing.log" wrapper: - f"{sequana_wrapper_branch}/wrappers/salmon/index" + f"{manager.wrappers}/wrappers/salmon/index" @@ -318,7 +311,7 @@ if not manager.config['fastqc']['skip_fastqc_raw']: log: "{sample}/fastqc_raw/fastqc.log" wrapper: - f"{sequana_wrapper_branch}/wrappers/fastqc" + f"{manager.wrappers}/wrappers/fastqc" expected_output.extend(expand("{sample}/fastqc_raw/fastqc.done", sample=manager.samples)) @@ -340,10 +333,10 @@ elif manager.config.trimming.software_choice in ["cutadapt", "atropos"]: if adapter_tool in ["cutadapt", "atropos"]: adapter_tool = "cutadapt" __cutadapt__input_fastq = manager.getrawdata() - __cutadapt__wkdir = manager.getwkdir("cutadapt") - __cutadapt__output = [manager.getname("cutadapt", "_R1_.clean.fastq.gz")] + __cutadapt__wkdir = "{sample}/cutadapt" + __cutadapt__output = ["{sample}/cutadapt/{sample}_R1_.clean.fastq.gz"] if manager.paired: - __cutadapt__output += [manager.getname("cutadapt", "_R2_.clean.fastq.gz")] + __cutadapt__output += ["{sample}/cutadapt/{sample}_R2_.clean.fastq.gz"] # Set the fwd and rev adapters __cutadapt__fwd = manager.config.cutadapt.fwd @@ -389,7 +382,7 @@ elif manager.config.trimming.software_choice == "fastp": container: config['apptainers']['fastp'] wrapper: - f"{sequana_wrapper_branch}/wrappers/fastp" + f"{manager.wrappers}/wrappers/fastp" # ===================================================== FASTQC fastp results @@ -410,21 +403,21 @@ rule fastqc_clean: container: config['apptainers']['fastqc'] wrapper: - f"{sequana_wrapper_branch}/wrappers/fastqc" + f"{manager.wrappers}/wrappers/fastqc" expected_output.extend(expand("{sample}/fastqc_clean/fastqc.done", sample=manager.samples)) # ================================= Decompress fastq.gz file before running bowtie1 # -if manager.config.trimming.software_choice == 'cutadapt': - __unpigz_R1__input = manager.getname("cutadapt", "_R1_.clean.fastq.gz") - __unpigz_R1__output = manager.getname("cutadapt", "_R1_.clean.fastq") -elif manager.config.trimming.software_choice == 'fastp': +if manager.config.trimming.software_choice == 'cutadapt' and manager.config.trimming.do: + #__unpigz_R1__input = manager.getname("cutadapt", "_R1_.clean.fastq.gz") + __unpigz_R1__input = "{sample}/cutadapt/{sample}_R1_.clean.fastq.gz" +elif manager.config.trimming.software_choice == 'fastp' and manager.config.trimming.do: __unpigz_R1__input = "{sample}/fastp/{sample}_R1_.fastp.fastq.gz" - __unpigz_R1__output = "{sample}/fastp/{sample}_R1_.fastp.fastq" +elif manager.config.trimming.software_choice == 'atropos' and manager.config.trimming.do: + __unpigz_R1__input = "{sample}/atropos/{sample}_R1_.clean.fastq.gz" else: __unpigz_R1__input = manager.getrawdata() - __unpigz_R1__output = manager.getname("cutadapt", "_R1_.fastq") # ========== decompress and sanity check @@ -438,7 +431,7 @@ rule sample_rRNA: input: __unpigz_R1__input output: - temp(__unpigz_R1__output) + fastq=temp("{sample}/data_for_bowtie1/{sample}_R1_.fastq") threads: 4 params: nreads = int(config['bowtie1_mapping_rna']['nreads']) @@ -469,8 +462,7 @@ if manager.config.general.rRNA_feature or manager.config.general.contaminant_fil rule bowtie1_mapping_rna: input: - fastq= __unpigz_R1__output, - #index = f"{__prefix_name__}_{config.general.rRNA_feature}.1.ebwt" + fastq= rules.sample_rRNA.output.fastq, index=bowtie1_index_conta__output output: bam = "{sample}/bowtie1_mapping_rna/{sample}_rRNA.bam", @@ -479,15 +471,12 @@ if manager.config.general.rRNA_feature or manager.config.general.contaminant_fil "{sample}/bowtie1_mapping_rna/{sample}_bowtie1.log" params: options="" - threads: + threads: config['bowtie1_mapping_rna']['threads'] container: config['apptainers']['sequana_tools'] wrapper: - f"{sequana_wrapper_branch}/wrappers/bowtie1/align" - - #expected_output.extend(expand("{sample}/bowtie1_mapping_rna/{sample}_rRNA.sorted.bam", - # sample=manager.samples)) + f"{manager.wrappers}/wrappers/bowtie1/align" rule fix_bowtie1_log: input: @@ -542,7 +531,7 @@ if manager.config.general.aligner == "bowtie2": resources: **config['bowtie2_mapping']['resources'] wrapper: - f"{sequana_wrapper_branch}/wrappers/bowtie2/align" + f"{manager.wrappers}/wrappers/bowtie2/align" __mapping_output = "{sample}/bowtie2/{sample}.sorted.bam" @@ -555,7 +544,7 @@ elif manager.config.general.aligner == "star": input: fastq= __clean_fastq__output, reference=__fasta_file__, - index= __star_index__done, + index= __star_index__done output: bam = "{sample}/star_mapping/{sample}_Aligned.sortedByCoord.out.bam" params: @@ -574,7 +563,7 @@ elif manager.config.general.aligner == "star": resources: **config['star_mapping']['resources'] wrapper: - f"{sequana_wrapper_branch}/wrappers/star/align" + f"{manager.wrappers}/wrappers/star/align" expected_output.extend( @@ -595,7 +584,7 @@ elif manager.config.general.aligner == "salmon": rule salmon_mapping: input: fastq=__clean_fastq__output, - index=f"{genome_directory}/salmon{salmon_version}/salmon.done" + index=genome_directory + f"/salmon{salmon_version}/salmon.done" output: quant="{sample}/salmon_mapping/{sample}_quant.sf" params: @@ -634,7 +623,7 @@ if manager.config.general.aligner not in ['salmon']: container: config['apptainers']['sequana_tools'] wrapper: - f"{sequana_wrapper_branch}/wrappers/add_read_group" + f"{manager.wrappers}/wrappers/add_read_group" @@ -661,7 +650,7 @@ if config["mark_duplicates"]["do"]: resources: **config['mark_duplicates']['resources'] wrapper: - f"{sequana_wrapper_branch}/wrappers/mark_duplicates" + f"{manager.wrappers}/wrappers/mark_duplicates" __final_bam__ = "{sample}/mark_duplicates/{sample}.sorted.markdup.bam" elif manager.config.general.aligner not in ['salmon']: __final_bam__ = "{sample}/add_read_group/{sample}.sorted.bam" @@ -687,7 +676,7 @@ if manager.config.bam_coverage.do is True and config['general']['aligner'] not i resources: **config['bam_coverage']['resources'] wrapper: - f"{sequana_wrapper_branch}/wrappers/deeptools/bam_coverage" + f"{manager.wrappers}/wrappers/deeptools/bam_coverage" expected_output.extend( expand( @@ -775,7 +764,7 @@ if manager.config.feature_counts.do and manager.config.general.aligner not in [' log: "{sample}/feature_counts/{strand}/feature_counts.log" wrapper: - f"{sequana_wrapper_branch}/wrappers/feature_counts" + f"{manager.wrappers}/wrappers/feature_counts" # ===================== guessing the strand @@ -994,7 +983,7 @@ rule multiqc: log: "multiqc/multiqc.log" wrapper: - f"{sequana_wrapper_branch}/wrappers/multiqc" + f"{manager.wrappers}/wrappers/multiqc" # ========================================================== rulegraph @@ -1007,7 +996,7 @@ rule rulegraph: configname="config.yaml", mapper = {"multiqc": "../multiqc/multiqc_report.html"}, wrapper: - f"{sequana_wrapper_branch}/wrappers/rulegraph" + f"{manager.wrappers}/wrappers/rulegraph" rule dot2svg: @@ -1109,17 +1098,21 @@ onsuccess: The RNA-seq pipeline maps the reads on the provided reference (called {__fasta_file__.split("/")[-1]}). Features counts were extracted and are available in the feature counts directory; those files are entry points for differential gene expression analysis. The differential analysis, if performed, should be available in the DGE analysis directory. In addition, if enrichment was performed (GO or Kegg pathways), it should be available in the directory as well.

-

A multiqc report is available, where various QC and mapping quality plots can be visualised. +

A multiqc report is available, where various QC and mapping quality plots can be visualised. Some important plots are also in the HTML page here below.

""" + rRNA_done = True try: - intro += """

ribosomal / contaminant content

""" if os.path.exists("multiqc/multiqc_report_data/multiqc_bowtie1.txt"): df = pd.read_csv("multiqc/multiqc_report_data/multiqc_bowtie1.txt", sep='\t') - else: + elif os.path.exists("multiqc/multiqc_data/multiqc_bowtie1.txt"): df = pd.read_csv("multiqc/multiqc_data/multiqc_bowtie1.txt", sep='\t') + else: + rRNA_done = False + raise Exception + intro += """

ribosomal / contaminant content

""" if "reads_aligned_percentage" in df.columns: rRNA = int(df.reads_aligned_percentage.mean()*100)/100 else: @@ -1141,18 +1134,53 @@ onsuccess: try: from sequana.multiqc.plots import Bowtie1Reader - filename = "multiqc/multiqc_data/multiqc_bowtie1.txt" - if not os.path.exists(filename): - filename = "multiqc/multiqc_report_data/multiqc_bowtie1.txt" - br = Bowtie1Reader(filename) - br.df.Sample = [str(x).replace("_bowtie1","") for x in br.df.Sample] - fig = br.plot_bar(html_code=True) - from plotly import offline - intro += offline.plot(fig, output_type="div", include_plotlyjs=True) + if rRNA_done: + filename = "multiqc/multiqc_data/multiqc_bowtie1.txt" + if not os.path.exists(filename): + filename = "multiqc/multiqc_report_data/multiqc_bowtie1.txt" + br = Bowtie1Reader(filename) + br.df.Sample = [str(x).replace("_bowtie1","") for x in br.df.Sample] + fig = br.plot_bar(html_code=True) + from plotly import offline + intro += offline.plot(fig, output_type="div", include_plotlyjs=True) except Exception as err: print(err) + + + # Include the bowtie plot + intro += """

Mapping rate

""" + try: + from sequana.multiqc.plots import Bowtie2 + filename = "multiqc/multiqc_data/mqc_bowtie2_se_plot_1.txt" + if not os.path.exists(filename): + filename = "multiqc/multiqc_report_data/mqc_bowtie2_se_plot_1.txt" + br = Bowtie2(filename) + fig = br.plot(html_code=True) + from plotly import offline + intro += """

The mapping was performed with bowtie2. Here below are the percentage of mapping for each sample. See also the multiqc report.

""" + offline.plot(fig, output_type="div", include_plotlyjs=True) + except Exception: pass + try: + from sequana.multiqc.plots import STAR + filename = "multiqc/multiqc_report_data/multiqc_star.txt" + br = STAR(filename) + fig = br.plot(html_code=True) + from plotly import offline + intro += """

The mapping was performed with STAR. Here below are the percentage of mapping for each sample. See also the multiqc report.

""" + offline.plot(fig, output_type="div", include_plotlyjs=True) + except Exception : + pass + + try: + from sequana.multiqc.plots import FeatureCounts + intro += """

Annotation rate

""" + filename = "multiqc/multiqc_report_data/mqc_featureCounts_assignment_plot_1.txt" + br = FeatureCounts(filename) + fig = br.plot(html_code=True) + from plotly import offline + intro += """

The annotation was performed with subread/feature counts software. Here below is the percentage of reads assigned to the requested feature (usually gene; see the config file here below).

""" + offline.plot(fig, output_type="div", include_plotlyjs=True) + except Exception : + pass if manager.globals['strand_summary']: intro += """

Strandness

""" @@ -1189,9 +1217,6 @@ onsuccess: pass s = SummaryModule2(data, intro) - #s.create_report_content(workflow=True) - #s.sections.append({"name": "Info", "anchor":"command", "content": command}) - #s.create_html("summary.html") shell("chmod -R g+w .") shell("rm -rf rulegraph") diff --git a/sequana_pipelines/rnaseq/schema.yaml b/sequana_pipelines/rnaseq/schema.yaml index 95d4449..28b74ae 100644 --- a/sequana_pipelines/rnaseq/schema.yaml +++ b/sequana_pipelines/rnaseq/schema.yaml @@ -3,6 +3,8 @@ type: map mapping: + "sequana_wrappers": + type: str "input_directory": type: str required: True diff --git a/setup.py b/setup.py index 637f438..7aba164 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,8 @@ import subprocess _MAJOR = 0 -_MINOR = 17 -_MICRO = 2 +_MINOR = 18 +_MICRO = 0 version = "%d.%d.%d" % (_MAJOR, _MINOR, _MICRO) release = "%d.%d" % (_MAJOR, _MINOR)