From b0b37ebf19d05cce664113b6e11fab52dc1fb9c9 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 10 Oct 2024 17:29:18 +0200 Subject: [PATCH 01/25] Add chaining samplesheet with mag --- conf/test.config | 2 + conf/test_nothing.config | 1 + nextflow.config | 4 ++ .../generate_downstream_samplesheets/main.nf | 45 +++++++++++++++++++ workflows/taxprofiler.nf | 9 ++++ 5 files changed, 61 insertions(+) create mode 100644 subworkflows/local/generate_downstream_samplesheets/main.nf diff --git a/conf/test.config b/conf/test.config index 4e457585..820073d9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -48,6 +48,8 @@ params { kraken2_save_reads = true centrifuge_save_reads = true run_profile_standardisation = true + generate_downstream_samplesheets = true + generate_downstream_samplesheets = "mag" } process { diff --git a/conf/test_nothing.config b/conf/test_nothing.config index e8b87bc7..647b6ae8 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -41,6 +41,7 @@ params { run_motus = false run_kmcp = false run_ganon = false + generate_downstream_samplesheets = false } process { diff --git a/nextflow.config b/nextflow.config index e8dd7723..ecdd6a48 100644 --- a/nextflow.config +++ b/nextflow.config @@ -196,6 +196,10 @@ params { taxpasta_add_ranklineage = false taxpasta_ignore_errors = false standardisation_motus_generatebiom = false + + // Generate downstream samplesheets + generate_downstream_samplesheets = false + downstream_pipeline = 'mag' } // Load base.config by default for all pipelines diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf new file mode 100644 index 00000000..4311e36d --- /dev/null +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -0,0 +1,45 @@ +// +// Subworkflow with functionality specific to the nf-core/taxprofiler pipeline +// + +workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { + take: + ch_processed_reads + + main: + format = 'csv' // most common format in nf-core + format_sep = ',' + + if ( params.downstream_pipeline == 'mag' && params.save_analysis_ready_reads ) { + def fastq_rel_path = '/' + format = 'csv' + format_sep = ',' + ch_list_for_samplesheet = ch_processed_reads.view() + .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> meta.single_end == false && meta.is_fasta == true } + + //Filter out the fasta files and the single-end reads + .map { + meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> + def sample = meta.id + def run = meta.run_accession //this should be optional + def group = "" + def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() + def short_reads_2 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() + def long_reads = "" + [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + } + .tap{ ch_header } + } + + + ch_header + .first() + .map{ it.keySet().join(format_sep) } + .concat( ch_list_for_samplesheet.map{ it.values().join(format_sep) }) + .collectFile( + name:"${params.outdir}/downstream_samplesheet/${params.downstream_pipeline}.${format}", + newLine: true, + sort: false + ) + +} diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 93eb55dd..f8a735f5 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -10,6 +10,7 @@ include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_taxprofiler_pipeline' +include { GENERATE_DOWNSTREAM_SAMPLESHEETS } from '../subworkflows/local/generate_downstream_samplesheets/main.nf' include { validateParameters; paramsHelp; paramsSummaryLog; fromSamplesheet } from 'plugin/nf-validation' // Check input path parameters to see if they exist @@ -328,6 +329,14 @@ workflow TAXPROFILER { MODULE: MultiQC */ + + // + // Samplesheet generation + // + if ( params.generate_downstream_samplesheets ) { + GENERATE_DOWNSTREAM_SAMPLESHEETS ( samplesheet ) + } + // // Collate and save software versions // From 590f9b10931b85775b106ceec897e0ee52da0ab7 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 13:08:45 +0200 Subject: [PATCH 02/25] Filter out correctly --- subworkflows/local/generate_downstream_samplesheets/main.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 4311e36d..a0ec4810 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -15,9 +15,8 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { format = 'csv' format_sep = ',' ch_list_for_samplesheet = ch_processed_reads.view() - .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> meta.single_end == false && meta.is_fasta == true } - - //Filter out the fasta files and the single-end reads + //Filter out the fasta files and the single-end reads + .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> def sample = meta.id From 738837e074c459f101ccf79593037f8137167a7f Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 13:31:25 +0200 Subject: [PATCH 03/25] Rename parameters --- conf/test.config | 4 ++- conf/test_nothing.config | 3 +++ docs/output.md | 25 +++++++++++++++++++ nextflow.config | 6 +++-- nextflow_schema.json | 15 ++++++++++- .../generate_downstream_samplesheets/main.nf | 4 +-- 6 files changed, 51 insertions(+), 6 deletions(-) diff --git a/conf/test.config b/conf/test.config index 820073d9..6f93aa41 100644 --- a/conf/test.config +++ b/conf/test.config @@ -48,8 +48,10 @@ params { kraken2_save_reads = true centrifuge_save_reads = true run_profile_standardisation = true + + // Generate downstream samplesheets generate_downstream_samplesheets = true - generate_downstream_samplesheets = "mag" + generate_pipeline_samplesheets = "mag" } process { diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 647b6ae8..257f8984 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -41,7 +41,10 @@ params { run_motus = false run_kmcp = false run_ganon = false + + // Generate downstream samplesheets generate_downstream_samplesheets = false + generate_pipeline_samplesheets = "mag" } process { diff --git a/docs/output.md b/docs/output.md index 7cf96395..4b34d18a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -683,6 +683,10 @@ The following report files are used for the taxpasta step: Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. ::: + +The pipeline can also generate downstream pipeline input samplesheets. +These are stored in `/downstream_samplesheets`. + ### MultiQC
@@ -744,3 +748,24 @@ For example, DIAMOND output does not have a dedicated section in the MultiQC HTM
[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + + +### Downstream samplesheets + +The pipeline can also generate input files for the following downstream +pipelines: + +- [nf-core/mag](https://nf-co.re/mag) + +
+Output files + +- `downstream_samplesheets/` + - `mag.csv`: Only for paired-end reads + +
+ +:::warning +Any generated downstream samplesheet is provided as 'best effort' and are not guaranteed to work straight out of the box! +They may not be complete (e.g. some columns may need to be manually filled in). +::: diff --git a/nextflow.config b/nextflow.config index ecdd6a48..67dd6912 100644 --- a/nextflow.config +++ b/nextflow.config @@ -198,8 +198,10 @@ params { standardisation_motus_generatebiom = false // Generate downstream samplesheets - generate_downstream_samplesheets = false - downstream_pipeline = 'mag' + + // Generate downstream samplesheets + generate_downstream_samplesheets = false + generate_pipeline_samplesheets = "mag" } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 3ada1a56..e14fa5e3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -984,5 +984,18 @@ { "$ref": "#/definitions/reference_genome_options" } - ] + ], + "properties": { + "generate_downstream_samplesheets": { + "type": "boolean", + "description": "Turn on generation of samplesheets for downstream pipelines.", + "fa_icon": "fas fa-toggle-on" + }, + "generate_pipeline_samplesheets": { + "type": "string", + "default": "taxprofiler", + "description": "Specify which pipeline to generate a samplesheet for.", + "fa_icon": "fas fa-toolbox" + } + } } diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index a0ec4810..05b73db7 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -10,7 +10,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { format = 'csv' // most common format in nf-core format_sep = ',' - if ( params.downstream_pipeline == 'mag' && params.save_analysis_ready_reads ) { + if ( params.generate_pipeline_samplesheets == 'mag' && params.save_analysis_ready_reads ) { def fastq_rel_path = '/' format = 'csv' format_sep = ',' @@ -36,7 +36,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { .map{ it.keySet().join(format_sep) } .concat( ch_list_for_samplesheet.map{ it.values().join(format_sep) }) .collectFile( - name:"${params.outdir}/downstream_samplesheet/${params.downstream_pipeline}.${format}", + name:"${params.outdir}/downstream_samplesheet/${params.generate_pipeline_samplesheets}.${format}", newLine: true, sort: false ) From 3d1917634c021be37b927f2800e1e9f5a50270f2 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 13:36:35 +0200 Subject: [PATCH 04/25] Fix linting --- docs/output.md | 2 -- .../local/generate_downstream_samplesheets/main.nf | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/output.md b/docs/output.md index 4b34d18a..fa4c4bac 100644 --- a/docs/output.md +++ b/docs/output.md @@ -683,7 +683,6 @@ The following report files are used for the taxpasta step: Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. ::: - The pipeline can also generate downstream pipeline input samplesheets. These are stored in `/downstream_samplesheets`. @@ -749,7 +748,6 @@ For example, DIAMOND output does not have a dedicated section in the MultiQC HTM [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. - ### Downstream samplesheets The pipeline can also generate input files for the following downstream diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 05b73db7..8ff84007 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -16,7 +16,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { format_sep = ',' ch_list_for_samplesheet = ch_processed_reads.view() //Filter out the fasta files and the single-end reads - .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } + .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> def sample = meta.id @@ -25,9 +25,9 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() def short_reads_2 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() def long_reads = "" - [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] - } - .tap{ ch_header } + [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + } + .tap{ ch_header } } From 80bed68917e6bf0c612cfaf7cfd474fdee5ba9e5 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 13:39:40 +0200 Subject: [PATCH 05/25] Fix linting --- subworkflows/local/generate_downstream_samplesheets/main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 8ff84007..a3cbf23b 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -14,8 +14,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { def fastq_rel_path = '/' format = 'csv' format_sep = ',' - ch_list_for_samplesheet = ch_processed_reads.view() - //Filter out the fasta files and the single-end reads + ch_list_for_samplesheet = ch_processed_reads .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> From 4a9fa8801027b40cee1b1a84687fe3c0707d6719 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 13:45:21 +0200 Subject: [PATCH 06/25] Use same schema as createtaxdb --- nextflow_schema.json | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index e14fa5e3..b0c43a22 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -712,6 +712,25 @@ }, "fa_icon": "fas fa-chart-line" }, + "generate_samplesheet_options": { + "title": "Downstream pipeline samplesheet generation options", + "type": "object", + "fa_icon": "fas fa-align-justify", + "description": "Options for generating input samplesheets for complementary downstream pipelines.", + "properties": { + "generate_downstream_samplesheets": { + "type": "boolean", + "description": "Turn on generation of samplesheets for downstream pipelines.", + "fa_icon": "fas fa-toggle-on" + }, + "generate_pipeline_samplesheets": { + "type": "string", + "default": "taxprofiler", + "description": "Specify which pipeline to generate a samplesheet for.", + "fa_icon": "fas fa-toolbox" + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -984,18 +1003,5 @@ { "$ref": "#/definitions/reference_genome_options" } - ], - "properties": { - "generate_downstream_samplesheets": { - "type": "boolean", - "description": "Turn on generation of samplesheets for downstream pipelines.", - "fa_icon": "fas fa-toggle-on" - }, - "generate_pipeline_samplesheets": { - "type": "string", - "default": "taxprofiler", - "description": "Specify which pipeline to generate a samplesheet for.", - "fa_icon": "fas fa-toolbox" - } - } + ] } From a3dea866798dc18b5e88911b8d20103e05f9dbd1 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 13:58:03 +0200 Subject: [PATCH 07/25] Use correct name of argument --- subworkflows/local/generate_downstream_samplesheets/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index a3cbf23b..b1499bd4 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -10,7 +10,7 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { format = 'csv' // most common format in nf-core format_sep = ',' - if ( params.generate_pipeline_samplesheets == 'mag' && params.save_analysis_ready_reads ) { + if ( params.generate_pipeline_samplesheets == 'mag' && params.save_analysis_ready_fastqs ) { def fastq_rel_path = '/' format = 'csv' format_sep = ',' From b0ceef70cb66c707d9cf0ad99c3d59be714106a4 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 14:49:38 +0200 Subject: [PATCH 08/25] Add function --- docs/output.md | 22 +++---- nextflow_schema.json | 30 ++++------ .../generate_downstream_samplesheets/main.nf | 57 +++++++++++-------- 3 files changed, 55 insertions(+), 54 deletions(-) diff --git a/docs/output.md b/docs/output.md index fa4c4bac..4403898c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -130,7 +130,7 @@ You can change the default value for low complexity filtering by using the argum By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected. -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc.. @@ -174,7 +174,7 @@ The `.npo` files can be used for re-generating and customising the plots using t The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy. @@ -195,7 +195,7 @@ We do **not** recommend using Porechop if you are already trimming the adapters The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. ### BBDuk @@ -212,7 +212,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor -By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. @@ -233,7 +233,7 @@ It is used in nf-core/taxprofiler for complexity filtering using different algor -By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. @@ -252,7 +252,7 @@ The resulting `.fastq` files may _not_ always be the 'final' reads that go into -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::warning We do _not_ recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy. @@ -271,7 +271,7 @@ We do _not_ recommend using Filtlong if you are performing filtering of low qual -You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. ### Bowtie2 @@ -292,7 +292,7 @@ It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/ -By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::info Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq). @@ -345,7 +345,7 @@ Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See -This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. :::info For short-read unmapped reads, see [bowtie2](#bowtie2). @@ -354,7 +354,7 @@ For short-read unmapped reads, see [bowtie2](#bowtie2). ### Analysis Ready Reads :::info -This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`. +This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_fastqs`. :::
@@ -401,7 +401,7 @@ This is the last possible preprocessing step, so if you have multiple runs or li Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory. -This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. +This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_fastqs`, in which case the reads will be in the folder `analysis_ready_reads`. ### Bracken diff --git a/nextflow_schema.json b/nextflow_schema.json index b0c43a22..798e2004 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -712,25 +712,6 @@ }, "fa_icon": "fas fa-chart-line" }, - "generate_samplesheet_options": { - "title": "Downstream pipeline samplesheet generation options", - "type": "object", - "fa_icon": "fas fa-align-justify", - "description": "Options for generating input samplesheets for complementary downstream pipelines.", - "properties": { - "generate_downstream_samplesheets": { - "type": "boolean", - "description": "Turn on generation of samplesheets for downstream pipelines.", - "fa_icon": "fas fa-toggle-on" - }, - "generate_pipeline_samplesheets": { - "type": "string", - "default": "taxprofiler", - "description": "Specify which pipeline to generate a samplesheet for.", - "fa_icon": "fas fa-toolbox" - } - } - }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -1003,5 +984,14 @@ { "$ref": "#/definitions/reference_genome_options" } - ] + ], + "properties": { + "generate_downstream_samplesheets": { + "type": "boolean" + }, + "generate_pipeline_samplesheets": { + "type": "string", + "default": "mag" + } + } } diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index b1499bd4..9d1ccdf1 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -1,8 +1,8 @@ // -// Subworkflow with functionality specific to the nf-core/taxprofiler pipeline +// Subworkflow with functionality specific to the nf-core/mag pipeline // -workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { +workflow SAMPLESHEET_MAG { take: ch_processed_reads @@ -10,34 +10,45 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { format = 'csv' // most common format in nf-core format_sep = ',' - if ( params.generate_pipeline_samplesheets == 'mag' && params.save_analysis_ready_fastqs ) { - def fastq_rel_path = '/' - format = 'csv' - format_sep = ',' - ch_list_for_samplesheet = ch_processed_reads - .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } - .map { - meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> - def sample = meta.id - def run = meta.run_accession //this should be optional - def group = "" - def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() - def short_reads_2 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() - def long_reads = "" - [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + + ch_list_for_samplesheet = ch_processed_reads + .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta }.view() + .map { + meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> + def sample = meta.id + def run = meta.run_accession //this should be optional + def group = "" + def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() + def short_reads_2 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() + def long_reads = "" + [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] } - .tap{ ch_header } - } + .tap { ch_colnames } + + channelToSamplesheet(ch_colnames, ch_list_for_samplesheet, 'downstream_samplesheets', format, format_sep) +} + +workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { + take: + ch_processed_reads + + main: + if ( params.generate_pipeline_samplesheets == 'mag' && params.save_analysis_ready_fastqs ) { + SAMPLESHEET_MAG(ch_processed_reads) + } +} +def channelToSamplesheet(ch_header, ch_list_for_samplesheet, outdir_subdir, format, format_sep) { + // Constructs the header string and then the strings of each row, and + // finally concatenates for saving. Originally designed by @mahesh-panchal ch_header .first() - .map{ it.keySet().join(format_sep) } - .concat( ch_list_for_samplesheet.map{ it.values().join(format_sep) }) + .map { it.keySet().join(format_sep) } + .concat(ch_list_for_samplesheet.map { it.values().join(format_sep) }) .collectFile( - name:"${params.outdir}/downstream_samplesheet/${params.generate_pipeline_samplesheets}.${format}", + name: "${params.outdir}/${outdir_subdir}/${params.generate_pipeline_samplesheets}.${format}", newLine: true, sort: false ) - } From b5fe3db29b59099184d804ead801c21549a65828 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 15:01:15 +0200 Subject: [PATCH 09/25] Update nextflow_schema.json --- nextflow_schema.json | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 798e2004..e14fa5e3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -987,11 +987,15 @@ ], "properties": { "generate_downstream_samplesheets": { - "type": "boolean" + "type": "boolean", + "description": "Turn on generation of samplesheets for downstream pipelines.", + "fa_icon": "fas fa-toggle-on" }, "generate_pipeline_samplesheets": { "type": "string", - "default": "mag" + "default": "taxprofiler", + "description": "Specify which pipeline to generate a samplesheet for.", + "fa_icon": "fas fa-toolbox" } } } From a1cab25bd65b23f782a456971859a22fa4c997c3 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 16:32:13 +0200 Subject: [PATCH 10/25] Apply review suggestions --- docs/output.md | 5 ++- nextflow.config | 2 +- .../generate_downstream_samplesheets/main.nf | 3 +- .../tests/main.nf.test | 42 +++++++++++++++++++ 4 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test diff --git a/docs/output.md b/docs/output.md index 4403898c..9188e398 100644 --- a/docs/output.md +++ b/docs/output.md @@ -42,6 +42,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +The pipeline can also generate downstream pipeline input samplesheets. +These are stored in `/downstream_samplesheets`. + ![](images/taxprofiler_tube.png) ### untar @@ -683,8 +686,6 @@ The following report files are used for the taxpasta step: Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. ::: -The pipeline can also generate downstream pipeline input samplesheets. -These are stored in `/downstream_samplesheets`. ### MultiQC diff --git a/nextflow.config b/nextflow.config index 67dd6912..0f96ad18 100644 --- a/nextflow.config +++ b/nextflow.config @@ -201,7 +201,7 @@ params { // Generate downstream samplesheets generate_downstream_samplesheets = false - generate_pipeline_samplesheets = "mag" + generate_pipeline_samplesheets = null } // Load base.config by default for all pipelines diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 9d1ccdf1..2a28900a 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -12,7 +12,8 @@ workflow SAMPLESHEET_MAG { ch_list_for_samplesheet = ch_processed_reads - .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta }.view() + .view() + .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> def sample = meta.id diff --git a/subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test b/subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test new file mode 100644 index 00000000..ef63bcee --- /dev/null +++ b/subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test @@ -0,0 +1,42 @@ +nextflow_workflow { + + name "Test Subworkflow GENERATE_DOWNSTREAM_SAMPLESHEETS" + script "../main.nf" + workflow "GENERATE_DOWNSTREAM_SAMPLESHEETS" + + tag "subworkflows" + tag "subworkflows_local" + tag "subworkflows/generate_downstream_samplesheets" + + test("generate_downstream_samplesheets - mag") { + + when { + params { + modules_testdata_base_path = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/" + outdir = "." + generate_pipeline_samplesheets = 'mag' + } + workflow { + """ + input[0] = Channel.of( + [ + [id:'database', tool:'kraken2'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/db/kraken2.tar.gz", checkIfExists: true) + ], + [ + [id:'database', tool:'krakenuniq'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/db/krakenuniq.tar.gz", checkIfExists: true) + ], + ) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot("${params.outdir}/downstream_samplesheets/${params.generate_pipeline_samplesheets}.csv").match() } + ) + } + } +} From c315caeb70682474de4dbad472ac58a2d14ed69d Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Mon, 14 Oct 2024 16:32:27 +0200 Subject: [PATCH 11/25] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 4403898c..6e753b98 100644 --- a/docs/output.md +++ b/docs/output.md @@ -759,7 +759,7 @@ pipelines: Output files - `downstream_samplesheets/` - - `mag.csv`: Only for paired-end reads + - `mag.csv`: input sheet for nf-core/mag with paths to nf-core/taxprofiler preprocessed (corresponding to what is saved with `--save_analysis_ready_fastqs`)
From cd136d77f37c6430d7ff3c01256347e32911498d Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Mon, 14 Oct 2024 16:34:17 +0200 Subject: [PATCH 12/25] Review suggestions --- workflows/taxprofiler.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index f8a735f5..8a47b3c6 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -10,7 +10,6 @@ include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_taxprofiler_pipeline' -include { GENERATE_DOWNSTREAM_SAMPLESHEETS } from '../subworkflows/local/generate_downstream_samplesheets/main.nf' include { validateParameters; paramsHelp; paramsSummaryLog; fromSamplesheet } from 'plugin/nf-validation' // Check input path parameters to see if they exist @@ -73,6 +72,7 @@ include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_ include { PROFILING } from '../subworkflows/local/profiling' include { VISUALIZATION_KRONA } from '../subworkflows/local/visualization_krona' include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardisation_profiles' +include { GENERATE_DOWNSTREAM_SAMPLESHEETS } from '../subworkflows/local/generate_downstream_samplesheets/main.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 263c7d3f6fdea186c6d2e6107d9ebac52480795b Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Tue, 15 Oct 2024 07:35:20 +0000 Subject: [PATCH 13/25] [automated] Fix code linting --- docs/output.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index d48d54a7..704b9dd3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -686,7 +686,6 @@ The following report files are used for the taxpasta step: Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. ::: - ### MultiQC
From e3fa0eea4c9a07a0ac7d62f94661088e868c2a84 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 15 Oct 2024 13:43:06 +0200 Subject: [PATCH 14/25] Add pattern to nextflow_schema.json --- nextflow_schema.json | 149 +++++++++++++++++++++++++++++++------------ 1 file changed, 109 insertions(+), 40 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index e14fa5e3..913f7d8f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "databases", "outdir"], + "required": [ + "input", + "databases", + "outdir" + ], "properties": { "input": { "type": "string", @@ -75,7 +79,10 @@ "preprocessing_qc_tool": { "type": "string", "default": "fastqc", - "enum": ["fastqc", "falco"], + "enum": [ + "fastqc", + "falco" + ], "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", "description": "Specify the tool used for quality control of raw sequencing reads", "fa_icon": "fas fa-tools" @@ -110,7 +117,10 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": ["fastp", "adapterremoval"], + "enum": [ + "fastp", + "adapterremoval" + ], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -172,13 +182,17 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": ["bbduk", "prinseqplusplus", "fastp"], + "enum": [ + "bbduk", + "prinseqplusplus", + "fastp" + ], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, "shortread_complexityfilter_entropy": { "type": "number", - "default": 0.3, + "default": 0.299999999999999988897769753748434595763683319091796875, "fa_icon": "fas fa-random", "description": "Specify the minimum sequence entropy level for complexity filtering", "help_text": "Specify the minimum 'entropy' value for complexity filtering for BBDuk or PRINSEQ++.\n\nNote that this value will only be used for PRINSEQ++ if `--shortread_complexityfilter_prinseqplusplus_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation exists within the read. Higher values correspond to more variety, and thus will likely reslut in more specific matching to a taxon's reference genome. The trade off here is fewer reads (or abundance information) available for having a confident identification.\n\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`\n\n" @@ -206,7 +220,10 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": ["entropy", "dust"], + "enum": [ + "entropy", + "dust" + ], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -241,7 +258,10 @@ "longread_adapterremoval_tool": { "type": "string", "default": "porechop_abi", - "enum": ["porechop", "porechop_abi"], + "enum": [ + "porechop", + "porechop_abi" + ], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for adapter trimming.", "help_text": "The performance of Porechop and Porechop_ABI is same in terms of removing adapter reads. However Porechop is no longer updated, Porechop_ABI receives regular updates." @@ -255,7 +275,10 @@ "longread_filter_tool": { "type": "string", "default": "nanoq", - "enum": ["filtlong", "nanoq"], + "enum": [ + "filtlong", + "nanoq" + ], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for long reads filtering", "help_text": "Nanoq is a filtering tool only for Nanopore reads. Nanoq is faster and more memory-efficient than Filtlong. Nanoq also provides a summary of input read statistics; see [benchmarking](https://github.com/esteinig/nanoq?tab=readme-ov-file#benchmarks). \n\nFiltlong is a good option if you want to keep a certain percentage of reads after filtering, and you can also use it for non-Nanopore long reads." @@ -313,7 +336,10 @@ "type": "string", "default": "kmer", "description": "Specify mode for identifying redundant reads", - "enum": ["kmer", "alignment"], + "enum": [ + "kmer", + "alignment" + ], "fa_icon": "fas fa-align-left", "help_text": "Specify which read-comparison mode to use to check for redundancy.\n\nk-mer is faster but less precise but is recommended for FASTQ files. Alignment is more precise but is slower, it is recommended for FASTA files.\n\n> Modifies tool parameter(s):\n> - Nonpareil: `-T`" } @@ -423,7 +449,15 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "enum": [ + "blast", + "xml", + "txt", + "daa", + "sam", + "tsv", + "paf" + ], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -448,7 +482,14 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": ["phylum", "class", "order", "family", "genus", "species"], + "enum": [ + "phylum", + "class", + "order", + "family", + "genus", + "species" + ], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be only be a single level (e.g. `species`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -596,7 +637,13 @@ "default": "reads", "description": "Specify the type of ganon report to save.", "help_text": "Specify the type of taxonomic report to produce from ganon report. This mainly refers to which form of 'value' to print: raw read counts, abundance estimates, genome-size normalised etc. \n\nSee the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/#ganon-report) for more information of each option.\n\n> Modifies tool parameter(s):\n- ganon report: `--report-type`\n", - "enum": ["abundance", "reads", "matches", "dist", "corr"], + "enum": [ + "abundance", + "reads", + "matches", + "dist", + "corr" + ], "fa_icon": "fas fa-file" }, "ganon_report_rank": { @@ -665,7 +712,13 @@ "default": "tsv", "fa_icon": "fas fa-pastafarianism", "description": "The desired output format.", - "enum": ["tsv", "csv", "arrow", "parquet", "biom"] + "enum": [ + "tsv", + "csv", + "arrow", + "parquet", + "biom" + ] }, "taxpasta_taxonomy_dir": { "type": "string", @@ -712,6 +765,25 @@ }, "fa_icon": "fas fa-chart-line" }, + "generate_samplesheet_options": { + "title": "Downstream pipeline samplesheet generation options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Options for generating input samplesheets for complementary downstream pipelines.", + "properties": { + "generate_pipeline_samplesheets": { + "type": "string", + "default": "differentialabundance,mag", + "description": "Specify a comma separated string in quotes to specify which pipeline to generate a samplesheet for.", + "pattern": "^(differentialabundance|mag)(?:,(differentialabundance|mag)){0,1}" + }, + "generate_downstream_samplesheets": { + "type": "boolean", + "description": "Turn on generation of samplesheets for downstream pipelines.", + "fa_icon": "fas fa-toggle-on" + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -820,7 +892,14 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], "hidden": true }, "email_on_fail": { @@ -946,56 +1025,46 @@ }, "allOf": [ { - "$ref": "#/definitions/input_output_options" + "$ref": "#/$defs/generate_samplesheet_options" }, { - "$ref": "#/definitions/preprocessing_general_qc_options" + "$ref": "#/$defs/input_output_options" }, { - "$ref": "#/definitions/preprocessing_short_read_qc_options" + "$ref": "#/$defs/preprocessing_general_qc_options" }, { - "$ref": "#/definitions/preprocessing_long_read_qc_options" + "$ref": "#/$defs/preprocessing_short_read_qc_options" }, { - "$ref": "#/definitions/redundancy_estimation" + "$ref": "#/$defs/preprocessing_long_read_qc_options" }, { - "$ref": "#/definitions/preprocessing_host_removal_options" + "$ref": "#/$defs/redundancy_estimation" }, { - "$ref": "#/definitions/preprocessing_run_merging_options" + "$ref": "#/$defs/preprocessing_host_removal_options" }, { - "$ref": "#/definitions/profiling_options" + "$ref": "#/$defs/preprocessing_run_merging_options" }, { - "$ref": "#/definitions/postprocessing_and_visualisation_options" + "$ref": "#/$defs/profiling_options" }, { - "$ref": "#/definitions/institutional_config_options" + "$ref": "#/$defs/postprocessing_and_visualisation_options" }, { - "$ref": "#/definitions/max_job_request_options" + "$ref": "#/$defs/institutional_config_options" }, { - "$ref": "#/definitions/generic_options" + "$ref": "#/$defs/max_job_request_options" }, { - "$ref": "#/definitions/reference_genome_options" - } - ], - "properties": { - "generate_downstream_samplesheets": { - "type": "boolean", - "description": "Turn on generation of samplesheets for downstream pipelines.", - "fa_icon": "fas fa-toggle-on" + "$ref": "#/$defs/generic_options" }, - "generate_pipeline_samplesheets": { - "type": "string", - "default": "taxprofiler", - "description": "Specify which pipeline to generate a samplesheet for.", - "fa_icon": "fas fa-toolbox" + { + "$ref": "#/$defs/reference_genome_options" } - } + ] } From c6ac0cb7c745598962d6e06be8620e8219a56aa7 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 15 Oct 2024 13:46:28 +0200 Subject: [PATCH 15/25] Prettier --- nextflow_schema.json | 86 +++++++------------------------------------- 1 file changed, 13 insertions(+), 73 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 913f7d8f..ee62e79f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,11 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "databases", - "outdir" - ], + "required": ["input", "databases", "outdir"], "properties": { "input": { "type": "string", @@ -79,10 +75,7 @@ "preprocessing_qc_tool": { "type": "string", "default": "fastqc", - "enum": [ - "fastqc", - "falco" - ], + "enum": ["fastqc", "falco"], "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", "description": "Specify the tool used for quality control of raw sequencing reads", "fa_icon": "fas fa-tools" @@ -117,10 +110,7 @@ "shortread_qc_tool": { "type": "string", "default": "fastp", - "enum": [ - "fastp", - "adapterremoval" - ], + "enum": ["fastp", "adapterremoval"], "fa_icon": "fas fa-tools", "description": "Specify which tool to use for short-read QC" }, @@ -182,11 +172,7 @@ "shortread_complexityfilter_tool": { "type": "string", "default": "bbduk", - "enum": [ - "bbduk", - "prinseqplusplus", - "fastp" - ], + "enum": ["bbduk", "prinseqplusplus", "fastp"], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for complexity filtering" }, @@ -220,10 +206,7 @@ "shortread_complexityfilter_prinseqplusplus_mode": { "type": "string", "default": "entropy", - "enum": [ - "entropy", - "dust" - ], + "enum": ["entropy", "dust"], "fa_icon": "fas fa-check-square", "description": "Specify the complexity filter mode for PRINSEQ++" }, @@ -258,10 +241,7 @@ "longread_adapterremoval_tool": { "type": "string", "default": "porechop_abi", - "enum": [ - "porechop", - "porechop_abi" - ], + "enum": ["porechop", "porechop_abi"], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for adapter trimming.", "help_text": "The performance of Porechop and Porechop_ABI is same in terms of removing adapter reads. However Porechop is no longer updated, Porechop_ABI receives regular updates." @@ -275,10 +255,7 @@ "longread_filter_tool": { "type": "string", "default": "nanoq", - "enum": [ - "filtlong", - "nanoq" - ], + "enum": ["filtlong", "nanoq"], "fa_icon": "fas fa-hammer", "description": "Specify which tool to use for long reads filtering", "help_text": "Nanoq is a filtering tool only for Nanopore reads. Nanoq is faster and more memory-efficient than Filtlong. Nanoq also provides a summary of input read statistics; see [benchmarking](https://github.com/esteinig/nanoq?tab=readme-ov-file#benchmarks). \n\nFiltlong is a good option if you want to keep a certain percentage of reads after filtering, and you can also use it for non-Nanopore long reads." @@ -336,10 +313,7 @@ "type": "string", "default": "kmer", "description": "Specify mode for identifying redundant reads", - "enum": [ - "kmer", - "alignment" - ], + "enum": ["kmer", "alignment"], "fa_icon": "fas fa-align-left", "help_text": "Specify which read-comparison mode to use to check for redundancy.\n\nk-mer is faster but less precise but is recommended for FASTQ files. Alignment is more precise but is slower, it is recommended for FASTA files.\n\n> Modifies tool parameter(s):\n> - Nonpareil: `-T`" } @@ -449,15 +423,7 @@ "diamond_output_format": { "type": "string", "default": "tsv", - "enum": [ - "blast", - "xml", - "txt", - "daa", - "sam", - "tsv", - "paf" - ], + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], "fa_icon": "fas fa-file", "description": "Specify output format from DIAMOND profiling.", "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" @@ -482,14 +448,7 @@ "kaiju_taxon_rank": { "type": "string", "default": "species", - "enum": [ - "phylum", - "class", - "order", - "family", - "genus", - "species" - ], + "enum": ["phylum", "class", "order", "family", "genus", "species"], "fa_icon": "fas fa-tag", "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be only be a single level (e.g. `species`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" @@ -637,13 +596,7 @@ "default": "reads", "description": "Specify the type of ganon report to save.", "help_text": "Specify the type of taxonomic report to produce from ganon report. This mainly refers to which form of 'value' to print: raw read counts, abundance estimates, genome-size normalised etc. \n\nSee the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/#ganon-report) for more information of each option.\n\n> Modifies tool parameter(s):\n- ganon report: `--report-type`\n", - "enum": [ - "abundance", - "reads", - "matches", - "dist", - "corr" - ], + "enum": ["abundance", "reads", "matches", "dist", "corr"], "fa_icon": "fas fa-file" }, "ganon_report_rank": { @@ -712,13 +665,7 @@ "default": "tsv", "fa_icon": "fas fa-pastafarianism", "description": "The desired output format.", - "enum": [ - "tsv", - "csv", - "arrow", - "parquet", - "biom" - ] + "enum": ["tsv", "csv", "arrow", "parquet", "biom"] }, "taxpasta_taxonomy_dir": { "type": "string", @@ -892,14 +839,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { From aff979e244477a1429766eb4e4403a579a4a7251 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Tue, 15 Oct 2024 16:07:54 +0200 Subject: [PATCH 16/25] Review suggestions and new function --- conf/test.config | 2 +- conf/test_nothing.config | 2 +- nextflow_schema.json | 33 +++++++++---------- .../generate_downstream_samplesheets/main.nf | 23 ++++++++----- 4 files changed, 32 insertions(+), 28 deletions(-) diff --git a/conf/test.config b/conf/test.config index 6f93aa41..f40fb04b 100644 --- a/conf/test.config +++ b/conf/test.config @@ -51,7 +51,7 @@ params { // Generate downstream samplesheets generate_downstream_samplesheets = true - generate_pipeline_samplesheets = "mag" + generate_pipeline_samplesheets = "differentialabundance,mag" } process { diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 257f8984..7619f73c 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -44,7 +44,7 @@ params { // Generate downstream samplesheets generate_downstream_samplesheets = false - generate_pipeline_samplesheets = "mag" + generate_pipeline_samplesheets = "differentialabundance,mag" } process { diff --git a/nextflow_schema.json b/nextflow_schema.json index ee62e79f..7012ec1f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/nextflow_schema.json", "title": "nf-core/taxprofiler pipeline parameters", "description": "Taxonomic classification and profiling of shotgun short- and long-read metagenomic data", @@ -178,7 +178,7 @@ }, "shortread_complexityfilter_entropy": { "type": "number", - "default": 0.299999999999999988897769753748434595763683319091796875, + "default": 0.3, "fa_icon": "fas fa-random", "description": "Specify the minimum sequence entropy level for complexity filtering", "help_text": "Specify the minimum 'entropy' value for complexity filtering for BBDuk or PRINSEQ++.\n\nNote that this value will only be used for PRINSEQ++ if `--shortread_complexityfilter_prinseqplusplus_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation exists within the read. Higher values correspond to more variety, and thus will likely reslut in more specific matching to a taxon's reference genome. The trade off here is fewer reads (or abundance information) available for having a confident identification.\n\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`\n\n" @@ -720,7 +720,6 @@ "properties": { "generate_pipeline_samplesheets": { "type": "string", - "default": "differentialabundance,mag", "description": "Specify a comma separated string in quotes to specify which pipeline to generate a samplesheet for.", "pattern": "^(differentialabundance|mag)(?:,(differentialabundance|mag)){0,1}" }, @@ -965,46 +964,46 @@ }, "allOf": [ { - "$ref": "#/$defs/generate_samplesheet_options" + "$ref": "#/definitions/generate_samplesheet_options" }, { - "$ref": "#/$defs/input_output_options" + "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/$defs/preprocessing_general_qc_options" + "$ref": "#/definitions/preprocessing_general_qc_options" }, { - "$ref": "#/$defs/preprocessing_short_read_qc_options" + "$ref": "#/definitions/preprocessing_short_read_qc_options" }, { - "$ref": "#/$defs/preprocessing_long_read_qc_options" + "$ref": "#/definitions/preprocessing_long_read_qc_options" }, { - "$ref": "#/$defs/redundancy_estimation" + "$ref": "#/definitions/redundancy_estimation" }, { - "$ref": "#/$defs/preprocessing_host_removal_options" + "$ref": "#/definitions/preprocessing_host_removal_options" }, { - "$ref": "#/$defs/preprocessing_run_merging_options" + "$ref": "#/definitions/preprocessing_run_merging_options" }, { - "$ref": "#/$defs/profiling_options" + "$ref": "#/definitions/profiling_options" }, { - "$ref": "#/$defs/postprocessing_and_visualisation_options" + "$ref": "#/definitions/postprocessing_and_visualisation_options" }, { - "$ref": "#/$defs/institutional_config_options" + "$ref": "#/definitions/institutional_config_options" }, { - "$ref": "#/$defs/max_job_request_options" + "$ref": "#/definitions/max_job_request_options" }, { - "$ref": "#/$defs/generic_options" + "$ref": "#/definitions/generic_options" }, { - "$ref": "#/$defs/reference_genome_options" + "$ref": "#/definitions/reference_genome_options" } ] } diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 2a28900a..3f9b73f9 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -12,7 +12,6 @@ workflow SAMPLESHEET_MAG { ch_list_for_samplesheet = ch_processed_reads - .view() .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> @@ -26,7 +25,7 @@ workflow SAMPLESHEET_MAG { } .tap { ch_colnames } - channelToSamplesheet(ch_colnames, ch_list_for_samplesheet, 'downstream_samplesheets', format, format_sep) + channelToSamplesheet(ch_list_for_samplesheet,"${params.outdir}/downstream_samplesheets/mag", format) } workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { @@ -35,20 +34,26 @@ workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { ch_processed_reads main: - if ( params.generate_pipeline_samplesheets == 'mag' && params.save_analysis_ready_fastqs ) { + def downstreampipeline_names = params.generate_pipeline_samplesheets.split(",") + + if ( downstreampipeline_names.contains('mag') && params.save_analysis_ready_fastqs) { SAMPLESHEET_MAG(ch_processed_reads) } + } -def channelToSamplesheet(ch_header, ch_list_for_samplesheet, outdir_subdir, format, format_sep) { - // Constructs the header string and then the strings of each row, and - // finally concatenates for saving. Originally designed by @mahesh-panchal +// Constructs the header string and then the strings of each row, and +def channelToSamplesheet(ch_list_for_samplesheet, path, format) { + format_sep = ["csv":",", "tsv":"\t", "txt":"\t"][format] + + ch_header = ch_list_for_samplesheet + ch_header .first() - .map { it.keySet().join(format_sep) } - .concat(ch_list_for_samplesheet.map { it.values().join(format_sep) }) + .map{ it.keySet().join(format_sep) } + .concat( ch_list_for_samplesheet.map{ it.values().join(format_sep) }) .collectFile( - name: "${params.outdir}/${outdir_subdir}/${params.generate_pipeline_samplesheets}.${format}", + name:"${path}.${format}", newLine: true, sort: false ) From 67f33e58b8b7e9024143124da523497783664bd0 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Wed, 16 Oct 2024 13:48:29 +0200 Subject: [PATCH 17/25] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 704b9dd3..98a32a8a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -759,7 +759,7 @@ pipelines: Output files - `downstream_samplesheets/` - - `mag.csv`: input sheet for nf-core/mag with paths to nf-core/taxprofiler preprocessed (corresponding to what is saved with `--save_analysis_ready_fastqs`) + - `mag.csv`: input sheet for that contains paths to preprocessed FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag
From 94a28f410e91f6a8314f990d8172accad0717d5b Mon Sep 17 00:00:00 2001 From: Sofia Stamouli <91951607+sofstam@users.noreply.github.com> Date: Wed, 16 Oct 2024 13:48:37 +0200 Subject: [PATCH 18/25] Update docs/output.md Co-authored-by: James A. Fellows Yates --- docs/output.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index 98a32a8a..53b74719 100644 --- a/docs/output.md +++ b/docs/output.md @@ -750,8 +750,7 @@ For example, DIAMOND output does not have a dedicated section in the MultiQC HTM ### Downstream samplesheets -The pipeline can also generate input files for the following downstream -pipelines: +The pipeline can also generate input files for the following downstream pipelines: - [nf-core/mag](https://nf-co.re/mag) From 892b4284828db31bc8af7707dfc70b4c5ce7e452 Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Wed, 16 Oct 2024 15:05:29 +0200 Subject: [PATCH 19/25] Remove tests folder --- .../tests/main.nf.test | 42 ------------------- 1 file changed, 42 deletions(-) delete mode 100644 subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test diff --git a/subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test b/subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test deleted file mode 100644 index ef63bcee..00000000 --- a/subworkflows/local/generate_downstream_samplesheets/tests/main.nf.test +++ /dev/null @@ -1,42 +0,0 @@ -nextflow_workflow { - - name "Test Subworkflow GENERATE_DOWNSTREAM_SAMPLESHEETS" - script "../main.nf" - workflow "GENERATE_DOWNSTREAM_SAMPLESHEETS" - - tag "subworkflows" - tag "subworkflows_local" - tag "subworkflows/generate_downstream_samplesheets" - - test("generate_downstream_samplesheets - mag") { - - when { - params { - modules_testdata_base_path = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/" - outdir = "." - generate_pipeline_samplesheets = 'mag' - } - workflow { - """ - input[0] = Channel.of( - [ - [id:'database', tool:'kraken2'], - file(params.modules_testdata_base_path + "genomics/sarscov2/genome/db/kraken2.tar.gz", checkIfExists: true) - ], - [ - [id:'database', tool:'krakenuniq'], - file(params.modules_testdata_base_path + "genomics/sarscov2/genome/db/krakenuniq.tar.gz", checkIfExists: true) - ], - ) - """ - } - } - - then { - assertAll( - { assert workflow.success}, - { assert snapshot("${params.outdir}/downstream_samplesheets/${params.generate_pipeline_samplesheets}.csv").match() } - ) - } - } -} From 0abfdf77531c5df4f9b61176c66a263bed359a4e Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 17 Oct 2024 13:33:54 +0200 Subject: [PATCH 20/25] Use the same function as detaxizer --- .../generate_downstream_samplesheets/main.nf | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 3f9b73f9..635fba4c 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -10,22 +10,34 @@ workflow SAMPLESHEET_MAG { format = 'csv' // most common format in nf-core format_sep = ',' - - ch_list_for_samplesheet = ch_processed_reads - .filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } - .map { + ch_list_for_samplesheet = ch_processed_reads.view() + //.filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } + .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> def sample = meta.id def run = meta.run_accession //this should be optional def group = "" - def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() - def short_reads_2 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() - def long_reads = "" + def short_reads_1 = meta.single_end ? "": file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() + def short_reads_2 = meta.single_end ? "": file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() + def long_reads = meta.is_fasta ? file(params.outdir).toString() + '/' + meta.id + '/' + fasta.getName() : "" [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] } - .tap { ch_colnames } + .view() + .tap{ ch_list_for_samplesheet_all } + .filter{ it.short_reads_1!="" } + .branch{ + se: it.short_reads_2 =="" + pe: true + } + + // Throw a warning that only long reads are not supported yet by MAG + ch_list_for_samplesheet_all + .filter{ it.long_reads !="" && it.short_reads_1=="" } + .collect{ log.warn("Standalone long reads are not yet supported by the nf-core/mag pipeline and ARE REMOVED from the samplesheet 'mag-se.csv' \n sample: ${it.sample}" )} + + channelToSamplesheet(ch_list_for_samplesheet.pe,"${params.outdir}/downstream_samplesheets/mag-pe", format) + channelToSamplesheet(ch_list_for_samplesheet.se, "${params.outdir}/downstream_samplesheets/mag-se", format) - channelToSamplesheet(ch_list_for_samplesheet,"${params.outdir}/downstream_samplesheets/mag", format) } workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { From 1e00c4c1f1a01bbdf5a003d9f793d7cfa770789f Mon Sep 17 00:00:00 2001 From: Sofia Stamouli Date: Thu, 17 Oct 2024 13:37:22 +0200 Subject: [PATCH 21/25] LintinG --- subworkflows/local/generate_downstream_samplesheets/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 635fba4c..08ed226e 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -11,7 +11,6 @@ workflow SAMPLESHEET_MAG { format_sep = ',' ch_list_for_samplesheet = ch_processed_reads.view() - //.filter { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> (fastq_1 && fastq_2) && !fasta } .map { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> def sample = meta.id From f79fc7d1637fe59ce1eb4e38ace565751dfcf8c3 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Tue, 22 Oct 2024 09:28:23 +0000 Subject: [PATCH 22/25] [automated] Fix code linting --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 53b74719..80f61d7c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -758,7 +758,7 @@ The pipeline can also generate input files for the following downstream pipeline Output files - `downstream_samplesheets/` - - `mag.csv`: input sheet for that contains paths to preprocessed FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag + - `mag.csv`: input sheet for that contains paths to preprocessed FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag From 7a2517896f66398069d80cfe373b444a2d3ef180 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 24 Oct 2024 10:16:54 +0200 Subject: [PATCH 23/25] Apply suggestions from code review --- .../local/generate_downstream_samplesheets/main.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 08ed226e..4a73e312 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -15,24 +15,24 @@ workflow SAMPLESHEET_MAG { meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> def sample = meta.id def run = meta.run_accession //this should be optional - def group = "" - def short_reads_1 = meta.single_end ? "": file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() - def short_reads_2 = meta.single_end ? "": file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() + def group = "" + def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() + def short_reads_2 = meta.single_end ? "" : file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() def long_reads = meta.is_fasta ? file(params.outdir).toString() + '/' + meta.id + '/' + fasta.getName() : "" [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] } .view() .tap{ ch_list_for_samplesheet_all } - .filter{ it.short_reads_1!="" } + .filter{ it.short_reads_1 != "" } .branch{ - se: it.short_reads_2 =="" + se: it.short_reads_2 == "" pe: true } // Throw a warning that only long reads are not supported yet by MAG ch_list_for_samplesheet_all - .filter{ it.long_reads !="" && it.short_reads_1=="" } - .collect{ log.warn("Standalone long reads are not yet supported by the nf-core/mag pipeline and ARE REMOVED from the samplesheet 'mag-se.csv' \n sample: ${it.sample}" )} + .filter{ it.long_reads != "" && it.short_reads_1 == "" } + .collect{ log.warn("[nf-core/taxprofiler] WARNING: Standalone long reads are not yet supported by the nf-core/mag pipeline and will not be in present in `mag-*.csv`. Sample: ${it.sample}" )} channelToSamplesheet(ch_list_for_samplesheet.pe,"${params.outdir}/downstream_samplesheets/mag-pe", format) channelToSamplesheet(ch_list_for_samplesheet.se, "${params.outdir}/downstream_samplesheets/mag-se", format) From b729483bd4dfc50af8b47d4f238e49435812526c Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 24 Oct 2024 09:40:40 +0000 Subject: [PATCH 24/25] Get the samplesheet generate to generate se reads --- docs/output.md | 4 +- .../generate_downstream_samplesheets/main.nf | 58 +++++++++---------- .../utils_nfcore_taxprofiler_pipeline/main.nf | 12 +++- workflows/taxprofiler.nf | 2 +- 4 files changed, 43 insertions(+), 33 deletions(-) diff --git a/docs/output.md b/docs/output.md index 80f61d7c..a608b85f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -758,7 +758,9 @@ The pipeline can also generate input files for the following downstream pipeline Output files - `downstream_samplesheets/` - - `mag.csv`: input sheet for that contains paths to preprocessed FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag + - `mag-{pe,se}.csv`: input sheet for single-end and paired-end reads that contains paths to preprocessed short-read FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag. + - Note: if you merge reads, these will be listed in teh `mag-se.csv`. + - Note: the nf-core/mag mandatory `group` column is filled with a dummy ID (`0`), you may wish to change this depending on your nf-core/mag settings! diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index 4a73e312..d4a66530 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -7,64 +7,62 @@ workflow SAMPLESHEET_MAG { ch_processed_reads main: - format = 'csv' // most common format in nf-core - format_sep = ',' + format = 'csv' - ch_list_for_samplesheet = ch_processed_reads.view() - .map { - meta, sample_id, instrument_platform,fastq_1,fastq_2,fasta -> - def sample = meta.id - def run = meta.run_accession //this should be optional - def group = "" - def short_reads_1 = file(params.outdir).toString() + '/' + meta.id + '/' + fastq_1.getName() - def short_reads_2 = meta.single_end ? "" : file(params.outdir).toString() + '/' + meta.id + '/' + fastq_2.getName() - def long_reads = meta.is_fasta ? file(params.outdir).toString() + '/' + meta.id + '/' + fasta.getName() : "" - [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + ch_list_for_samplesheet = ch_processed_reads + .dump() + .map { meta, reads -> + def sample = meta.id + def run = params.perform_runmerging ? '' : meta.run_accession + def group = "0" + //this should be optional + def short_reads_1 = meta.is_fasta ? "" : file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[0].getName() + def short_reads_2 = meta.is_fasta || meta.single_end ? "" : file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[1].getName() + def long_reads = meta.is_fasta ? file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[0].getName() : "" + + [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] } - .view() - .tap{ ch_list_for_samplesheet_all } - .filter{ it.short_reads_1 != "" } - .branch{ + .tap { ch_list_for_samplesheet_all } + .filter { it.short_reads_1 != "" } + .branch { se: it.short_reads_2 == "" - pe: true - } + pe: it.short_reads_2 != "" + unknown: true + } // Throw a warning that only long reads are not supported yet by MAG ch_list_for_samplesheet_all - .filter{ it.long_reads != "" && it.short_reads_1 == "" } - .collect{ log.warn("[nf-core/taxprofiler] WARNING: Standalone long reads are not yet supported by the nf-core/mag pipeline and will not be in present in `mag-*.csv`. Sample: ${it.sample}" )} + .filter { it.long_reads != "" && it.short_reads_1 == "" } + .collect { log.warn("[nf-core/taxprofiler] WARNING: Standalone long reads are not yet supported by the nf-core/mag pipeline and will not be in present in `mag-*.csv`. Sample: ${it.sample}") } - channelToSamplesheet(ch_list_for_samplesheet.pe,"${params.outdir}/downstream_samplesheets/mag-pe", format) + channelToSamplesheet(ch_list_for_samplesheet.pe, "${params.outdir}/downstream_samplesheets/mag-pe", format) channelToSamplesheet(ch_list_for_samplesheet.se, "${params.outdir}/downstream_samplesheets/mag-se", format) - } workflow GENERATE_DOWNSTREAM_SAMPLESHEETS { - take: ch_processed_reads main: def downstreampipeline_names = params.generate_pipeline_samplesheets.split(",") - if ( downstreampipeline_names.contains('mag') && params.save_analysis_ready_fastqs) { + if (downstreampipeline_names.contains('mag') && params.save_analysis_ready_fastqs) { SAMPLESHEET_MAG(ch_processed_reads) } - } // Constructs the header string and then the strings of each row, and def channelToSamplesheet(ch_list_for_samplesheet, path, format) { - format_sep = ["csv":",", "tsv":"\t", "txt":"\t"][format] + def format_sep = [csv: ",", tsv: "\t", txt: "\t"][format] - ch_header = ch_list_for_samplesheet + def ch_header = ch_list_for_samplesheet ch_header .first() - .map{ it.keySet().join(format_sep) } - .concat( ch_list_for_samplesheet.map{ it.values().join(format_sep) }) + .map { it.keySet().join(format_sep) } + .concat(ch_list_for_samplesheet.map { it.values().join(format_sep) }) .collectFile( - name:"${path}.${format}", + name: "${path}.${format}", newLine: true, sort: false ) diff --git a/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf b/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf index 9b4f6df5..e93e9942 100644 --- a/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf @@ -149,7 +149,17 @@ workflow PIPELINE_COMPLETION { // def validateInputParameters() { genomeExistsError() -}// + + if (params.generate_downstream_samplesheets && !params.generate_pipeline_samplesheets) { + error('[nf-core/taxprofiler] ERROR: If supplying `--generate_downstream_samplesheets`, you must also specify which pipeline to generate for with `--generate_pipeline_samplesheets`! Check input.') + } + + if ( params.generate_downstream_samplesheets && params.generate_pipeline_samplesheets.split(",").contains('mag') && !params.save_analysis_ready_fastqs ) { + error("[nf-core/taxprofiler] ERROR: To generate downstream samplesheets for nf-core/mag, you must also specify `--save_analysis_ready_fastqs`") + } +} + +// // Validate channels from input samplesheet // def validateInputSamplesheet(input) { diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 8a47b3c6..619d3916 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -334,7 +334,7 @@ workflow TAXPROFILER { // Samplesheet generation // if ( params.generate_downstream_samplesheets ) { - GENERATE_DOWNSTREAM_SAMPLESHEETS ( samplesheet ) + GENERATE_DOWNSTREAM_SAMPLESHEETS ( ch_reads_runmerged ) } // From 6eeb98265c27938dd3329970dc23ceb36859c1e4 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 24 Oct 2024 10:42:46 +0000 Subject: [PATCH 25/25] Fix run column --- docs/output.md | 4 ++-- .../local/generate_downstream_samplesheets/main.nf | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/output.md b/docs/output.md index a608b85f..20eddac6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -759,8 +759,8 @@ The pipeline can also generate input files for the following downstream pipeline - `downstream_samplesheets/` - `mag-{pe,se}.csv`: input sheet for single-end and paired-end reads that contains paths to preprocessed short-read FASTQs (corresponding to what is saved with `--save_analysis_ready_fastqs`) that can be used to skip read preprocessing steps in nf-core/mag. - - Note: if you merge reads, these will be listed in teh `mag-se.csv`. - - Note: the nf-core/mag mandatory `group` column is filled with a dummy ID (`0`), you may wish to change this depending on your nf-core/mag settings! + - Note: if you supplied `--shortread_qc_mergepairs`, all files will be listed in `mag-se.csv` as single end and no `mag-pe.csv` will be generated. + - Note: the nf-core/mag mandatory `group` column is filled with a dummy ID (`0`), you may wish to change this depending on your nf-core/mag settings. diff --git a/subworkflows/local/generate_downstream_samplesheets/main.nf b/subworkflows/local/generate_downstream_samplesheets/main.nf index d4a66530..c2f70f8d 100644 --- a/subworkflows/local/generate_downstream_samplesheets/main.nf +++ b/subworkflows/local/generate_downstream_samplesheets/main.nf @@ -10,7 +10,12 @@ workflow SAMPLESHEET_MAG { format = 'csv' ch_list_for_samplesheet = ch_processed_reads - .dump() + .filter { meta, reads -> + if (meta.instrument_platform != 'ILLUMINA') { + log.warn("[nf-core/taxprofiler] WARNING: Only Illumina short-reads are supported by the nf-core/mag pipeline. The following sample will not be in present in `mag-*.csv`: ${meta.id}") + } + meta.instrument_platform == 'ILLUMINA' + } .map { meta, reads -> def sample = meta.id def run = params.perform_runmerging ? '' : meta.run_accession @@ -20,7 +25,12 @@ workflow SAMPLESHEET_MAG { def short_reads_2 = meta.is_fasta || meta.single_end ? "" : file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[1].getName() def long_reads = meta.is_fasta ? file(params.outdir).toString() + '/analysis_ready_fastqs/' + reads[0].getName() : "" - [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + if (params.perform_runmerging) { + [sample: sample, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + } + else { + [sample: sample, run: run, group: group, short_reads_1: short_reads_1, short_reads_2: short_reads_2, long_reads: long_reads] + } } .tap { ch_list_for_samplesheet_all } .filter { it.short_reads_1 != "" }