Merge pull request #13 from BCCDC-PHL/hostile-dehost

Add optional dehosting with bede/hostile
BCCDC-PHL · Sep 16, 2024 · f498d8e · f498d8e
2 parents d2d498b + d7acd4a
commit f498d8e
Show file tree

Hide file tree

Showing 7 changed files with 349 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ A generic pipeline that can be run on an arbitrary set of Illumina sequence file
 ## Analyses
 
 * [`fastp`](https://github.com/OpenGene/fastp): Collect sequence QC stats
+* [`hostile`](https://github.com/bede/hostile): Removal of host (human) reads (optional)
 
 ## Usage
 
@@ -42,6 +43,90 @@ nextflow run BCCDC-PHL/basic-sequence-qc \
   --outdir <output directory>
 ```
 
+## Dehosting
+
+This pipeline supports an optional dehosting step that can be used to remove human-derived sequence reads.
+Removal of host reads from other organisms is not currently supported.
+
+### Setup
+
+Before using this pipeline to perform dehosting, the human reference genome(s) should be downloaded.
+Dehosting is performed by attempting to align reads against the human genome and removing reads
+that align with sufficient quality and specificity.
+
+1. Activate the conda environment that is used by this pipeline for dehosting.
+
+```
+conda activate basic-sequence-qc-dehosting-6e4260b30b21d1dbd469b1b1e0f20628
+```
+
+2. Download (fetch) the default reference genome (`human-t2t-hla`)
+
+```
+hostile fetch
+```
+
+The reference will be downloaded to:
+
+```
+~/.local/share/hostile
+```
+
+If alternative versions of the reference genome are needed, download them 
+
+```
+hostile fetch --name <REF_NAME>
+```
+
+Details on available references can be found on [the bede/hostile README](https://github.com/bede/hostile?tab=readme-ov-file#indexes).
+The list of available reference names can be found by running:
+
+```
+hostile fetch --list
+```
+
+...which should return a list like this:
+
+```
+human-t2t-hla
+human-t2t-hla-argos985
+human-t2t-hla-argos985-mycob140
+human-t2t-hla.rs-viral-202401_ml-phage-202401
+human-t2t-hla.argos-bacteria-985_rs-viral-202401_ml-phage-202401
+```
+
+3. Deactivate the conda env.
+
+```
+conda deactivate
+```
+
+### Performing Dehosting
+
+To include the dehosting step in the pipeline analysis, add the `--dehost` flag:
+
+```
+nextflow run BCCDC-PHL/basic-sequence-qc \
+  -profile conda \
+  --cache ~/.conda/envs \
+  --fastq_input /path/to/fastq_input \
+  --dehost \
+  --outidr /path/to/output-dir
+```
+
+By default, the `human-t2t-hla` reference will be used. To use an alternative reference, include the `--dehosting_index` flag along
+with the reference name:
+
+```
+nextflow run BCCDC-PHL/basic-sequence-qc \
+  -profile conda \
+  --cache ~/.conda/envs \
+  --fastq_input /path/to/fastq_input \
+  --dehost \
+  --dehosting_reference human-t2t-hla-argos985-mycob140 \
+  --outidr /path/to/output-dir
+```
+
 ## Output
 
 A single output file in .csv format will be created in the directory specified by `--outdir`. The filename will be `basic_qc_stats.csv`.
@@ -72,3 +157,27 @@ gc_content_after_filtering
 adapter_trimmed_reads
 adapter_trimmed_bases
 ```
+
+### Dehosted Reads
+
+If dehosting is performed, dehosted reads will be deposited under the directory supplied for the `--outdir` param, in a sub-directory
+for each sample, named using the sample ID. For example:
+
+```
+outdir
+├── sample-01
+│   ├── sample-01_dehosted_R1.fastq.gz
+│   ├── sample-01_dehosted_R2.fastq.gz
+│   ├── sample-01_dehosted_fastp.csv
+│   ├── sample-01_fastp.csv
+│   └── sample-01_hostile.log.json
+├── sample-02
+│   ├── sample-02_dehosted_R1.fastq.gz
+│   ├── sample-02_dehosted_R2.fastq.gz
+│   ├── sample-02_dehosted_fastp.csv
+│   ├── sample-02_fastp.csv
+│   └── sample-02_hostile.log.json
+├── sample-03
+│   ├── ...
+...
+```
diff --git a/bin/combine_fastp_reports.py b/bin/combine_fastp_reports.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import sys
+
+from pathlib import Path
+
+def parse_fastp_csv(fastp_csv_path: Path):
+    """
+    """
+    fastp_report = {}
+    int_fields = [
+        'total_reads_before_filtering',
+        'total_reads_after_filtering',
+        'total_bases_before_filtering',
+        'total_bases_after_filtering',
+        'read1_mean_length_before_filtering',
+        'read1_mean_length_after_filtering',
+        'read2_mean_length_before_filtering',
+        'read2_mean_length_after_filtering',
+        'q20_bases_before_filtering',
+        'q30_bases_before_filtering',
+        'q20_bases_after_filtering',
+        'q30_bases_after_filtering',
+        'adapter_trimmed_reads',
+        'adapter_trimmed_bases',
+
+    ]
+    float_fields = [
+        'q20_rate_before_filtering',
+        'q30_rate_before_filtering',
+        'q20_rate_after_filtering',
+        'q30_rate_after_filtering',
+        'gc_content_before_filtering',
+        'gc_content_after_filtering',
+    ]
+    with open(fastp_csv_path, 'r') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            for field, value in row.items():
+                if field in int_fields:
+                    try:
+                        fastp_report[field] = int(value)
+                    except ValueError as e:
+                        fastp_report[field] = None
+                elif field in float_fields:
+                    try:
+                        fastp_report[field] = float(value)
+                    except ValueError as e:
+                        fastp_report[field] = None
+                else:
+                    fastp_report[field] = value
+
+    return fastp_report
+
+
+def combine_fastp_reports(pre_dehosting_report, post_dehosting_report):
+    """
+    """
+    combined_report = {}
+
+    combined_report['sample_id'] = pre_dehosting_report['sample_id']
+
+    for key, value in pre_dehosting_report.items():
+        if key.endswith('before_filtering'):
+            combined_report[key] = value
+    for key, value in post_dehosting_report.items():
+        if key.endswith('after_filtering'):
+            combined_report[key] = value
+
+    pre_dehosting_adapter_trimmed_reads = pre_dehosting_report.get('adapter_trimmed_reads', 0)
+    post_dehosting_adapter_trimmed_reads = post_dehosting_report.get('adapter_trimmed_reads', 0)
+    combined_report['adapter_trimmed_reads'] = pre_dehosting_adapter_trimmed_reads + post_dehosting_adapter_trimmed_reads
+
+    pre_dehosting_adapter_trimmed_bases = pre_dehosting_report.get('adapter_trimmed_bases', 0)
+    post_dehosting_adapter_trimmed_bases = post_dehosting_report.get('adapter_trimmed_bases', 0)
+    combined_report['adapter_trimmed_bases'] = pre_dehosting_adapter_trimmed_bases + post_dehosting_adapter_trimmed_bases
+
+    return combined_report
+
+
+def main(args):
+    pre_dehosting_report = parse_fastp_csv(Path(args.pre_dehosting))
+    post_dehosting_report = parse_fastp_csv(Path(args.post_dehosting))
+
+    combined_report = combine_fastp_reports(pre_dehosting_report, post_dehosting_report)
+
+    output_fieldnames = [
+        'sample_id',
+        'total_reads_before_filtering',
+        'total_reads_after_filtering',
+        'total_bases_before_filtering',
+        'total_bases_after_filtering',
+        'read1_mean_length_before_filtering',
+        'read1_mean_length_after_filtering',
+        'read2_mean_length_before_filtering',
+        'read2_mean_length_after_filtering',
+        'q20_bases_before_filtering',
+        'q30_bases_after_filtering',
+        'q20_rate_before_filtering',
+        'q20_rate_after_filtering',
+        'q30_bases_before_filtering',
+        'q30_bases_after_filtering',
+        'q30_rate_before_filtering',
+        'q30_rate_after_filtering',
+        'gc_content_before_filtering',
+        'gc_content_after_filtering',
+        'adapter_trimmed_reads',
+        'adapter_trimmed_bases',
+    ]
+    writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='unix', quoting=csv.QUOTE_MINIMAL, extrasaction='ignore')
+    writer.writeheader()
+    writer.writerow(combined_report)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--pre-dehosting', required=True, help='Path to pre-dehosting fastp report')
+    parser.add_argument('--post-dehosting', required=True, help='Path to post-dehosting fastp report')
+    args = parser.parse_args()
+    main(args)
diff --git a/environments/dehosting.yml b/environments/dehosting.yml
@@ -0,0 +1,8 @@
+name: basic-sequence-qc-dehosting
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3
+  - hostile=1.1.0
diff --git a/main.nf b/main.nf
@@ -2,7 +2,11 @@
 
 nextflow.enable.dsl = 2
 
-include { fastp } from './modules/fastp.nf'
+include { fastp }                    from './modules/fastp.nf'
+include { fastp as fastp_dehosted }  from './modules/fastp.nf'
+include { dehost }                   from './modules/dehosting.nf'
+include { combine_fastp_reports }    from './modules/dehosting.nf'
+
 
 workflow {
   if (params.samplesheet_input != 'NO_FILE') {
@@ -13,9 +17,20 @@ workflow {
 
   main:
 
-    fastp(ch_fastq_input)
+    fastp(ch_fastq_input.combine(Channel.of("")))
+
+    if (params.dehost) {
+	dehost(ch_fastq_input)
+	fastp_dehosted(dehost.out.dehosted_reads.combine(Channel.of("_dehosted")))
+	combine_fastp_reports(fastp.out.metrics.join(fastp_dehosted.out.metrics))
+    }
 
     output_prefix = params.prefix == '' ? params.prefix : params.prefix + '_'
-    fastp.out.collectFile(keepHeader: true, sort: { it.text }, name: "${output_prefix}basic_qc_stats.csv", storeDir: "${params.outdir}")
+
+    if (params.dehost) {
+	combine_fastp_reports.out.metrics.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${output_prefix}basic_qc_stats.csv", storeDir: "${params.outdir}")
+    } else {
+	fastp.out.metrics.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "${output_prefix}basic_qc_stats.csv", storeDir: "${params.outdir}")
+    }
 
 }
diff --git a/modules/dehosting.nf b/modules/dehosting.nf
@@ -0,0 +1,48 @@
+process dehost {
+
+    tag { sample_id }
+
+    publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_dehosted_R{1,2}.fastq.gz", mode: 'copy'
+    publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_hostile.log.json", mode: 'copy'
+
+    input:
+    tuple val(sample_id), path(reads)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_hostile.log.json"), emit: hostile_log
+    tuple val(sample_id), path("${sample_id}_dehosted*.fastq.gz"), emit: dehosted_reads
+
+    script:
+    """
+    hostile clean \
+	--threads ${task.cpus} \
+	--fastq1 ${reads[0]} \
+	--fastq2 ${reads[1]} \
+	--index ${params.dehosting_index} \
+	--out-dir . \
+	> ${sample_id}_hostile.log.json
+
+    mv ${sample_id}*.clean_1.fastq.gz ${sample_id}_dehosted_R1.fastq.gz
+    mv ${sample_id}*.clean_2.fastq.gz ${sample_id}_dehosted_R2.fastq.gz
+    """
+}
+
+
+process combine_fastp_reports {
+
+    tag { sample_id }
+
+    input:
+    tuple val(sample_id), path(fastp_pre_dehosting), path(fastp_post_dehosting)
+
+    output:
+    tuple val(sample_id), path("${sample_id}_fastp_combined.csv"), emit: metrics
+
+    script:
+    """
+    combine_fastp_reports.py \
+	--pre-dehosting ${fastp_pre_dehosting} \
+	--post-dehosting ${fastp_post_dehosting} \
+	> ${sample_id}_fastp_combined.csv
+    """
+}
diff --git a/modules/fastp.nf b/modules/fastp.nf
@@ -2,11 +2,13 @@ process fastp {
 
     tag { sample_id }
 
+    publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}*_fastp.csv", mode: 'copy', enabled: params.dehost
+
     input:
-    tuple val(sample_id), path(reads)
+    tuple val(sample_id), path(reads), val(suffix)
 
     output:
-    path("${sample_id}_fastp.csv")
+    tuple val(sample_id), path("${sample_id}${suffix}_fastp.csv"), emit: metrics
 
 
     script:
@@ -18,6 +20,6 @@ process fastp {
       --cut_tail \
       -j ${sample_id}_fastp.json
 
-    fastp_json_to_csv.py -s ${sample_id} ${sample_id}_fastp.json > ${sample_id}_fastp.csv
+    fastp_json_to_csv.py -s ${sample_id} ${sample_id}_fastp.json > ${sample_id}${suffix}_fastp.csv
     """
-}
+}