Merge pull request #61 from genepi/Adapt-nextlow.config-for-singularity

Merge latest version with adapted clustering strategy
genepi · Mar 6, 2024 · 1f08091 · 1f08091
2 parents bd793cd + 59c04cd
commit 1f08091
Show file tree

Hide file tree

Showing 27 changed files with 511 additions and 582 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Umi-pipeline-nf
 
 **Umi-pipeline-nf** creates highly accurate single-molecule consensus sequences for unique molecular identifier (UMI)-tagged amplicons from nanopore sequencing data.  
 The pipeline can be run for the whole fastq_pass folder of your nanopore run and, per default, outputs the aligned consensus sequences of each UMI cluster in bam file. The optional variant calling creates a vcf file for all variants that are found in the consensus sequences.
-umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI analysis pipeline](https://github.com/nanoporetech/pipeline-umi-amplicon); originally developed by [Karst et al, Nat Biotechnol 18:165–169, 2021](https://www.nature.com/articles/s41592-020-01041-y)). We implemented the pipeline in [Nextflow](https://www.nextflow.io) and included several optimizations and [additional functionalities](#main-adaptations).  
+umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI analysis pipeline](https://github.com/nanoporetech/pipeline-umi-amplicon); originally developed by [Karst et al, Nat Biotechnol 18:165–169, 2021](https://www.nature.com/articles/s41592-020-01041-y)). We migrated the pipeline in [Nextflow](https://www.nextflow.io), included several optimizations and [additional functionalities](#main-adaptations).  
 
 ![Workflow](docs/images/umi-pipeline-nf_metro-map.svg)
 
@@ -26,7 +26,7 @@ umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI ana
 
 ## Main Adaptations
 
-* It comes with docker containers making **installation simple, portable** and **results highly reproducible**.
+* It comes with a docker/singularity container making **installation simple, easy to use on clusters** and **results highly reproducible**.
 * The pipeline is **optimized for parallelization**.
 * **Additional UMI cluster splitting** step to remove admixed UMI clusters.
 * Read filtering strategy per UMI cluster was adapted to **preserve the highest quality reads**.

diff --git a/bin/extract_umis.py b/bin/extract_umis.py
@@ -45,10 +45,10 @@ def parse_args(argv):
         help="Max edit distance for UMI",
     )
     parser.add_argument(
-        "--adapter-length",
+        "--adapter_length",
         dest="ADAPTER_LENGTH",
         type=int,
-        default=250,
+        default=200,
         help="Length of adapter",
     )
     parser.add_argument(
@@ -106,10 +106,27 @@ def parse_args(argv):
     return args
 
 
-def extract_umi(query_seq, query_qual, pattern, max_edit_dist, format):
+def clip_entry(entry, umi_start_fwd, umi_start_rev, adapter_length, format):
+    clip_fwd = umi_start_fwd
+    if adapter_length == umi_start_rev:
+        clip_rev = None
+    else: 
+        clip_rev = umi_start_rev - adapter_length
+
+    entry.sequence = entry.sequence[clip_fwd:clip_rev]
+
+    if format == "fastq":
+        entry.quality = entry.quality[clip_fwd:clip_rev]
+
+    return entry
+
+def extract_umi(query_seq, query_qual, pattern, max_edit_dist, format, direction):
     umi_qual = None
     equalities = [("M", "A"), ("M", "C"), ("R", "A"), ("R", "G"), ("W", "A"), ("W", "T"), ("S", "C"), ("S", "G"), ("Y", "C"), ("Y", "T"), ("K", "G"), ("K", "T"), ("V", "A"), ("V", "C"),
-                  ("V", "G"), ("H", "A"), ("H", "C"), ("H", "T"), ("D", "A"), ("D", "G"), ("D", "T"), ("B", "C"), ("B", "G"), ("B", "T"), ("N", "A"), ("N", "C"), ("N", "G"), ("N", "T")]
+                  ("V", "G"), ("H", "A"), ("H", "C"), ("H", "T"), ("D", "A"), ("D", "G"), ("D", "T"), ("B", "C"), ("B", "G"), ("B", "T"), ("N", "A"), ("N", "C"), ("N", "G"), ("N", "T"),
+                  ("m", "a"), ("m", "c"), ("r", "a"), ("r", "g"), ("w", "a"), ("w", "t"), ("s", "c"), ("s", "g"), ("y", "c"), ("y", "t"), ("k", "g"), ("k", "t"), ("v", "a"), ("v", "c"),
+                  ("v", "g"), ("h", "a"), ("h", "c"), ("h", "t"), ("d", "a"), ("d", "g"), ("d", "t"), ("b", "c"), ("b", "g"), ("b", "t"), ("n", "a"), ("n", "c"), ("n", "g"), ("n", "t"), 
+                  ("a", "A"), ("c", "C"), ("t", "T"), ("g", "G")]
 
     result = edlib.align(
         pattern,
@@ -120,31 +137,38 @@ def extract_umi(query_seq, query_qual, pattern, max_edit_dist, format):
         additionalEqualities=equalities,
     )
     if result["editDistance"] == -1:
-        return None, None, None
+        return None, None, None, None
 
     edit_dist = result["editDistance"]
     locs = result["locations"][0]
-    umi = query_seq[locs[0]:locs[1]+1]
+    umi_start_pos = locs[0]
+    umi_end_pos = locs[1] + 1
+    umi = query_seq[umi_start_pos:umi_end_pos]
+
+    if direction == "fwd":
+        umi_start = umi_start_pos
+    else:
+        umi_start = umi_end_pos
 
     if format == "fastq":
         umi_qual = query_qual[locs[0]:locs[1]+1]
 
-    return edit_dist, umi, umi_qual
+    return edit_dist, umi, umi_qual, umi_start
 
 
-def extract_adapters(entry, max_adapter_length, format):
+def extract_adapters(entry, adapter_length, format):
     read_5p_seq = None
     read_5p_qual = None
     read_3p_seq = None
     read_3p_qual = None
 
-    if len(entry.sequence) > max_adapter_length:
-        read_5p_seq = entry.sequence[:max_adapter_length]
-        read_3p_seq = entry.sequence[-max_adapter_length:]
+    if len(entry.sequence) > adapter_length:
+        read_5p_seq = entry.sequence[:adapter_length]
+        read_3p_seq = entry.sequence[-adapter_length:]
 
         if format == "fastq":
-            read_5p_qual = entry.quality[:max_adapter_length]
-            read_3p_qual = entry.quality[-max_adapter_length:]
+            read_5p_qual = entry.quality[:adapter_length]
+            read_3p_qual = entry.quality[-adapter_length:]
 
     return read_5p_seq, read_3p_seq, read_5p_qual, read_3p_qual
 
@@ -288,7 +312,7 @@ def write_tsv(
 def extract_umis(
     args
 ):
-    max_adapter_length = args.ADAPTER_LENGTH
+    adapter_length = args.ADAPTER_LENGTH
     max_pattern_dist = args.MAX_ERROR
     output_folder = args.OUT
     tsv = args.TSV
@@ -311,7 +335,7 @@ def extract_umis(
             n_total += 1
 
             read_5p_seq, read_3p_seq, read_5p_qual, read_3p_qual = extract_adapters(
-                entry, max_adapter_length, format
+                entry, adapter_length, format
             )
 
             if read_5p_seq is None or read_3p_seq is None:
@@ -320,22 +344,26 @@ def extract_umis(
             strand_stats[strand] += 1
 
             # Extract fwd UMI
-            result_5p_fwd_umi_dist, result_5p_fwd_umi_seq, result_5p_fwd_umi_qual = extract_umi(
-                read_5p_seq, read_5p_qual, umi_fwd, max_pattern_dist, format
+            result_5p_fwd_umi_dist, result_5p_fwd_umi_seq, result_5p_fwd_umi_qual, umi_start_fwd = extract_umi(
+                read_5p_seq, read_5p_qual, umi_fwd, max_pattern_dist, format, "fwd"
             )
             # Extract rev UMI
-            result_3p_rev_umi_dist, result_3p_rev_umi_seq, result_3p_rev_umi_qual = extract_umi(
-                read_3p_seq, read_3p_qual, umi_rev, max_pattern_dist, format
+            result_3p_rev_umi_dist, result_3p_rev_umi_seq, result_3p_rev_umi_qual, umi_start_rev = extract_umi(
+                read_3p_seq, read_3p_qual, umi_rev, max_pattern_dist, format, "rev"
             )
 
             if not result_5p_fwd_umi_seq or not result_3p_rev_umi_seq:
                 continue
 
+            clipped_entry = clip_entry(
+                entry, umi_start_fwd, umi_start_rev, adapter_length, format
+            )
+
             n_both_umi += 1
 
             if format == "fasta":
                 write_fasta(
-                    entry,
+                    clipped_entry,
                     strand,
                     result_5p_fwd_umi_dist,
                     result_3p_rev_umi_dist,
@@ -346,7 +374,7 @@ def extract_umis(
 
             if format == "fastq":
                 write_fastq(
-                    entry,
+                    clipped_entry,
                     strand,
                     result_5p_fwd_umi_dist,
                     result_3p_rev_umi_dist,

diff --git a/bin/filter_reads.py b/bin/filter_reads.py
@@ -48,6 +48,14 @@ def parse_args(argv):
         default=0.9,
         help="Min overlap with target region",
     )
+
+    parser.add_argument(
+        "--adapter_length",
+        dest="ADAPTER_LENGTH",
+        type=int,
+        default=200,
+        help="Length of adapter",
+    )
 
     parser.add_argument(
         "--include_secondary_reads",
@@ -160,7 +168,7 @@ def write_fastq(read, out_f):
 def filter_reads(args):
     bed_regions = args.BED[0]
     bam_file = args.BAM
-    max_clipping = 250
+    adapter_length = args.ADAPTER_LENGTH
     min_overlap = args.MIN_OVERLAP
     incl_sec = args.INCL_SEC
     output = args.OUT
@@ -177,12 +185,7 @@ def filter_reads(args):
     n_supplementary = 0
     n_secondary = 0
     n_total = 0
-
-    filtered_perc = 0
-    unmapped_perc = 0
-    ontarget_perc = 0
-    concatermer_perc = 0
-    short_perc = 0
+    n_long = 0
 
     with pysam.AlignmentFile(bam_file, "rb") as bam:
         region = parse_bed(bed_regions)
@@ -204,8 +207,8 @@ def filter_reads(args):
                 continue
 
             if read.is_secondary:
+                n_secondary += 1
                 if not incl_sec:
-                    n_secondary += 1
                     write_read(read, output, region, "secondary", out_format)
                     continue
 
@@ -215,33 +218,37 @@ def filter_reads(args):
                 continue
 
             n_ontarget += 1
-            if read.query_alignment_length < (
-                read.query_length - 2 * max_clipping
-            ):
+            if read.query_alignment_length < (read.query_length - 2 * adapter_length):
                 n_concatamer += 1
                 write_read(read, output, region, "concatamer", out_format)
                 continue
 
-            if read.reference_length < (region_length * min_overlap):
+            if read.query_alignment_length < (region_length * min_overlap):
                 n_short += 1
                 write_read(read, output, region, "short", out_format)
                 continue
+
+            if read.query_length > (region_length * ( 2 - min_overlap) + 2 * adapter_length):
+                n_long += 1
+                write_read(read, output, region, "long", out_format)
+                continue
+
             n_reads_region += 1
             write_read(read, output, region, "filtered", out_format)
 
     if tsv:
         stats_out_filename = os.path.join(
             output, "{}.tsv".format(stats_out_filename))
         write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget,
-                  n_concatamer, n_short, n_reads_region, incl_sec, stats_out_filename, region)
+                  n_concatamer, n_short, n_long, n_reads_region, incl_sec, stats_out_filename, region)
 
 
-def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_concatamer, n_short, n_reads_region, incl_sec, stats_out_filename, region):
+def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_concatamer, n_short, n_long, n_reads_region, incl_sec, stats_out_filename, region):
     if n_total > 0:
         if incl_sec:
-            filtered_perc = 100 * n_supplementary // n_total
+            filtered_perc = 100 * n_reads_region // n_total
         else:
-            filtered_perc = 100 * (n_secondary + n_supplementary) // n_total
+            filtered_perc = 100 * (n_secondary + n_reads_region) // n_total
 
         unmapped_perc = 100 * n_unmapped // n_total
         secondary_perc = 100 * n_secondary // n_total
@@ -251,6 +258,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
         if ontarget_perc > 0:
             concatermer_perc = 100 * n_concatamer // n_ontarget
             short_perc = 100 * n_short // n_ontarget
+            long_perc = 100 * n_long // n_ontarget
 
     with open(stats_out_filename, "a") as out_f:
         print(
@@ -263,6 +271,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
             "reads_on_target",
             "reads_concatamer",
             "reads_short",
+            "reads_long",
             "reads_filtered",
             "include_secondary",
             sep="\t",
@@ -278,6 +287,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
             n_ontarget,
             n_concatamer,
             n_short,
+            n_long,
             n_reads_region,
             incl_sec,
             sep="\t",
@@ -293,6 +303,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
             ontarget_perc,
             concatermer_perc,
             short_perc,
+            long_perc,
             filtered_perc,
             incl_sec,
             sep="\t",