Skip to content

Commit

Permalink
Merge pull request #61 from genepi/Adapt-nextlow.config-for-singularity
Browse files Browse the repository at this point in the history
Merge latest version with adapted clustering strategy
  • Loading branch information
AmstlerStephan authored Mar 6, 2024
2 parents bd793cd + 59c04cd commit 1f08091
Show file tree
Hide file tree
Showing 27 changed files with 511 additions and 582 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Umi-pipeline-nf

**Umi-pipeline-nf** creates highly accurate single-molecule consensus sequences for unique molecular identifier (UMI)-tagged amplicons from nanopore sequencing data.
The pipeline can be run for the whole fastq_pass folder of your nanopore run and, per default, outputs the aligned consensus sequences of each UMI cluster in bam file. The optional variant calling creates a vcf file for all variants that are found in the consensus sequences.
umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI analysis pipeline](https://github.com/nanoporetech/pipeline-umi-amplicon); originally developed by [Karst et al, Nat Biotechnol 18:165–169, 2021](https://www.nature.com/articles/s41592-020-01041-y)). We implemented the pipeline in [Nextflow](https://www.nextflow.io) and included several optimizations and [additional functionalities](#main-adaptations).
umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI analysis pipeline](https://github.com/nanoporetech/pipeline-umi-amplicon); originally developed by [Karst et al, Nat Biotechnol 18:165–169, 2021](https://www.nature.com/articles/s41592-020-01041-y)). We migrated the pipeline in [Nextflow](https://www.nextflow.io), included several optimizations and [additional functionalities](#main-adaptations).

![Workflow](docs/images/umi-pipeline-nf_metro-map.svg)

Expand All @@ -26,7 +26,7 @@ umi-pipeline-nf is inspired by a snakemake-based analysis pipeline ([ONT UMI ana
## Main Adaptations

* It comes with docker containers making **installation simple, portable** and **results highly reproducible**.
* It comes with a docker/singularity container making **installation simple, easy to use on clusters** and **results highly reproducible**.
* The pipeline is **optimized for parallelization**.
* **Additional UMI cluster splitting** step to remove admixed UMI clusters.
* Read filtering strategy per UMI cluster was adapted to **preserve the highest quality reads**.
Expand Down
70 changes: 49 additions & 21 deletions bin/extract_umis.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ def parse_args(argv):
help="Max edit distance for UMI",
)
parser.add_argument(
"--adapter-length",
"--adapter_length",
dest="ADAPTER_LENGTH",
type=int,
default=250,
default=200,
help="Length of adapter",
)
parser.add_argument(
Expand Down Expand Up @@ -106,10 +106,27 @@ def parse_args(argv):
return args


def extract_umi(query_seq, query_qual, pattern, max_edit_dist, format):
def clip_entry(entry, umi_start_fwd, umi_start_rev, adapter_length, format):
clip_fwd = umi_start_fwd
if adapter_length == umi_start_rev:
clip_rev = None
else:
clip_rev = umi_start_rev - adapter_length

entry.sequence = entry.sequence[clip_fwd:clip_rev]

if format == "fastq":
entry.quality = entry.quality[clip_fwd:clip_rev]

return entry

def extract_umi(query_seq, query_qual, pattern, max_edit_dist, format, direction):
umi_qual = None
equalities = [("M", "A"), ("M", "C"), ("R", "A"), ("R", "G"), ("W", "A"), ("W", "T"), ("S", "C"), ("S", "G"), ("Y", "C"), ("Y", "T"), ("K", "G"), ("K", "T"), ("V", "A"), ("V", "C"),
("V", "G"), ("H", "A"), ("H", "C"), ("H", "T"), ("D", "A"), ("D", "G"), ("D", "T"), ("B", "C"), ("B", "G"), ("B", "T"), ("N", "A"), ("N", "C"), ("N", "G"), ("N", "T")]
("V", "G"), ("H", "A"), ("H", "C"), ("H", "T"), ("D", "A"), ("D", "G"), ("D", "T"), ("B", "C"), ("B", "G"), ("B", "T"), ("N", "A"), ("N", "C"), ("N", "G"), ("N", "T"),
("m", "a"), ("m", "c"), ("r", "a"), ("r", "g"), ("w", "a"), ("w", "t"), ("s", "c"), ("s", "g"), ("y", "c"), ("y", "t"), ("k", "g"), ("k", "t"), ("v", "a"), ("v", "c"),
("v", "g"), ("h", "a"), ("h", "c"), ("h", "t"), ("d", "a"), ("d", "g"), ("d", "t"), ("b", "c"), ("b", "g"), ("b", "t"), ("n", "a"), ("n", "c"), ("n", "g"), ("n", "t"),
("a", "A"), ("c", "C"), ("t", "T"), ("g", "G")]

result = edlib.align(
pattern,
Expand All @@ -120,31 +137,38 @@ def extract_umi(query_seq, query_qual, pattern, max_edit_dist, format):
additionalEqualities=equalities,
)
if result["editDistance"] == -1:
return None, None, None
return None, None, None, None

edit_dist = result["editDistance"]
locs = result["locations"][0]
umi = query_seq[locs[0]:locs[1]+1]
umi_start_pos = locs[0]
umi_end_pos = locs[1] + 1
umi = query_seq[umi_start_pos:umi_end_pos]

if direction == "fwd":
umi_start = umi_start_pos
else:
umi_start = umi_end_pos

if format == "fastq":
umi_qual = query_qual[locs[0]:locs[1]+1]

return edit_dist, umi, umi_qual
return edit_dist, umi, umi_qual, umi_start


def extract_adapters(entry, max_adapter_length, format):
def extract_adapters(entry, adapter_length, format):
read_5p_seq = None
read_5p_qual = None
read_3p_seq = None
read_3p_qual = None

if len(entry.sequence) > max_adapter_length:
read_5p_seq = entry.sequence[:max_adapter_length]
read_3p_seq = entry.sequence[-max_adapter_length:]
if len(entry.sequence) > adapter_length:
read_5p_seq = entry.sequence[:adapter_length]
read_3p_seq = entry.sequence[-adapter_length:]

if format == "fastq":
read_5p_qual = entry.quality[:max_adapter_length]
read_3p_qual = entry.quality[-max_adapter_length:]
read_5p_qual = entry.quality[:adapter_length]
read_3p_qual = entry.quality[-adapter_length:]

return read_5p_seq, read_3p_seq, read_5p_qual, read_3p_qual

Expand Down Expand Up @@ -288,7 +312,7 @@ def write_tsv(
def extract_umis(
args
):
max_adapter_length = args.ADAPTER_LENGTH
adapter_length = args.ADAPTER_LENGTH
max_pattern_dist = args.MAX_ERROR
output_folder = args.OUT
tsv = args.TSV
Expand All @@ -311,7 +335,7 @@ def extract_umis(
n_total += 1

read_5p_seq, read_3p_seq, read_5p_qual, read_3p_qual = extract_adapters(
entry, max_adapter_length, format
entry, adapter_length, format
)

if read_5p_seq is None or read_3p_seq is None:
Expand All @@ -320,22 +344,26 @@ def extract_umis(
strand_stats[strand] += 1

# Extract fwd UMI
result_5p_fwd_umi_dist, result_5p_fwd_umi_seq, result_5p_fwd_umi_qual = extract_umi(
read_5p_seq, read_5p_qual, umi_fwd, max_pattern_dist, format
result_5p_fwd_umi_dist, result_5p_fwd_umi_seq, result_5p_fwd_umi_qual, umi_start_fwd = extract_umi(
read_5p_seq, read_5p_qual, umi_fwd, max_pattern_dist, format, "fwd"
)
# Extract rev UMI
result_3p_rev_umi_dist, result_3p_rev_umi_seq, result_3p_rev_umi_qual = extract_umi(
read_3p_seq, read_3p_qual, umi_rev, max_pattern_dist, format
result_3p_rev_umi_dist, result_3p_rev_umi_seq, result_3p_rev_umi_qual, umi_start_rev = extract_umi(
read_3p_seq, read_3p_qual, umi_rev, max_pattern_dist, format, "rev"
)

if not result_5p_fwd_umi_seq or not result_3p_rev_umi_seq:
continue

clipped_entry = clip_entry(
entry, umi_start_fwd, umi_start_rev, adapter_length, format
)

n_both_umi += 1

if format == "fasta":
write_fasta(
entry,
clipped_entry,
strand,
result_5p_fwd_umi_dist,
result_3p_rev_umi_dist,
Expand All @@ -346,7 +374,7 @@ def extract_umis(

if format == "fastq":
write_fastq(
entry,
clipped_entry,
strand,
result_5p_fwd_umi_dist,
result_3p_rev_umi_dist,
Expand Down
43 changes: 27 additions & 16 deletions bin/filter_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ def parse_args(argv):
default=0.9,
help="Min overlap with target region",
)

parser.add_argument(
"--adapter_length",
dest="ADAPTER_LENGTH",
type=int,
default=200,
help="Length of adapter",
)

parser.add_argument(
"--include_secondary_reads",
Expand Down Expand Up @@ -160,7 +168,7 @@ def write_fastq(read, out_f):
def filter_reads(args):
bed_regions = args.BED[0]
bam_file = args.BAM
max_clipping = 250
adapter_length = args.ADAPTER_LENGTH
min_overlap = args.MIN_OVERLAP
incl_sec = args.INCL_SEC
output = args.OUT
Expand All @@ -177,12 +185,7 @@ def filter_reads(args):
n_supplementary = 0
n_secondary = 0
n_total = 0

filtered_perc = 0
unmapped_perc = 0
ontarget_perc = 0
concatermer_perc = 0
short_perc = 0
n_long = 0

with pysam.AlignmentFile(bam_file, "rb") as bam:
region = parse_bed(bed_regions)
Expand All @@ -204,8 +207,8 @@ def filter_reads(args):
continue

if read.is_secondary:
n_secondary += 1
if not incl_sec:
n_secondary += 1
write_read(read, output, region, "secondary", out_format)
continue

Expand All @@ -215,33 +218,37 @@ def filter_reads(args):
continue

n_ontarget += 1
if read.query_alignment_length < (
read.query_length - 2 * max_clipping
):
if read.query_alignment_length < (read.query_length - 2 * adapter_length):
n_concatamer += 1
write_read(read, output, region, "concatamer", out_format)
continue

if read.reference_length < (region_length * min_overlap):
if read.query_alignment_length < (region_length * min_overlap):
n_short += 1
write_read(read, output, region, "short", out_format)
continue

if read.query_length > (region_length * ( 2 - min_overlap) + 2 * adapter_length):
n_long += 1
write_read(read, output, region, "long", out_format)
continue

n_reads_region += 1
write_read(read, output, region, "filtered", out_format)

if tsv:
stats_out_filename = os.path.join(
output, "{}.tsv".format(stats_out_filename))
write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget,
n_concatamer, n_short, n_reads_region, incl_sec, stats_out_filename, region)
n_concatamer, n_short, n_long, n_reads_region, incl_sec, stats_out_filename, region)


def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_concatamer, n_short, n_reads_region, incl_sec, stats_out_filename, region):
def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_concatamer, n_short, n_long, n_reads_region, incl_sec, stats_out_filename, region):
if n_total > 0:
if incl_sec:
filtered_perc = 100 * n_supplementary // n_total
filtered_perc = 100 * n_reads_region // n_total
else:
filtered_perc = 100 * (n_secondary + n_supplementary) // n_total
filtered_perc = 100 * (n_secondary + n_reads_region) // n_total

unmapped_perc = 100 * n_unmapped // n_total
secondary_perc = 100 * n_secondary // n_total
Expand All @@ -251,6 +258,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
if ontarget_perc > 0:
concatermer_perc = 100 * n_concatamer // n_ontarget
short_perc = 100 * n_short // n_ontarget
long_perc = 100 * n_long // n_ontarget

with open(stats_out_filename, "a") as out_f:
print(
Expand All @@ -263,6 +271,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
"reads_on_target",
"reads_concatamer",
"reads_short",
"reads_long",
"reads_filtered",
"include_secondary",
sep="\t",
Expand All @@ -278,6 +287,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
n_ontarget,
n_concatamer,
n_short,
n_long,
n_reads_region,
incl_sec,
sep="\t",
Expand All @@ -293,6 +303,7 @@ def write_tsv(n_total, n_unmapped, n_secondary, n_supplementary, n_ontarget, n_c
ontarget_perc,
concatermer_perc,
short_perc,
long_perc,
filtered_perc,
incl_sec,
sep="\t",
Expand Down
Loading

0 comments on commit 1f08091

Please sign in to comment.