diff --git a/cutseq/run.py b/cutseq/run.py index d13bd0f..426c2a1 100644 --- a/cutseq/run.py +++ b/cutseq/run.py @@ -18,6 +18,7 @@ NonInternalFrontAdapter, PrefixAdapter, RightmostFrontAdapter, + SuffixAdapter, ) from cutadapt.files import InputPaths, OutputFiles from cutadapt.modifiers import ( @@ -29,13 +30,11 @@ UnconditionalCutter, ) from cutadapt.pipeline import PairedEndPipeline, SingleEndPipeline -from cutadapt.predicates import IsUntrimmed, TooShort +from cutadapt.predicates import Predicate, TooShort from cutadapt.runners import make_runner from cutadapt.steps import ( - InfoFileWriter, PairedEndFilter, PairedEndSink, - PairedSingleEndStep, SingleEndFilter, SingleEndSink, ) @@ -118,7 +117,7 @@ def _parse_barcode(self, b): class CutadaptConfig: def __init__(self): self.rname_suffix = False - self.discarded_untrimmed = False + self.ensure_inline_barcode = False self.trim_polyA = False self.min_length = 20 self.min_quality = 20 @@ -126,6 +125,25 @@ def __init__(self): self.threads = 1 +class IsUntrimmedRef(Predicate): + """ + Select reads for which no adapter match was found + """ + + def __init__(self, ref_adapters): + self.ref_adapters = ref_adapters + + def __repr__(self): + return "IsUntrimmedRef()" + + def test(self, read, info): + # check not all adapters with ref_adapters are exist in the matches + match_adapters = [match.adapter for match in info.matches] + if any([adapter not in match_adapters for adapter in self.ref_adapters]): + return True + return False + + def run_steps(steps, dry_run=False): if dry_run: print( @@ -138,7 +156,7 @@ def run_steps(steps, dry_run=False): return process.stdout.decode(), process.stderr.decode() -def pipeline_single(input1, output1, short1, untrimed1, barcode, settings): +def pipeline_single(input1, output1, short1, untrimmed1, barcode, settings): modifiers = [] # step 1: remove suffix in the read name modifiers.extend([SuffixRemover(".1"), SuffixRemover("/1")]) @@ -162,14 +180,11 @@ def pipeline_single(input1, output1, short1, untrimed1, barcode, settings): ) # step 4: trim inline barcode if barcode.inline5.len > 0: - modifiers.append( - AdapterCutter( - [PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2)], - times=1, - ) - ) + adapter_inline5 = PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2) + modifiers.append(AdapterCutter([adapter_inline5], times=1)) if barcode.inline3.len > 0: - modifiers.append(UnconditionalCutter(-barcode.inline3.len)) + adapter_inline3 = SuffixAdapter(sequence=barcode.inline3.fw, max_errors=0.2) + modifiers.append(AdapterCutter([adapter_inline3], times=1)) # step 5: extract UMI if barcode.umi5.len > 0: @@ -215,19 +230,35 @@ def pipeline_single(input1, output1, short1, untrimed1, barcode, settings): qualities=runner.input_file_format().has_qualities(), interleaved=False, ) - steps = [ + steps = [] + steps.append( # --info-file=info.txt # PairedSingleEndStep(InfoFileWriter(outfiles.open_text("info.txt"))), # -m 10 SingleEndFilter( TooShort(settings.min_length), outfiles.open_record_writer(short1) ), - # TODO: --max-n=0 support - # --discard-untrimmed - # SingleEndFilter( IsUntrimmed(), IsUntrimmed(), pair_filter_mode="any"), + ) + if ( + settings.ensure_inline_barcode + and barcode.inline5.len + barcode.inline3.len > 0 + ): + ref_adapters = [] + if barcode.inline5.len > 0: + ref_adapters.append(adapter_inline5) + if barcode.inline3.len > 0: + ref_adapters.append(adapter_inline3) + steps.append( + # TODO: --max-n=0 support + SingleEndFilter( + IsUntrimmedRef(ref_adapters), + outfiles.open_record_writer(untrimmed1, interleaved=False), + ), + ) + steps.append( # -o SingleEndSink(outfiles.open_record_writer(output1, interleaved=False)), - ] + ) pipeline = SingleEndPipeline(modifiers, steps) _stats = runner.run(pipeline, Progress(), outfiles) # _ = stats.as_json() @@ -241,7 +272,7 @@ def pipeline_paired( output2, short1, short2, - untrimed1, + untrimmed1, untrimmed2, barcode, settings, @@ -290,23 +321,19 @@ def pipeline_paired( ) # step 4: trim inline barcode if barcode.inline5.len > 0: + adapter_inline5 = PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2) modifiers.append( ( - AdapterCutter( - [PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2)], - times=1, - ), + AdapterCutter([adapter_inline5], times=1), UnconditionalCutter(-barcode.inline5.len), ) ) if barcode.inline3.len > 0: + adapter_inline3 = PrefixAdapter(sequence=barcode.inline3.rc, max_errors=0.2) modifiers.append( ( UnconditionalCutter(-barcode.inline3.len), - AdapterCutter( - [BackAdapter(sequence=barcode.inline3.rc, max_errors=0.2)], - times=1, - ), + AdapterCutter([adapter_inline3], times=1), ) ) @@ -387,21 +414,40 @@ def pipeline_paired( qualities=runner.input_file_format().has_qualities(), interleaved=False, ) - steps = [ - # --info-file=info.txt - # PairedSingleEndStep(InfoFileWriter(outfiles.open_text("info.txt"))), - # -m 10:10 + steps = [] + # --info-file=info.txt + # PairedSingleEndStep(InfoFileWriter(outfiles.open_text("info.txt"))), + # -m 10:10 + steps.append( PairedEndFilter( TooShort(settings.min_length), TooShort(settings.min_length), outfiles.open_record_writer(short1, short2, interleaved=False), - ), - # TODO: --max-n=0 support - # --discard-untrimmed - # PairedEndFilter( IsUntrimmed(), IsUntrimmed(), pair_filter_mode="any"), + ) + ) + if ( + settings.ensure_inline_barcode + and barcode.inline5.len + barcode.inline3.len > 0 + ): + steps.append( + # TODO: --max-n=0 support + PairedEndFilter( + IsUntrimmedRef([adapter_inline5]) + if barcode.inline5.len > 0 + else None, + IsUntrimmedRef([adapter_inline3]) + if barcode.inline3.len > 0 + else None, + outfiles.open_record_writer( + untrimmed1, untrimmed2, interleaved=False + ), + pair_filter_mode="any", + ) + ) + steps.append( # -o ... -p ... - PairedEndSink(outfiles.open_record_writer(output1, output2)), - ] + PairedEndSink(outfiles.open_record_writer(output1, output2)) + ) pipeline = PairedEndPipeline(modifiers, steps) _stats = runner.run(pipeline, Progress(), outfiles) # _ = stats.as_json() @@ -411,12 +457,9 @@ def pipeline_paired( def run_cutseq(args): barcode_config = BarcodeConfig(args.adapter_scheme.upper()) settings = CutadaptConfig() - if args.with_rname_suffix: - settings.rname_suffix = True - if args.untrimmed_file is None: - settings.discarded_untrimmed = True - if args.trim_polyA: - settings.trim_polyA = True + settings.rname_suffix = args.with_rname_suffix + settings.ensure_inline_barcode = args.ensure_inline_barcode + settings.trim_polyA = args.trim_polyA settings.threads = args.threads settings.min_length = args.min_length settings.dry_run = args.dry_run @@ -506,6 +549,12 @@ def main(): action="store_true", help="R1 and R2 suffix cotains suffix. MGI platform.", ) + + parser.add_argument( + "--ensure-inline-barcode", + action="store_true", + help="Output discarded reads without inline barcode.", + ) parser.add_argument( "-U", "--untrimmed-file", @@ -584,12 +633,8 @@ def validate_output_file(output_files, input_files, output_prefix, output_suffix args.short_file = validate_output_file( args.short_file, args.input_file, args.output_prefix, "short" ) - args.untrimmed_file = ( - validate_output_file( - args.untrimmed_file, args.input_file, args.output_prefix, "untrimmed" - ) - if args.untrimmed_file is not None - else [None] * len(args.input_file) + args.untrimmed_file = validate_output_file( + args.untrimmed_file, args.input_file, args.output_prefix, "untrimmed" ) run_cutseq(args) diff --git a/pyproject.toml b/pyproject.toml index eb24abd..3849bba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cutseq" -version = "0.0.4" +version = "0.0.5" description = "Automatically cut adapter / barcode / UMI from NGS data" authors = ["Ye Chang "] license = "MIT"