Skip to content

Commit

Permalink
release 0.0.5
Browse files Browse the repository at this point in the history
  • Loading branch information
y9c committed Apr 22, 2024
1 parent 6ba2c8d commit e0a71ce
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 49 deletions.
141 changes: 93 additions & 48 deletions cutseq/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
NonInternalFrontAdapter,
PrefixAdapter,
RightmostFrontAdapter,
SuffixAdapter,
)
from cutadapt.files import InputPaths, OutputFiles
from cutadapt.modifiers import (
Expand All @@ -29,13 +30,11 @@
UnconditionalCutter,
)
from cutadapt.pipeline import PairedEndPipeline, SingleEndPipeline
from cutadapt.predicates import IsUntrimmed, TooShort
from cutadapt.predicates import Predicate, TooShort
from cutadapt.runners import make_runner
from cutadapt.steps import (
InfoFileWriter,
PairedEndFilter,
PairedEndSink,
PairedSingleEndStep,
SingleEndFilter,
SingleEndSink,
)
Expand Down Expand Up @@ -118,14 +117,33 @@ def _parse_barcode(self, b):
class CutadaptConfig:
def __init__(self):
self.rname_suffix = False
self.discarded_untrimmed = False
self.ensure_inline_barcode = False
self.trim_polyA = False
self.min_length = 20
self.min_quality = 20
self.dry_run = False
self.threads = 1


class IsUntrimmedRef(Predicate):
"""
Select reads for which no adapter match was found
"""

def __init__(self, ref_adapters):
self.ref_adapters = ref_adapters

def __repr__(self):
return "IsUntrimmedRef()"

def test(self, read, info):
# check not all adapters with ref_adapters are exist in the matches
match_adapters = [match.adapter for match in info.matches]
if any([adapter not in match_adapters for adapter in self.ref_adapters]):
return True
return False


def run_steps(steps, dry_run=False):
if dry_run:
print(
Expand All @@ -138,7 +156,7 @@ def run_steps(steps, dry_run=False):
return process.stdout.decode(), process.stderr.decode()


def pipeline_single(input1, output1, short1, untrimed1, barcode, settings):
def pipeline_single(input1, output1, short1, untrimmed1, barcode, settings):
modifiers = []
# step 1: remove suffix in the read name
modifiers.extend([SuffixRemover(".1"), SuffixRemover("/1")])
Expand All @@ -162,14 +180,11 @@ def pipeline_single(input1, output1, short1, untrimed1, barcode, settings):
)
# step 4: trim inline barcode
if barcode.inline5.len > 0:
modifiers.append(
AdapterCutter(
[PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2)],
times=1,
)
)
adapter_inline5 = PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2)
modifiers.append(AdapterCutter([adapter_inline5], times=1))
if barcode.inline3.len > 0:
modifiers.append(UnconditionalCutter(-barcode.inline3.len))
adapter_inline3 = SuffixAdapter(sequence=barcode.inline3.fw, max_errors=0.2)
modifiers.append(AdapterCutter([adapter_inline3], times=1))

# step 5: extract UMI
if barcode.umi5.len > 0:
Expand Down Expand Up @@ -215,19 +230,35 @@ def pipeline_single(input1, output1, short1, untrimed1, barcode, settings):
qualities=runner.input_file_format().has_qualities(),
interleaved=False,
)
steps = [
steps = []
steps.append(
# --info-file=info.txt
# PairedSingleEndStep(InfoFileWriter(outfiles.open_text("info.txt"))),
# -m 10
SingleEndFilter(
TooShort(settings.min_length), outfiles.open_record_writer(short1)
),
# TODO: --max-n=0 support
# --discard-untrimmed
# SingleEndFilter( IsUntrimmed(), IsUntrimmed(), pair_filter_mode="any"),
)
if (
settings.ensure_inline_barcode
and barcode.inline5.len + barcode.inline3.len > 0
):
ref_adapters = []
if barcode.inline5.len > 0:
ref_adapters.append(adapter_inline5)
if barcode.inline3.len > 0:
ref_adapters.append(adapter_inline3)
steps.append(
# TODO: --max-n=0 support
SingleEndFilter(
IsUntrimmedRef(ref_adapters),
outfiles.open_record_writer(untrimmed1, interleaved=False),
),
)
steps.append(
# -o
SingleEndSink(outfiles.open_record_writer(output1, interleaved=False)),
]
)
pipeline = SingleEndPipeline(modifiers, steps)
_stats = runner.run(pipeline, Progress(), outfiles)
# _ = stats.as_json()
Expand All @@ -241,7 +272,7 @@ def pipeline_paired(
output2,
short1,
short2,
untrimed1,
untrimmed1,
untrimmed2,
barcode,
settings,
Expand Down Expand Up @@ -290,23 +321,19 @@ def pipeline_paired(
)
# step 4: trim inline barcode
if barcode.inline5.len > 0:
adapter_inline5 = PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2)
modifiers.append(
(
AdapterCutter(
[PrefixAdapter(sequence=barcode.inline5.fw, max_errors=0.2)],
times=1,
),
AdapterCutter([adapter_inline5], times=1),
UnconditionalCutter(-barcode.inline5.len),
)
)
if barcode.inline3.len > 0:
adapter_inline3 = PrefixAdapter(sequence=barcode.inline3.rc, max_errors=0.2)
modifiers.append(
(
UnconditionalCutter(-barcode.inline3.len),
AdapterCutter(
[BackAdapter(sequence=barcode.inline3.rc, max_errors=0.2)],
times=1,
),
AdapterCutter([adapter_inline3], times=1),
)
)

Expand Down Expand Up @@ -387,21 +414,40 @@ def pipeline_paired(
qualities=runner.input_file_format().has_qualities(),
interleaved=False,
)
steps = [
# --info-file=info.txt
# PairedSingleEndStep(InfoFileWriter(outfiles.open_text("info.txt"))),
# -m 10:10
steps = []
# --info-file=info.txt
# PairedSingleEndStep(InfoFileWriter(outfiles.open_text("info.txt"))),
# -m 10:10
steps.append(
PairedEndFilter(
TooShort(settings.min_length),
TooShort(settings.min_length),
outfiles.open_record_writer(short1, short2, interleaved=False),
),
# TODO: --max-n=0 support
# --discard-untrimmed
# PairedEndFilter( IsUntrimmed(), IsUntrimmed(), pair_filter_mode="any"),
)
)
if (
settings.ensure_inline_barcode
and barcode.inline5.len + barcode.inline3.len > 0
):
steps.append(
# TODO: --max-n=0 support
PairedEndFilter(
IsUntrimmedRef([adapter_inline5])
if barcode.inline5.len > 0
else None,
IsUntrimmedRef([adapter_inline3])
if barcode.inline3.len > 0
else None,
outfiles.open_record_writer(
untrimmed1, untrimmed2, interleaved=False
),
pair_filter_mode="any",
)
)
steps.append(
# -o ... -p ...
PairedEndSink(outfiles.open_record_writer(output1, output2)),
]
PairedEndSink(outfiles.open_record_writer(output1, output2))
)
pipeline = PairedEndPipeline(modifiers, steps)
_stats = runner.run(pipeline, Progress(), outfiles)
# _ = stats.as_json()
Expand All @@ -411,12 +457,9 @@ def pipeline_paired(
def run_cutseq(args):
barcode_config = BarcodeConfig(args.adapter_scheme.upper())
settings = CutadaptConfig()
if args.with_rname_suffix:
settings.rname_suffix = True
if args.untrimmed_file is None:
settings.discarded_untrimmed = True
if args.trim_polyA:
settings.trim_polyA = True
settings.rname_suffix = args.with_rname_suffix
settings.ensure_inline_barcode = args.ensure_inline_barcode
settings.trim_polyA = args.trim_polyA
settings.threads = args.threads
settings.min_length = args.min_length
settings.dry_run = args.dry_run
Expand Down Expand Up @@ -506,6 +549,12 @@ def main():
action="store_true",
help="R1 and R2 suffix cotains suffix. MGI platform.",
)

parser.add_argument(
"--ensure-inline-barcode",
action="store_true",
help="Output discarded reads without inline barcode.",
)
parser.add_argument(
"-U",
"--untrimmed-file",
Expand Down Expand Up @@ -584,12 +633,8 @@ def validate_output_file(output_files, input_files, output_prefix, output_suffix
args.short_file = validate_output_file(
args.short_file, args.input_file, args.output_prefix, "short"
)
args.untrimmed_file = (
validate_output_file(
args.untrimmed_file, args.input_file, args.output_prefix, "untrimmed"
)
if args.untrimmed_file is not None
else [None] * len(args.input_file)
args.untrimmed_file = validate_output_file(
args.untrimmed_file, args.input_file, args.output_prefix, "untrimmed"
)

run_cutseq(args)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cutseq"
version = "0.0.4"
version = "0.0.5"
description = "Automatically cut adapter / barcode / UMI from NGS data"
authors = ["Ye Chang <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit e0a71ce

Please sign in to comment.