Skip to content

Commit

Permalink
Add FASTA output to pretext-to-tpf script
Browse files Browse the repository at this point in the history
  • Loading branch information
jgrg committed Nov 4, 2024
1 parent 62c8b9c commit 2f68fba
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 25 deletions.
11 changes: 6 additions & 5 deletions src/tola/assembly/scripts/asm_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys

import click

from tola.assembly.format import format_agp, format_tpf
from tola.assembly.parser import parse_agp, parse_tpf

Expand Down Expand Up @@ -124,9 +125,8 @@ def process_fh(in_fh, in_fmt, asm_name, out_fh, out_fmt, qc_overlaps):
msg = f"Unknown input format: '{in_fmt}'"
raise ValueError(msg)

if qc_overlaps:
if pairs := asm.find_overlapping_fragments():
report_overlaps(asm_name, pairs)
if qc_overlaps and (pairs := asm.find_overlapping_fragments()):
report_overlaps(asm_name, pairs)

if out_fmt == "AGP":
format_agp(asm, out_fh)
Expand All @@ -153,8 +153,9 @@ def format_from_file_extn(pth, default=None):
"""
Guess the file format from the extension, or return the supplied default
"""
if m := re.match(r"\.(agp|tpf)\w*$", pth.suffix, flags=re.IGNORECASE):
return m.group(1).upper()
if m := re.match(r"\.(agp|tpf|fa(?:sta)?)\w*$", pth.suffix, flags=re.IGNORECASE):
uc_fmt = m.group(1).upper()
return "FASTA" if uc_fmt.startswith("FA") else uc_fmt
else:
return default

Expand Down
49 changes: 29 additions & 20 deletions src/tola/assembly/scripts/pretext_to_tpf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import click
import yaml

from tola.assembly.assembly_stats import AssemblyStats
from tola.assembly.build_assembly import BuildAssembly
from tola.assembly.format import format_agp, format_tpf
from tola.assembly.gap import Gap
from tola.assembly.indexed_assembly import IndexedAssembly
from tola.assembly.parser import parse_agp, parse_tpf
from tola.assembly.scripts.asm_format import format_from_file_extn
from tola.fasta.index import FastaIndex
from tola.fasta.stream import FastaStream


def bd(txt):
Expand Down Expand Up @@ -114,7 +115,7 @@ def ul(txt):
@click.option(
"--clobber/--no-clobber",
"-f",
default=False,
default=True,
show_default=True,
help="Overwrite an existing output file.",
)
Expand Down Expand Up @@ -147,10 +148,9 @@ def cli(
):
logfile = setup_logging(log_level, output_file, write_log, clobber)

input_asm = IndexedAssembly.new_from_assembly(
parse_assembly_file(assembly_file, "TPF")
)
prtxt_asm = parse_assembly_file(pretext_file, "AGP")
asm, fai = parse_assembly_file(assembly_file, "TPF")
input_asm = IndexedAssembly.new_from_assembly(asm)
prtxt_asm, _ = parse_assembly_file(pretext_file, "AGP")

# Trap "-a" and "-p" arguments being switched
if not prtxt_asm.bp_per_texel:
Expand All @@ -169,7 +169,7 @@ def cli(

out_assemblies = build_asm.assemblies_with_scaffolds_fused()
for out_asm in out_assemblies.values():
write_assembly(out_asm, output_file, clobber)
write_assembly(fai, out_asm, output_file, clobber)
stats = build_asm.assembly_stats
if output_file:
write_chr_csv_files(output_file, stats, out_assemblies, clobber)
Expand Down Expand Up @@ -203,12 +203,13 @@ def setup_logging(log_level, output_file, write_log, clobber):
return logfile


def write_assembly(out_asm, output_file, clobber):
def write_assembly(fai, out_asm, output_file, clobber):
if output_file:
out_fmt = format_from_file_extn(output_file, "TPF")
mode = "b" if out_fmt == "FASTA" else ""
if out_asm.name != output_file.stem:
output_file = output_file.with_stem(out_asm.name)
out_fh = get_output_filehandle(output_file, clobber)
out_fh = get_output_filehandle(output_file, clobber, mode)
else:
out_fmt = "STR"
out_fh = sys.stdout
Expand All @@ -217,14 +218,19 @@ def write_assembly(out_asm, output_file, clobber):
format_tpf(out_asm, out_fh)
elif out_fmt == "AGP":
format_agp(out_asm, out_fh)
elif out_fmt == "FASTA":
stream = FastaStream(out_fh, fai)
stream.write_assembly(out_asm)

# Save a .agp file alongside the .fa / .fasta
output_agp = output_file.with_suffix(".agp")
agp_fh = get_output_filehandle(output_agp, clobber)
format_agp(out_asm, agp_fh)

elif out_fmt == "STR":
out_fh.write("\n")
out_fh.write(str(out_asm))

if out_fmt != "STR":
op = "Overwrote" if clobber else "Created"
click.echo(f"{op} file '{output_file}'", err=True)


def write_chr_csv_files(output_file, stats, out_assemblies, clobber):
for asm_key, asm in out_assemblies.items():
Expand All @@ -235,8 +241,6 @@ def write_chr_csv_files(output_file, stats, out_assemblies, clobber):
with get_output_filehandle(csv_file, clobber) as csv_fh:
for cn_list in chr_names:
csv_fh.write(",".join(cn_list) + "\n")
op = "Overwrote" if clobber else "Created"
click.echo(f"{op} file '{csv_file}'", err=True)


def write_info_yaml(output_file, stats, out_assemblies, clobber):
Expand All @@ -259,22 +263,27 @@ def write_info_yaml(output_file, stats, out_assemblies, clobber):
click.echo(f"{op} file '{yaml_file}'", err=True)


def get_output_filehandle(path, clobber):
def get_output_filehandle(path, clobber, mode=""):
op = "Overwrote" if path.exists() else "Created"
try:
out_fh = path.open("w" if clobber else "x")
out_fh = path.open("w" + mode if clobber else "x" + mode)
except FileExistsError:
click.echo(f"Error: output file '{path}' already exists", err=True)
sys.exit(1)
click.echo(f"{op} file '{path}'", err=True)
return out_fh


def parse_assembly_file(path, default_format=None):
fmt = format_from_file_extn(path, default_format)
path_fh = path.open("r")
if fmt == "AGP":
return parse_agp(path_fh, path.stem)
return parse_agp(path.open(), path.stem), None
elif fmt == "TPF":
return parse_tpf(path_fh, path.stem)
return parse_tpf(path.open(), path.stem), None
elif fmt == "FASTA":
fai = FastaIndex(path)
fai.auto_load()
return fai.assembly, fai
else:
msg = f"Unknown assembly file format '{fmt}'"
raise ValueError(msg)
Expand Down

0 comments on commit 2f68fba

Please sign in to comment.