Skip to content

Commit

Permalink
Merge pull request #13 from jeromekelleher/compare-other-libs
Browse files Browse the repository at this point in the history
Compare to other libs
  • Loading branch information
jeromekelleher authored Aug 27, 2021
2 parents 6a953d7 + c847eba commit ed7ad86
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 1 deletion.
46 changes: 46 additions & 0 deletions tree_performance/R_implementations.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# install.packages("reticulate")
# install.packages("phangorn")
library(reticulate)
library(phangorn)

# The num_sites argument here should be the same number of sites in the FASTA.
# I can't see now to get the number of actual sites out of phyDat - it only
# seems to care about the unique sites.
benchmark_phangorn <- function(ts_path, num_sites, tree_path, fasta_path) {

tree <- read.tree(tree_path)
# print("read tree")
data <- read.phyDat(fasta_path, format="fasta", type="DNA")
# print("data")

num_distinct_sites <- attr(data, "nr")

# This method is fast, but seems to use a *lot* of memory. Won't
# run for 10^6 samples
before <- proc.time()
score <- sankoff(tree, data)
duration <- proc.time() - before

cat("phangorn", duration[1] / num_distinct_sites, "\n")

# Check the parsimony result against tskit
tskit <- reticulate::import("tskit")
ts <- tskit$load(ts_path)
variants <- ts$variants()
tsk_tree <- ts$first()

py_score <- 0
for (j in 1:num_sites) {
var <- iter_next(variants)
ret <- tsk_tree$map_mutations(var$genotypes, var$alleles)
py_score <- py_score + length(ret[[2]])
}
if (py_score != score) {
stop("mismatch score")
}

}

args = commandArgs(trailingOnly=TRUE)

benchmark_phangorn(args[1], as.integer(args[2]), args[3], args[4])
109 changes: 108 additions & 1 deletion tree_performance/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@
import itertools
import functools
import subprocess
import io
import textwrap

import pandas as pd
import numpy as np
import click
import msprime
import tskit
import numba
import Bio.Phylo.TreeConstruction

import pythran_implementation

Expand Down Expand Up @@ -120,6 +123,45 @@ def benchmark_python(ts, func, implementation, max_sites=1000):
]



def benchmark_biopython(ts_path, max_sites=1000):
ts = tskit.load(ts_path)
assert ts.num_sites >= max_sites
variants = itertools.islice(ts.variants(), max_sites)
times = np.zeros(max_sites)

tree = ts.first()
bp_tree = Bio.Phylo.read(io.StringIO(tree.newick()), "newick")
ps = Bio.Phylo.TreeConstruction.ParsimonyScorer()

with click.progressbar(
variants, length=max_sites, label=f"BioPython:n={ts.num_samples}"
) as bar:
for j, variant in enumerate(bar):
records = [
Bio.SeqRecord.SeqRecord(
Bio.Seq.Seq(str(variant.genotypes[k])), id=str(k + 1)
)
for k in range(ts.num_samples)
]
alignment = Bio.Align.MultipleSeqAlignment(records)

before = time.perf_counter()
bp_score = ps.get_score(bp_tree, alignment)
duration = time.perf_counter() - before
times[j] = duration
_, mutations = tree.map_mutations(variant.genotypes, variant.alleles)
assert bp_score == len(mutations)
return [
{
"implementation": f"BioPython",
"sample_size": ts.num_samples,
"time_mean": np.mean(times),
"time_var": np.var(times),
}
]


def benchmark_tskit(ts_path, max_sites):
ts = tskit.load(ts_path)
assert ts.num_trees == 1
Expand Down Expand Up @@ -190,6 +232,7 @@ def benchmark_c_library(ts_path, max_sites):
def benchmark_cpp_library(ts_path, max_sites):
return benchmark_external(f"./cpp_implementation", ts_path, max_sites)


def warmup_jit():
ts = msprime.sim_ancestry(100, sequence_length=100000, random_seed=43)
numba_hartigan_parsimony(ts.first(), np.zeros(ts.num_samples, dtype=np.int8), ["0"])
Expand Down Expand Up @@ -265,6 +308,32 @@ def to_preorder(ts, verify=False):
return new_ts


def convert_phylo(ts, num_sites, newick_path, fasta_path):

tree = ts.first()
with open(newick_path, "w") as f:
f.write(tree.newick())

H = np.empty((ts.num_samples, num_sites), dtype=np.int8)
for var in ts.variants():
if var.site.id >= num_sites:
break
alleles = np.full(len(var.alleles), 0, dtype=np.int8)
for i, allele in enumerate(var.alleles):
ascii_allele = allele.encode("ascii")
allele_int8 = ord(ascii_allele)
alleles[i] = allele_int8
H[:, var.site.id] = alleles[var.genotypes]

with open(fasta_path, "w") as f:
# Sample are labelled 1,.., n in the newick
for j, h in enumerate(H, start=1):
print(f">{j}", file=f)
sequence = h.tobytes().decode("ascii")
for line in textwrap.wrap(sequence):
print(line, file=f)


@click.command()
def generate_data():
"""
Expand All @@ -277,6 +346,10 @@ def generate_data():
ts.dump(f"data/n_1e{k}.trees")
ts_preorder = to_preorder(ts, verify=k < 6)
ts_preorder.dump(f"data/n_1e{k}_preorder.trees")
if k < 7:
convert_phylo(ts, 1000, f"data/n_1e{k}.nwk", f"data/n_1e{k}.fasta")
else:
break


@click.group()
Expand Down Expand Up @@ -321,21 +394,55 @@ def quickbench(filename):
ts = tskit.load(filename)
assert ts.num_trees == 1
for impl in [
# benchmark_biopython,
benchmark_pythran,
benchmark_numba,
benchmark_tskit,
benchmark_c_library,
]:
m = 1000 if ts.num_samples < 10 ** 6 else 10
m = 100 if ts.num_samples < 10 ** 6 else 10
for datum in impl(filename, max_sites=m):
print(datum["implementation"], datum["time_mean"], sep="\t")
# _hartigan_postorder.inspect_types()
# _hartigan_preorder.inspect_types()


@click.command()
def benchmark_libs():
"""
Runs benchmarks on available libraries.
"""
# warmup_jit()

benchmark_biopython(tskit.load("data/n_1e3.trees"))

benchmark_R("data/n_1e3.trees")

# ts = tskit.load("data/n_1e1.trees")
# print(ts)

# assert ts.num_trees == 1
# for impl in [
# benchmark_pythran,
# benchmark_numba,
# benchmark_tskit,
# benchmark_c_library,
# ]:
# m = 1000 if ts.num_samples < 10 ** 6 else 10
# for datum in impl(filename, max_sites=m):
# print(datum["implementation"], datum["time_mean"], sep="\t")
# _hartigan_postorder.inspect_types()
# _hartigan_preorder.inspect_types()





cli.add_command(generate_data)
cli.add_command(run_benchmarks)
cli.add_command(verify)
cli.add_command(quickbench)
cli.add_command(benchmark_libs)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions tree_performance/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ msprime
matplotlib
numba
pythran
BioPython

0 comments on commit ed7ad86

Please sign in to comment.