Merge pull request #13 from jeromekelleher/compare-other-libs

Compare to other libs
tskit-dev · Aug 27, 2021 · ed7ad86 · ed7ad86
2 parents 6a953d7 + c847eba
commit ed7ad86
Show file tree

Hide file tree

Showing 3 changed files with 155 additions and 1 deletion.
diff --git a/tree_performance/R_implementations.R b/tree_performance/R_implementations.R
@@ -0,0 +1,46 @@
+# install.packages("reticulate")
+# install.packages("phangorn")
+library(reticulate)
+library(phangorn)
+
+# The num_sites argument here should be the same number of sites in the FASTA.
+# I can't see now to get the number of actual sites out of phyDat - it only
+# seems to care about the unique sites.
+benchmark_phangorn <- function(ts_path, num_sites, tree_path, fasta_path) {
+
+    tree <- read.tree(tree_path)
+    # print("read tree")
+    data <- read.phyDat(fasta_path, format="fasta", type="DNA")
+    # print("data")
+
+    num_distinct_sites <- attr(data, "nr")
+
+    # This method is fast, but seems to use a *lot* of memory. Won't 
+    # run for 10^6 samples
+    before <- proc.time()
+    score <- sankoff(tree, data)
+    duration <- proc.time() - before
+
+    cat("phangorn", duration[1] / num_distinct_sites, "\n")
+
+    # Check the parsimony result against tskit
+    tskit <- reticulate::import("tskit")
+    ts <- tskit$load(ts_path)
+    variants <- ts$variants()
+    tsk_tree <- ts$first()
+
+    py_score <- 0
+    for (j in 1:num_sites) {
+        var <- iter_next(variants)
+        ret <- tsk_tree$map_mutations(var$genotypes, var$alleles)
+        py_score <- py_score + length(ret[[2]])
+    }
+    if (py_score != score) {
+        stop("mismatch score")
+    }
+
+}
+
+args = commandArgs(trailingOnly=TRUE)
+
+benchmark_phangorn(args[1], as.integer(args[2]), args[3], args[4])
diff --git a/tree_performance/benchmark.py b/tree_performance/benchmark.py
@@ -6,13 +6,16 @@
 import itertools
 import functools
 import subprocess
+import io
+import textwrap
 
 import pandas as pd
 import numpy as np
 import click
 import msprime
 import tskit
 import numba
+import Bio.Phylo.TreeConstruction
 
 import pythran_implementation
 
@@ -120,6 +123,45 @@ def benchmark_python(ts, func, implementation, max_sites=1000):
     ]
 
 
+
+def benchmark_biopython(ts_path, max_sites=1000):
+    ts = tskit.load(ts_path)
+    assert ts.num_sites >= max_sites
+    variants = itertools.islice(ts.variants(), max_sites)
+    times = np.zeros(max_sites)
+
+    tree = ts.first()
+    bp_tree = Bio.Phylo.read(io.StringIO(tree.newick()), "newick")
+    ps = Bio.Phylo.TreeConstruction.ParsimonyScorer()
+
+    with click.progressbar(
+        variants, length=max_sites, label=f"BioPython:n={ts.num_samples}"
+    ) as bar:
+        for j, variant in enumerate(bar):
+            records = [
+                Bio.SeqRecord.SeqRecord(
+                    Bio.Seq.Seq(str(variant.genotypes[k])), id=str(k + 1)
+                )
+                for k in range(ts.num_samples)
+            ]
+            alignment = Bio.Align.MultipleSeqAlignment(records)
+
+            before = time.perf_counter()
+            bp_score = ps.get_score(bp_tree, alignment)
+            duration = time.perf_counter() - before
+            times[j] = duration
+            _, mutations = tree.map_mutations(variant.genotypes, variant.alleles)
+            assert bp_score == len(mutations)
+    return [
+        {
+            "implementation": f"BioPython",
+            "sample_size": ts.num_samples,
+            "time_mean": np.mean(times),
+            "time_var": np.var(times),
+        }
+    ]
+
+
 def benchmark_tskit(ts_path, max_sites):
     ts = tskit.load(ts_path)
     assert ts.num_trees == 1
@@ -190,6 +232,7 @@ def benchmark_c_library(ts_path, max_sites):
 def benchmark_cpp_library(ts_path, max_sites):
     return benchmark_external(f"./cpp_implementation", ts_path, max_sites)
 
+
 def warmup_jit():
     ts = msprime.sim_ancestry(100, sequence_length=100000, random_seed=43)
     numba_hartigan_parsimony(ts.first(), np.zeros(ts.num_samples, dtype=np.int8), ["0"])
@@ -265,6 +308,32 @@ def to_preorder(ts, verify=False):
     return new_ts
 
 
+def convert_phylo(ts, num_sites, newick_path, fasta_path):
+
+    tree = ts.first()
+    with open(newick_path, "w") as f:
+        f.write(tree.newick())
+
+    H = np.empty((ts.num_samples, num_sites), dtype=np.int8)
+    for var in ts.variants():
+        if var.site.id >= num_sites:
+            break
+        alleles = np.full(len(var.alleles), 0, dtype=np.int8)
+        for i, allele in enumerate(var.alleles):
+            ascii_allele = allele.encode("ascii")
+            allele_int8 = ord(ascii_allele)
+            alleles[i] = allele_int8
+        H[:, var.site.id] = alleles[var.genotypes]
+
+    with open(fasta_path, "w") as f:
+        # Sample are labelled 1,.., n in the newick
+        for j, h in enumerate(H, start=1):
+            print(f">{j}", file=f)
+            sequence = h.tobytes().decode("ascii")
+            for line in textwrap.wrap(sequence):
+                print(line, file=f)
+
+
 @click.command()
 def generate_data():
     """
@@ -277,6 +346,10 @@ def generate_data():
         ts.dump(f"data/n_1e{k}.trees")
         ts_preorder = to_preorder(ts, verify=k < 6)
         ts_preorder.dump(f"data/n_1e{k}_preorder.trees")
+        if k < 7:
+            convert_phylo(ts, 1000, f"data/n_1e{k}.nwk", f"data/n_1e{k}.fasta")
+        else:
+            break
 
 
 @click.group()
@@ -321,21 +394,55 @@ def quickbench(filename):
     ts = tskit.load(filename)
     assert ts.num_trees == 1
     for impl in [
+        # benchmark_biopython,
         benchmark_pythran,
         benchmark_numba,
         benchmark_tskit,
         benchmark_c_library,
     ]:
-        m = 1000 if ts.num_samples < 10 ** 6 else 10
+        m = 100 if ts.num_samples < 10 ** 6 else 10
         for datum in impl(filename, max_sites=m):
             print(datum["implementation"], datum["time_mean"], sep="\t")
     # _hartigan_postorder.inspect_types()
     # _hartigan_preorder.inspect_types()
 
+
+@click.command()
+def benchmark_libs():
+    """
+    Runs benchmarks on available libraries.
+    """
+    # warmup_jit()
+
+    benchmark_biopython(tskit.load("data/n_1e3.trees"))
+
+    benchmark_R("data/n_1e3.trees")
+
+    # ts = tskit.load("data/n_1e1.trees")
+    # print(ts)
+
+    # assert ts.num_trees == 1
+    # for impl in [
+    #     benchmark_pythran,
+    #     benchmark_numba,
+    #     benchmark_tskit,
+    #     benchmark_c_library,
+    # ]:
+    #     m = 1000 if ts.num_samples < 10 ** 6 else 10
+    #     for datum in impl(filename, max_sites=m):
+    #         print(datum["implementation"], datum["time_mean"], sep="\t")
+    # _hartigan_postorder.inspect_types()
+    # _hartigan_preorder.inspect_types()
+
+
+
+
+
 cli.add_command(generate_data)
 cli.add_command(run_benchmarks)
 cli.add_command(verify)
 cli.add_command(quickbench)
+cli.add_command(benchmark_libs)
 
 
 if __name__ == "__main__":

diff --git a/tree_performance/requirements.txt b/tree_performance/requirements.txt
@@ -5,3 +5,4 @@ msprime
 matplotlib
 numba
 pythran
+BioPython