Skip to content

Commit

Permalink
v0.0.4
Browse files Browse the repository at this point in the history
  • Loading branch information
uym2 committed Apr 17, 2024
1 parent d0d7025 commit 9dc03ec
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 41 deletions.
45 changes: 8 additions & 37 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,37 +1,8 @@
* Version 0.1
* MOSEK license check updated
* Method renamed to LAML
* Trees are ultrametric, --timescale is expected
* Version 0.5b
* Slight change in branch length initialization
* Fixed bug in the Mstep when MOSEK failed
* Allow multiple tree topologies --> handle data with multiple progenitors
* Version 0.4
* Stable version of v0.4b
* Version 0.4b (beta, unstable):
* Increase recursion depth limit
* Fix bug of EM in computing likelihood of trees with polytomies
* Bypass the case where cvxpy cannot solve on prolem instance in Mstep
* Version 0.3:
* Stable version of v0.3p
* Add checkpoints
* Version 0.3p (parallelized, unstable):
* Topology search with simulated annealing
* Paralellize NNI operations
* Version 0.2:
* Topology Search version 1: Simple NNI approach.
- Can take polytomy trees and will restrict topology search to the branches introduced by randomly resolving the polytomies first. v0.2 will note the maximum likelihood resolved tree in this search process, then take this as the start tree and explore topologies on the whole tree.
- Topology Search pipeline:
1. Pick a branch according to some strategy.
- Strategies:
Each of these strategies produces a scoring for all branches, and then each branch attempt is selected according to the strategy. A branch attempt is successful if the likelihood (without recomputing branch lengths) is higher than the current likelihood. If a branch attempt is unsuccessful, then we draw a second branch attempt without replacement. This ends when no more branches are available.
- random: will pick a random branch from the set of allowed branches
- vanilla : scored according to comparing the az-partition tags. the should change strategy takes the max of $(d_{ab}, d_{ac})$
- shouldchange: scored according to comparing the az-partition tags. the should change strategy takes the max of $(d_{ab} - d_{bc}, d_{ac} - d_{bc})$
2. Greedily pick an NNI which improves the tree likelihood around the selected branch.
3. If likelihood improves, accept this branch.
4. Exit condition:
- exit if the improvement between the last round and the current round is smaller than some user-input convergence epsilon parameter AND the current best tree topology has been seen before AND the likelihood has been the same within some threshold K times (where k is defined according to having >95% confidence in picking user-input proportion of bad branches - default 0.2)
- exit if the number of NNI iterations has exceed the max allowed
* Version 0.1:
* Preliminary version: Scipy and EM solvers to optimize branch lengths, nu, and phi given a tree topology.
* LAML version 0.0.4
* handle mismatch between character matrix and prior pickle
* LAML version 0.0.3 (depricated)
* attempt to allow user-specified sampling times
* seems to work for branch length estimation on a fixed topology, but fails on doing topology search
* roll-back to the previous version
* LAML version 0.0.2
* LAML version 0.0.1
3 changes: 1 addition & 2 deletions laml_libs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# below we define the constants that are shared among all libraries


PROGRAM_NAME = "LAML"
PROGRAM_AUTHOR = ["Uyen Mai","Gillian Chu","Ben Raphael"]
PROGRAM_LICENSE = "GNU General Public License, version 3"
PROGRAM_VERSION = "0.0.2"
PROGRAM_VERSION = "0.0.4"
PROGRAM_YEAR = "2023"
PROGRAM_INSTITUTE = "Computer Science Department, Princeton University"
PROGRAM_DESCRIPTION = "LAML: Lineage Analysis via Maximum Likelihood"
Expand Down
61 changes: 60 additions & 1 deletion laml_libs/sequence_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,22 +151,81 @@ def load_pickle(f):
Q[i] = q
return Q

def read_priors(pfile, site_names=None):
def read_priors(pfile, msa, site_names=None):
file_extension = pfile.strip().split(".")[-1]
if file_extension == "pkl" or file_extension == "pickle": #pickled file
infile = open(pfile, "rb")
priors = pickle.load(infile)
infile.close()
Q = []
priorkeys = sorted(priors.keys())
mapping = dict()

if site_names is not None and priorkeys != sorted([int(x[1:]) for x in site_names]):
print("Prior keys mismatch with site names.")
print("Prior keys:", priorkeys)
print("Site names:", site_names)
print("Attempting to infer mapping between site names and prior keys...")

# check if we have the same number of keys
if len(site_names) == len(priorkeys):
for i, site_name in enumerate(site_names):
mapping[site_name] = priorkeys[i]
print(mapping)
else:
# inferring prior and site_name mapping
# we should have a mapping from every site name to a prior dictionary

# compute offset
site_name_digits = []
for site_name in site_names:
digit_name = ''.join([x for x in site_name if x.isdigit()])
site_name_digits.append(int(digit_name))

# tell the difference between an offset and missing keys in the dictionary
all_site_names_present = True
for i, site_name in enumerate(site_names):
digit_name = site_name_digits[i]
if digit_name not in priors.keys():
all_site_names_present = False

if not all_site_names_present:
print("Not all site names are present in the dictionary. Trying offset...")
offset = min(site_name_digits) - min(priorkeys)
print("Offset between input site names and prior keys is assumed to be", offset)

for i, site_name in enumerate(site_names):
digit_name = site_name_digits[i]
if not all_site_names_present:
prior_name = digit_name - offset
else:
prior_name = digit_name
mapping[site_name] = prior_name

if prior_name in priors.keys():
q = {int(x):priors[prior_name][x] for x in priors[prior_name]}
else:
print(f"Missing priors at site {site_name}, filling in uniform priors...")
# fill in uniform priors at site i
M_i = set(msa[x][i] for x in msa if msa[x][i] not in [0,"?"])
if len(M_i) == 0:
# add pseudo-mutated state
m_i = 1
q={"1":1.0}
else:
m_i = len(M_i)
q = {x:1/m_i for x in M_i}
q[0] = 0
Q.append(q)
print(mapping)
return Q


for i in sorted(priors.keys()):
q = {int(x):priors[i][x] for x in priors[i]}
q[0] = 0
Q.append(q)

elif file_extension == "csv":
#k = len(site_names)
#Q = [{0:0} for i in range(k)]
Expand Down
2 changes: 1 addition & 1 deletion run_laml.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def main():
q[0] = 0
Q.append(q)
else:
Q = read_priors(args["priors"], site_names)
Q = read_priors(args["priors"],msa,site_names=site_names)

selected_solver = EM_solver
em_selected = True
Expand Down

0 comments on commit 9dc03ec

Please sign in to comment.