Skip to content

Commit

Permalink
formatting fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Feb 26, 2024
1 parent 259f628 commit d3465b6
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 68 deletions.
2 changes: 1 addition & 1 deletion workflow/rules/tree.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ rule raxmlng:
max_len = int(100*config["LENGTH"]/config["IDENTITY"])
threads: lambda wildcards: int(config['num_threads'])
benchmark:
config["OUT_DIR"]+"/benchmarks/{id}.iqtree.txt"
config["OUT_DIR"]+"/benchmarks/{id}.raxml.txt"
shell:
'''
if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]]
Expand Down
96 changes: 57 additions & 39 deletions workflow/scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,95 @@
import matplotlib.pyplot as plt
import glob, os, sys
import pandas as pd
#benchmark file path
path = sys.argv[1]
out_dir = sys.argv[2]
import argparse

# benchmark file path
parser = argparse.ArgumentParser(
description="Process benchmark files and plot results."
)
parser.add_argument("--path", help="Directory containing benchmark files.")
parser.add_argument("--out_dir", help="Output directory for plots and CSV files.")
args = parser.parse_args()

path = args.path
out_dir = args.out_dir

sampling = []
alignment = []
msa = []
tree_build = []
astral = []
step_names = ["sampling","lastz","pasta","iqtree"]
steps = [sampling,alignment,msa,tree_build]
extensions = ["*.sample.txt","*.lastz.txt","*.pasta.txt","*.iqtree.txt"]
step_names = ["sampling", "lastz", "pasta", "raxml"]
steps = [sampling, alignment, msa, tree_build]
extensions = ["*.sample.txt", "*.lastz.txt", "*.pasta.txt", "*.raxml.txt"]

for i in range(len(extensions)):
for filename in glob.glob(os.path.join(path, extensions[i])):
with open(os.path.join(os.getcwd(), filename), "r") as f:
lines = f.readlines()
s = lines[1].strip().split()
seconds = float(s[0])/60
s2 = filename.split('/')
fn = s2[len(s2)-1].replace(extensions[i][1:],'')
cpu_time = float(s[len(s)-1])/60
steps[i].append((fn,seconds,cpu_time))
step_totals= []
cpu_totals= []
seconds = float(s[0]) / 60
s2 = filename.split("/")
fn = s2[len(s2) - 1].replace(extensions[i][1:], "")
cpu_time = float(s[len(s) - 1]) / 60
steps[i].append((fn, seconds, cpu_time))

step_totals = []
cpu_totals = []

for i in range(len(steps)):
sec_total = 0
cpu_total = 0
for j in range(len(steps[i])):
sec_total += steps[i][j][1]
cpu_total += steps[i][j][2]
step_totals.append((step_names[i],sec_total,cpu_total))
step_totals.append((step_names[i], sec_total, cpu_total))

tops = []

for i in range(len(steps)):
if len(steps[i]) > 50:
top = sorted(steps[i],key= lambda x: x[1],reverse=True)[:50]
top = sorted(steps[i], key=lambda x: x[1], reverse=True)[:50]
print(top)
tops.append(top)
else:
tops.append(steps[i])
with open(out_dir+"/step_avg.csv",'w') as w:

with open(out_dir + "/step_avg.csv", "w") as w:
for s in step_totals:
w.write(s[0]+','+str(s[1])+','+str(s[2])+'\n')
#with open("astral.txt",'r') as f:
#lines = f.readlines()
# s = lines[1].strip().split()
#seconds = float(s[0])
# cpu = float(s[len(s)-1])
# step_totals.append(("ASTRAL",seconds,cpu))
print(step_totals)
w.write(s[0] + "," + str(s[1]) + "," + str(s[2]) + "\n")
# with open("astral.txt",'r') as f:
# lines = f.readlines()
# s = lines[1].strip().split()
# seconds = float(s[0])
# cpu = float(s[len(s)-1])
# step_totals.append(("ASTRAL",seconds,cpu))

for i in range(len(tops)):
print(step_names[i])
fig, ax =plt.subplots(figsize=(20,10))
df = pd.DataFrame({
'job' :[x[0] for x in tops[i]],
'minutes' : [x[1] for x in tops[i]],
'cpu-time' : [x[2] for x in tops[i]]
})
fig, ax = plt.subplots(figsize=(20, 10))
df = pd.DataFrame(
{
"job": [x[0] for x in tops[i]],
"minutes": [x[1] for x in tops[i]],
"cpu-time": [x[2] for x in tops[i]],
}
)
print(df.head())
m = df.melt(id_vars='job')
m = df.melt(id_vars="job")
print(m.head())
sns.barplot(data = m, x = 'job',y= 'value', hue='variable',ax=ax)
sns.barplot(data=m, x="job", y="value", hue="variable", ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
ax.set_title(step_names[i])
plt.tight_layout()
fig = ax.get_figure()
fig.savefig(out_dir+'/'+step_names[i]+'.png')
fig, ax =plt.subplots(figsize=(20,10))
print(step_names)
print(step_totals)
ax = sns.barplot(x=[k[0] for k in step_totals],y=[k[1] for k in step_totals])
fig.savefig(out_dir + "/" + step_names[i] + ".png")

fig, ax = plt.subplots(figsize=(20, 10))

ax = sns.barplot(x=[k[0] for k in step_totals], y=[k[1] for k in step_totals])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
ax.set_title(step_names[i])

plt.tight_layout()
fig = ax.get_figure()
fig.savefig(out_dir+'/step_totals.png')
fig.savefig(out_dir + "/step_totals.png")
67 changes: 49 additions & 18 deletions workflow/scripts/converge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@
from reroot import rerootTree
import yaml
from pathlib import Path
import multiprocessing
from multiprocessing import Pool
import time
import subprocess
import math
import csv

Expand All @@ -26,29 +23,34 @@ def comp_tree(t1, t2):
d = t1.compare(t2)
return d["norm_rf"]


# Function to update the configuration file
def update_config(config_path, iteration, base_gene_count):
def update_config(config_path, base_gene_count):
with open(config_path) as file:
config = yaml.load(file, Loader=yaml.FullLoader)

# Update GENE_COUNT based on the iteration number
config['GENE_COUNT'] = base_gene_count * 2
config["GENE_COUNT"] = base_gene_count * 2

# Save the updated configuration
with open(config_path, 'w') as file:
with open(config_path, "w") as file:
yaml.dump(config, file)


# Function to read the initial GENE_COUNT from the config file
def read_initial_gene_count(config_path):
with open(config_path) as file:
config = yaml.load(file, Loader=yaml.FullLoader)
return config['GENE_COUNT']
return config["GENE_COUNT"]


# function to run snakemake with settings and add to run folder
def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances):
def run_snakemake(
cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances
):

# Set threads per instance dynamically
num_threads = cores/fixed_parallel_instances
num_threads = cores // fixed_parallel_instances

cmd = [
"snakemake",
Expand Down Expand Up @@ -101,7 +103,18 @@ def combine_iter(out_dir, run, cores):


# function for convergence run
def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances):
def converge_run(
iteration,
cores,
mode,
out_dir,
ref_exist,
ref,
roadies_dir,
support_thr,
config_path,
fixed_parallel_instances,
):
os.system("rm -r {0}".format(roadies_dir))
os.system("mkdir {0}".format(roadies_dir))
run = "iteration_"
Expand All @@ -112,9 +125,13 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor
run += str(iteration)
# run snakemake with specificed gene number and length
if iteration >= 2:
base_gene_count = read_initial_gene_count(config_path) # Read initial GENE_COUNT value
update_config(config_path, iteration, base_gene_count)
run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances)
base_gene_count = read_initial_gene_count(
config_path
) # Read initial GENE_COUNT value
update_config(config_path, base_gene_count)
run_snakemake(
cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances
)
# merging gene trees and mapping files
gene_trees = combine_iter(out_dir, run, cores)
t = Tree(out_dir + "/" + run + ".nwk")
Expand Down Expand Up @@ -202,7 +219,16 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor
t_out.write("Start time: " + str(start_time_l) + "\n")
while True:
percent_high_support, num_gt, outputtree = converge_run(
iteration, CORES, MODE, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances
iteration,
CORES,
MODE,
out_dir,
ref_exist,
ref,
roadies_dir,
support_thr,
config_path,
fixed_parallel_instances,
)
curr_time = time.time()
curr_time_l = time.asctime(time.localtime(time.time()))
Expand Down Expand Up @@ -233,7 +259,12 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor
)

iteration += 1
# if iteration == num_itrs:
# break
if ((iteration == 1) and (percent_high_support == 100)) or ((iteration >= 2) and ((percent_high_support - high_support_list[iteration-2] < 1) or (percent_high_support == 100) or (iteration == 9))):
if ((iteration == 1) and (percent_high_support == 100)) or (
(iteration >= 2)
and (
(percent_high_support - high_support_list[iteration - 2] < 1)
or (percent_high_support == 100)
or (iteration == 9)
)
):
break
3 changes: 1 addition & 2 deletions workflow/scripts/lastz2fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import matplotlib.pyplot as plt
from collections import OrderedDict
from Bio.Seq import Seq
import re

# get arguments
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -230,7 +229,7 @@
with open(filename, "w") as w:
w.write("")
sorted(gene_dup)
# output duplicity as csv]
# output duplicity as csv
with open(statdir + "/gene_dup.csv", "w") as f:
for i in range(len(gene_dup)):
f.write(str(gene_dup[i][0]) + "," + str(gene_dup[i][1]) + "\n")
Expand Down
31 changes: 23 additions & 8 deletions workflow/scripts/noconverge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@
from reroot import rerootTree
import yaml
from pathlib import Path
import multiprocessing
from multiprocessing import Pool
import time
import subprocess
import math


Expand All @@ -29,7 +26,7 @@ def comp_tree(t1, t2):
def run_snakemake(cores, mode, config_path, fixed_parallel_instances):

# Set threads per instance dynamically
num_threads = cores/fixed_parallel_instances
num_threads = cores // fixed_parallel_instances

cmd = [
"snakemake",
Expand All @@ -52,18 +49,27 @@ def run_snakemake(cores, mode, config_path, fixed_parallel_instances):


# function for convergence run
def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances):
def converge_run(
cores,
mode,
ref_exist,
ref,
trees,
roadies_dir,
config_path,
fixed_parallel_instances,
):
# run snakemake with specificed gene number and length
run_snakemake(cores, mode, config_path, fixed_parallel_instances)
# merging gene trees and mapping files
os.system(
"ASTER-Linux/bin/astral-pro -t {1} -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies.nwk -a {0}/genes/mapping.txt".format(
roadies_dir,cores
roadies_dir, cores
)
)
os.system(
"ASTER-Linux/bin/astral-pro -t {1} -u 3 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies_stats.nwk -a {0}/genes/mapping.txt".format(
roadies_dir,cores
roadies_dir, cores
)
)
gt = open(roadies_dir + "/genetrees/gene_tree_merged.nwk", "r")
Expand Down Expand Up @@ -131,7 +137,16 @@ def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_
time_stamps.append(start_time)
with open(roadies_dir + "/time_stamps.csv", "a") as t_out:
t_out.write("Start time: " + str(start_time_l) + "\n")
num_gt = converge_run(CORES, MODE, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances)
num_gt = converge_run(
CORES,
MODE,
ref_exist,
ref,
trees,
roadies_dir,
config_path,
fixed_parallel_instances,
)
curr_time = time.time()
curr_time_l = time.asctime(time.localtime(time.time()))
to_previous = curr_time - time_stamps[len(time_stamps) - 1]
Expand Down
4 changes: 4 additions & 0 deletions workflow/scripts/sequence_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
output = sys.argv[2]
plotdir = sys.argv[3]
species_count = {}

with open(output, "wb") as outfile:
for filename in glob.glob(os.path.join(directory, "*.fa")):
if filename == output or "temp" not in filename:
Expand All @@ -25,11 +26,14 @@
species_count[name] = num
with open(filename, "rb") as readfile:
shutil.copyfileobj(readfile, outfile)

x = list(species_count.keys())
y = list(species_count.values())

ax = sns.barplot(x=x, y=y)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
ax.set_title("Number of Reference Genes from Each Genome")

plt.tight_layout()
fig = ax.get_figure()
fig.savefig(plotdir)

0 comments on commit d3465b6

Please sign in to comment.