Skip to content

Commit

Permalink
Merge pull request #16 from TurakhiaLab/converge_latest
Browse files Browse the repository at this point in the history
Converge latest
  • Loading branch information
ang037 authored Feb 26, 2024
2 parents 96cf180 + d3465b6 commit da62afa
Show file tree
Hide file tree
Showing 11 changed files with 189 additions and 89 deletions.
8 changes: 4 additions & 4 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#Path for input genomes
GENOMES: "/datasets/gzipped_birds_new"
#Reference tree (optional, default set as null)
REFERENCE: reference_trees/birds_reftree_363_very_latest.nwk
REFERENCE: NULL
#Length of each of the genes
LENGTH: 500
#Number of genes per iteration
Expand All @@ -30,6 +30,6 @@ STEPS: 1
FILTERFRAGMENTS: 0.50
MASKSITES: 0.02
#Threshold for high support (only used in --converge option)
SUPPORT_THRESHOLD: 0.7
#Number of iterations (only used in --converge option)
ITERATIONS: 3
SUPPORT_THRESHOLD: 0.95
#Number of instances to run in parallel
NUM_INSTANCES: 4
7 changes: 4 additions & 3 deletions workflow/rules/align.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ rule lastz:
config["OUT_DIR"]+"/alignments/{sample}.maf"
benchmark:
config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt"
threads: 2
threads: lambda wildcards: int(config['num_threads'])
params:
species = "{sample}",
identity = config['IDENTITY'],
Expand Down Expand Up @@ -64,7 +64,7 @@ rule pasta:
suffix = "fa.aln"
benchmark:
config["OUT_DIR"]+"/benchmarks/{id}.pasta.txt"
threads: 4
threads: lambda wildcards: int(config['num_threads'])
conda:
"../envs/msa.yaml"
shell:
Expand All @@ -77,6 +77,7 @@ rule pasta:
all_matched=true
while IFS= read -r line; do
line=$(echo "$line" | tr '[:lower:]' '[:upper:]')
if [[ "$line" != ">"* ]]; then
if [ -z "$reference" ]; then
reference="$line"
Expand All @@ -90,7 +91,7 @@ rule pasta:
if [ "$all_matched" = true ]; then
cp "$input_file" "$output_file"
else
python pasta/run_pasta.py -i {input} -j {params.prefix} --alignment-suffix={params.suffix} --num-cpus 4
python pasta/run_pasta.py -i {input} -j {params.prefix} --alignment-suffix={params.suffix} --num-cpus {threads}
fi
fi
touch {output}
Expand Down
1 change: 0 additions & 1 deletion workflow/rules/fasttree.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ rule fasttree:
params:
m = config["MIN_ALIGN"],
max_len = int(100*config["LENGTH"]/config["IDENTITY"])
threads: 1
benchmark:
config["OUT_DIR"]+"/benchmarks/{id}.fasttree.txt"
shell:
Expand Down
8 changes: 4 additions & 4 deletions workflow/rules/mashtree.smk
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ rule sequence_select:
THRES=config["UPPER_CASE"]
benchmark:
config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt"
threads: lambda wildcards: int(config['num_threads'])
output:
config["OUT_DIR"]+"/samples/{sample}_temp.fa"
threads:workflow.cores
shell:
'''
echo "We are starting to sample {input}"
Expand Down Expand Up @@ -72,7 +72,7 @@ rule lastz:
config["OUT_DIR"]+"/alignments/{sample}.maf"
benchmark:
config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt"
threads: 2
threads: lambda wildcards: int(config['num_threads'])
params:
species = "{sample}",
identity = config['IDENTITY'],
Expand Down Expand Up @@ -127,12 +127,12 @@ rule mashtree:
gene_dir = config["OUT_DIR"] + "/genes/gene_{id}"
benchmark:
config["OUT_DIR"]+"/benchmarks/{id}.mashtree.txt"
threads: 8
threads: lambda wildcards: int(config['num_threads'])
shell:
'''
if [[ `grep -n '>' {input} | wc -l` -gt {params.m} ]] || [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input}` -gt {params.max_len} ]]
then
mashtree --mindepth 0 --numcpus 8 --kmerlength 10 {params.gene_dir}/*.fa > {output}
mashtree --mindepth 0 --numcpus {threads} --kmerlength 10 {params.gene_dir}/*.fa > {output}
fi
touch {output}
'''
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/sampling.smk
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ rule sequence_select:
THRES=config["UPPER_CASE"]
benchmark:
config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt"
threads: lambda wildcards: int(config['num_threads'])
output:
config["OUT_DIR"]+"/samples/{sample}_temp.fa"
threads:32
shell:
'''
echo "We are starting to sample {input}"
Expand Down
6 changes: 3 additions & 3 deletions workflow/rules/tree.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ rule raxmlng:
params:
m = config["MIN_ALIGN"],
max_len = int(100*config["LENGTH"]/config["IDENTITY"])
threads: 4
threads: lambda wildcards: int(config['num_threads'])
benchmark:
config["OUT_DIR"]+"/benchmarks/{id}.iqtree.txt"
config["OUT_DIR"]+"/benchmarks/{id}.raxml.txt"
shell:
'''
if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]]
then
raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{4}}
raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{threads}} --blopt nr_safe
else
touch {output.gene_tree}
fi
Expand Down
96 changes: 57 additions & 39 deletions workflow/scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,95 @@
import matplotlib.pyplot as plt
import glob, os, sys
import pandas as pd
#benchmark file path
path = sys.argv[1]
out_dir = sys.argv[2]
import argparse

# benchmark file path
parser = argparse.ArgumentParser(
description="Process benchmark files and plot results."
)
parser.add_argument("--path", help="Directory containing benchmark files.")
parser.add_argument("--out_dir", help="Output directory for plots and CSV files.")
args = parser.parse_args()

path = args.path
out_dir = args.out_dir

sampling = []
alignment = []
msa = []
tree_build = []
astral = []
step_names = ["sampling","lastz","pasta","iqtree"]
steps = [sampling,alignment,msa,tree_build]
extensions = ["*.sample.txt","*.lastz.txt","*.pasta.txt","*.iqtree.txt"]
step_names = ["sampling", "lastz", "pasta", "raxml"]
steps = [sampling, alignment, msa, tree_build]
extensions = ["*.sample.txt", "*.lastz.txt", "*.pasta.txt", "*.raxml.txt"]

for i in range(len(extensions)):
for filename in glob.glob(os.path.join(path, extensions[i])):
with open(os.path.join(os.getcwd(), filename), "r") as f:
lines = f.readlines()
s = lines[1].strip().split()
seconds = float(s[0])/60
s2 = filename.split('/')
fn = s2[len(s2)-1].replace(extensions[i][1:],'')
cpu_time = float(s[len(s)-1])/60
steps[i].append((fn,seconds,cpu_time))
step_totals= []
cpu_totals= []
seconds = float(s[0]) / 60
s2 = filename.split("/")
fn = s2[len(s2) - 1].replace(extensions[i][1:], "")
cpu_time = float(s[len(s) - 1]) / 60
steps[i].append((fn, seconds, cpu_time))

step_totals = []
cpu_totals = []

for i in range(len(steps)):
sec_total = 0
cpu_total = 0
for j in range(len(steps[i])):
sec_total += steps[i][j][1]
cpu_total += steps[i][j][2]
step_totals.append((step_names[i],sec_total,cpu_total))
step_totals.append((step_names[i], sec_total, cpu_total))

tops = []

for i in range(len(steps)):
if len(steps[i]) > 50:
top = sorted(steps[i],key= lambda x: x[1],reverse=True)[:50]
top = sorted(steps[i], key=lambda x: x[1], reverse=True)[:50]
print(top)
tops.append(top)
else:
tops.append(steps[i])
with open(out_dir+"/step_avg.csv",'w') as w:

with open(out_dir + "/step_avg.csv", "w") as w:
for s in step_totals:
w.write(s[0]+','+str(s[1])+','+str(s[2])+'\n')
#with open("astral.txt",'r') as f:
#lines = f.readlines()
# s = lines[1].strip().split()
#seconds = float(s[0])
# cpu = float(s[len(s)-1])
# step_totals.append(("ASTRAL",seconds,cpu))
print(step_totals)
w.write(s[0] + "," + str(s[1]) + "," + str(s[2]) + "\n")
# with open("astral.txt",'r') as f:
# lines = f.readlines()
# s = lines[1].strip().split()
# seconds = float(s[0])
# cpu = float(s[len(s)-1])
# step_totals.append(("ASTRAL",seconds,cpu))

for i in range(len(tops)):
print(step_names[i])
fig, ax =plt.subplots(figsize=(20,10))
df = pd.DataFrame({
'job' :[x[0] for x in tops[i]],
'minutes' : [x[1] for x in tops[i]],
'cpu-time' : [x[2] for x in tops[i]]
})
fig, ax = plt.subplots(figsize=(20, 10))
df = pd.DataFrame(
{
"job": [x[0] for x in tops[i]],
"minutes": [x[1] for x in tops[i]],
"cpu-time": [x[2] for x in tops[i]],
}
)
print(df.head())
m = df.melt(id_vars='job')
m = df.melt(id_vars="job")
print(m.head())
sns.barplot(data = m, x = 'job',y= 'value', hue='variable',ax=ax)
sns.barplot(data=m, x="job", y="value", hue="variable", ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
ax.set_title(step_names[i])
plt.tight_layout()
fig = ax.get_figure()
fig.savefig(out_dir+'/'+step_names[i]+'.png')
fig, ax =plt.subplots(figsize=(20,10))
print(step_names)
print(step_totals)
ax = sns.barplot(x=[k[0] for k in step_totals],y=[k[1] for k in step_totals])
fig.savefig(out_dir + "/" + step_names[i] + ".png")

fig, ax = plt.subplots(figsize=(20, 10))

ax = sns.barplot(x=[k[0] for k in step_totals], y=[k[1] for k in step_totals])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
ax.set_title(step_names[i])

plt.tight_layout()
fig = ax.get_figure()
fig.savefig(out_dir+'/step_totals.png')
fig.savefig(out_dir + "/step_totals.png")
Loading

0 comments on commit da62afa

Please sign in to comment.