Merge pull request #16 from TurakhiaLab/converge_latest

Converge latest
TurakhiaLab · Feb 26, 2024 · da62afa · da62afa
2 parents 96cf180 + d3465b6
commit da62afa
Show file tree

Hide file tree

Showing 11 changed files with 189 additions and 89 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -3,7 +3,7 @@
 #Path for input genomes
 GENOMES: "/datasets/gzipped_birds_new"
 #Reference tree (optional, default set as null)
-REFERENCE: reference_trees/birds_reftree_363_very_latest.nwk
+REFERENCE: NULL
 #Length of each of the genes
 LENGTH: 500
 #Number of genes per iteration
@@ -30,6 +30,6 @@ STEPS: 1
 FILTERFRAGMENTS: 0.50
 MASKSITES: 0.02
 #Threshold for high support (only used in --converge option)
-SUPPORT_THRESHOLD: 0.7
-#Number of iterations (only used in --converge option)
-ITERATIONS: 3
+SUPPORT_THRESHOLD: 0.95
+#Number of instances to run in parallel
+NUM_INSTANCES: 4
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -10,7 +10,7 @@ rule lastz:
 		config["OUT_DIR"]+"/alignments/{sample}.maf"
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt"
-	threads: 2
+	threads: lambda wildcards: int(config['num_threads'])
 	params:
 		species = "{sample}",
 		identity = config['IDENTITY'],
@@ -64,7 +64,7 @@ rule pasta:
 		suffix = "fa.aln"
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{id}.pasta.txt"
-	threads: 4
+	threads: lambda wildcards: int(config['num_threads'])
 	conda: 
 		"../envs/msa.yaml"
 	shell:
@@ -77,6 +77,7 @@ rule pasta:
 			all_matched=true
 
 			while IFS= read -r line; do
+				line=$(echo "$line" | tr '[:lower:]' '[:upper:]')
   				if [[ "$line" != ">"* ]]; then
     				if [ -z "$reference" ]; then
       					reference="$line"
@@ -90,7 +91,7 @@ rule pasta:
 			if [ "$all_matched" = true ]; then
 				cp "$input_file" "$output_file"
 			else
-				python pasta/run_pasta.py -i {input} -j {params.prefix} --alignment-suffix={params.suffix} --num-cpus 4
+				python pasta/run_pasta.py -i {input} -j {params.prefix} --alignment-suffix={params.suffix} --num-cpus {threads}
 			fi
 		fi
 		touch {output}

diff --git a/workflow/rules/fasttree.smk b/workflow/rules/fasttree.smk
@@ -7,7 +7,6 @@ rule fasttree:
 	params:
 		m = config["MIN_ALIGN"],
 		max_len = int(100*config["LENGTH"]/config["IDENTITY"])
-	threads: 1
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{id}.fasttree.txt"
 	shell:

diff --git a/workflow/rules/mashtree.smk b/workflow/rules/mashtree.smk
@@ -39,9 +39,9 @@ rule sequence_select:
 		THRES=config["UPPER_CASE"]
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt"
+	threads: lambda wildcards: int(config['num_threads'])
 	output:
         	config["OUT_DIR"]+"/samples/{sample}_temp.fa"
-	threads:workflow.cores
 	shell:
 			'''
 			echo "We are starting to sample {input}"
@@ -72,7 +72,7 @@ rule lastz:
 		config["OUT_DIR"]+"/alignments/{sample}.maf"
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt"
-	threads: 2
+	threads: lambda wildcards: int(config['num_threads'])
 	params:
 		species = "{sample}",
 		identity = config['IDENTITY'],
@@ -127,12 +127,12 @@ rule mashtree:
 		gene_dir = config["OUT_DIR"] + "/genes/gene_{id}"
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{id}.mashtree.txt"
-	threads: 8
+	threads: lambda wildcards: int(config['num_threads'])
 	shell:
 		'''
 		if [[ `grep -n '>' {input} | wc -l` -gt {params.m} ]] || [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input}` -gt {params.max_len} ]]
 		then
-			mashtree --mindepth 0 --numcpus 8 --kmerlength 10 {params.gene_dir}/*.fa > {output}
+			mashtree --mindepth 0 --numcpus {threads} --kmerlength 10 {params.gene_dir}/*.fa > {output}
 		fi
 		touch {output}
 		'''

diff --git a/workflow/rules/sampling.smk b/workflow/rules/sampling.smk
@@ -37,9 +37,9 @@ rule sequence_select:
 		THRES=config["UPPER_CASE"]
 	benchmark:
 		config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt"
+	threads: lambda wildcards: int(config['num_threads'])
 	output:
         	config["OUT_DIR"]+"/samples/{sample}_temp.fa"
-	threads:32
 	shell:
 			'''
 			echo "We are starting to sample {input}"

diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk
@@ -7,14 +7,14 @@ rule raxmlng:
 	params:
 		m = config["MIN_ALIGN"],
 		max_len = int(100*config["LENGTH"]/config["IDENTITY"])
-	threads: 4
+	threads: lambda wildcards: int(config['num_threads'])
 	benchmark:
-		config["OUT_DIR"]+"/benchmarks/{id}.iqtree.txt"
+		config["OUT_DIR"]+"/benchmarks/{id}.raxml.txt"
 	shell:
 		'''
 		if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]]
 		then
-			raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{4}}
+			raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{threads}} --blopt nr_safe
 		else
 			touch {output.gene_tree}
 		fi

diff --git a/workflow/scripts/benchmark.py b/workflow/scripts/benchmark.py
@@ -2,77 +2,95 @@
 import matplotlib.pyplot as plt
 import glob, os, sys
 import pandas as pd
-#benchmark file path
-path = sys.argv[1]
-out_dir = sys.argv[2]
+import argparse
+
+# benchmark file path
+parser = argparse.ArgumentParser(
+    description="Process benchmark files and plot results."
+)
+parser.add_argument("--path", help="Directory containing benchmark files.")
+parser.add_argument("--out_dir", help="Output directory for plots and CSV files.")
+args = parser.parse_args()
+
+path = args.path
+out_dir = args.out_dir
+
 sampling = []
 alignment = []
 msa = []
 tree_build = []
-astral = []
-step_names =  ["sampling","lastz","pasta","iqtree"]
-steps = [sampling,alignment,msa,tree_build]
-extensions = ["*.sample.txt","*.lastz.txt","*.pasta.txt","*.iqtree.txt"]
+step_names = ["sampling", "lastz", "pasta", "raxml"]
+steps = [sampling, alignment, msa, tree_build]
+extensions = ["*.sample.txt", "*.lastz.txt", "*.pasta.txt", "*.raxml.txt"]
+
 for i in range(len(extensions)):
     for filename in glob.glob(os.path.join(path, extensions[i])):
         with open(os.path.join(os.getcwd(), filename), "r") as f:
             lines = f.readlines()
             s = lines[1].strip().split()
-            seconds = float(s[0])/60
-            s2 = filename.split('/')
-            fn = s2[len(s2)-1].replace(extensions[i][1:],'')
-            cpu_time = float(s[len(s)-1])/60
-            steps[i].append((fn,seconds,cpu_time))
-step_totals= []
-cpu_totals= []
+            seconds = float(s[0]) / 60
+            s2 = filename.split("/")
+            fn = s2[len(s2) - 1].replace(extensions[i][1:], "")
+            cpu_time = float(s[len(s) - 1]) / 60
+            steps[i].append((fn, seconds, cpu_time))
+
+step_totals = []
+cpu_totals = []
+
 for i in range(len(steps)):
     sec_total = 0
     cpu_total = 0
     for j in range(len(steps[i])):
         sec_total += steps[i][j][1]
         cpu_total += steps[i][j][2]
-    step_totals.append((step_names[i],sec_total,cpu_total))
+    step_totals.append((step_names[i], sec_total, cpu_total))
+
 tops = []
+
 for i in range(len(steps)):
     if len(steps[i]) > 50:
-        top = sorted(steps[i],key= lambda x: x[1],reverse=True)[:50]
+        top = sorted(steps[i], key=lambda x: x[1], reverse=True)[:50]
         print(top)
         tops.append(top)
     else:
         tops.append(steps[i])
-with open(out_dir+"/step_avg.csv",'w') as w:
+
+with open(out_dir + "/step_avg.csv", "w") as w:
     for s in step_totals:
-        w.write(s[0]+','+str(s[1])+','+str(s[2])+'\n')
-#with open("astral.txt",'r') as f:
-    #lines = f.readlines()
-   # s = lines[1].strip().split()
-    #seconds = float(s[0])
-   # cpu = float(s[len(s)-1])
-   # step_totals.append(("ASTRAL",seconds,cpu))
-print(step_totals)
+        w.write(s[0] + "," + str(s[1]) + "," + str(s[2]) + "\n")
+# with open("astral.txt",'r') as f:
+# lines = f.readlines()
+# s = lines[1].strip().split()
+# seconds = float(s[0])
+# cpu = float(s[len(s)-1])
+# step_totals.append(("ASTRAL",seconds,cpu))
+
 for i in range(len(tops)):
     print(step_names[i])
-    fig, ax =plt.subplots(figsize=(20,10))
-    df = pd.DataFrame({
-        'job' :[x[0] for x in tops[i]],
-        'minutes' : [x[1] for x in tops[i]],
-        'cpu-time' : [x[2] for x in tops[i]]
-        })
+    fig, ax = plt.subplots(figsize=(20, 10))
+    df = pd.DataFrame(
+        {
+            "job": [x[0] for x in tops[i]],
+            "minutes": [x[1] for x in tops[i]],
+            "cpu-time": [x[2] for x in tops[i]],
+        }
+    )
     print(df.head())
-    m = df.melt(id_vars='job')
+    m = df.melt(id_vars="job")
     print(m.head())
-    sns.barplot(data = m, x = 'job',y= 'value', hue='variable',ax=ax)
+    sns.barplot(data=m, x="job", y="value", hue="variable", ax=ax)
     ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
     ax.set_title(step_names[i])
     plt.tight_layout()
     fig = ax.get_figure()
-    fig.savefig(out_dir+'/'+step_names[i]+'.png')
-fig, ax =plt.subplots(figsize=(20,10))
-print(step_names)
-print(step_totals)
-ax = sns.barplot(x=[k[0] for k in step_totals],y=[k[1] for k in step_totals])
+    fig.savefig(out_dir + "/" + step_names[i] + ".png")
+
+fig, ax = plt.subplots(figsize=(20, 10))
+
+ax = sns.barplot(x=[k[0] for k in step_totals], y=[k[1] for k in step_totals])
 ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
 ax.set_title(step_names[i])
+
 plt.tight_layout()
 fig = ax.get_figure()
-fig.savefig(out_dir+'/step_totals.png')
+fig.savefig(out_dir + "/step_totals.png")