formatting fixes

TurakhiaLab · Feb 26, 2024 · d3465b6 · d3465b6
1 parent 259f628
commit d3465b6
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 68 deletions.
diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk
@@ -9,7 +9,7 @@ rule raxmlng:
 		max_len = int(100*config["LENGTH"]/config["IDENTITY"])
 	threads: lambda wildcards: int(config['num_threads'])
 	benchmark:
-		config["OUT_DIR"]+"/benchmarks/{id}.iqtree.txt"
+		config["OUT_DIR"]+"/benchmarks/{id}.raxml.txt"
 	shell:
 		'''
 		if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]]

diff --git a/workflow/scripts/benchmark.py b/workflow/scripts/benchmark.py
@@ -2,77 +2,95 @@
 import matplotlib.pyplot as plt
 import glob, os, sys
 import pandas as pd
-#benchmark file path
-path = sys.argv[1]
-out_dir = sys.argv[2]
+import argparse
+
+# benchmark file path
+parser = argparse.ArgumentParser(
+    description="Process benchmark files and plot results."
+)
+parser.add_argument("--path", help="Directory containing benchmark files.")
+parser.add_argument("--out_dir", help="Output directory for plots and CSV files.")
+args = parser.parse_args()
+
+path = args.path
+out_dir = args.out_dir
+
 sampling = []
 alignment = []
 msa = []
 tree_build = []
-astral = []
-step_names =  ["sampling","lastz","pasta","iqtree"]
-steps = [sampling,alignment,msa,tree_build]
-extensions = ["*.sample.txt","*.lastz.txt","*.pasta.txt","*.iqtree.txt"]
+step_names = ["sampling", "lastz", "pasta", "raxml"]
+steps = [sampling, alignment, msa, tree_build]
+extensions = ["*.sample.txt", "*.lastz.txt", "*.pasta.txt", "*.raxml.txt"]
+
 for i in range(len(extensions)):
     for filename in glob.glob(os.path.join(path, extensions[i])):
         with open(os.path.join(os.getcwd(), filename), "r") as f:
             lines = f.readlines()
             s = lines[1].strip().split()
-            seconds = float(s[0])/60
-            s2 = filename.split('/')
-            fn = s2[len(s2)-1].replace(extensions[i][1:],'')
-            cpu_time = float(s[len(s)-1])/60
-            steps[i].append((fn,seconds,cpu_time))
-step_totals= []
-cpu_totals= []
+            seconds = float(s[0]) / 60
+            s2 = filename.split("/")
+            fn = s2[len(s2) - 1].replace(extensions[i][1:], "")
+            cpu_time = float(s[len(s) - 1]) / 60
+            steps[i].append((fn, seconds, cpu_time))
+
+step_totals = []
+cpu_totals = []
+
 for i in range(len(steps)):
     sec_total = 0
     cpu_total = 0
     for j in range(len(steps[i])):
         sec_total += steps[i][j][1]
         cpu_total += steps[i][j][2]
-    step_totals.append((step_names[i],sec_total,cpu_total))
+    step_totals.append((step_names[i], sec_total, cpu_total))
+
 tops = []
+
 for i in range(len(steps)):
     if len(steps[i]) > 50:
-        top = sorted(steps[i],key= lambda x: x[1],reverse=True)[:50]
+        top = sorted(steps[i], key=lambda x: x[1], reverse=True)[:50]
         print(top)
         tops.append(top)
     else:
         tops.append(steps[i])
-with open(out_dir+"/step_avg.csv",'w') as w:
+
+with open(out_dir + "/step_avg.csv", "w") as w:
     for s in step_totals:
-        w.write(s[0]+','+str(s[1])+','+str(s[2])+'\n')
-#with open("astral.txt",'r') as f:
-    #lines = f.readlines()
-   # s = lines[1].strip().split()
-    #seconds = float(s[0])
-   # cpu = float(s[len(s)-1])
-   # step_totals.append(("ASTRAL",seconds,cpu))
-print(step_totals)
+        w.write(s[0] + "," + str(s[1]) + "," + str(s[2]) + "\n")
+# with open("astral.txt",'r') as f:
+# lines = f.readlines()
+# s = lines[1].strip().split()
+# seconds = float(s[0])
+# cpu = float(s[len(s)-1])
+# step_totals.append(("ASTRAL",seconds,cpu))
+
 for i in range(len(tops)):
     print(step_names[i])
-    fig, ax =plt.subplots(figsize=(20,10))
-    df = pd.DataFrame({
-        'job' :[x[0] for x in tops[i]],
-        'minutes' : [x[1] for x in tops[i]],
-        'cpu-time' : [x[2] for x in tops[i]]
-        })
+    fig, ax = plt.subplots(figsize=(20, 10))
+    df = pd.DataFrame(
+        {
+            "job": [x[0] for x in tops[i]],
+            "minutes": [x[1] for x in tops[i]],
+            "cpu-time": [x[2] for x in tops[i]],
+        }
+    )
     print(df.head())
-    m = df.melt(id_vars='job')
+    m = df.melt(id_vars="job")
     print(m.head())
-    sns.barplot(data = m, x = 'job',y= 'value', hue='variable',ax=ax)
+    sns.barplot(data=m, x="job", y="value", hue="variable", ax=ax)
     ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
     ax.set_title(step_names[i])
     plt.tight_layout()
     fig = ax.get_figure()
-    fig.savefig(out_dir+'/'+step_names[i]+'.png')
-fig, ax =plt.subplots(figsize=(20,10))
-print(step_names)
-print(step_totals)
-ax = sns.barplot(x=[k[0] for k in step_totals],y=[k[1] for k in step_totals])
+    fig.savefig(out_dir + "/" + step_names[i] + ".png")
+
+fig, ax = plt.subplots(figsize=(20, 10))
+
+ax = sns.barplot(x=[k[0] for k in step_totals], y=[k[1] for k in step_totals])
 ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
 ax.set_title(step_names[i])
+
 plt.tight_layout()
 fig = ax.get_figure()
-fig.savefig(out_dir+'/step_totals.png')
+fig.savefig(out_dir + "/step_totals.png")
diff --git a/workflow/scripts/converge.py b/workflow/scripts/converge.py
@@ -13,10 +13,7 @@
 from reroot import rerootTree
 import yaml
 from pathlib import Path
-import multiprocessing
-from multiprocessing import Pool
 import time
-import subprocess
 import math
 import csv
 
@@ -26,29 +23,34 @@ def comp_tree(t1, t2):
     d = t1.compare(t2)
     return d["norm_rf"]
 
+
 # Function to update the configuration file
-def update_config(config_path, iteration, base_gene_count):
+def update_config(config_path, base_gene_count):
     with open(config_path) as file:
         config = yaml.load(file, Loader=yaml.FullLoader)
-    
+
     # Update GENE_COUNT based on the iteration number
-    config['GENE_COUNT'] = base_gene_count * 2
+    config["GENE_COUNT"] = base_gene_count * 2
 
     # Save the updated configuration
-    with open(config_path, 'w') as file:
+    with open(config_path, "w") as file:
         yaml.dump(config, file)
 
+
 # Function to read the initial GENE_COUNT from the config file
 def read_initial_gene_count(config_path):
     with open(config_path) as file:
         config = yaml.load(file, Loader=yaml.FullLoader)
-    return config['GENE_COUNT']
+    return config["GENE_COUNT"]
+
 
 # function to run snakemake with settings and add to run folder
-def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances):
+def run_snakemake(
+    cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances
+):
 
     # Set threads per instance dynamically
-    num_threads = cores/fixed_parallel_instances
+    num_threads = cores // fixed_parallel_instances
 
     cmd = [
         "snakemake",
@@ -101,7 +103,18 @@ def combine_iter(out_dir, run, cores):
 
 
 # function for convergence run
-def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances):
+def converge_run(
+    iteration,
+    cores,
+    mode,
+    out_dir,
+    ref_exist,
+    ref,
+    roadies_dir,
+    support_thr,
+    config_path,
+    fixed_parallel_instances,
+):
     os.system("rm -r {0}".format(roadies_dir))
     os.system("mkdir {0}".format(roadies_dir))
     run = "iteration_"
@@ -112,9 +125,13 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor
         run += str(iteration)
     # run snakemake with specificed gene number and length
     if iteration >= 2:
-        base_gene_count = read_initial_gene_count(config_path)  # Read initial GENE_COUNT value
-        update_config(config_path, iteration, base_gene_count)
-    run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances)
+        base_gene_count = read_initial_gene_count(
+            config_path
+        )  # Read initial GENE_COUNT value
+        update_config(config_path, base_gene_count)
+    run_snakemake(
+        cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances
+    )
     # merging gene trees and mapping files
     gene_trees = combine_iter(out_dir, run, cores)
     t = Tree(out_dir + "/" + run + ".nwk")
@@ -202,7 +219,16 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor
         t_out.write("Start time: " + str(start_time_l) + "\n")
     while True:
         percent_high_support, num_gt, outputtree = converge_run(
-            iteration, CORES, MODE, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances
+            iteration,
+            CORES,
+            MODE,
+            out_dir,
+            ref_exist,
+            ref,
+            roadies_dir,
+            support_thr,
+            config_path,
+            fixed_parallel_instances,
         )
         curr_time = time.time()
         curr_time_l = time.asctime(time.localtime(time.time()))
@@ -233,7 +259,12 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor
                 )
 
         iteration += 1
-        # if iteration == num_itrs:
-        #     break
-        if ((iteration == 1) and (percent_high_support == 100)) or ((iteration >= 2) and ((percent_high_support - high_support_list[iteration-2] < 1) or (percent_high_support == 100) or (iteration == 9))):
+        if ((iteration == 1) and (percent_high_support == 100)) or (
+            (iteration >= 2)
+            and (
+                (percent_high_support - high_support_list[iteration - 2] < 1)
+                or (percent_high_support == 100)
+                or (iteration == 9)
+            )
+        ):
             break
diff --git a/workflow/scripts/lastz2fasta.py b/workflow/scripts/lastz2fasta.py
@@ -12,7 +12,6 @@
 import matplotlib.pyplot as plt
 from collections import OrderedDict
 from Bio.Seq import Seq
-import re
 
 # get arguments
 parser = argparse.ArgumentParser(
@@ -230,7 +229,7 @@
         with open(filename, "w") as w:
             w.write("")
 sorted(gene_dup)
-# output duplicity as csv]
+# output duplicity as csv
 with open(statdir + "/gene_dup.csv", "w") as f:
     for i in range(len(gene_dup)):
         f.write(str(gene_dup[i][0]) + "," + str(gene_dup[i][1]) + "\n")

diff --git a/workflow/scripts/noconverge.py b/workflow/scripts/noconverge.py
@@ -12,10 +12,7 @@
 from reroot import rerootTree
 import yaml
 from pathlib import Path
-import multiprocessing
-from multiprocessing import Pool
 import time
-import subprocess
 import math
 
 
@@ -29,7 +26,7 @@ def comp_tree(t1, t2):
 def run_snakemake(cores, mode, config_path, fixed_parallel_instances):
 
     # Set threads per instance dynamically
-    num_threads = cores/fixed_parallel_instances
+    num_threads = cores // fixed_parallel_instances
 
     cmd = [
         "snakemake",
@@ -52,18 +49,27 @@ def run_snakemake(cores, mode, config_path, fixed_parallel_instances):
 
 
 # function for convergence run
-def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances):
+def converge_run(
+    cores,
+    mode,
+    ref_exist,
+    ref,
+    trees,
+    roadies_dir,
+    config_path,
+    fixed_parallel_instances,
+):
     # run snakemake with specificed gene number and length
     run_snakemake(cores, mode, config_path, fixed_parallel_instances)
     # merging gene trees and mapping files
     os.system(
         "ASTER-Linux/bin/astral-pro -t {1} -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies.nwk -a {0}/genes/mapping.txt".format(
-            roadies_dir,cores
+            roadies_dir, cores
         )
     )
     os.system(
         "ASTER-Linux/bin/astral-pro -t {1} -u 3 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies_stats.nwk -a {0}/genes/mapping.txt".format(
-            roadies_dir,cores
+            roadies_dir, cores
         )
     )
     gt = open(roadies_dir + "/genetrees/gene_tree_merged.nwk", "r")
@@ -131,7 +137,16 @@ def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_
     time_stamps.append(start_time)
     with open(roadies_dir + "/time_stamps.csv", "a") as t_out:
         t_out.write("Start time: " + str(start_time_l) + "\n")
-    num_gt = converge_run(CORES, MODE, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances)
+    num_gt = converge_run(
+        CORES,
+        MODE,
+        ref_exist,
+        ref,
+        trees,
+        roadies_dir,
+        config_path,
+        fixed_parallel_instances,
+    )
     curr_time = time.time()
     curr_time_l = time.asctime(time.localtime(time.time()))
     to_previous = curr_time - time_stamps[len(time_stamps) - 1]

diff --git a/workflow/scripts/sequence_merge.py b/workflow/scripts/sequence_merge.py
@@ -10,6 +10,7 @@
 output = sys.argv[2]
 plotdir = sys.argv[3]
 species_count = {}
+
 with open(output, "wb") as outfile:
     for filename in glob.glob(os.path.join(directory, "*.fa")):
         if filename == output or "temp" not in filename:
@@ -25,11 +26,14 @@
         species_count[name] = num
         with open(filename, "rb") as readfile:
             shutil.copyfileobj(readfile, outfile)
+
 x = list(species_count.keys())
 y = list(species_count.values())
+
 ax = sns.barplot(x=x, y=y)
 ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right")
 ax.set_title("Number of Reference Genes from Each Genome")
+
 plt.tight_layout()
 fig = ax.get_figure()
 fig.savefig(plotdir)