From 57f0517a6ed07d31b4b7fc89a1caa700170dac00 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 20 Feb 2024 00:03:47 +0000 Subject: [PATCH 1/4] latest converge logic added --- config/config.yaml | 6 ++---- workflow/rules/align.smk | 3 ++- workflow/rules/mashtree.smk | 2 +- workflow/rules/sampling.smk | 2 +- workflow/rules/tree.smk | 2 +- workflow/scripts/converge.py | 30 +++++++++++++++++++++++++----- workflow/scripts/noconverge.py | 7 ++++++- 7 files changed, 38 insertions(+), 14 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 047eb97d..4cdafebb 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,7 +7,7 @@ REFERENCE: reference_trees/birds_reftree_363_very_latest.nwk #Length of each of the genes LENGTH: 500 #Number of genes per iteration -GENE_COUNT: 4000 +GENE_COUNT: 250 #Minimum % uppercase for sampling valid genes UPPER_CASE: 0.90 #ROADIES output directory (current iteration output in --converge option) @@ -30,6 +30,4 @@ STEPS: 1 FILTERFRAGMENTS: 0.50 MASKSITES: 0.02 #Threshold for high support (only used in --converge option) -SUPPORT_THRESHOLD: 0.7 -#Number of iterations (only used in --converge option) -ITERATIONS: 3 \ No newline at end of file +SUPPORT_THRESHOLD: 0.95 diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 7509de30..4acd0605 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -10,7 +10,7 @@ rule lastz: config["OUT_DIR"]+"/alignments/{sample}.maf" benchmark: config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt" - threads: 2 + threads: 4 params: species = "{sample}", identity = config['IDENTITY'], @@ -77,6 +77,7 @@ rule pasta: all_matched=true while IFS= read -r line; do + line=$(echo "$line" | tr '[:lower:]' '[:upper:]') if [[ "$line" != ">"* ]]; then if [ -z "$reference" ]; then reference="$line" diff --git a/workflow/rules/mashtree.smk b/workflow/rules/mashtree.smk index de58d4bd..5e208558 100644 --- a/workflow/rules/mashtree.smk +++ b/workflow/rules/mashtree.smk @@ -72,7 +72,7 @@ rule lastz: config["OUT_DIR"]+"/alignments/{sample}.maf" benchmark: config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt" - threads: 2 + threads: 4 params: species = "{sample}", identity = config['IDENTITY'], diff --git a/workflow/rules/sampling.smk b/workflow/rules/sampling.smk index 1b90c116..849c657e 100644 --- a/workflow/rules/sampling.smk +++ b/workflow/rules/sampling.smk @@ -39,7 +39,7 @@ rule sequence_select: config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt" output: config["OUT_DIR"]+"/samples/{sample}_temp.fa" - threads:32 + threads:workflow.cores shell: ''' echo "We are starting to sample {input}" diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk index ffed0ea6..d2ba948e 100644 --- a/workflow/rules/tree.smk +++ b/workflow/rules/tree.smk @@ -14,7 +14,7 @@ rule raxmlng: ''' if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]] then - raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{4}} + raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{4}} --blopt nr_safe else touch {output.gene_tree} fi diff --git a/workflow/scripts/converge.py b/workflow/scripts/converge.py index c953afd3..d22b094c 100644 --- a/workflow/scripts/converge.py +++ b/workflow/scripts/converge.py @@ -26,12 +26,29 @@ def comp_tree(t1, t2): d = t1.compare(t2) return d["norm_rf"] +# Function to update the configuration file +def update_config(config_path, iteration, base_gene_count): + with open(config_path) as file: + config = yaml.load(file, Loader=yaml.FullLoader) + + # Update GENE_COUNT based on the iteration number + config['GENE_COUNT'] = base_gene_count * 2 + + # Save the updated configuration + with open(config_path, 'w') as file: + yaml.dump(config, file) + +# Function to read the initial GENE_COUNT from the config file +def read_initial_gene_count(config_path): + with open(config_path) as file: + config = yaml.load(file, Loader=yaml.FullLoader) + return config['GENE_COUNT'] # function to run snakemake with settings and add to run folder def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path): cmd = [ "snakemake", - "--core", + "--cores", str(cores), "--jobs", str(cores), @@ -91,6 +108,9 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor else: run += str(iteration) # run snakemake with specificed gene number and length + if iteration >= 2: + base_gene_count = read_initial_gene_count(config_path) # Read initial GENE_COUNT value + update_config(config_path, iteration, base_gene_count) run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path) # merging gene trees and mapping files gene_trees = combine_iter(out_dir, run) @@ -152,7 +172,6 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor ref = Tree(config["REFERENCE"]) genomes = config["GENOMES"] out_dir = config["ALL_OUT_DIR"] - num_itrs = config["ITERATIONS"] NUM_GENOMES = len(os.listdir(genomes)) NUM_GENES = config["GENE_COUNT"] LENGTH = config["LENGTH"] @@ -185,6 +204,7 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor curr_time_l = time.asctime(time.localtime(time.time())) to_previous = curr_time - time_stamps[len(time_stamps) - 1] time_stamps.append(curr_time) + high_support_list.append(percent_high_support) elapsed_time = curr_time - start_time with open(out_dir + "/time_stamps.csv", "a") as t_out: t_out.write( @@ -208,8 +228,8 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor str(iteration) + "," + str(num_gt) + "," + str(ref_dist) + "\n" ) - high_support_list.append(percent_high_support) - iteration += 1 - if iteration == num_itrs: + # if iteration == num_itrs: + # break + if ((iteration == 1) and (percent_high_support == 100)) or ((iteration >= 2) and ((percent_high_support - high_support_list[iteration-2] < 1) or (percent_high_support == 100) or (iteration == 9))): break diff --git a/workflow/scripts/noconverge.py b/workflow/scripts/noconverge.py index 11a5a734..8695c7a4 100644 --- a/workflow/scripts/noconverge.py +++ b/workflow/scripts/noconverge.py @@ -29,7 +29,7 @@ def comp_tree(t1, t2): def run_snakemake(cores, mode, config_path): cmd = [ "snakemake", - "--core", + "--cores", str(cores), "--jobs", str(cores), @@ -58,6 +58,11 @@ def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path): roadies_dir ) ) + os.system( + "ASTER-Linux/bin/astral-pro -t 16 -u 3 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies_stats.nwk -a {0}/genes/mapping.txt".format( + roadies_dir + ) + ) gt = open(roadies_dir + "/genetrees/gene_tree_merged.nwk", "r") gene_trees = gt.readlines() gt.close() From 57be3154309fa41896968db13dac9b63617beffd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 25 Feb 2024 23:53:56 +0000 Subject: [PATCH 2/4] fixed snakemake multithreading --- config/config.yaml | 6 ++++-- workflow/rules/align.smk | 6 +++--- workflow/rules/fasttree.smk | 1 - workflow/rules/mashtree.smk | 7 +++---- workflow/rules/sampling.smk | 1 - workflow/rules/tree.smk | 4 ++-- workflow/scripts/converge.py | 28 ++++++++++++++++------------ workflow/scripts/noconverge.py | 24 ++++++++++++++---------- 8 files changed, 42 insertions(+), 35 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 4cdafebb..26e3ce34 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -3,11 +3,11 @@ #Path for input genomes GENOMES: "/datasets/gzipped_birds_new" #Reference tree (optional, default set as null) -REFERENCE: reference_trees/birds_reftree_363_very_latest.nwk +REFERENCE: NULL #Length of each of the genes LENGTH: 500 #Number of genes per iteration -GENE_COUNT: 250 +GENE_COUNT: 4000 #Minimum % uppercase for sampling valid genes UPPER_CASE: 0.90 #ROADIES output directory (current iteration output in --converge option) @@ -31,3 +31,5 @@ FILTERFRAGMENTS: 0.50 MASKSITES: 0.02 #Threshold for high support (only used in --converge option) SUPPORT_THRESHOLD: 0.95 +#Number of instances to run in parallel +NUM_INSTANCES: 4 diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 4acd0605..27302221 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -10,7 +10,7 @@ rule lastz: config["OUT_DIR"]+"/alignments/{sample}.maf" benchmark: config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt" - threads: 4 + threads: lambda wildcards: int(config['num_threads']) params: species = "{sample}", identity = config['IDENTITY'], @@ -64,7 +64,7 @@ rule pasta: suffix = "fa.aln" benchmark: config["OUT_DIR"]+"/benchmarks/{id}.pasta.txt" - threads: 4 + threads: lambda wildcards: int(config['num_threads']) conda: "../envs/msa.yaml" shell: @@ -91,7 +91,7 @@ rule pasta: if [ "$all_matched" = true ]; then cp "$input_file" "$output_file" else - python pasta/run_pasta.py -i {input} -j {params.prefix} --alignment-suffix={params.suffix} --num-cpus 4 + python pasta/run_pasta.py -i {input} -j {params.prefix} --alignment-suffix={params.suffix} --num-cpus {threads} fi fi touch {output} diff --git a/workflow/rules/fasttree.smk b/workflow/rules/fasttree.smk index 822a41b6..83e5e2b0 100644 --- a/workflow/rules/fasttree.smk +++ b/workflow/rules/fasttree.smk @@ -7,7 +7,6 @@ rule fasttree: params: m = config["MIN_ALIGN"], max_len = int(100*config["LENGTH"]/config["IDENTITY"]) - threads: 1 benchmark: config["OUT_DIR"]+"/benchmarks/{id}.fasttree.txt" shell: diff --git a/workflow/rules/mashtree.smk b/workflow/rules/mashtree.smk index 5e208558..b02a8a05 100644 --- a/workflow/rules/mashtree.smk +++ b/workflow/rules/mashtree.smk @@ -41,7 +41,6 @@ rule sequence_select: config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt" output: config["OUT_DIR"]+"/samples/{sample}_temp.fa" - threads:workflow.cores shell: ''' echo "We are starting to sample {input}" @@ -72,7 +71,7 @@ rule lastz: config["OUT_DIR"]+"/alignments/{sample}.maf" benchmark: config["OUT_DIR"]+"/benchmarks/{sample}.lastz.txt" - threads: 4 + threads: lambda wildcards: int(config['num_threads']) params: species = "{sample}", identity = config['IDENTITY'], @@ -127,12 +126,12 @@ rule mashtree: gene_dir = config["OUT_DIR"] + "/genes/gene_{id}" benchmark: config["OUT_DIR"]+"/benchmarks/{id}.mashtree.txt" - threads: 8 + threads: lambda wildcards: int(config['num_threads']) shell: ''' if [[ `grep -n '>' {input} | wc -l` -gt {params.m} ]] || [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input}` -gt {params.max_len} ]] then - mashtree --mindepth 0 --numcpus 8 --kmerlength 10 {params.gene_dir}/*.fa > {output} + mashtree --mindepth 0 --numcpus {threads} --kmerlength 10 {params.gene_dir}/*.fa > {output} fi touch {output} ''' diff --git a/workflow/rules/sampling.smk b/workflow/rules/sampling.smk index 849c657e..a5cca5f3 100644 --- a/workflow/rules/sampling.smk +++ b/workflow/rules/sampling.smk @@ -39,7 +39,6 @@ rule sequence_select: config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt" output: config["OUT_DIR"]+"/samples/{sample}_temp.fa" - threads:workflow.cores shell: ''' echo "We are starting to sample {input}" diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk index d2ba948e..06ba7614 100644 --- a/workflow/rules/tree.smk +++ b/workflow/rules/tree.smk @@ -7,14 +7,14 @@ rule raxmlng: params: m = config["MIN_ALIGN"], max_len = int(100*config["LENGTH"]/config["IDENTITY"]) - threads: 4 + threads: lambda wildcards: int(config['num_threads']) benchmark: config["OUT_DIR"]+"/benchmarks/{id}.iqtree.txt" shell: ''' if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]] then - raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{4}} --blopt nr_safe + raxml-ng --msa {input.msa} --model GTR+G+F --redo --threads auto{{threads}} --blopt nr_safe else touch {output.gene_tree} fi diff --git a/workflow/scripts/converge.py b/workflow/scripts/converge.py index d22b094c..3878b898 100644 --- a/workflow/scripts/converge.py +++ b/workflow/scripts/converge.py @@ -45,16 +45,19 @@ def read_initial_gene_count(config_path): return config['GENE_COUNT'] # function to run snakemake with settings and add to run folder -def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path): +def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances): + + # Set threads per instance dynamically + num_threads = cores/fixed_parallel_instances + cmd = [ "snakemake", "--cores", str(cores), - "--jobs", - str(cores), "--config", "mode=" + str(mode), "config_path=" + str(config_path), + "num_threads=" + str(num_threads), "--use-conda", "--rerun-incomplete", ] @@ -71,7 +74,7 @@ def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path): # function to combine gene trees and mapping files from all iterations -def combine_iter(out_dir, run): +def combine_iter(out_dir, run, cores): os.system( "cat {0}/{1}/gene_tree_merged.nwk >> {0}/master_gt.nwk".format(out_dir, run) ) @@ -81,13 +84,13 @@ def combine_iter(out_dir, run): # open both files and get lines, each line is a separate gene tree os.system( - "ASTER-Linux/bin/astral-pro -t 16 -i {0}/master_gt.nwk -o {0}/{1}.nwk -a {0}/master_map.txt".format( - out_dir, run + "ASTER-Linux/bin/astral-pro -t {2} -i {0}/master_gt.nwk -o {0}/{1}.nwk -a {0}/master_map.txt".format( + out_dir, run, cores ) ) os.system( - "ASTER-Linux/bin/astral-pro -t 16 -u 3 -i {0}/master_gt.nwk -o {0}/{1}_stats.nwk -a {0}/master_map.txt".format( - out_dir, run + "ASTER-Linux/bin/astral-pro -t {2} -u 3 -i {0}/master_gt.nwk -o {0}/{1}_stats.nwk -a {0}/master_map.txt".format( + out_dir, run, cores ) ) # open both master files and get gene trees and mapping @@ -98,7 +101,7 @@ def combine_iter(out_dir, run): # function for convergence run -def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, support_thr, config_path): +def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances): os.system("rm -r {0}".format(roadies_dir)) os.system("mkdir {0}".format(roadies_dir)) run = "iteration_" @@ -111,9 +114,9 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor if iteration >= 2: base_gene_count = read_initial_gene_count(config_path) # Read initial GENE_COUNT value update_config(config_path, iteration, base_gene_count) - run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path) + run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances) # merging gene trees and mapping files - gene_trees = combine_iter(out_dir, run) + gene_trees = combine_iter(out_dir, run, cores) t = Tree(out_dir + "/" + run + ".nwk") # add species tree to tree list if ref_exist: @@ -177,6 +180,7 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor LENGTH = config["LENGTH"] support_thr = config["SUPPORT_THRESHOLD"] roadies_dir = config["OUT_DIR"] + fixed_parallel_instances = config["NUM_INSTANCES"] master_gt = out_dir + "/master_gt.nwk" master_map = out_dir + "/master_map.txt" os.system("rm -r {0}".format(out_dir)) @@ -198,7 +202,7 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor t_out.write("Start time: " + str(start_time_l) + "\n") while True: percent_high_support, num_gt, outputtree = converge_run( - iteration, CORES, MODE, out_dir, ref_exist, roadies_dir, support_thr, config_path + iteration, CORES, MODE, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances ) curr_time = time.time() curr_time_l = time.asctime(time.localtime(time.time())) diff --git a/workflow/scripts/noconverge.py b/workflow/scripts/noconverge.py index 8695c7a4..e98fd6ff 100644 --- a/workflow/scripts/noconverge.py +++ b/workflow/scripts/noconverge.py @@ -26,16 +26,19 @@ def comp_tree(t1, t2): # function to run snakemake with settings and add to run folder -def run_snakemake(cores, mode, config_path): +def run_snakemake(cores, mode, config_path, fixed_parallel_instances): + + # Set threads per instance dynamically + num_threads = cores/fixed_parallel_instances + cmd = [ "snakemake", "--cores", str(cores), - "--jobs", - str(cores), "--config", "mode=" + str(mode), "config_path=" + str(config_path), + "num_threads=" + str(num_threads), "--use-conda", "--rerun-incomplete", ] @@ -49,18 +52,18 @@ def run_snakemake(cores, mode, config_path): # function for convergence run -def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path): +def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances): # run snakemake with specificed gene number and length - run_snakemake(cores, mode, config_path) + run_snakemake(cores, mode, config_path, fixed_parallel_instances) # merging gene trees and mapping files os.system( - "ASTER-Linux/bin/astral-pro -t 16 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies.nwk -a {0}/genes/mapping.txt".format( - roadies_dir + "ASTER-Linux/bin/astral-pro -t {1} -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies.nwk -a {0}/genes/mapping.txt".format( + roadies_dir,cores ) ) os.system( - "ASTER-Linux/bin/astral-pro -t 16 -u 3 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies_stats.nwk -a {0}/genes/mapping.txt".format( - roadies_dir + "ASTER-Linux/bin/astral-pro -t {1} -u 3 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies_stats.nwk -a {0}/genes/mapping.txt".format( + roadies_dir,cores ) ) gt = open(roadies_dir + "/genetrees/gene_tree_merged.nwk", "r") @@ -112,6 +115,7 @@ def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path): NUM_GENES = config["GENE_COUNT"] LENGTH = config["LENGTH"] roadies_dir = config["OUT_DIR"] + fixed_parallel_instances = config["NUM_INSTANCES"] os.system("rm -r {0}".format(roadies_dir)) os.system("mkdir {0}".format(roadies_dir)) sys.setrecursionlimit(2000) @@ -127,7 +131,7 @@ def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path): time_stamps.append(start_time) with open(roadies_dir + "/time_stamps.csv", "a") as t_out: t_out.write("Start time: " + str(start_time_l) + "\n") - num_gt = converge_run(CORES, MODE, ref_exist, trees, roadies_dir, config_path) + num_gt = converge_run(CORES, MODE, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances) curr_time = time.time() curr_time_l = time.asctime(time.localtime(time.time())) to_previous = curr_time - time_stamps[len(time_stamps) - 1] From 259f62891d2ad0e1a0d5c04f8179f2fbc63f3ee3 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 25 Feb 2024 23:57:02 +0000 Subject: [PATCH 3/4] sampling rule threads fixed --- workflow/rules/mashtree.smk | 1 + workflow/rules/sampling.smk | 1 + 2 files changed, 2 insertions(+) diff --git a/workflow/rules/mashtree.smk b/workflow/rules/mashtree.smk index b02a8a05..2e1e9ef6 100644 --- a/workflow/rules/mashtree.smk +++ b/workflow/rules/mashtree.smk @@ -39,6 +39,7 @@ rule sequence_select: THRES=config["UPPER_CASE"] benchmark: config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt" + threads: lambda wildcards: int(config['num_threads']) output: config["OUT_DIR"]+"/samples/{sample}_temp.fa" shell: diff --git a/workflow/rules/sampling.smk b/workflow/rules/sampling.smk index a5cca5f3..90f100e0 100644 --- a/workflow/rules/sampling.smk +++ b/workflow/rules/sampling.smk @@ -37,6 +37,7 @@ rule sequence_select: THRES=config["UPPER_CASE"] benchmark: config["OUT_DIR"]+"/benchmarks/{sample}.sample.txt" + threads: lambda wildcards: int(config['num_threads']) output: config["OUT_DIR"]+"/samples/{sample}_temp.fa" shell: From d3465b69e60b38ade4c879a8c875deded00f6818 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 26 Feb 2024 01:06:36 +0000 Subject: [PATCH 4/4] formatting fixes --- workflow/rules/tree.smk | 2 +- workflow/scripts/benchmark.py | 96 ++++++++++++++++++------------ workflow/scripts/converge.py | 67 +++++++++++++++------ workflow/scripts/lastz2fasta.py | 3 +- workflow/scripts/noconverge.py | 31 +++++++--- workflow/scripts/sequence_merge.py | 4 ++ 6 files changed, 135 insertions(+), 68 deletions(-) diff --git a/workflow/rules/tree.smk b/workflow/rules/tree.smk index 06ba7614..6058b8ad 100644 --- a/workflow/rules/tree.smk +++ b/workflow/rules/tree.smk @@ -9,7 +9,7 @@ rule raxmlng: max_len = int(100*config["LENGTH"]/config["IDENTITY"]) threads: lambda wildcards: int(config['num_threads']) benchmark: - config["OUT_DIR"]+"/benchmarks/{id}.iqtree.txt" + config["OUT_DIR"]+"/benchmarks/{id}.raxml.txt" shell: ''' if [[ `grep -n '>' {input.msa} | wc -l` -gt {params.m} ]] && [[ `awk 'BEGIN{{l=0;n=0;st=0}}{{if (substr($0,1,1) == ">") {{st=1}} else {{st=2}}; if(st==1) {{n+=1}} else if(st==2) {{l+=length($0)}}}} END{{if (n>0) {{print int((l+n-1)/n)}} else {{print 0}} }}' {input.msa}` -lt {params.max_len} ]] diff --git a/workflow/scripts/benchmark.py b/workflow/scripts/benchmark.py index 36712d31..a9f71ef1 100644 --- a/workflow/scripts/benchmark.py +++ b/workflow/scripts/benchmark.py @@ -2,77 +2,95 @@ import matplotlib.pyplot as plt import glob, os, sys import pandas as pd -#benchmark file path -path = sys.argv[1] -out_dir = sys.argv[2] +import argparse + +# benchmark file path +parser = argparse.ArgumentParser( + description="Process benchmark files and plot results." +) +parser.add_argument("--path", help="Directory containing benchmark files.") +parser.add_argument("--out_dir", help="Output directory for plots and CSV files.") +args = parser.parse_args() + +path = args.path +out_dir = args.out_dir + sampling = [] alignment = [] msa = [] tree_build = [] -astral = [] -step_names = ["sampling","lastz","pasta","iqtree"] -steps = [sampling,alignment,msa,tree_build] -extensions = ["*.sample.txt","*.lastz.txt","*.pasta.txt","*.iqtree.txt"] +step_names = ["sampling", "lastz", "pasta", "raxml"] +steps = [sampling, alignment, msa, tree_build] +extensions = ["*.sample.txt", "*.lastz.txt", "*.pasta.txt", "*.raxml.txt"] + for i in range(len(extensions)): for filename in glob.glob(os.path.join(path, extensions[i])): with open(os.path.join(os.getcwd(), filename), "r") as f: lines = f.readlines() s = lines[1].strip().split() - seconds = float(s[0])/60 - s2 = filename.split('/') - fn = s2[len(s2)-1].replace(extensions[i][1:],'') - cpu_time = float(s[len(s)-1])/60 - steps[i].append((fn,seconds,cpu_time)) -step_totals= [] -cpu_totals= [] + seconds = float(s[0]) / 60 + s2 = filename.split("/") + fn = s2[len(s2) - 1].replace(extensions[i][1:], "") + cpu_time = float(s[len(s) - 1]) / 60 + steps[i].append((fn, seconds, cpu_time)) + +step_totals = [] +cpu_totals = [] + for i in range(len(steps)): sec_total = 0 cpu_total = 0 for j in range(len(steps[i])): sec_total += steps[i][j][1] cpu_total += steps[i][j][2] - step_totals.append((step_names[i],sec_total,cpu_total)) + step_totals.append((step_names[i], sec_total, cpu_total)) + tops = [] + for i in range(len(steps)): if len(steps[i]) > 50: - top = sorted(steps[i],key= lambda x: x[1],reverse=True)[:50] + top = sorted(steps[i], key=lambda x: x[1], reverse=True)[:50] print(top) tops.append(top) else: tops.append(steps[i]) -with open(out_dir+"/step_avg.csv",'w') as w: + +with open(out_dir + "/step_avg.csv", "w") as w: for s in step_totals: - w.write(s[0]+','+str(s[1])+','+str(s[2])+'\n') -#with open("astral.txt",'r') as f: - #lines = f.readlines() - # s = lines[1].strip().split() - #seconds = float(s[0]) - # cpu = float(s[len(s)-1]) - # step_totals.append(("ASTRAL",seconds,cpu)) -print(step_totals) + w.write(s[0] + "," + str(s[1]) + "," + str(s[2]) + "\n") +# with open("astral.txt",'r') as f: +# lines = f.readlines() +# s = lines[1].strip().split() +# seconds = float(s[0]) +# cpu = float(s[len(s)-1]) +# step_totals.append(("ASTRAL",seconds,cpu)) + for i in range(len(tops)): print(step_names[i]) - fig, ax =plt.subplots(figsize=(20,10)) - df = pd.DataFrame({ - 'job' :[x[0] for x in tops[i]], - 'minutes' : [x[1] for x in tops[i]], - 'cpu-time' : [x[2] for x in tops[i]] - }) + fig, ax = plt.subplots(figsize=(20, 10)) + df = pd.DataFrame( + { + "job": [x[0] for x in tops[i]], + "minutes": [x[1] for x in tops[i]], + "cpu-time": [x[2] for x in tops[i]], + } + ) print(df.head()) - m = df.melt(id_vars='job') + m = df.melt(id_vars="job") print(m.head()) - sns.barplot(data = m, x = 'job',y= 'value', hue='variable',ax=ax) + sns.barplot(data=m, x="job", y="value", hue="variable", ax=ax) ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right") ax.set_title(step_names[i]) plt.tight_layout() fig = ax.get_figure() - fig.savefig(out_dir+'/'+step_names[i]+'.png') -fig, ax =plt.subplots(figsize=(20,10)) -print(step_names) -print(step_totals) -ax = sns.barplot(x=[k[0] for k in step_totals],y=[k[1] for k in step_totals]) + fig.savefig(out_dir + "/" + step_names[i] + ".png") + +fig, ax = plt.subplots(figsize=(20, 10)) + +ax = sns.barplot(x=[k[0] for k in step_totals], y=[k[1] for k in step_totals]) ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right") ax.set_title(step_names[i]) + plt.tight_layout() fig = ax.get_figure() -fig.savefig(out_dir+'/step_totals.png') +fig.savefig(out_dir + "/step_totals.png") diff --git a/workflow/scripts/converge.py b/workflow/scripts/converge.py index 3878b898..09a55827 100644 --- a/workflow/scripts/converge.py +++ b/workflow/scripts/converge.py @@ -13,10 +13,7 @@ from reroot import rerootTree import yaml from pathlib import Path -import multiprocessing -from multiprocessing import Pool import time -import subprocess import math import csv @@ -26,29 +23,34 @@ def comp_tree(t1, t2): d = t1.compare(t2) return d["norm_rf"] + # Function to update the configuration file -def update_config(config_path, iteration, base_gene_count): +def update_config(config_path, base_gene_count): with open(config_path) as file: config = yaml.load(file, Loader=yaml.FullLoader) - + # Update GENE_COUNT based on the iteration number - config['GENE_COUNT'] = base_gene_count * 2 + config["GENE_COUNT"] = base_gene_count * 2 # Save the updated configuration - with open(config_path, 'w') as file: + with open(config_path, "w") as file: yaml.dump(config, file) + # Function to read the initial GENE_COUNT from the config file def read_initial_gene_count(config_path): with open(config_path) as file: config = yaml.load(file, Loader=yaml.FullLoader) - return config['GENE_COUNT'] + return config["GENE_COUNT"] + # function to run snakemake with settings and add to run folder -def run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances): +def run_snakemake( + cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances +): # Set threads per instance dynamically - num_threads = cores/fixed_parallel_instances + num_threads = cores // fixed_parallel_instances cmd = [ "snakemake", @@ -101,7 +103,18 @@ def combine_iter(out_dir, run, cores): # function for convergence run -def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances): +def converge_run( + iteration, + cores, + mode, + out_dir, + ref_exist, + ref, + roadies_dir, + support_thr, + config_path, + fixed_parallel_instances, +): os.system("rm -r {0}".format(roadies_dir)) os.system("mkdir {0}".format(roadies_dir)) run = "iteration_" @@ -112,9 +125,13 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor run += str(iteration) # run snakemake with specificed gene number and length if iteration >= 2: - base_gene_count = read_initial_gene_count(config_path) # Read initial GENE_COUNT value - update_config(config_path, iteration, base_gene_count) - run_snakemake(cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances) + base_gene_count = read_initial_gene_count( + config_path + ) # Read initial GENE_COUNT value + update_config(config_path, base_gene_count) + run_snakemake( + cores, mode, out_dir, run, roadies_dir, config_path, fixed_parallel_instances + ) # merging gene trees and mapping files gene_trees = combine_iter(out_dir, run, cores) t = Tree(out_dir + "/" + run + ".nwk") @@ -202,7 +219,16 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor t_out.write("Start time: " + str(start_time_l) + "\n") while True: percent_high_support, num_gt, outputtree = converge_run( - iteration, CORES, MODE, out_dir, ref_exist, roadies_dir, support_thr, config_path, fixed_parallel_instances + iteration, + CORES, + MODE, + out_dir, + ref_exist, + ref, + roadies_dir, + support_thr, + config_path, + fixed_parallel_instances, ) curr_time = time.time() curr_time_l = time.asctime(time.localtime(time.time())) @@ -233,7 +259,12 @@ def converge_run(iteration, cores, mode, out_dir, ref_exist, roadies_dir, suppor ) iteration += 1 - # if iteration == num_itrs: - # break - if ((iteration == 1) and (percent_high_support == 100)) or ((iteration >= 2) and ((percent_high_support - high_support_list[iteration-2] < 1) or (percent_high_support == 100) or (iteration == 9))): + if ((iteration == 1) and (percent_high_support == 100)) or ( + (iteration >= 2) + and ( + (percent_high_support - high_support_list[iteration - 2] < 1) + or (percent_high_support == 100) + or (iteration == 9) + ) + ): break diff --git a/workflow/scripts/lastz2fasta.py b/workflow/scripts/lastz2fasta.py index e18a1e37..b572632b 100644 --- a/workflow/scripts/lastz2fasta.py +++ b/workflow/scripts/lastz2fasta.py @@ -12,7 +12,6 @@ import matplotlib.pyplot as plt from collections import OrderedDict from Bio.Seq import Seq -import re # get arguments parser = argparse.ArgumentParser( @@ -230,7 +229,7 @@ with open(filename, "w") as w: w.write("") sorted(gene_dup) -# output duplicity as csv] +# output duplicity as csv with open(statdir + "/gene_dup.csv", "w") as f: for i in range(len(gene_dup)): f.write(str(gene_dup[i][0]) + "," + str(gene_dup[i][1]) + "\n") diff --git a/workflow/scripts/noconverge.py b/workflow/scripts/noconverge.py index e98fd6ff..e97a725a 100644 --- a/workflow/scripts/noconverge.py +++ b/workflow/scripts/noconverge.py @@ -12,10 +12,7 @@ from reroot import rerootTree import yaml from pathlib import Path -import multiprocessing -from multiprocessing import Pool import time -import subprocess import math @@ -29,7 +26,7 @@ def comp_tree(t1, t2): def run_snakemake(cores, mode, config_path, fixed_parallel_instances): # Set threads per instance dynamically - num_threads = cores/fixed_parallel_instances + num_threads = cores // fixed_parallel_instances cmd = [ "snakemake", @@ -52,18 +49,27 @@ def run_snakemake(cores, mode, config_path, fixed_parallel_instances): # function for convergence run -def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances): +def converge_run( + cores, + mode, + ref_exist, + ref, + trees, + roadies_dir, + config_path, + fixed_parallel_instances, +): # run snakemake with specificed gene number and length run_snakemake(cores, mode, config_path, fixed_parallel_instances) # merging gene trees and mapping files os.system( "ASTER-Linux/bin/astral-pro -t {1} -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies.nwk -a {0}/genes/mapping.txt".format( - roadies_dir,cores + roadies_dir, cores ) ) os.system( "ASTER-Linux/bin/astral-pro -t {1} -u 3 -i {0}/genetrees/gene_tree_merged.nwk -o {0}/roadies_stats.nwk -a {0}/genes/mapping.txt".format( - roadies_dir,cores + roadies_dir, cores ) ) gt = open(roadies_dir + "/genetrees/gene_tree_merged.nwk", "r") @@ -131,7 +137,16 @@ def converge_run(cores, mode, ref_exist, trees, roadies_dir, config_path, fixed_ time_stamps.append(start_time) with open(roadies_dir + "/time_stamps.csv", "a") as t_out: t_out.write("Start time: " + str(start_time_l) + "\n") - num_gt = converge_run(CORES, MODE, ref_exist, trees, roadies_dir, config_path, fixed_parallel_instances) + num_gt = converge_run( + CORES, + MODE, + ref_exist, + ref, + trees, + roadies_dir, + config_path, + fixed_parallel_instances, + ) curr_time = time.time() curr_time_l = time.asctime(time.localtime(time.time())) to_previous = curr_time - time_stamps[len(time_stamps) - 1] diff --git a/workflow/scripts/sequence_merge.py b/workflow/scripts/sequence_merge.py index 0bd47643..7611d8cd 100644 --- a/workflow/scripts/sequence_merge.py +++ b/workflow/scripts/sequence_merge.py @@ -10,6 +10,7 @@ output = sys.argv[2] plotdir = sys.argv[3] species_count = {} + with open(output, "wb") as outfile: for filename in glob.glob(os.path.join(directory, "*.fa")): if filename == output or "temp" not in filename: @@ -25,11 +26,14 @@ species_count[name] = num with open(filename, "rb") as readfile: shutil.copyfileobj(readfile, outfile) + x = list(species_count.keys()) y = list(species_count.values()) + ax = sns.barplot(x=x, y=y) ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right") ax.set_title("Number of Reference Genes from Each Genome") + plt.tight_layout() fig = ax.get_figure() fig.savefig(plotdir)