Skip to content

Commit

Permalink
improve snakemake benchmark summary script
Browse files Browse the repository at this point in the history
  • Loading branch information
LMBradford committed Jun 27, 2024
1 parent 86dd7ef commit 325db8d
Showing 1 changed file with 23 additions and 10 deletions.
33 changes: 23 additions & 10 deletions scripts/summarize_sm_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This script produces a summary table from benchmark files output by Snakemake
## when running the R-ODAF_Health_Canada pipeline
#Run it in the directory with the benchmark files


import pandas as pd
import glob
Expand Down Expand Up @@ -40,10 +40,7 @@

# Loop through each file and read it into a DataFrame
for file_path in file_paths:
# Extract the part of the file name between "benchmark." and ".txt"
info_from_file_name = os.path.basename(file_path).split('benchmark.')[1].replace('.txt', '')

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t')

# Add a new column with information from the file name
Expand All @@ -70,19 +67,35 @@
# Calculate count for the "rule" column
result_df['rule_count'] = grouped_df['rule'].count()

# Iterate through the specified columns and calculate mean and total
for column in args.columns:
if column in df.columns:
result_df[f'{column}_mean'] = grouped_df[column].mean()
result_df[f'{column}_total'] = grouped_df[column].sum()
# Calculate mean and total for the "s" and "io_*" columns, if they're in the columns list
if 's' in args.columns:
result_df['s_mean'] = grouped_df['s'].mean()
result_df['s_total'] = grouped_df['s'].sum()

if 'io_in' in args.columns:
result_df['io_in_mean'] = grouped_df['io_in'].mean()
result_df['io_in_total'] = grouped_df['io_in'].sum()
if 'io_out' in args.columns:
result_df['io_out_mean'] = grouped_df['io_out'].mean()
result_df['io_out_total'] = grouped_df['io_out'].sum()

# Calculate max for the "max_*" columns, if they're in the columns list
if 'max_rss' in args.columns:
result_df['max_rss_max'] = grouped_df['max_rss'].max()
if 'max_vms' in args.columns:
result_df['max_vms_max'] = grouped_df['max_vms'].max()
if 'max_uss' in args.columns:
result_df['max_uss_max'] = grouped_df['max_uss'].max()
if 'max_pss' in args.columns:
result_df['max_pss_max'] = grouped_df['max_pss'].max()


# Reset the index to make "rule" a regular column
result_df.reset_index(inplace=True)

# Add tag column
result_df['Tag'] = args.tag


# Save the result DataFrame to a TSV file
result_df.to_csv(args.output_file, sep='\t', index=False)

Expand Down

0 comments on commit 325db8d

Please sign in to comment.