From 815c7bcc1332af0877facfbf61b707aaa4d2b06c Mon Sep 17 00:00:00 2001 From: AhmedBasem Date: Wed, 24 Jul 2024 16:42:52 +0300 Subject: [PATCH 1/6] Optimize data handling: Implement rounding and gzip compression for CSV output - Implement rounding to four decimal places for specific numerical columns. - gzip compression for CSV outputs to reduce file size and improve bandwidth usage during data transfers. --- .github/workflows/website.yml | 5 ++- .../unit_tests/reduce_output_size.py | 18 ++++++++++ website/dashboard/index.html | 1 + website/dashboard/index.js | 34 +++++++++++++------ 4 files changed, 47 insertions(+), 11 deletions(-) create mode 100644 tests/IVIMmodels/unit_tests/reduce_output_size.py diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml index ac8e4f3..93d25fe 100644 --- a/.github/workflows/website.yml +++ b/.github/workflows/website.yml @@ -46,9 +46,12 @@ jobs: with: name: 'Data' + - name: 'Filter and compress results file.' + run: python tests/IVIMmodels/unit_tests/reduce_output_size.py + - name: move data to the dashboard folder run: | - mv test_output.csv website/dashboard + mv test_output.csv.gz website/dashboard - name: Build documentation run: | diff --git a/tests/IVIMmodels/unit_tests/reduce_output_size.py b/tests/IVIMmodels/unit_tests/reduce_output_size.py new file mode 100644 index 0000000..6a3f16d --- /dev/null +++ b/tests/IVIMmodels/unit_tests/reduce_output_size.py @@ -0,0 +1,18 @@ +import pandas as pd + +df = pd.read_csv('test_output.csv') + +# Columns to be rounded to four decimal places +columns_to_round = [ + 'f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted', + 'bval_0.0', 'bval_1.0', 'bval_2.0', 'bval_5.0', 'bval_10.0', 'bval_20.0', + 'bval_30.0', 'bval_50.0', 'bval_75.0', 'bval_100.0', 'bval_150.0', 'bval_250.0', + 'bval_350.0', 'bval_400.0', 'bval_550.0', 'bval_700.0', 'bval_850.0', 'bval_1000.0' +] +for column in columns_to_round: + df[column] = df[column].round(4) + +#df = df.loc[:, ~df.columns.str.startswith('bval')] + +#compress and save the file. +df.to_csv('test_output.csv.gz', compression='gzip', index=False) diff --git a/website/dashboard/index.html b/website/dashboard/index.html index eefdcaa..e7ca265 100644 --- a/website/dashboard/index.html +++ b/website/dashboard/index.html @@ -8,6 +8,7 @@ + diff --git a/website/dashboard/index.js b/website/dashboard/index.js index f0166ae..5e07ab5 100644 --- a/website/dashboard/index.js +++ b/website/dashboard/index.js @@ -205,17 +205,31 @@ document.addEventListener('DOMContentLoaded', function() { showLoading(); - Papa.parse('test_output.csv', { - download: true, - header: true, - complete: results => { - data = results; - hideLoading(); - populateOptions(data); - drawBoxPlot(); - drawRegionBoxPlot(); - + fetch('test_output.csv.gz') + .then(response => { + if (!response.ok) { + throw new Error('Network response was not ok'); } + return response.arrayBuffer(); + }) + .then(buffer => { + // Use pako to decompress the data + var decompressed = pako.inflate(new Uint8Array(buffer), { to: 'string' }); + // Now use Papa Parse to parse the decompressed CSV data + Papa.parse(decompressed, { + header: true, + complete: results => { + console.log(results); + data = results; + hideLoading(); + populateOptions(data); + drawBoxPlot(); + drawRegionBoxPlot(); + } + }); + }) + .catch(error => { + console.error('There has been a problem with your fetch operation:', error); }); function populateOptions(data) { From 2d6afa3d9f36afee9d28d651b29db7ec4e89c970 Mon Sep 17 00:00:00 2001 From: AhmedBasem Date: Tue, 6 Aug 2024 22:05:26 +0300 Subject: [PATCH 2/6] fix unit tests. --- .github/workflows/website.yml | 2 +- .../IVIMmodels/unit_tests => utilities}/reduce_output_size.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename {tests/IVIMmodels/unit_tests => utilities}/reduce_output_size.py (100%) diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml index 93d25fe..636c11d 100644 --- a/.github/workflows/website.yml +++ b/.github/workflows/website.yml @@ -47,7 +47,7 @@ jobs: name: 'Data' - name: 'Filter and compress results file.' - run: python tests/IVIMmodels/unit_tests/reduce_output_size.py + run: python utilities/reduce_output_size.py - name: move data to the dashboard folder run: | diff --git a/tests/IVIMmodels/unit_tests/reduce_output_size.py b/utilities/reduce_output_size.py similarity index 100% rename from tests/IVIMmodels/unit_tests/reduce_output_size.py rename to utilities/reduce_output_size.py From ae47e2fdf3b2da193db8ee6b7fb7b68f9f137b1a Mon Sep 17 00:00:00 2001 From: AhmedBasem Date: Tue, 6 Aug 2024 22:15:25 +0300 Subject: [PATCH 3/6] Drop b_values columns from the simplified file. --- utilities/reduce_output_size.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/utilities/reduce_output_size.py b/utilities/reduce_output_size.py index 6a3f16d..0076fbc 100644 --- a/utilities/reduce_output_size.py +++ b/utilities/reduce_output_size.py @@ -3,16 +3,12 @@ df = pd.read_csv('test_output.csv') # Columns to be rounded to four decimal places -columns_to_round = [ - 'f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted', - 'bval_0.0', 'bval_1.0', 'bval_2.0', 'bval_5.0', 'bval_10.0', 'bval_20.0', - 'bval_30.0', 'bval_50.0', 'bval_75.0', 'bval_100.0', 'bval_150.0', 'bval_250.0', - 'bval_350.0', 'bval_400.0', 'bval_550.0', 'bval_700.0', 'bval_850.0', 'bval_1000.0' -] +columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] for column in columns_to_round: df[column] = df[column].round(4) -#df = df.loc[:, ~df.columns.str.startswith('bval')] +#drop b_values columns. +df = df.loc[:, ~df.columns.str.startswith('bval')] #compress and save the file. df.to_csv('test_output.csv.gz', compression='gzip', index=False) From 38a68474cb31d775b101992f34acedb8f6d77cc6 Mon Sep 17 00:00:00 2001 From: AhmedBasem Date: Wed, 7 Aug 2024 02:10:16 +0300 Subject: [PATCH 4/6] attempt to fix unit tests --- utilities/reduce_output_size.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/utilities/reduce_output_size.py b/utilities/reduce_output_size.py index 0076fbc..d2401c5 100644 --- a/utilities/reduce_output_size.py +++ b/utilities/reduce_output_size.py @@ -1,14 +1,19 @@ import pandas as pd +import os -df = pd.read_csv('test_output.csv') +file_path = 'test_output.csv' -# Columns to be rounded to four decimal places -columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] -for column in columns_to_round: - df[column] = df[column].round(4) +# Check if the file exists +if os.path.exists(file_path): + df = pd.read_csv(file_path) -#drop b_values columns. -df = df.loc[:, ~df.columns.str.startswith('bval')] + # Columns to be rounded to four decimal places + columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] + for column in columns_to_round: + df[column] = df[column].round(4) -#compress and save the file. -df.to_csv('test_output.csv.gz', compression='gzip', index=False) + #drop b_values columns. + df = df.loc[:, ~df.columns.str.startswith('bval')] + + #compress and save the file. + df.to_csv('test_output.csv.gz', compression='gzip', index=False) From 9b7286fcc660c672ea70a789cec611727b42e5ac Mon Sep 17 00:00:00 2001 From: AhmedBasem Date: Wed, 7 Aug 2024 16:24:24 +0300 Subject: [PATCH 5/6] process the file using the native csv module to avoid high memory usage --- utilities/reduce_output_size.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/utilities/reduce_output_size.py b/utilities/reduce_output_size.py index d2401c5..b9640ce 100644 --- a/utilities/reduce_output_size.py +++ b/utilities/reduce_output_size.py @@ -1,19 +1,26 @@ -import pandas as pd import os +import gzip +import csv file_path = 'test_output.csv' # Check if the file exists if os.path.exists(file_path): - df = pd.read_csv(file_path) + # Open the input and output files + with open(file_path, 'r') as infile, gzip.open('test_output.csv.gz', 'wt', newline='') as outfile: + reader = csv.DictReader(infile) - # Columns to be rounded to four decimal places - columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] - for column in columns_to_round: - df[column] = df[column].round(4) + # Drop b_values columns + fieldnames = [field for field in reader.fieldnames if not field.startswith('bval_')] + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() - #drop b_values columns. - df = df.loc[:, ~df.columns.str.startswith('bval')] - - #compress and save the file. - df.to_csv('test_output.csv.gz', compression='gzip', index=False) + columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] + + # Process each row + for row in reader: + filtered_row = {column: row[column] for column in fieldnames} + for column in columns_to_round: + if column in filtered_row: + filtered_row[column] = round(float(filtered_row[column]), 4) + writer.writerow(filtered_row) From 469e73f1882e3d2e3dd1365db8be6f05f502a1b9 Mon Sep 17 00:00:00 2001 From: AhmedBasem Date: Fri, 9 Aug 2024 21:38:15 +0300 Subject: [PATCH 6/6] Enhancements --- .github/workflows/website.yml | 2 +- utilities/reduce_output_size.py | 55 +++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml index 636c11d..1d1cf99 100644 --- a/.github/workflows/website.yml +++ b/.github/workflows/website.yml @@ -47,7 +47,7 @@ jobs: name: 'Data' - name: 'Filter and compress results file.' - run: python utilities/reduce_output_size.py + run: python utilities/reduce_output_size.py test_output.csv test_output.csv.gz - name: move data to the dashboard folder run: | diff --git a/utilities/reduce_output_size.py b/utilities/reduce_output_size.py index b9640ce..cbd9eb8 100644 --- a/utilities/reduce_output_size.py +++ b/utilities/reduce_output_size.py @@ -1,26 +1,43 @@ import os import gzip import csv +import sys -file_path = 'test_output.csv' +def reduce_output_file_size(input_file:str, output_file:str): + """ + Simplify the data generated by the analysis pipeline by retaining only the essential information required for the frontend. + """ + if os.path.exists(input_file): + # Open the input and output files + with open(input_file, 'r') as infile, gzip.open(output_file, 'wt', newline='') as outfile: + reader = csv.DictReader(infile) -# Check if the file exists -if os.path.exists(file_path): - # Open the input and output files - with open(file_path, 'r') as infile, gzip.open('test_output.csv.gz', 'wt', newline='') as outfile: - reader = csv.DictReader(infile) + # Drop b_values columns + fieldnames = [field for field in reader.fieldnames if not field.startswith('bval_')] + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + writer.writeheader() - # Drop b_values columns - fieldnames = [field for field in reader.fieldnames if not field.startswith('bval_')] - writer = csv.DictWriter(outfile, fieldnames=fieldnames) - writer.writeheader() + columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] - columns_to_round = ['f', 'Dp', 'D', 'f_fitted', 'Dp_fitted', 'D_fitted'] - - # Process each row - for row in reader: - filtered_row = {column: row[column] for column in fieldnames} - for column in columns_to_round: - if column in filtered_row: - filtered_row[column] = round(float(filtered_row[column]), 4) - writer.writerow(filtered_row) + for row in reader: + #Delete columns starting with 'bval_' + for key in list(row.keys()): + if key.startswith('bval_'): + del row[key] + + # Round values in the remaining relevant columns + for column in columns_to_round: + if column in row: + row[column] = round(float(row[column]), 4) + writer.writerow(row) + else: + print(f"File '{input_file}' not found.") + +if __name__ == '__main__': + if len(sys.argv) != 3: + print("Usage: python reduce_output_size.py ") + sys.exit(1) + + input_file = sys.argv[1] + output_file = sys.argv[2] + reduce_output_file_size(input_file, output_file)