Skip to content

Commit

Permalink
Just sklearn (#23)
Browse files Browse the repository at this point in the history
* adding dependency pandas

* new sklearn

* Good sklearn

* run sklearn on CI

---------

Co-authored-by: Zhuoxuan Zhang <[email protected]>
  • Loading branch information
Zhuoxuan-Zhang and Zhuoxuan Zhang authored Oct 24, 2024
1 parent e66798a commit 97d9bd7
Show file tree
Hide file tree
Showing 26 changed files with 200 additions and 84 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv]
benchmark: [nlp, file-enc, unix50, log-analysis, max-temp, uniq-ips, media-conv, sklearn]

steps:
- name: Checkout code
Expand Down
4 changes: 0 additions & 4 deletions sklearn/.gitignore

This file was deleted.

8 changes: 1 addition & 7 deletions sklearn/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# sklearn benchmark
This benchmark runs a series of scripts that trains a model from sklearn (Scikit-Learn). I got the series of scripts via decomposing the sklearn source code by hand. [Original](https://github.com/scikit-learn/scikit-learn/blob/289326704e13f7a5bf4c6c594c038051e968e1fd/sklearn/linear_model/_logistic.py)
This benchmark runs a series of scripts that trains a model from sklearn (Scikit-Learn). I got the series of scripts via decomposing the sklearn source code by hand.

## Purpose
I think this benchmark shows two things for a system like hS - viability in AI workflows and correctness. The first is quite self explanatory. If hS can run this benchmark, then it has proven that hS can handle the task of gluing together a nontrivial ML training workflow.
Expand All @@ -8,12 +8,6 @@ The second is correctness. There is a very clear ground truth (the model trained
## Usage
Running fit.sh will generate temporary files in a ./tmp folder

Before running, the user need to install the packages (possibly in a
virtual environment) by `pip install -r requirements.txt`, and make
sure the result direcotry exists (`mkdir -p result`). Then run
`run.sh` with appropriate environment where python is aliased to the
correct python3 installation (a.k.a. in a virtual environment).

To parallelize, we want one-vs-rest classification, where we generate multiple models.
Additionally, the forest cover dataset has much more samples than it has features.
This makes the Newton-Cholesky solver ideal for this task.
Expand Down
13 changes: 13 additions & 0 deletions sklearn/deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

export PASH_SPEC_TOP=${PASH_SPEC_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}

benchmark_dir="sklearn"

cd "$(realpath $(dirname "$0"))"
mkdir -p "$PASH_SPEC_TOP/report/resources/sklearn"
mkdir -p "$PASH_SPEC_TOP/report/output/sklearn"

# Currently just dumped the entire dataset, but ideally we actually download it

pip install -r requirements.txt
8 changes: 0 additions & 8 deletions sklearn/generate.sh

This file was deleted.

4 changes: 1 addition & 3 deletions sklearn/inputs.sh → sklearn/input.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,4 @@ mkdir -p tmp
mkdir -p result
mkdir -p inputs

/usr/bin/env python3 -c "from sklearn.datasets import fetch_kddcup99; fetch_kddcup99(data_home=\"inputs\", percent10=False, download_if_missing=True)"


/usr/bin/env python3 -c "from sklearn.datasets import fetch_kddcup99; fetch_kddcup99(data_home=\"inputs\", percent10=False, download_if_missing=True)"
Binary file added sklearn/inputs/covertype/samples_py3
Binary file not shown.
Binary file added sklearn/inputs/covertype/targets_py3
Binary file not shown.
92 changes: 92 additions & 0 deletions sklearn/run
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python3

import argparse
from pathlib import Path
import os
import time
from subprocess import run, PIPE

parser = argparse.ArgumentParser(description="Run benchmark")
parser = argparse.ArgumentParser(description="Run benchmark")
parser.add_argument('--window', default=5, type=int, help='window size to run hs with')
parser.add_argument('--target', choices=['hs-only', 'sh-only', 'both'],
help='to run with sh or hs')
parser.add_argument('--log', choices=['enable', 'disable'], default="enable",
help='whether to enable logging for hs')

env = os.environ.copy()
SCRIPT_NAME = "run.sh"


def do_sh_run(test_base: Path, output_base: Path, env: dict):
before = time.time()
print(f'Running {test_base / SCRIPT_NAME}')
result = run(['/bin/sh', test_base / SCRIPT_NAME], stdout=PIPE, env=env)
duration = time.time() - before
with open(output_base / "sh_time", 'w') as f:
f.write(f'{duration}\n')
os.rename(env["OUTPUT_DIR"] / "trained_model.obj", env["OUTPUT_DIR"] / "sh_trained_model.obj")
return result.returncode, result.stdout

def do_hs_run(test_base: Path, output_base: Path, hs_base: Path, window: int, env: dict, log: bool):
cmd = [hs_base / 'pash-spec.sh', '--window', str(window)]
if log:
cmd.extend(['-d', '2'])
cmd.append(test_base / SCRIPT_NAME)
before = time.time()
print(f'Running {cmd}')
with open(output_base / 'hs_log', 'w') as log:
result = run(cmd, stdout=PIPE, stderr=log, env=env)
duration = time.time() - before
with open(output_base / "hs_time", 'w') as f:
f.write(f'{duration}\n')
os.rename(env["OUTPUT_DIR"] / "trained_model.obj", env["OUTPUT_DIR"] / "hs_trained_model.obj")
return result.returncode, result.stdout

if __name__ == '__main__':
args = parser.parse_args()
test_base = Path(__file__).parent.resolve()
hs_base = test_base.parent.parent.parent

#######################
# SPECIFY ENV VARS HERE

env['TMP'] = hs_base / 'report' / 'resources' / 'sklearn'
env['RESULT'] = hs_base / 'report' / 'output' / 'sklearn'
env['OUTPUT_DIR'] = hs_base / 'report' / 'output' / 'sklearn'

#######################

bench_base = test_base.parent
local_name = os.sep.join(test_base.parts[-1:])
print(local_name)
output_base = hs_base / "report" / "output" / 'sklearn' / local_name
run_hs = False
run_sh = False
if args.target in ["hs-only", "both"]:
run_hs = True
if args.target in ["sh-only", "both"]:
run_sh = True
if not run_hs and not run_sh:
raise("Not running anything, add --target argument")
output_base.mkdir(parents=True, exist_ok=True)


if run_sh:
output_sh = do_sh_run(test_base, output_base, env)
if run_hs:
output_hs = do_hs_run(test_base, output_base, hs_base, args.window, env, args.log == 'enable')
if run_sh and run_hs:
with open(output_base / 'error', 'w') as errf:
print(output_sh[:100])
if output_sh == output_hs:
errf.write('')
else:
errf.write('error\n')
errf.write(f'return code {output_sh[0]} vs {output_hs[0]}\n')
errf.write(f'==== output sh ====\n')
errf.write(output_sh[1].decode('UTF-8'))
errf.write(f'==== output hs ====\n')
errf.write(output_hs[1].decode('UTF-8'))


48 changes: 25 additions & 23 deletions sklearn/run.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/bin/bash

set -e

PYTHON=${PYTHON:-`which python3`}
PYTHON="python3"
OUT=${OUT:-$PWD/result}
TMP=${TMP:-$PWD/tmp}
#export tmp to env
export TMP
SCRIPTS=${SCRIPTS:-$PWD/scripts}

# Ideally, we'll move on to piping rather than writing to a file
Expand All @@ -17,6 +17,18 @@ MAX_SQ_SUM=$TMP/max_squared_sum.obj
WARM_COEF=$TMP/warm_start_coef.obj
C_=$TMP/C_.obj

echo $PYTHON >&2
echo "DIR: $DIR" >&2
echo "SCRIPTS: $SCRIPTS" >&2
echo "MODEL: $MODEL" >&2
echo "X: $X" >&2
echo "y: $y" >&2
echo "CLASSES: $CLASSES" >&2
echo "DUAL: $DUAL" >&2
echo "MAX_SQ_SUM: $MAX_SQ_SUM" >&2
echo "WARM_COEF: $WARM_COEF" >&2
echo "C_: $C_" >&2

# TODO: Try this out on a larger dataset
# TODO: Benchmark each phase

Expand All @@ -31,8 +43,9 @@ $PYTHON $SCRIPTS/check_solver.py $MODEL
penalty=$($PYTHON $SCRIPTS/penalty.py $MODEL)
$PYTHON $SCRIPTS/val_data.py $MODEL $X $y
$PYTHON $SCRIPTS/classes.py $MODEL $y # This should return a classes with just the unique classes in y
echo "$PYTHON $SCRIPTS/check_multiclass.py $MODEL" >&2
multiclass=$($PYTHON $SCRIPTS/check_multiclass.py $MODEL)

echo "------" >&2
# TODO: Benchmark each step of the pipeline
# Make a modified pipeline where each step writes its output to a file

Expand All @@ -41,30 +54,19 @@ $PYTHON $SCRIPTS/rownorm.py $X
n_classes=$($PYTHON $SCRIPTS/reshape_classes.py $MODEL $CLASSES)
$PYTHON $SCRIPTS/warm_start.py $MODEL $multiclass $n_classes # pipes coefficients

# KDD Cup 99 dataset has 23 classes
# Covtype dataset has 7 classes
echo "WARM_COEF: $WARM_COEF" >&2
echo "MAX_SQ_SUM: $MAX_SQ_SUM" >&2

echo "multiclass: $multiclass" >&2
echo "penalty: $penalty" >&2
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 1
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 2
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 3
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 4
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 5
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 6
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 7
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 8
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 9
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 10
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 11
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 12
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 13
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 14
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 15
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 16
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 17
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 18
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 19
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 20
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 21
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 22
$PYTHON $SCRIPTS/parallel.py $MODEL $X $y $C_ $WARM_COEF $MAX_SQ_SUM $multiclass $penalty 23

$PYTHON $SCRIPTS/zip_coef.py $MODEL $n_classes
$PYTHON $SCRIPTS/adjust_coef.py $MODEL $X $multiclass $n_classes $OUT/trained_model.obj
$PYTHON $SCRIPTS/zip_coef.py $MODEL
$PYTHON $SCRIPTS/adjust_coef.py $MODEL $X $multiclass $n_classes $RESULT/trained_model.obj
11 changes: 7 additions & 4 deletions sklearn/scripts/adjust_coef.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
import sys
import pickle
import numpy as np
import os

with open(f'{os.environ.get("TMP","./tmp")}/fold_coef.obj', 'rb') as file:
tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, 'fold_coef.obj')
with open(filepath, 'rb') as file:
fold_coefs_ = pickle.load(file)

model_file, X_file, multi_class, n_classes, destination = sys.argv[1:6]
Expand All @@ -30,5 +32,6 @@
else:
model.intercept_ = np.zeros(n_classes)

with open(destination, 'wb') as file:
pickle.dump(model, file)
filepath = os.path.join(tmp, 'trained_model.obj')
with open(filepath, 'wb') as file:
pickle.dump(model, file)
Empty file modified sklearn/scripts/check_multiclass.py
100644 → 100755
Empty file.
Empty file modified sklearn/scripts/check_solver.py
100644 → 100755
Empty file.
Empty file modified sklearn/scripts/classes.py
100644 → 100755
Empty file.
7 changes: 4 additions & 3 deletions sklearn/scripts/fold_coef.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
)
for class_, warm_start_coef_ in zip(classes, warm_start_coef)
)

with open(f'{os.environ.get("TMP","./tmp")}/fold_coef.obj', 'w+b') as file:
pickle.dump(fold_coefs_, file)
tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, 'fold_coef.obj')
with open(filepath, 'w+b') as file:
pickle.dump(fold_coefs_, file)
7 changes: 5 additions & 2 deletions sklearn/scripts/gen_model.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
reg = LogisticRegression(max_iter=int(sys.argv[1]),
solver='newton-cholesky',
multi_class='ovr')
with open(f'{os.environ.get("TMP","./tmp")}/model.obj', 'w+b') as file:
pickle.dump(reg, file)

tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, 'model.obj')
with open(filepath, 'w+b') as file:
pickle.dump(reg, file)
19 changes: 8 additions & 11 deletions sklearn/scripts/gen_samples.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import datasets
import pickle
import pandas as pd
import numpy as np
import os

X, y = datasets.fetch_kddcup99(data_home="inputs", percent10=False, return_X_y=True, as_frame=True, download_if_missing=True)
X = pd.DataFrame(X).drop(columns=["protocol_type", "service", "flag"]).astype(float)
X[X.columns] = MinMaxScaler().fit_transform(X[X.columns])
X = X.to_numpy()
y = LabelEncoder().fit_transform(y).astype(np.int32)

data = train_test_split(X,
y,
raw_data = datasets.fetch_covtype(data_home="inputs", download_if_missing=False)

data = train_test_split(raw_data.data,
raw_data.target,
test_size=0.2,
random_state=0)
filenames = ['X_train', 'X_test', 'y_train', 'y_test']
tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, 'model.obj')
for datum, name in zip(data, filenames):
with open(f'{os.environ.get("TMP","./tmp")}/{name}.obj', 'w+b') as file:
filepath = os.path.join(tmp, f'{name}.obj')
with open(filepath, 'w+b') as file:
pickle.dump(datum, file)
6 changes: 4 additions & 2 deletions sklearn/scripts/parallel.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,7 @@
sample_weight=None,
)

with open(f'{os.environ.get("TMP","./tmp")}/result_{class_}.obj', 'wb') as file:
pickle.dump(result, file)
tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, f'result_{class_}.obj')
with open(filepath, 'w+b') as file:
pickle.dump(result, file)
6 changes: 4 additions & 2 deletions sklearn/scripts/penalty.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import warnings
import sys
import os
import pickle
import numpy as np
import os

with open(sys.argv[1], 'rb') as file:
model = pickle.load(file)
Expand Down Expand Up @@ -34,6 +34,8 @@
C_ = model.C
penalty = model.penalty

with open(f'{os.environ.get("TMP","./tmp")}/C_.obj', 'w+b') as file:
tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, 'C_.obj')
with open(filepath, 'w+b') as file:
pickle.dump(C_, file)
print(penalty)
Empty file modified sklearn/scripts/reshape_classes.py
100644 → 100755
Empty file.
6 changes: 4 additions & 2 deletions sklearn/scripts/rownorm.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from sklearn.linear_model import _logistic
import sys
import os
import pickle
import os

with open(sys.argv[1], 'rb') as file:
X = pickle.load(file)

max_squared_sum = _logistic.row_norms(X, squared=True).max()

with open(f'{os.environ.get("TMP","./tmp")}/max_squared_sum.obj', 'w+b') as file:
tmp = os.environ.get('TMP')
filepath = os.path.join(tmp, 'max_squared_sum.obj')
with open(filepath, 'w+b') as file:
pickle.dump(max_squared_sum, file)
Empty file modified sklearn/scripts/val_data.py
100644 → 100755
Empty file.
10 changes: 6 additions & 4 deletions sklearn/scripts/warm_start.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
import os
import numpy as np
import pickle
import os

with open(sys.argv[1], 'rb') as file:
model = pickle.load(file)
Expand All @@ -16,13 +16,15 @@
warm_start_coef = np.append(
warm_start_coef, model.intercept_[:, np.newaxis], axis=1
)

tmp = os.environ.get('TMP')
if multi_class == "multinomial":
with open(f'{os.environ.get("TMP","./tmp")}/classes.obj', 'wb') as file:
filepath = os.path.join(tmp, 'classes.obj')
with open(filepath, 'wb') as file:
pickle.dump([None], file)
warm_start_coef = [warm_start_coef]
if warm_start_coef is None:
warm_start_coef = [None] * n_classes

with open(f'{os.environ.get("TMP","./tmp")}/warm_start_coef.obj', 'w+b') as file:
filepath = os.path.join(tmp, 'warm_start_coef.obj')
with open(filepath, 'w+b') as file:
pickle.dump(warm_start_coef, file)
Loading

0 comments on commit 97d9bd7

Please sign in to comment.