Skip to content

Commit

Permalink
Merge branch 'release-v0.97'
Browse files Browse the repository at this point in the history
============================== Release Notes: v0.97 ==============================
Support for new layers:
 - Mean absolute error and L1 norm
 - GPU implementation for activation layers
 - Log sigmoid and softsign
 - Channel-wise mean (temporary kludge)

Model portability & usability:
 - Hints for layer output dimensions
 - Confusion matrix callback
 - Metric checking callback

Internal features:
 - Removed target-layer-based features from model zoo
 - Layer unit tests check for expected output values

Retired features:
 - Smooth ReLU, bent identity, and swish layers
 - Target-layer-based metrics
 - Target-layer-based models (sequential, greedy layer-wise autoencoder, Siamese)
  • Loading branch information
bvanessen committed Nov 30, 2018
2 parents cd7350e + f019d34 commit 51d0c7f
Show file tree
Hide file tree
Showing 288 changed files with 7,509 additions and 7,772 deletions.
301 changes: 219 additions & 82 deletions CMakeLists.txt

Large diffs are not rendered by default.

25 changes: 24 additions & 1 deletion ReleaseNotes.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
============================== (Pending) Release Notes: v0.97 ==============================
============================== (Pending) Release Notes: v0.98 ==============================
Support for new training algorithms:

Support for new network structures:
Expand All @@ -15,6 +15,29 @@ I/O & data readers:

Build system:

Retired features:

============================== Release Notes: v0.97 ==============================
Support for new layers:
- Mean absolute error and L1 norm
- GPU implementation for activation layers
- Log sigmoid and softsign
- Channel-wise mean (temporary kludge)

Model portability & usability:
- Hints for layer output dimensions
- Confusion matrix callback
- Metric checking callback

Internal features:
- Removed target-layer-based features from model zoo
- Layer unit tests check for expected output values

Retired features:
- Smooth ReLU, bent identity, and swish layers
- Target-layer-based metrics
- Target-layer-based models (sequential, greedy layer-wise autoencoder, Siamese)

============================== Release Notes: v0.96 ==============================
Support for new layers:
- Log softmax
Expand Down
2 changes: 1 addition & 1 deletion bamboo/common_python/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def get_command(cluster,
command_allocate = ''
# Allocate a node if we don't have one already
# Running the tests manually allows for already having a node allocated
if os.getenv('SLURM_NNODES') == None:
if os.getenv('SLURM_JOB_NUM_NODES') == None:
command_allocate = 'salloc'
option_num_nodes = ''
option_partition = ''
Expand Down
4 changes: 2 additions & 2 deletions bamboo/integration_tests/common_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def get_command(cluster, dir_name, model_folder, model_name, executable,
cluster=cluster, executable=executable, num_nodes=1,
partition=partition, time_limit=time_limit, num_processes=num_processes,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder=model_folder,
model_name=model_name, num_epochs=5, optimizer_name='adagrad',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand Down Expand Up @@ -103,7 +103,7 @@ def extract_data(output_file_name, data_fields, should_log):
for line in output_file:
if should_log:
print('%s: %s' % (output_file_name, line))

# Check if line is reporting model results
is_model = re.search('^Model ([0-9]+)', line)
if is_model:
Expand Down
14 changes: 7 additions & 7 deletions bamboo/integration_tests/test_integration_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ def skeleton_mnist_debug(cluster, dir_name, executables, compiler_name, weekly,
command = tools.get_command(
cluster=cluster, executable=executables[compiler_name], num_nodes=1,
partition='pbatch', time_limit=100, dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='models/' + model_name,
model_name=model_name, num_epochs=5, optimizer_name='adagrad',
output_file_name=output_file_name, error_file_name=error_file_name)
output_value = common_code.run_lbann(command, model_name, output_file_name, error_file_name)
assert output_value == 0

def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly, debug, should_log=False):
# If weekly or debug are true, then run the test.
# If weekly or debug are true, then run the test.
if (not weekly) and (not debug):
pytest.skip('Not doing weekly or debug testing')
if cluster == 'ray':
Expand All @@ -38,8 +38,8 @@ def skeleton_cifar_debug(cluster, dir_name, executables, compiler_name, weekly,
command = tools.get_command(
cluster=cluster, executable=executables[compiler_name], num_nodes=1,
partition='pbatch', time_limit=100, dir_name=dir_name,
data_filename_train_default='/p/lscratchf/brainusr/datasets/cifar10-bin/data_all.bin',
data_filename_test_default='/p/lscratchf/brainusr/datasets/cifar10-bin/test_batch.bin',
data_filename_train_default='/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin',
data_filename_test_default='/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin',
data_reader_name='cifar10', data_reader_percent=0.01, model_folder='models/' + model_name,
model_name='conv_' + model_name, num_epochs=5, optimizer_name='adagrad',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand All @@ -51,13 +51,13 @@ def test_integration_mnist_clang4_debug(cluster, dirname, exes, weekly, debug):

def test_integration_cifar_clang4_debug(cluster, dirname, exes, weekly, debug):
skeleton_cifar_debug(cluster, dirname, exes, 'clang4_debug', weekly, debug)

def test_integration_mnist_gcc4_debug(cluster, dirname, exes, weekly, debug):
skeleton_mnist_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug)

def test_integration_cifar_gcc4_debug(cluster, dirname, exes, weekly, debug):
skeleton_cifar_debug(cluster, dirname, exes, 'gcc4_debug', weekly, debug)

def test_integration_mnist_gcc7_debug(cluster, dirname, exes, weekly, debug):
skeleton_mnist_debug(cluster, dirname, exes, 'gcc7_debug', weekly, debug)

Expand Down
2 changes: 1 addition & 1 deletion bamboo/integration_tests/test_integration_io_buffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def skeleton_io_buffers(cluster, dir_name, executables, compiler_name, weekly):
command = tools.get_command(
cluster=cluster, executable=executables[compiler_name], num_nodes=2,
num_processes=num_ranks, dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', mini_batch_size=mini_batch_size,
model_folder='tests', model_name=model_name, num_epochs=5,
optimizer_name='adagrad',
Expand Down
4 changes: 2 additions & 2 deletions bamboo/unit_tests/prototext/data_reader_mnist.prototext
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ data_reader {
name: "mnist"
role: "train"
shuffle: true
data_filedir: "/p/lscratchf/brainusr/datasets/MNIST"
data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
data_filename: "train-images-idx3-ubyte"
label_filename: "train-labels-idx1-ubyte"
validation_percent: 0.1
Expand Down Expand Up @@ -34,7 +34,7 @@ data_reader {
name: "mnist"
role: "test"
shuffle: true
data_filedir: "/p/lscratchf/brainusr/datasets/MNIST"
data_filedir: "/p/lscratchh/brainusr/datasets/MNIST"
data_filename: "t10k-images-idx3-ubyte"
label_filename: "t10k-labels-idx1-ubyte"
validation_percent: 1.0
Expand Down
13 changes: 5 additions & 8 deletions bamboo/unit_tests/test_unit_check_proto_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
defective_models = []
working_models = []
for subdir, dirs, files in os.walk(dir_name + '/model_zoo/models/'):
if 'greedy' in subdir:
print('Skipping greedy_layerwise_autoencoder_mnist, kills bamboo agent')
continue
for file_name in files:
if file_name.endswith('.prototext') and "model" in file_name:
model_path = subdir + '/' + file_name
Expand All @@ -30,14 +27,14 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
print('Skipping %s because motifs are deprecated' % model_path)
continue
elif 'mnist' in file_name:
data_filedir_default = '/p/lscratchf/brainusr/datasets/MNIST'
data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
data_reader_name = 'mnist'
elif 'adversarial' in file_name:
data_filedir_default = '/p/lscratchf/brainusr/datasets/MNIST'
data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
data_reader_path = '%s/model_zoo/models/gan/mnist/adversarial_data.prototext' % (dir_name)
data_reader_name = None
elif 'discriminator' in file_name:
data_filedir_default = '/p/lscratchf/brainusr/datasets/MNIST'
data_filedir_default = '/p/lscratchh/brainusr/datasets/MNIST'
data_reader_path = '%s/model_zoo/models/gan/mnist/discriminator_data.prototext' % (dir_name)
data_reader_name = None
elif 'triplet' in file_name:
Expand Down Expand Up @@ -66,8 +63,8 @@ def skeleton_models(cluster, dir_name, executables, compiler_name):
if 'resnet50' in file_name:
node_count = 8
elif 'cifar' in file_name:
data_filename_train_default = '/p/lscratchf/brainusr/datasets/cifar10-bin/data_all.bin'
data_filename_test_default = '/p/lscratchf/brainusr/datasets/cifar10-bin/test_batch.bin'
data_filename_train_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/data_all.bin'
data_filename_test_default = '/p/lscratchh/brainusr/datasets/cifar10-bin/test_batch.bin'
data_reader_name = 'cifar10'
elif 'char' in file_name:
data_filedir_default = '/p/lscratchh/brainusr/datasets/tinyshakespeare/'
Expand Down
12 changes: 6 additions & 6 deletions bamboo/unit_tests/test_unit_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na
command = tools.get_command(
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='tests',
model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand All @@ -28,7 +28,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na
command = tools.get_command(
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='tests',
model_name='lenet_mnist_ckpt', num_epochs=1, optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand All @@ -42,7 +42,7 @@ def skeleton_checkpoint_lenet_shared(cluster, executables, dir_name, compiler_na
command = tools.get_command(
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='tests',
model_name='lenet_mnist_ckpt', num_epochs=2, optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand All @@ -64,7 +64,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil
command = tools.get_command(
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='tests',
model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand All @@ -79,7 +79,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil
command = tools.get_command(
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='tests',
model_name='lenet_mnist_dist_ckpt', num_epochs=1, optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand All @@ -93,7 +93,7 @@ def skeleton_checkpoint_lenet_distributed(cluster, executables, dir_name, compil
command = tools.get_command(
cluster=cluster, executable=exe, num_nodes=1, num_processes=2,
dir_name=dir_name,
data_filedir_default='/p/lscratchf/brainusr/datasets/MNIST',
data_filedir_default='/p/lscratchh/brainusr/datasets/MNIST',
data_reader_name='mnist', model_folder='tests',
model_name='lenet_mnist_dist_ckpt', num_epochs=2, optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
Expand Down
41 changes: 41 additions & 0 deletions bamboo/unit_tests/test_unit_layer_elu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import sys
sys.path.insert(0, '../common_python')
import tools
import pytest
import os

def skeleton_layer_elu(cluster, executables, dir_name, compiler_name):
if compiler_name not in executables:
pytest.skip('default_exes[%s] does not exist' % compiler_name)
output_file_name = '%s/bamboo/unit_tests/output/layer_elu_%s_output.txt' % (dir_name, compiler_name)
error_file_name = '%s/bamboo/unit_tests/error/layer_elu_%s_error.txt' % (dir_name, compiler_name)
command = tools.get_command(
cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
data_filedir_default='', data_reader_name='synthetic',
model_folder='tests/layer_tests', model_name='elu', optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
return_code = os.system(command)
assert return_code == 0

def test_unit_layer_elu_clang4(cluster, exes, dirname):
skeleton_layer_elu(cluster, exes, dirname, 'clang4')

def test_unit_layer_elu_gcc4_check(cluster, exes, dirname):
if cluster in ['surface']:
pytest.skip('FIXME')
# Surface Errors:
# assert 34304 == 0
skeleton_layer_elu(cluster, exes, dirname, 'gcc4')

def test_unit_layer_elu_gcc7(cluster, exes, dirname):
skeleton_layer_elu(cluster, exes, dirname, 'gcc7')

def test_unit_layer_elu_intel18(cluster, exes, dirname):
skeleton_layer_elu(cluster, exes, dirname, 'intel18')

# Run with python -m pytest -s test_unit_layer_elu.py -k 'test_unit_layer_elu_exe' --exe=<executable>
def test_unit_layer_elu_exe(cluster, dirname, exe):
if exe == None:
pytest.skip('Non-local testing')
exes = {'exe' : exe}
skeleton_layer_elu(cluster, exes, dirname, 'exe')
41 changes: 41 additions & 0 deletions bamboo/unit_tests/test_unit_layer_identity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import sys
sys.path.insert(0, '../common_python')
import tools
import pytest
import os

def skeleton_layer_identity(cluster, executables, dir_name, compiler_name):
if compiler_name not in executables:
pytest.skip('default_exes[%s] does not exist' % compiler_name)
output_file_name = '%s/bamboo/unit_tests/output/layer_identity_%s_output.txt' % (dir_name, compiler_name)
error_file_name = '%s/bamboo/unit_tests/error/layer_identity_%s_error.txt' % (dir_name, compiler_name)
command = tools.get_command(
cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
data_filedir_default='', data_reader_name='synthetic',
model_folder='tests/layer_tests', model_name='identity', optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
return_code = os.system(command)
assert return_code == 0

def test_unit_layer_identity_clang4(cluster, exes, dirname):
skeleton_layer_identity(cluster, exes, dirname, 'clang4')

def test_unit_layer_identity_gcc4_check(cluster, exes, dirname):
if cluster in ['surface']:
pytest.skip('FIXME')
# Surface Errors:
# assert 34304 == 0
skeleton_layer_identity(cluster, exes, dirname, 'gcc4')

def test_unit_layer_identity_gcc7(cluster, exes, dirname):
skeleton_layer_identity(cluster, exes, dirname, 'gcc7')

def test_unit_layer_identity_intel18(cluster, exes, dirname):
skeleton_layer_identity(cluster, exes, dirname, 'intel18')

# Run with python -m pytest -s test_unit_layer_identity.py -k 'test_unit_layer_identity_exe' --exe=<executable>
def test_unit_layer_identity_exe(cluster, dirname, exe):
if exe == None:
pytest.skip('Non-local testing')
exes = {'exe' : exe}
skeleton_layer_identity(cluster, exes, dirname, 'exe')
41 changes: 41 additions & 0 deletions bamboo/unit_tests/test_unit_layer_l1_norm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import sys
sys.path.insert(0, '../common_python')
import tools
import pytest
import os

def skeleton_layer_l1_norm(cluster, executables, dir_name, compiler_name):
if compiler_name not in executables:
pytest.skip('default_exes[%s] does not exist' % compiler_name)
output_file_name = '%s/bamboo/unit_tests/output/layer_l1_norm_%s_output.txt' % (dir_name, compiler_name)
error_file_name = '%s/bamboo/unit_tests/error/layer_l1_norm_%s_error.txt' % (dir_name, compiler_name)
command = tools.get_command(
cluster=cluster, executable=executables[compiler_name], num_nodes=1, num_processes=2, dir_name=dir_name,
data_filedir_default='', data_reader_name='synthetic',
model_folder='tests/layer_tests', model_name='l1_norm', optimizer_name='sgd',
output_file_name=output_file_name, error_file_name=error_file_name)
return_code = os.system(command)
assert return_code == 0

def test_unit_layer_l1_norm_clang4(cluster, exes, dirname):
skeleton_layer_l1_norm(cluster, exes, dirname, 'clang4')

def test_unit_layer_l1_norm_gcc4_check(cluster, exes, dirname):
if cluster in ['surface']:
pytest.skip('FIXME')
# Surface Errors:
# assert 34304 == 0
skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc4')

def test_unit_layer_l1_norm_gcc7(cluster, exes, dirname):
skeleton_layer_l1_norm(cluster, exes, dirname, 'gcc7')

def test_unit_layer_l1_norm_intel18(cluster, exes, dirname):
skeleton_layer_l1_norm(cluster, exes, dirname, 'intel18')

# Run with python -m pytest -s test_unit_ridge_regression.py -k 'test_unit_layer_l1_norm_exe' --exe=<executable>
def test_unit_layer_l1_norm_exe(cluster, dirname, exe):
if exe == None:
pytest.skip('Non-local testing')
exes = {'exe' : exe}
skeleton_layer_l1_norm(cluster, exes, dirname, 'exe')
Loading

0 comments on commit 51d0c7f

Please sign in to comment.