-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into jeffra/inject_v2
- Loading branch information
Showing
50 changed files
with
1,335 additions
and
253 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# This is a basic workflow to help you get started with Actions | ||
|
||
name: Tests-w-precompiled-ops | ||
|
||
# Controls when the action will run. | ||
on: | ||
# Allows you to run this workflow manually from the Actions tab | ||
workflow_dispatch: | ||
|
||
# A workflow run is made up of one or more jobs that can run sequentially or in parallel | ||
jobs: | ||
# This workflow contains a single job called "build" | ||
build: | ||
# The type of runner that the job will run on | ||
runs-on: self-hosted | ||
|
||
# Steps represent a sequence of tasks that will be executed as part of the job | ||
steps: | ||
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it | ||
- uses: actions/checkout@v2 | ||
|
||
# Runs a single command using the runners shell | ||
- name: environment | ||
run: | | ||
nvidia-smi | ||
which python | ||
python --version | ||
which nvcc | ||
nvcc --version | ||
python -c "import torch; print('torch:', torch.__version__, torch)" | ||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())" | ||
# Runs a set of commands using the runners shell | ||
- name: Install deepspeed | ||
run: | | ||
DS_BUILD_OPS=1 pip install .[dev] | ||
ds_report | ||
- name: Formatting checks | ||
run: | | ||
pre-commit run --all-files | ||
# Runs a set of commands using the runners shell | ||
- name: Unit tests | ||
run: | | ||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi | ||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/ |
Submodule DeepSpeedExamples
updated
11 files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import json | ||
|
||
import deepspeed | ||
from deepspeed.elasticity import compute_elastic_config | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json") | ||
parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size") | ||
args = parser.parse_args() | ||
ds_config = json.load(open(args.config, 'r')) | ||
|
||
ds_version = deepspeed.__version__ | ||
|
||
elastic_config = ds_config['elasticity'] | ||
print('------------------------------------------') | ||
print("Elasticity config:") | ||
print('------------------------------------------') | ||
print(json.dumps(elastic_config, indent=4, sort_keys=True)) | ||
|
||
if args.world_size > 0: | ||
final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size) | ||
print('------------------------------------------') | ||
print(f"Calculated results for world size {args.world_size}:") | ||
print('------------------------------------------') | ||
print(f'final_batch_size .... {final_batch_size}') | ||
print(f'valid_gpus .......... {valid_gpus}') | ||
print(f'micro_batch_size .... {micro_batch_size}') | ||
else: | ||
final_batch_size, valid_gpus = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) | ||
print('------------------------------------------') | ||
print("Calculated results:") | ||
print('------------------------------------------') | ||
print(f'final_batch_size .... {final_batch_size}') | ||
print(f'valid_gpus .......... {valid_gpus}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
''' | ||
Copyright 2020 The Microsoft DeepSpeed Team | ||
''' | ||
|
||
############################################# | ||
# Torch distributed constants | ||
############################################# | ||
TORCH_DISTRIBUTED_DEFAULT_PORT = 29500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
Copyright 2020 The Microsoft DeepSpeed Team | ||
""" | ||
|
||
import json | ||
from .constants import * | ||
|
||
|
||
class ElasticityError(Exception): | ||
""" | ||
Base exception for all elasticity related errors | ||
""" | ||
pass | ||
|
||
|
||
class ElasticityConfigError(ElasticityError): | ||
""" | ||
Elasticity configuration error | ||
""" | ||
pass | ||
|
||
|
||
class ElasticityIncompatibleWorldSize(ElasticityError): | ||
""" | ||
Attempting to run a world size that is incompatible with a given elastic config | ||
""" | ||
pass | ||
|
||
|
||
class ElasticityConfig: | ||
""" | ||
Elastic config object, constructed from a param dictionary that only contains elastic | ||
config parameters, example below: | ||
If elasticity is enabled, user must specify (at least) max_train_batch_size | ||
and micro_batch_sizes. | ||
{ | ||
"enabled": true, | ||
"max_train_batch_size": 2000, | ||
"micro_batch_sizes": [2,4,6], | ||
"min_gpus": 1, | ||
"max_gpus" : 10000 | ||
"min_time": 20 | ||
"ignore_non_elastic_batch_info": false | ||
"version": 0.1 | ||
} | ||
""" | ||
def __init__(self, param_dict): | ||
self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT) | ||
if self.enabled: | ||
if MAX_ACCEPTABLE_BATCH_SIZE in param_dict: | ||
self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE] | ||
else: | ||
raise ElasticityConfigError( | ||
f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}") | ||
if MICRO_BATCHES in param_dict: | ||
self.micro_batches = param_dict[MICRO_BATCHES] | ||
else: | ||
raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}") | ||
else: | ||
self.max_acceptable_batch_size = param_dict.get( | ||
MAX_ACCEPTABLE_BATCH_SIZE, | ||
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT) | ||
self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT) | ||
self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT) | ||
self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT) | ||
self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT) | ||
self.version = param_dict.get(VERSION, VERSION_DEFAULT) | ||
self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, | ||
PREFER_LARGER_BATCH_DEFAULT) | ||
self.ignore_non_elastic_batch_info = param_dict.get( | ||
IGNORE_NON_ELASTIC_BATCH_INFO, | ||
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT) | ||
|
||
def repr(self): | ||
return self.__dict__ | ||
|
||
def __repr__(self): | ||
return json.dumps(self.__dict__, sort_keys=True, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
Copyright 2020 The Microsoft DeepSpeed Team | ||
""" | ||
|
||
######################################### | ||
# Elasticity | ||
######################################### | ||
''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible | ||
with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that | ||
can support a large number of GPUs based on the user specified parameters | ||
''' | ||
FORMAT = ''' | ||
Elasticity should be enabled as: | ||
"elasticity": { | ||
"enabled": true, | ||
"max_train_batch_size": 2000, | ||
"micro_batch_sizes": [2,4,6], | ||
"min_gpus": 1, | ||
"max_gpus" : 10000 | ||
"min_time": 20, | ||
"prefer_larger_batch": true, | ||
"ignore_non_elastic_batch_info": false, | ||
"version": 0.1 | ||
} | ||
''' | ||
|
||
ELASTICITY = 'elasticity' | ||
|
||
# Current elasticity version | ||
LATEST_ELASTICITY_VERSION = 0.1 | ||
|
||
ENABLED = 'enabled' | ||
ENABLED_DEFAULT = False | ||
|
||
# Max acceptable train_batch_size | ||
MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size' | ||
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000 | ||
|
||
# Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu | ||
MICRO_BATCHES = 'micro_batch_sizes' | ||
MICRO_BATCHES_DEFAULT = [2, 4, 6] | ||
|
||
# Min/max of GPUs to search over | ||
MIN_GPUS = 'min_gpus' | ||
MIN_GPUS_DEFAULT = 1 | ||
MAX_GPUS = 'max_gpus' | ||
MAX_GPUS_DEFAULT = 10000 | ||
|
||
# Minimum running time (minutes) before the scheduler will scale us | ||
MIN_TIME = "min_time" | ||
MIN_TIME_DEFAULT = "20" | ||
|
||
# When finding a suitable batch size, attempt to find one that is closest | ||
# to the max train batch size given. | ||
PREFER_LARGER_BATCH = 'prefer_larger_batch' | ||
PREFER_LARGER_BATCH_DEFAULT = True | ||
|
||
# In order to reduce confusion, if elastic mode is enabled we | ||
# require (via assert) that no batch info is set outside of the | ||
# elastic config. You can turn off this assert via this config | ||
# but keep in mind that all batch info defined outside the | ||
# elastic mode *will be ignored*. | ||
IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info' | ||
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False | ||
|
||
# Version of elastic logic to use | ||
VERSION = "version" | ||
VERSION_DEFAULT = LATEST_ELASTICITY_VERSION | ||
|
||
# Minimum deepspeed version to use elasticity | ||
MINIMUM_DEEPSPEED_VERSION = "0.3.8" | ||
|
||
# Environment variable storing elastic config from resource scheduler | ||
DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG" |
Oops, something went wrong.