diff --git a/autodist/autosync/__init__.py b/autodist/autosync/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/autosync/simulator/__init__.py b/autodist/autosync/simulator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/autosync/simulator/base.py b/autodist/autosync/simulator/base.py new file mode 100644 index 0000000..e670cfa --- /dev/null +++ b/autodist/autosync/simulator/base.py @@ -0,0 +1,143 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simulator base class.""" +from collections import OrderedDict + +import os + +from autodist.graph_item import GraphItem +from autodist.kernel.partitioner import PartitionerConfig +from autodist.resource_spec import ResourceSpec +from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem + + +class SimulatorBase: + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + graph_item=None, + resource_spec=None): + """ + Constructor for simulator base class + Args: + graph_item: a GraphItem object, or a path to a serialized GraphItem object. + resource_spec: a ResourceSpec object, or a path to a resource file. + """ + # check if it is a path + self._graph_item = None + if isinstance(graph_item, GraphItem): + self._graph_item = graph_item + elif isinstance(graph_item, str) and os.path.exists(graph_item): + self._graph_item = GraphItem.deserialize(graph_item) + else: + raise ValueError("Invalid graph_item: {}".format(graph_item)) + + self._resource_spec = None + if isinstance(resource_spec, ResourceSpec): + self._resource_spec = resource_spec + elif isinstance(resource_spec, str) and os.path.exists(resource_spec): + self._resource_spec = ResourceSpec(resource_spec) + else: + raise ValueError("Invalid resource_spec: {}".format(resource_spec)) + + def update_graph_item(self, graph_item): + """Change the default graph_item with this simulator.""" + if not graph_item: + raise ValueError('Empty graph item.') + self._graph_item = graph_item + + def update_resource_spec(self, resource_spec): + """Change the default resource_spec with this simulator.""" + if not resource_spec: + raise ValueError('Empty resource spec.') + self._resource_spec = resource_spec + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + *args, + **kwargs): + """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.""" + raise NotImplementedError() + + def inference(self, *args, **kwargs): + """Abstract method for simulator inference.""" + raise NotImplementedError() + + def load_checkpoint(self, checkpoint): + """ + Load a checkpoint file as weights of the simulator. + + Args: + checkpoint: path to a checkpoint file. + """ + raise NotImplementedError() + + # def save_checkpoint(self, model, checkpoint): + # """ + # Save a trained weight as a checkpoint file. + # + # Args: + # model: trained model. + # checkpoint: path where to save the checkpoint. + # """ + # raise NotImplementedError() + + def preprocess(self, + strategy, + graph_item=None, + resource_spec=None): + """ + Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features. + + Args: + strategy: a distribution strategy + graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used. + resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used. + + Returns: + OrderedDict(): variable/part name to variable/part items. + ResourceItem: + """ + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + if not strategy: + raise ValueError('No strategy provided.') + + resource_item = ResourceItem(resource_spec) + name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()} + + name_to_items = OrderedDict() + for node in strategy.node_config: + var_name = node.var_name + var = name_to_var[var_name] + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_item = PartItem(var, graph_item, i, pc, part) + name_to_items[part_item.name] = part_item + else: + var_item = VariableItem(var, graph_item, node) + name_to_items[var_item.name] = var_item + return name_to_items, resource_item diff --git a/autodist/autosync/simulator/linear_simulator.py b/autodist/autosync/simulator/linear_simulator.py new file mode 100644 index 0000000..d0ee310 --- /dev/null +++ b/autodist/autosync/simulator/linear_simulator.py @@ -0,0 +1,193 @@ +# Copyright 2020 Petuum Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Predefined simulator with linear model.""" +import os +import pickle as pkl + +import tensorflow as tf +import numpy as np + +from autodist.autosync.simulator.predefined_simulator import PredefinedSimulator +from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from autodist.utils import logging + + +class LinearSimulator(PredefinedSimulator): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + graph_item=None, + resource_spec=None, + batch_size=1, + seq_len=1, + checkpoint=None): + super(PredefinedSimulator, self).__init__(graph_item, resource_spec) + logging.debug('A LinearSimulator is instantiated: batch_size_per_gpu is {}'.format(batch_size)) + + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + + # For loading weights of the linear model. + self._checkpoint = checkpoint + if self._checkpoint: + try: + self._weight = self.load_checkpoint(checkpoint) + except ValueError: + logging.warning('self._checkpoint is invalid') + self._weight = None + + # TODO(Hao): add the default weights here. + self._default_weights = None + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + checkpoint=None, + *args, + **kwargs): + """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple. + + Args: + strategy: the strategy to simulate. + graph_item: the graph_item this strategy is generated on. + resource_spec: the resource_spec this strategy is on. + checkpoint: the checkpoint to perform inference (in place of the default weight). + + Returns: + float: the estimated cost (lower is better). + """ + if not strategy: + raise ValueError('strategy is None.') + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + + x = self._extract_feature(strategy, graph_item, resource_spec) + + # The priority of checkpoint lookup priority is: + # simulate(checkpoint) > self._weight > self._default_weight + if checkpoint: + weights = self.load_checkpoint(checkpoint) + elif self._weights: + weights = self._weights + else: + weights = self._default_weight + + cost = self.inference(np.array(x), weights) + return cost + + def inference(self, x, weights): + """ + + Args: + x: features extracts from a (strategy, graph_item, resource_spec). + weight: trained linear model weight. + + Returns: + float: ranking score. + """ + # if not isinstance(inputs, tf.Tensor): + # inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) + + assert len(weights) == 2 + W, b = weights + cost = np.array(W) * x.T + np.array(b) + return cost + + def load_checkpoint(self, checkpoint): + """ + Load a trained weight from a checkpoint. + + Args: + checkpoint: the file path to a npz, or a list/array of weights. + + Returns: + list: load weights [W, b]. + """ + logging.info('Loading checkpoint: {}'.format(checkpoint)) + if isinstance(checkpoint, list): + assert(len(checkpoint) == 2 or len(checkpoint) == 13) + if len(checkpoint) == 13: + checkpoint = checkpoint[:11], checkpoint[12] + return checkpoint + elif isinstance(checkpoint, str): + if os.path.isfile(checkpoint): + weights = np.load(checkpoint) + return weights['W'], weights['b'] + else: + raise ValueError('Unable to load the checkpoint: {}'.format(checkpoint)) + + def _extract_feature(self, + strategy, + graph_item, + resource_spec): + """Get the feature vector as input to the linear model.""" + var_name_to_items, resource_item, var_name_to_sync_time = \ + self.extract_prefeature(strategy, graph_item, resource_spec) + + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + ps_server_sync_time = {} + cc_group_sync_time = {} + + for var_name, var_item in var_name_to_items.items(): + sync_time = var_name_to_sync_time[var_name] + + # Extract per-server and per-group sync time. + if isinstance(var_item.synchronizer, PSSynchronizer): + server = var_item.device + if server not in ps_server_sync_time: + ps_server_sync_time[server] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + ps_server_sync_time[server][key] += sync_time[0][key] + sync_time[1][key] + elif isinstance(var_item.synchronizer, AllReduceSynchronizer): + group = var_item.group + if group not in cc_group_sync_time: + cc_group_sync_time[group] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + cc_group_sync_time[group][key] += sync_time[key] + else: + raise ValueError('Unrecognized type of synchronizer: {}'.format(type(var_item.synchronizer))) + + # Different from predefined modeling, we transform these into feature vectors in this simulator. + # We care about the sum time of all servers/groups, or the slowest (max) server/group. + max_ps_server_sync_time = {key: 0.0 for key in feature_keys} + sum_ps_server_sync_time = {key: 0.0 for key in feature_keys} + max_cc_group_sync_time = {key: 0.0 for key in feature_keys} + sum_cc_group_sync_time = {key: 0.0 for key in feature_keys} + + for key in feature_keys: + max_ps_server_sync_time[key] = \ + max([sync_time[key] for sync_time in ps_server_sync_time.values()] or [0.0]) + sum_ps_server_sync_time[key] = \ + sum([sync_time[key] for sync_time in ps_server_sync_time.values()] or [0.0]) + max_cc_group_sync_time[key] = \ + max([sync_time[key] for sync_time in cc_group_sync_time.values()] or [0.0]) + sum_cc_group_sync_time[key] = \ + sum([sync_time[key] for sync_time in cc_group_sync_time.values()] or [0.0]) + + # concat them to get the feature. + x = [max_ps_server_sync_time[key] for key in feature_keys] + \ + [sum_ps_server_sync_time[key] for key in feature_keys] + \ + [max_cc_group_sync_time[key] for key in feature_keys] + \ + [sum_cc_group_sync_time[key] for key in feature_keys] + return x diff --git a/autodist/autosync/simulator/predefined_simulator.py b/autodist/autosync/simulator/predefined_simulator.py new file mode 100644 index 0000000..b05b50d --- /dev/null +++ b/autodist/autosync/simulator/predefined_simulator.py @@ -0,0 +1,323 @@ +# Copyright 2020 Petuum Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Predefined simulator with linear model.""" + +from collections import OrderedDict + +import tensorflow as tf + +from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from autodist.autosync.simulator.base import SimulatorBase +from autodist.autosync.simulator.utils import on_same_host, get_dtype_bits +from autodist.utils import logging + + +class PredefinedSimulator(SimulatorBase): + """ + Simulator that uses a predefined communication model to estimate the runtime of strategies. + + See this paper TODO(Hao): put the paper link. + """ + + def __init__(self, + graph_item=None, + resource_spec=None, + batch_size=1, + seq_len=1, + mode='sum'): + """ + Construct a predefined simulator. + + We need the per-replica batch size and the length of the input sequence to estimate the communication load of + variables that are sparsely accessed (e.g. embeddings). For dense variables, these two arguments have no + influence on estimation. + Note that graph_item and resource_spec are not required to instantiate a simulator object as we allow + transferring a trained simulator on a graph_item (or resource_spec) to a different graph_item (or different + resource_spec). This can be done by passing graph_item or resource_spec + + Args: + graph_item: a GraphItem object, or a path to a serialized GraphItem object. + resource_spec: a ResourceSpec object, or a path to a resource file. + batch_size: the per-replica batch size used to train this model, if there are sparse variables. + seq_len: the average length of input sequences (if there is any). + mode: use the `sum` or `max` of all variable sync time as the cost. + """ + super(PredefinedSimulator, self).__init__(graph_item, resource_spec) + logging.debug('A PredefinedSimualtor is instantiated: batch_size_per_gpu is {}'.format(batch_size)) + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._mode = mode + + # Constants for predefined modeling. + self._network_overhead = 0.0 + self._gpu_kernel_memory_latency = 0.0 + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + *args, + **kwargs): + """ + Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple. + + Args: + strategy: the strategy to simulate + graph_item: the graph_item this strategy is generated on. + resource_spec: the resource_spec this strategy is on. + + Returns: + float: the estimated runtime (lower is better). + """ + if not strategy: + raise ValueError('strategy is None.') + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + + var_name_to_items, resource_item, var_name_to_sync_time = \ + self.extract_prefeature(strategy, graph_item, resource_spec) + + # Now use the estimated per-variable sync time to calculate the overall sync time. + ps_server_sync_time = {} + cc_group_sync_time = {} + + for var_name, var_item in var_name_to_items.items(): + sync_time = var_name_to_sync_time[var_name] + + # we use a simple formula: + # time = transmission + network_overhead * participating_workers + gpu_memory_latency * max(#gpus) + if isinstance(var_item.synchronizer, PSSynchronizer): + server = var_item.device + if server not in ps_server_sync_time: + ps_server_sync_time[server] = 0.0 + send_time = sync_time[0]['transmission'] + \ + sync_time[0]['network_overhead'] * self._network_overhead + \ + sync_time[0]['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency + recv_time = sync_time[1]['transmission'] + \ + sync_time[1]['network_overhead'] * self._network_overhead + \ + sync_time[1]['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency + # Then accumulate the time for each variable on this PS. Note this is not necessarily accurate as + # there might exist parallel communication of variables even on one server. + ps_server_sync_time[server] += send_time + ps_server_sync_time[server] += recv_time + elif isinstance(var_item.synchronizer, AllReduceSynchronizer): + group = var_item.group + if group not in cc_group_sync_time: + # Each group of variables are fused as one message to pass, so we accumulate the + # overhead and latency for only ONCE. + cc_group_sync_time[group] += sync_time['network_overhead'] * self._network_overhead + \ + sync_time['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency + cc_group_sync_time[group] += sync_time['transmission'] + else: + raise ValueError('Unrecognized type of synchronizer: {}'.format(type(var_item.synchronizer))) + + sync_time = [v for v in ps_server_sync_time.values()] + [v for v in cc_group_sync_time.values()] + if self._mode == 'max': + # In `max` mode, we assume all PS and collective groups communicate in parallel, and the PS/group that + # takes the longest time to sync would bound the overall per-iter time. + per_iter_time = max(sync_time) + elif self._mode == 'sum': + # In `sum` mode, we assume all PS and collective groups synchronize sequentially, and the overall per-iter + # time is the summation of the sync time of all serviers and collective groups. + # !!Note: both modes have over-simplified assumptions than a real system. + per_iter_time = sum(sync_time) + else: + raise ValueError('Unrecognized simulation mode: {}'.format(self._mode)) + return per_iter_time + + def extract_prefeature(self, + strategy, + graph_item, + resource_spec): + """ + Extract impacting factors of the communication time for each variable. + + Args: + strategy: the strategy to simulate. + graph_item: the graph_item this strategy is generated for. + resource_spec: the resource_spec this strategy is on. + + Returns: + Dict: A dict of variable name (str) to impacting factors (dict). + """ + # TODO(Hao): need to make sure the (strategy, graph_item, resource_spec) match each other. + # construct the meta objects + name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec) + + # Now estimate the per-variable sync time + var_sync_time = OrderedDict() + for var_name, var_item in name_to_items.items(): + if isinstance(var_item.synchronizer, PSSynchronizer): + var_sync_time[var_name] = self.var_ps_time(var_item, resource_item) + elif isinstance(var_item.synchronizer, AllReduceSynchronizer): + var_sync_time[var_name] = self.var_ar_time(var_item, resource_item) + else: + raise ValueError('{}'.format(type(var_item.synchronizer))) + return name_to_items, resource_item, var_sync_time + + def var_ps_time(self, + var_item, + resource_item): + """ + Estimate the synchronization time of a variable that uses PS synchronizer. + + Args: + var_item: the variable meta information. + resource_item: the resource meta information. + + Returns: + tuple(Dict): a dict of potential impacting factors for send and recv time, respectively. + """ + bits_to_transfer = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) + placement = var_item.device + p2p_bandwidth = resource_item.p2p_bandwidth + max_num_local_gpu_replica = resource_item.max_num_local_gpu_replica + num_local_replica_on_each_worker = [resource_item.num_local_gpu_replica_on(host) + for host in resource_item.cpu_replicas] + if var_item.is_sparse: + send_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, + resource_item.cpu_replicas, + num_local_replica_on_each_worker) + recv_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, + resource_item.gpu_replicas, + [1.0] * len(resource_item.gpu_replicas)) + else: + # In AutoDist, the gradients are always locally accumulated then SENT to parameter server. + send_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, + resource_item.cpu_replicas, + [1.0] * len(resource_item.cpu_replicas)) + # The communication overhead of receiving parameters from PS depends on `local_replication`. + if var_item.local_replication: + recv_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, + resource_item.cpu_replicas, + [1.0] * len(resource_item.cpu_replicas)) + else: + recv_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, + resource_item.gpu_replicas, + [1.0] * len(resource_item.gpu_replicas)) + return send_time, recv_time + + @staticmethod + def _estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, + virtual_worker_list, + virtual_num_local_replica): + """ + Estimate the send or receive time of a ps and return multiple impacting factors. + + Args: + bits_to_transfer: the variable whose communication time will be estimated. + placement: the placement of the variable. + p2p_bandwidth: point-to-point bandwidth between divices of the cluster. + max_num_local_gpu_replica: the maximum number of on a single node across the cluster. + virtual_worker_list: A list of virtual workers (could be actual gpu workers, or virtual cpu worker). + virtual_num_local_replica: A list of integers indicating the number of local replica on each virtual worker. + + Returns: + Dict: a dictionary of impacting factors. + """ + transmission_time = 0.0 + + # To estimate network transmission time for the given variable var_item on PS, we simply sum up the time of + # transmitting (or say, synchronizing) this variable across all workers. + # The time is separately estimated as send_time and recv_time by calling this function twice with different + # values of arguments. + # TODO(Hao): didn't consider any parallelization between variables or partitions. + for k, worker in enumerate(virtual_worker_list): + if not on_same_host(placement, worker): + bits_on_this_worker = bits_to_transfer * virtual_num_local_replica[k] + bandwidth = min(p2p_bandwidth[placement][worker], p2p_bandwidth[worker][placement]) + transmission_time += bits_on_this_worker / bandwidth + factors = { + 'transmission': transmission_time, + 'network_overhead': len(virtual_worker_list), + 'gpu_kernel_memory_latency': max_num_local_gpu_replica, # TODO(Hao): Is this correct? + 'constant': 1.0 + } + return factors + + def var_ar_time(self, + var_item, + resource_item, + powersgd_rank=10): + """ + Estimate the synchronization time of a variable that uses collective synchronizer. + + Due to limitation, we only consider dense variables for now. + Args: + var_item: the variable meta information. + resource_item: the resource meta information. + + Returns: + Dict: a dictionary of impacting factors. + """ + # Address cases for different types of compressors + if var_item.compressor not in ['PowerSGDCopmressor', 'HorovodCompressorEF', 'HorovodCompressor', + 'NoneCompressor', 0, 1, 2, 3]: + raise ValueError('Compressor type not recognized: {}'.format(var_item.compressor)) + + size_to_transfer = var_item.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = var_item.dtype + + if var_item.compressor in ['PowerSGDCopmressor', 3, "HorovodCompressorEF", "HorovodCompressor", 1, 2]: + # These compressors always use float32 to communicate. + dtype = tf.float32 + if var_item.compressor in ["PowerSGDCompressor", 3]: + # For PowerSGDCompessor, we hard-code the rank as 10. It will always use float32 to communicate. + if len(var_item.shape) > 1: + n = var_item.shape[0] + m = 1 + for d in var_item.shape[1:]: + m *= d + size_to_transfer = (m + n) * powersgd_rank + + # We assume ring allreduce, and multiple rings will be constructed and executed serialliy to synchronize grads. + # In one ring, each worker exchanges grads with its next worker in parallel. Hence, the time a single ring + # completes is bounded by the slowest pair of workers; the total time spent for all workers to synchronize + # grads are bounded by the time all rings finish on the slowest pair of workers. + transmission_time = size_to_transfer * get_dtype_bits(dtype) / resource_item.min_bandwidth + factors = { + 'transmission': transmission_time, + 'network_overhead': 1, # TODO(Hao): is this correct? + 'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica, + 'constant': 1.0 + } + return factors diff --git a/autodist/autosync/simulator/rankrnn_simulator.py b/autodist/autosync/simulator/rankrnn_simulator.py new file mode 100644 index 0000000..5e08bbd --- /dev/null +++ b/autodist/autosync/simulator/rankrnn_simulator.py @@ -0,0 +1,1027 @@ +"""Strategy RankNetSimulator.""" +import glob +import json +import sys +from datetime import datetime +from pathlib import Path +from string import digits +import time + +import numpy as np +import os +import tensorflow as tf +tf.compat.v1.disable_eager_execution() + +import arion +from arion.graph_item import GraphItem +from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from arion.simulator.models.base import SimulatorBase +from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS +from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica, _resolved_devices_on_diff_machine +from arion.strategy.random_sample_strategy import VariableHelper, PartHelper +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.cluster import SSHCluster +from arion.kernel.device.resolver import DeviceResolver +from arion.kernel.partitioner import PartitionerConfig +from arion.simulator.models.predefined_simulator import PredefinedSimulator + +import torch +import torch.nn as nn + +import multiprocessing +from multiprocessing import Process, Queue + +TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# feature settings +MAX_NUM_WORKERS = 16 +MAX_NUM_GROUPS = 600 +MAX_NUM_VARS = 500 +MAX_NUM_PARS = 1500 +FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15 + +# model size +PARTITION_MLP_HIDDEN = 128 +PARTITION_MLP_OUT = 32 +STEM_RNN_HIDDEN = 128 +BIDIECTIONAL = True +BATCH_SIZE = 96 + +NUM_RNN_LAYERS = 3 +SCORE_TH = 0.005 +LR = 2e-3 +WD = 3e-4 +DATA_AUG = False +IN_LAYERS = 2 +OUT_LAYERS = 1 + +# ncf used: +# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020 +# noaug +# PARTITION_MLP_HIDDEN = 128 +# PARTITION_MLP_OUT = 32 +# STEM_RNN_HIDDEN = 128 +# BIDIECTIONAL = True +# NUM_RNN_LAYERS = 4 +# BATCH_SIZE = 64 +# LR = 1e-3 +# WD = 4e-4 + +# vgg used: +# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374 +# noaug +# PARTITION_MLP_HIDDEN = 128 +# PARTITION_MLP_OUT = 32 +# STEM_RNN_HIDDEN = 128 +# BIDIECTIONAL = True +# NUM_RNN_LAYERS = 3 +# BATCH_SIZE = 64 +# LR = 1e-3 +# WD = 3e-4 + +GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item', + 'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item', + 'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item', + 'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item', + 'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item', + 'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item', + 'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l', + 'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l', + 'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l', + 'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'} + +def get_model(path_): + if 'densenet121' in path_: + return 'densenet121' + elif 'ncf' in path_: + return 'ncf' + elif 'inceptionv3' in path_: + return 'inceptionv3' + elif 'resnet101' in path_: + return 'resnet101' + elif 'resnet50' in path_: + return 'resnet50' + elif 'vgg16' in path_: + return 'vgg16' + elif 'bert' in path_ and '12l' in path_: + return 'bert_12l' + elif 'bert' in path_ and '6l' in path_: + return 'bert_6l' + elif 'bert' in path_ and '3l' in path_: + return 'bert_3l' + elif 'bert' in path_ and 'large' in path_: + return 'bert_large' + else: + return None + +class RankRNN(nn.Module): + def __init__(self, input_size=FEATURE_SIZE, + partition_mlp_hidden=PARTITION_MLP_HIDDEN, + partition_mlp_out=PARTITION_MLP_OUT, + stem_rnn_hidden=STEM_RNN_HIDDEN, + num_rnn_layers=NUM_RNN_LAYERS, + in_layers=IN_LAYERS, + out_layers=OUT_LAYERS, + bidirectional=BIDIECTIONAL): + super(RankRNN, self).__init__() + self.partition_mlp_out = partition_mlp_out + # self.num_rnn_layers = num_rnn_layers + self.stem_rnn_hidden = stem_rnn_hidden + tmp = [nn.Linear(input_size, partition_mlp_hidden)] + for _ in range(in_layers-2): + tmp.append(nn.ReLU()) + tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden)) + tmp.append(nn.ReLU()) + tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out)) + + self.partition_mlp = nn.Sequential(*tmp) + + self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional) + + if out_layers == 1: + self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1) + elif out_layers == 2: + self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128), + nn.ReLU(), + nn.Linear(128, 1)) + + self.relu = nn.ReLU() + + def forward(self, features, par_indices, var_nums, return_feature=False): + # print(features.shape, par_indices.shape, var_nums.shape) + x = features.float() + # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float() + x = self.partition_mlp(x) + + x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype) + x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x) + + # Set initial hidden and cell states + # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + + # Forward propagate LSTM + x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False) + out, (ht, ct) = self.stem_rnn(x1) # out: tensor of shape (batch_size, seq_length, hidden_size) + + # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None] + out = ht.permute(1, 0, 2).reshape(x.shape[0], -1) + # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]]) + # print(ht.permute(1, 0, 2).shape, x.shape) + if return_feature: + return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt()) + else: + return self.final_fc(out) + +class TrainTensorDataset(torch.utils.data.Dataset): + """TensorDataset with support of transforms. + """ + def __init__(self, tensors): + assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) + self.tensors = tensors + + def __getitem__(self, index): + x = self.tensors[0][index] + x = self.perturbe_device_and_group(x) + x1 = self.tensors[1][index] + x2 = self.tensors[2][index] + + y = self.tensors[3][index] + + return x, x1, x2, y + + def __len__(self): + return self.tensors[0].size(0) + + def perturbe_device_and_group(self, x): + if DATA_AUG: + perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32) + perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32) + mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids] + mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids] + x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1) + return x + + +def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas): + ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)] + + if device is not None: + ret[0][device] = 1 + + group = getattr(synchronizer, 'group', None) + if group is not None: + assert group < MAX_NUM_GROUPS, group + ret[1][group] = 1 + + compressor = getattr(synchronizer, 'compressor', None) + if compressor is not None: + if compressor in ["PowerSGDCompressor", 3]: + ret[2][2] = 1 + elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]: + ret[2][1] = 1 + elif compressor in ["NoneCompressor", 0]: + ret[2][0] = 1 + else: + raise ValueError('Compressor does not exist: {}'.format(compressor)) + + local_replication = getattr(synchronizer, 'local_replication', None) + if isinstance(synchronizer, PSSynchronizer): + synchronizer = 0 + if int(local_replication) == 0: + if int(is_sparse) == 0: + ret[3][0] = 1 + else: + ret[3][1] = 1 + else: + if int(is_sparse) == 0: + ret[3][2] = 1 + else: + ret[3][3] = 1 + else: + ret[3][4] = 1 + ret[4] = np.array([size_ratio, bd, num_replicas]) + + return np.concatenate(ret) + +def connvert_feature(strategy, resource_spec, graph_item): + + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) + network_bandwidth = network_bandwidth + min_network_bandwidth = network_bandwidth.min() + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + + num_vars = 0 + total_size_vars = 0 + for var_op, var in graph_item.trainable_var_op_to_var.items(): + num_vars += 1 + if var.initial_value.shape.ndims: + var_helper = VariableHelper(var, graph_item) + if var_helper.is_sparse: + total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) + else: + total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) + assert num_vars < MAX_NUM_VARS, num_vars + var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32) + partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) + + cnt = 0 + for node_id, node in enumerate(strategy.node_config): + var_name = node.var_name + for var_op, var in graph_item.trainable_var_op_to_var.items(): + if var.name == var_name: + break + var_helper = VariableHelper(var, graph_item) + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_helper = PartHelper(i, var, pc) + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + return var_partition_features, partition_indice, np.array(node_id+1) + +def create_predefined_features(strategy, resource_spec, predefined_simulator): + + var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec) + + features = [] + for var_name, sync_time in var_sync_time.items(): + if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies. + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + else: # AR + transmission = sync_time['transmission'] + is_ps = False + + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)] + features.append(feat) + features = np.array(features, dtype=np.float) + return features + +def extract_graph_item(graph_item): + total_size_vars = 0 + name2var = {} + name2var_helper = {} + for var_op, var in graph_item.trainable_var_op_to_var.items(): + name2var[var.name] = var + var_helper = VariableHelper(var, graph_item) + name2var_helper[var.name] = var_helper + if var.initial_value.shape.ndims: + if var_helper.is_sparse: + total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) + else: + total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) + + return total_size_vars, name2var, name2var_helper + +def wrap_fn(queue, idx, run_worker, rs, st): + ret = run_worker(rs, st) + queue.put((idx, ret)) + +def convert_feature_batch(strategys, resource_specs, total_size_vars, name2var, name2var_helper, _batch_size_per_gpu, _seq_len): + + def var_ps_time(var_size_to_transfer, is_sparse, device, dtype, local_replication, network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in PS strategy.""" + def _helper(worker_list, worker_num_replicas=None): + if worker_num_replicas is None: + worker_num_replicas = [1.0] * len(worker_list) + + this_server_time = 0 + # network transfer: sum up all workers time. equals to the time cost of this server. + # TODO(Hao): didn't consider any parallelization among partitions + for k, worker in enumerate(worker_list): + if _resolved_devices_on_diff_machine(device, worker): + if is_sparse: + this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + else: + this_worker_size = get_dense_var_bits(var_size_to_transfer, dtype) + this_server_time += this_worker_size / network_bandwidth_map[device][worker] + + return { + 'transmission': this_server_time, + 'network_overhead': len(worker_list), + 'gpu_kernel_memory_latency': max_num_local_replica, + } + + send_time = _helper(cpu_worker_list) + if local_replication: + receive_time = _helper(cpu_worker_list) + else: + receive_time = _helper(gpu_worker_list) + + return send_time, receive_time + + def var_ar_time(var_size_to_transfer, og_shape, dtype, compressor, max_num_local_replica, cpu_worker_list, network_bandwidth_map, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in AR strategy.""" + worker_list = cpu_worker_list + num_workers = len(worker_list) + min_bandwidth = None + for i in range(num_workers): + for j in range(i, num_workers): + if min_bandwidth is None: + min_bandwidth = network_bandwidth_map[worker_list[j]][worker_list[i]] + else: + min_bandwidth = min(min_bandwidth, network_bandwidth_map[worker_list[j]][worker_list[i]]) + + # Compressor + if compressor == "PowerSGDCompressor" or compressor == 3: + rank = 10 # currently using default value. So hardcode here. # todo: confirm + # assume var must be a dense variable. + ndims = len(og_shape) + if ndims <= 1: # no compress + size_to_transfer = var_size_to_transfer + else: + if ndims > 2: + n = og_shape[0] + m = 1 + for s in og_shape[1:]: + m *= s # tensor's shape (n, m) + else: + n, m = og_shape[0], og_shape[1] + size_to_transfer = n * rank + m * rank + dtype = tf.float32 + elif compressor == "HorovodCompressorEF" or compressor == "HorovodCompressor" \ + or compressor == 2 or compressor == 1: + size_to_transfer = var_size_to_transfer + dtype = tf.float32 + elif compressor == "NoneCompressor" or compressor == 0: + size_to_transfer = var_size_to_transfer + dtype = dtype + else: + raise ValueError('Compressor does not exist: {}'.format(compressor)) + + time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth + + return { + 'transmission': time, + 'network_overhead': 1, # len(worker_list), + 'gpu_kernel_memory_latency': max_num_local_replica, + } + + def network_bandwidth2(resource_spec: ResourceSpec, device_resolver: DeviceResolver): + """Calculates all P2P network bandwidths between nodes in the cluster.""" + devices = [device for device, _ in resource_spec.devices] + resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices] + gpu_cpu_bw = 10000. # hardcode for now + network_bandwidth = {} # key: + for i in range(len(devices)): + if resolved_devices[i] not in network_bandwidth: + network_bandwidth[resolved_devices[i]] = {} + for j in range(i, len(devices)): + if resolved_devices[j] not in network_bandwidth: + network_bandwidth[resolved_devices[j]] = {} + ip_i = devices[i].split(':')[0] + ip_j = devices[j].split(':')[0] + if ip_i != ip_j: + network_bandwidth[resolved_devices[i]][resolved_devices[j]] \ + = GIGABITS * resource_spec.network_bandwidth[ip_i] + network_bandwidth[resolved_devices[j]][resolved_devices[i]] \ + = GIGABITS * resource_spec.network_bandwidth[ip_j] + else: + network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw + network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw + return network_bandwidth + + def run_worker(resource_spec, strategy): + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) + min_network_bandwidth = network_bandwidth.min() + network_bandwidth_map = network_bandwidth2(resource_spec, device_resolver) + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + + var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE)).astype(np.float32) + partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) + cnt = 0 + for node_id, node in enumerate(strategy.node_config): + var_name = node.var_name + var = name2var[var_name] + var_helper = name2var_helper[var_name] + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device_id = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device_id = cpu_worker_list.index(device) + bd = network_bandwidth[device_id] + num_replicas = worker_num_replicas[device_id] + + par_shape = var.initial_value.shape.as_list() + dim_size = par_shape[pc.axis] // pc.num_shards + extras = par_shape[pc.axis] % pc.num_shards + if i < extras: + dim_size += 1 + par_shape[pc.axis] = dim_size + + size_to_transfer =np.prod(par_shape) + if var_helper.is_sparse: + raise Error + size_ratio = get_sparse_var_bits(size_to_transfer)/total_size_vars + else: + size_ratio = get_dense_var_bits(size_to_transfer, var_helper.dtype)/total_size_vars + + if isinstance(synchronizer, AllReduceSynchronizer): + sync_time = var_ar_time(size_to_transfer, par_shape, var_helper.dtype, getattr(synchronizer, 'compressor', None), max_num_local_replica, cpu_worker_list, network_bandwidth_map) + transmission = sync_time['transmission'] + is_ps = False + else: + sync_time = var_ps_time(size_to_transfer, var_helper.is_sparse, device, var_helper.dtype, getattr(synchronizer, 'local_replication', None), network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list) + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + var_partition_features[cnt] = np.concatenate([to_numpy(synchronizer, device_id, size_ratio, var_helper.is_sparse, bd, num_replicas), np.array([transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)])]) + partition_indice[cnt] = node_id + cnt += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device_id = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device_id = cpu_worker_list.index(device) + bd = network_bandwidth[device_id] + num_replicas = worker_num_replicas[device_id] + + size_to_transfer =np.prod(var_helper.shape) + if var_helper.is_sparse: + raise Error + size_ratio = get_sparse_var_bits(size_to_transfer)/total_size_vars + else: + size_ratio = get_dense_var_bits(size_to_transfer, var_helper.dtype)/total_size_vars + + if isinstance(synchronizer, AllReduceSynchronizer): + sync_time = var_ar_time(size_to_transfer, var.initial_value.shape.as_list(), var_helper.dtype, getattr(synchronizer, 'compressor', None), max_num_local_replica, cpu_worker_list, network_bandwidth_map) + transmission = sync_time['transmission'] + is_ps = False + else: + sync_time = var_ps_time(size_to_transfer, var_helper.is_sparse, device, var_helper.dtype, getattr(synchronizer, 'local_replication', None), network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list) + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + var_partition_features[cnt] = np.concatenate([to_numpy(synchronizer, device_id, size_ratio, var_helper.is_sparse, bd, num_replicas), np.array([transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)])]) + partition_indice[cnt] = node_id + cnt += 1 + return (var_partition_features, partition_indice, np.array(node_id+1)) + + # t1 =time.time() + # with multiprocessing.Pool(processes=32) as pool: + # results = pool.starmap(run_worker, zip(resource_specs, strategys)) + # ret1, ret2, ret3 = [], [], [] + # for tmp in results: + # ret1.append(tmp[0]); ret2.append(tmp[1]); ret3.append(tmp[2]) + + q = Queue() + rets = [] + prs = [] + for idx, (arg1, arg2) in enumerate(zip(resource_specs, strategys)): + prs.append(Process(target=wrap_fn, args=(q, idx, run_worker, arg1, arg2))) + prs[-1].start() + for pr in prs: + ret = q.get() # will block + rets.append(ret) + for pr in prs: + pr.join() + + ret1, ret2, ret3 = [], [], [] + for tmp in sorted(rets, key=lambda x: x[0]): + ret1.append(tmp[1][0]); ret2.append(tmp[1][1]); ret3.append(tmp[1][2]) + # print(time.time() - t1) + + # t1 =time.time() + # ret1, ret2, ret3 = [], [], [] + # for rs, st in zip(resource_specs, strategys): + # tmp = run_worker(rs, st) + # ret1.append(tmp[0]); ret2.append(tmp[1]); ret3.append(tmp[2]) + # print(time.time() - t1) + return np.stack(ret1), np.stack(ret2), np.stack(ret3) + + +class RankRNNSimulatorPenalty(SimulatorBase): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + original_graph_item_path, + num_rnn_layers, + in_layers, + out_layers, + fetches=None, + batch_size=1, + seq_len=1, + checkpoint=None): + + super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path) + print("It's using RankNet simulator.") + self._fetches = fetches + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._checkpoint = checkpoint + self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) + if self._checkpoint: + self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE) + self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu'))) + + total_size_vars, name2var, name2var_helper = extract_graph_item(self._original_graph_item) + self.total_size_vars = total_size_vars + self.name2var = name2var + self.name2var_helper = name2var_helper + + def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None): + score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint) + return score.view(-1).data.cpu().numpy(), feature.data.cpu().numpy() + + def predict(self, + strategy, + resource_spec, + strategy_path=None, + checkpoint=None): + if checkpoint is None: + if self._checkpoint is None: + raise ValueError("checkpoint is None: {}".format(checkpoint)) + else: + model = self._model + else: + model = RankRNN().to(TORCH_DEVICE) + model.load_state_dict(torch.load(checkpoint)) + if type(strategy) == list and type(resource_spec) == list: + + var_partition_features, partition_indice, var_num = convert_feature_batch(strategy, resource_spec, self.total_size_vars, self.name2var, self.name2var_helper, self._batch_size_per_gpu, self._seq_len) + + var_partition_features = torch.from_numpy(var_partition_features).to(TORCH_DEVICE) + partition_indice = torch.from_numpy(partition_indice).to(TORCH_DEVICE) + var_num = torch.from_numpy(var_num).to(TORCH_DEVICE) + + return model(var_partition_features, partition_indice, var_num, True) + else: + if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz')) + var_partition_features, partition_indice, var_num, _ = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + else: + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, resource_spec, self._original_graph_item) + + if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz')) + predefined_features = loaded['x4'] + else: + predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator) + + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE) + partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE) + var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE) + + return model(var_partition_features, partition_indice, var_num, True) + +class RankNetTrainer(): + + def __init__(self, + batch_size_per_gpu=256, + seq_len=1, + seed=1): + self._batch_size_per_gpu = batch_size_per_gpu + self._seq_len = seq_len + self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()} + self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()} + self.best_acc = 0. + print("It's using RankNet trainer.") + + def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'): + features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()} + for training_path in path_list: + for path in Path(training_path).rglob('strategies'): + strategy_paths = glob.glob(os.path.join(path, '*')) + # strategy_paths = np.random.permutation(list(strategy_paths)) + for strategy_path in strategy_paths: + if 'json' in strategy_path or \ + 'bert_large_batch_8_orca_16_group_2/' in strategy_path: + continue + model = get_model(strategy_path) + if model is None: + if not ('densenets169' in strategy_path or 'densenets201' in strategy_path): + assert False, strategy_path + continue + rs_path = strategy_path.replace('strategies', 'resource_specs') + runtime_path = strategy_path.replace('strategies', 'runtimes') + npz_path = (strategy_path+'.npz').replace('strategies', 'npz') + if not os.path.isfile(rs_path): + rs_path += '.yml' + if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)): + continue + if not os.path.exists(os.path.dirname(npz_path)): + os.makedirs(os.path.dirname(npz_path)) + + if not os.path.isfile(npz_path): + strategy = Strategy.deserialize(path=strategy_path) + rs = ResourceSpec(resource_file=rs_path) + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, rs, self.graph_items[model]) + label = np.array(json.load(open(runtime_path))['average']) + np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label) + else: + loaded = np.load(npz_path) + var_partition_features, partition_indice, var_num, label = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + + if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')): + predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model]) + np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features) + else: + loaded = np.load(npz_path.replace('.npz', '_pdf.npz')) + predefined_features = loaded['x4'] + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here + # is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path) + # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3]) + features[model][is_aws][0].append(var_partition_features) + features[model][is_aws][1].append(partition_indice) + features[model][is_aws][2].append(var_num) + features[model][is_aws][3].append(label) + + for k, _ in GRAPH_ITEM_PATHS.items(): + for i1 in range(2): + for i2 in range(4): + if len(features[k][i1][i2]) > 1: + features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16) + print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape) + else: + features[k][i1][i2] = None + + train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0) + train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0) + train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0) + train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0) + + if type(valid_patterns[0]) == str and valid_patterns[0] == 'same': + rng = np.random.RandomState(1) + permt = rng.permutation(train_features.shape[0]) + split = int(len(permt) * 0.7) + val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]] + train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]] + else: + val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0) + val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0) + val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0) + val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0) + + # comment here + rng = np.random.RandomState(1) + permt = rng.permutation(val_features.shape[0]) + split = int(len(permt) * 0.7) + train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0) + + val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]] + label_max = max(train_labels.max(), val_labels.max()) + label_min = min(train_labels.min(), val_labels.min()) + train_labels = (train_labels-label_min)/(label_max-label_min) + val_labels = (val_labels-label_min)/(label_max-label_min) + print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) + + ## train the model + trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE))) + testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE)) + self.trainloader = torch.utils.data.DataLoader(dataset=trainset, + batch_size=BATCH_SIZE, + shuffle=True) + self.testloader = torch.utils.data.DataLoader(dataset=testset, + batch_size=32, + shuffle=False) + + def train(self, name='', num_epochs=200, checkpoint=None): + + checkpoint_path = 'model_on_{}.ckpt'.format(name) + print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS) + + np.random.seed(1) + torch.manual_seed(1) + torch.cuda.manual_seed_all(1) + model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE) + if checkpoint: + model.load_state_dict(torch.load(checkpoint)) + optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD) + + best_val_acc = 0. + for epoch in range(num_epochs): + if epoch == int(num_epochs*2./5. - 1): + for param_group in optimizer.param_groups: param_group['lr'] = 3e-4 + if epoch == int(num_epochs*4./5. - 1): + for param_group in optimizer.param_groups: param_group['lr'] = 1e-4 + + labels = [] + outputs = [] + for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader): + + # Forward pass + outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() + + par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1) + + true_comp = ( + (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() + + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int() + + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int() + ) > 0 + true_comp = true_comp.float() * 2 - 1 + pred_comp = outputs_b[:, None] - outputs_b[None, :] + loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp) + loss = loss.tril(-1).mean() + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25) + optimizer.step() + + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + + with torch.no_grad(): + labels = [] + outputs = [] + for features_b, par_indices_b, var_nums_b, labels_b in self.testloader: + + # Forward pass + outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + if acc.item() > best_val_acc: + best_val_acc = acc.item() + if best_val_acc > self.best_acc: + print('Saved model @ acc', best_val_acc) + torch.save(model.state_dict(), checkpoint_path) + self.best_acc = best_val_acc + # print('Saved model to {}'.format(checkpoint_path)) + if epoch == num_epochs - 1: + print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc)) + return checkpoint_path + + +if __name__ == '__main__': + + if False: + trainer = RankNetTrainer() + trainer.load_data([ + '/users/hzhang2/oceanus_cost_model_training_data/vgg16', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only', + # '/users/hzhang2/oceanus_cost_model_training_data/densenet', + # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', + ], + [ + ('vgg16', 1), #('vgg16', 1), + # ('ncf', 0), #('ncf', 1), + # ('bert_large', 1), #('bert_large', 1), + # not used: + # ('densenet121', 0), ('densenet121', 1), + # ('inceptionv3', 0), ('inceptionv3', 1), + # ('resnet101', 0), ('resnet101', 1), + # ('resnet50', 0), ('resnet50', 1), + # ('bert_12l', 0), ('bert_12l', 1), + # ('bert_6l', 0), ('bert_6l', 1), + # ('bert_3l', 0), ('bert_3l', 1), + ], + [ + # ('vgg16', 1), + # ('ncf', 1), + # ('bert_large', 1), + 'same', + ], + ) + + for p2 in [0.01, 0.03]: + for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]: + for p4 in [1e-3, 1e-4, 3e-4, 5e-4, 5e-5, 2e-3, ]: + for p1 in [3, 4, 2]: + for p5 in [2, 3]: + for p6 in [1, 2]: + NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6 + checkpoint_path = trainer.train(name='vgg-aws-new-2', num_epochs=200) + exit() + else: + checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_bert-aws-only.ckpt' + test_list = [ + '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only', + # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15', + # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11', #TARGET: 0.9 + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + ] + + for data_folder in test_list: + simulator = RankRNNSimulatorPenalty3(GRAPH_ITEM_PATHS[get_model(data_folder)], + 4, + 2, + 1, + batch_size=256, + seq_len=1, + checkpoint=checkpoint_path) + + runtimes_folder = os.path.join(data_folder, 'runtimes') + results = {} + averages= [] + scores = [] + strategys = [] + rss = [] + strategy_paths = [] + for name in os.listdir(runtimes_folder): + strategy_path = os.path.join(data_folder, 'strategies', name) + rs_path = os.path.join(data_folder, 'resource_specs', name ) + + if not os.path.isfile(rs_path): + rs_path += '.yml' + runtime_path = os.path.join(runtimes_folder, name) + + strategy_paths.append(strategy_path) + + with open(runtime_path, 'r') as f: + runtimes = json.load(f) + average = np.array(runtimes['average']) + + s = Strategy.deserialize(strategy_path) + rs = ResourceSpec(resource_file=rs_path) + strategys.append(s) + rss.append(rs) + + averages.append(average) + + # for tmp1, tmp2, tmp3 in zip(strategys, rss, strategy_paths): + # scores.append(simulator.simulate(tmp1, tmp2, tmp3)[0]) + # print(np.stack(scores).reshape(-1)) + + scores = simulator.simulate(strategys, rss)[0] + print(scores) + + # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])} + # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])} + # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])} + # print('Sorted by runtime.......................') + # for _, (rt, prediction) in sorted_by_runtime.items(): + # print('runtime {} prediction {}'.format(rt, prediction)) + + y_train = np.array(averages) + test_score = np.array(scores) + true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :]) + pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :]) + equal = (true_comp == pred_comp).astype(np.int) + test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1) + + print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc)) diff --git a/autodist/autosync/simulator/train_linear.py b/autodist/autosync/simulator/train_linear.py new file mode 100644 index 0000000..c7e9438 --- /dev/null +++ b/autodist/autosync/simulator/train_linear.py @@ -0,0 +1,123 @@ +import os +import glob +import json +import numpy as np +from collections import OrderedDict +from os.path import expanduser +from sklearn import linear_model +from sklearn.linear_model import Ridge +from arion.simulator.utils import split_dataset + +def create_features(simulation): + runtime_coefficients = simulation['runtime_coefficients'] + var_sync_time = simulation['var_sync_time'] # dict: + + res = OrderedDict({ + 'network_overhead': 0.0, + 'gpu_kenrel_memory_latency': 0.0, + 'constant_factor': 0.0, + 'allreduce_factor': 0.0, + }) + for var_name, sim_time in var_sync_time.items(): + if isinstance(sim_time, list): + # PS strategies + send_time, receive_time = sim_time + res['constant_factor'] += send_time['transmission'] + receive_time['transmission'] + res['network_overhead'] += send_time['network_overhead'] + receive_time['network_overhead'] + res['gpu_kenrel_memory_latency'] += send_time['gpu_kenrel_memory_latency'] + receive_time['gpu_kenrel_memory_latency'] + elif isinstance(sim_time, dict): + # Allreduce strategy + res['allreduce_factor'] += sim_time['transmission'] + res['network_overhead'] += sim_time['network_overhead'] + res['gpu_kenrel_memory_latency'] += sim_time['gpu_kenrel_memory_latency'] + else: + raise ValueError + + # runtime_coefficients = { + # 'transmission': slowest_server_time, + # 'network_overhead': len(worker_list), + # 'gpu_kenrel_memory_latency': max_num_local_replica, + # 'constant': 1.0, + # # possible affecting factors. + # 'var_name': var_name, + # 'strategy': 'ps', + # 'local_proxy': local_proxy, + # 'is_sparse': is_sparse, + # 'server_list': [partition.to_dict() for partition in server_list], + # 'worker_list': worker_list, + # 'cpu_worker_list': cpu_worker_list, + # 'gpu_worker_list': gpu_worker_list, + # 'worker_num_replicas': worker_num_replicas, + # 'max_num_local_replica': max_num_local_replica, + # } + # runtime_coefficients = [ + # runtime_coefficients['transmission'], + # runtime_coefficients['network_overhead'], + # runtime_coefficients['gpu_kenrel_memory_latency'], + # ] + return list(res.values()) + +def load_trial_run_data(data_dir): + runtimes_folders = glob.glob("{}/**/runtimes".format(data_dir), recursive=True) + X = [] + Y = [] + for runtimes_folder in runtimes_folders: + print(runtimes_folder) + runtimes_files = glob.glob(os.path.join(runtimes_folder, '*')) + for runtimes_file in runtimes_files: + # Target + runtime = json.load(open(runtimes_file, 'r')) + y = runtime['average'] + # Features + simulation_file = '/'.join(runtimes_file.split('/')[:-2]) + '/simulations/' + runtimes_file.split('/')[-1] + assert os.path.isfile(simulation_file), 'simulation_file {} does not exist'.format(simulation_file) + simulation = json.load(open(simulation_file, 'r')) + x = create_features(simulation) + X.append(x) + Y.append(y) + return X, Y + +data_dir = os.path.join(expanduser('~'), 'oceanus_simulator/lm1b-patchon') +X, Y = load_trial_run_data(data_dir) +X_train, Y_train, X_valid, Y_valid = split_dataset(X, Y) +print('X_train', X_train.shape, 'Y_train', Y_train.shape, 'X_valid', X_valid.shape, 'Y_valid', Y_valid.shape) + +# Linear regression +lm = linear_model.LinearRegression() +model = lm.fit(X_train, Y_train) +predictions = lm.predict(X_valid) +print('predictions, targets: ') +pt = zip(predictions, Y_valid) +pt = sorted(pt, key=lambda x: x[1]) +for p, t in pt: + print(p, t) +train_score = lm.score(X_train, Y_train) +valid_score = lm.score(X_valid, Y_valid) +print('Linear train_score', train_score) +print('Linear valid_score', valid_score) + +# Ridge regression +ridge = Ridge(alpha=1.0) +ridge.fit(X_train, Y_train) +predictions = ridge.predict(X_valid) +train_score = ridge.score(X_train, Y_train) +valid_score = ridge.score(X_valid, Y_valid) +print('Ridge train_score', train_score) +print('Ridge valid_score', valid_score) + + +# Lasso +lasso = linear_model.Lasso(alpha=0.1) +lasso.fit(X_train, Y_train) +train_score = lasso.score(X_train, Y_train) +valid_score = lasso.score(X_valid, Y_valid) +print('Lasso train_score', train_score) +print('Lasso valid_score', valid_score) + +# ElasticNet +elastic = linear_model.ElasticNet(random_state=0) +elastic.fit(X_train, Y_train) +train_score = elastic.score(X_train, Y_train) +valid_score = elastic.score(X_valid, Y_valid) +print('ElasticNet train_score', train_score) +print('ElasticNet valid_score', valid_score) diff --git a/autodist/autosync/simulator/train_predefined_simulator.py b/autodist/autosync/simulator/train_predefined_simulator.py new file mode 100644 index 0000000..43bb08b --- /dev/null +++ b/autodist/autosync/simulator/train_predefined_simulator.py @@ -0,0 +1,343 @@ +import sys +import os +import numpy as np +import tensorflow as tf +from os.path import expanduser +import tqdm + +from tensorflow.python.eager import context +import tensorflow_ranking as tfr + +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.simulator import utils +from arion.simulator.models.predefined_simulator import PredefinedSimulator +from arion.simulator.utils import RankingLossKeys + +class TFRIterator: + def __init__(self, X, Y, list_size, batch_size, split, baseline=0.0, scale=1.0): + assert len(X) > 0, 'data: {}'.format(len(X)) + self.X = X + self.Y = Y + self.list_size = list_size + self.baseline = baseline + self.scale = scale + self.batch_size = batch_size + self.split = split + self.n = len(X) + self.num_examples = self.get_num_examples() + print('Split: {},\tnumber of samples: {},\tnumber of examples: {},\tmin of y: {}'.format( + split, len(X), self.num_examples, self.get_min_y())) + + def get_min_y(self): + return np.min(self.Y) + + def get_num_examples(self): + n_examples = 1 + for i in range(self.list_size): + n_examples *= (len(self.X) -1) + return n_examples + + def get_next(self): + xs = [[] for _ in range(self.list_size)] + ys = [] + for i in range(self.batch_size): + y =[] + for j in range(self.list_size): + ri = np.random.randint(self.n) + rx = self.X[ri] + ry = self.Y[ri] + xs[j].append(np.array(rx, dtype=np.float32)) + y.append(ry) + assert ry * self.scale - self.baseline > 0, '{}, {}, {}'.format(ry, self.scale, self.baseline) + ys.append(y) + xs = [np.array(xx, dtype=np.float32) for xx in xs] + ys = np.array(ys, dtype=np.float32) + if self.split == 'train': # normalize y as its used for loss weights. + ys = (ys * self.scale - self.baseline) + + return xs + [ys] + +model_params = { + 'ncf_large_adam_dense': { + 'model_batch_size': 256, + 'model_seq_len': 1, + 'data_dir': [ + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_ar_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_by_chunk', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_christy', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced_12_12', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced_20_50', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_sorted_christy_ordered_balanced_30_50', + '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_sorted_christy_ordered_balanced_30_50_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_christy_lb', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_christy_lb_ps_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_real_random', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_8', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_orca_4', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_orca_16', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_orca_16_christy_lb_if_partition_lb_linear_cost_ps_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_orca_16_christy_lb_if_partition_lb_num_partition_2_32_linear_cost_ps_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_random_search_christy_lb_ps_only_if_partition_lb_ranknet_simulator_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_random_search_christy_lb_ps_only_ranknet_simulator', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_g3.4.25.3_g3.4.25.4_3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1', + ], + 'original_graph_item_path': '/home/christy.li/oceanus_cost_model_training_data/ncf/original_graph_item', + 'save_dir': os.path.join(expanduser('~'), 'oceanus_cost_model_training_data/ncf/predefined_checkpoints'), + 'save_prefix': 'ckpV1_ncf_large_adam_dense_orca_all', + # 'save_prefix': 'ckpV2_ncf_large_adam_dense_orca', + 'baseline': 0.15, + # 'baseline': 0.0, + 'scale': 0.5, + 'learning_rate': 0.01, + 'list_size': 2, + 'batch_size': 100, + 'ranking_loss_key': 'pairwise_logistic_loss', + 'model_version': 'v1', + # 'model_version': 'v2', + 'do_train': False, + 'do_test': True, + 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_all_600_0.83249_0.84517', + }, + 'bert': { + 'model_batch_size': 32, + 'model_seq_len': 128, + 'data_dir': [ + '/home/christy.li/oceanus_cost_model_training_data/bert/bert_3l_orca_16', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert_6l_orca_15', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert_12l_orca_15', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert.12l_g4.4.50.1_g4.4.50.2', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert.6l_g4.4.50.1_g4.4.50.2', + ], + 'original_graph_item_path': '/home/hao.zhang/oceanus_cost_model_training_data/bert/bert_original_graph_item_3l', + 'save_dir': '/home/christy.li/oceanus_cost_model_training_data/bert/predefined_checkpoints', + 'save_prefix': 'ckpV1_bert_orca', + 'baseline': 0.04, + 'scale': 0.5, + 'learning_rate': 0.01, + 'list_size': 2, + 'batch_size': 100, + 'ranking_loss_key': 'pairwise_logistic_loss', + 'do_train': False, + 'do_test': True, + 'model_version': 'v1', + # 'model_version': 'v2', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/checkpoint_500', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_16_300_0.90684_0.91947', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_16_600_0.87000_0.71000', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_all_200_0.80568_0.81116', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_200_0.81503_0.82009', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV2_ncf_large_adam_dense_orca_16_600_0.89737_0.92842', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV2_ncf_large_adam_dense_all_500_0.87666_0.85391', + 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/bert/predefined_checkpoints/ckpV1_bert_orca_400_0.93600_0.93889', + }, + 'resnet101': { + 'model_batch_size': 32, + 'model_seq_len': 1, + 'baseline': 0.5, + 'scale': 0.5, + 'data_dir': '', + 'learning_rate': 0.01, + 'list_size': 2, + 'batch_size': 100, + 'ranking_loss_key': 'pairwise_logistic_loss', + }, +} + +def main(_): + np.random.seed(110) + + # Hyperparameters + # model_to_simulate = 'bert' + model_to_simulate = 'ncf_large_adam_dense' + data_dir = model_params[model_to_simulate]['data_dir'] + original_graph_item_path = model_params[model_to_simulate]['original_graph_item_path'] + batch_size = model_params[model_to_simulate]['batch_size'] + ranking_loss_key = model_params[model_to_simulate]['ranking_loss_key'] + learning_rate = model_params[model_to_simulate]['learning_rate'] + list_size = model_params[model_to_simulate]['list_size'] + baseline = model_params[model_to_simulate]['baseline'] + scale = model_params[model_to_simulate]['scale'] + save_dir = model_params[model_to_simulate]['save_dir'] + save_prefix = model_params[model_to_simulate]['save_prefix'] + do_train = model_params[model_to_simulate]['do_train'] + do_test = model_params[model_to_simulate]['do_test'] + checkpoint = model_params[model_to_simulate]['checkpoint'] + model_version = model_params[model_to_simulate]['model_version'] + + # Create simulator + simulator = PredefinedSimulator(original_graph_item_path, + batch_size=model_params[model_to_simulate]['model_batch_size'], + seq_len=model_params[model_to_simulate]['model_seq_len']) + + # Create features + strategy_resource_files, Y = utils.laod_from_folders(data_dir) + print("Createing features...") + X = [] + with context.graph_mode(): + for strategy_file, resource_file in tqdm.tqdm(strategy_resource_files): + x = simulator.create_features(Strategy.deserialize(strategy_file), ResourceSpec(resource_file)) + X.append(x) + X = np.array(X, dtype=np.float) + print("Finished createing features.") + + # Create model + hidden_dim = 12 + W = tf.Variable(tf.random.uniform([hidden_dim, 1]), name='W', dtype=tf.float32) + b = tf.Variable(0.0, name='b', dtype=tf.float32) + if model_version == 'v2': + W0 = tf.Variable(tf.random.uniform([hidden_dim, hidden_dim]), name='W0', dtype=tf.float32) + b0 = tf.Variable(0.0, name='b0', dtype=tf.float32) + loss_fn = tfr.losses.make_loss_fn(RankingLossKeys[ranking_loss_key]) + major_version, _, _ = tf.version.VERSION.split('.') + if major_version == '1': + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + else: + optimizer = tf.optimizers.Adam(learning_rate) + + def forward(xs): + rs = [] + for x in xs: + if model_version == 'v2': + x = tf.nn.elu(tf.matmul(x, W0) + b0) + r = tf.matmul(x, W) + b + rs.append(r) + r = tf.concat(rs, axis=1, name='logits') + return r + + @tf.function + def train_steps(inputs_iterator, total_steps): + + def train_step(input): + with tf.GradientTape() as tape: + logits = forward(input[:-1]) + loss = loss_fn(labels=input[-1], logits=logits, features={}) + vs = [W0, b0, W, b] if model_version == 'v2' else [W, b] + gradients = tape.gradient(loss, vs) + train_op = optimizer.apply_gradients(zip(gradients, vs)) + pred = tf.squeeze(tf.argmax(logits, axis=1)) + labels = tf.squeeze(tf.argmax(input[-1], axis=1)) + acc = tf.equal(pred, labels) + return loss, acc + + losses = [] + accs = [] + for step in range(total_steps): + l, a = train_step(inputs_iterator.get_next()) + losses.append(l) + accs.append(a) + return losses, accs + + @tf.function + def eval_step(input): + logits = forward(input[:-1]) + preds = tf.squeeze(tf.argmax(logits, axis=1)) + labels = tf.squeeze(tf.argmax(input[-1], axis=1)) + acc = tf.equal(preds, labels) + return acc, labels, preds, input[-1], logits + + def eval_steps(iterator, total_test_steps): + test_acc = [] + test_preds = [] + test_labels = [] + test_logits = [] + test_scores = [] + for step in range(total_test_steps): + acc, labels, preds, scores, logits = eval_step(iterator.get_next()) + test_acc.append(acc) + test_labels.append(labels) + test_preds.append(preds) + test_scores.append(scores) + test_logits.append(logits) + test_acc = tf.concat(test_acc, axis=0) + test_acc = tf.cast(test_acc, tf.float32) + avg_test_acc = tf.math.reduce_mean(test_acc) + test_labels = tf.concat(test_labels, axis=0) + test_preds = tf.concat(test_preds, axis=0) + test_scores = tf.concat(test_scores, axis=0) + test_logits = tf.concat(test_logits, axis=0) + return avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits + + if do_train: + train_set, valid_set, test_set = utils.split_dataset([X, Y], shuffle=True, train_ratio=0.7, test_ratio=0.15) + X_train, Y_train = train_set + X_valid, Y_valid = valid_set + X_test, Y_test = test_set + inputs_iterator = TFRIterator(X=X_train, Y=Y_train, list_size=list_size, batch_size=batch_size, split='train', + baseline=baseline, scale=scale) + valid_iterator = TFRIterator(X=X_valid, Y=Y_valid, list_size=list_size, batch_size=batch_size, split='valid') + test_iterator = TFRIterator(X=X_test, Y=Y_test, list_size=list_size, batch_size=batch_size, split='test') + total_train_steps = max(1, min(inputs_iterator.get_num_examples() // batch_size, 100)) + total_valid_steps = max(1, valid_iterator.get_num_examples() // batch_size) + total_test_steps = max(1, test_iterator.get_num_examples() // batch_size) + print("Total train steps per epoch: {}".format(total_train_steps)) + print("Total valid steps per epoch: {}".format(total_valid_steps)) + print("Total test steps: {}".format(total_test_steps)) + EPOCHS = 2000 + eval_every_epochs = 100 + save_every_epochs = 100 + + print("\nTrain model...") + losses = [] + for epoch in range(EPOCHS): + loss, acc = train_steps(inputs_iterator, total_train_steps) + losses.extend(loss) + avgloss = sum(losses) / float(len(losses)) + print('Step: {}, avgloss: {:.5f}'.format(epoch, avgloss)) + if (epoch+1) % eval_every_epochs == 0: + print("\nEvaluate on valid set...") + avg_valid_acc, *_= eval_steps(valid_iterator, total_valid_steps) + print('avg_valid_acc: {}'.format(avg_valid_acc.numpy())) + print("Evaluate on test set...") + avg_test_acc, *_= eval_steps(test_iterator, total_test_steps) + print('avg_test_acc: {}\n'.format(avg_test_acc.numpy())) + print('W', W.numpy()) + print('b', b.numpy()) + + if (epoch+1) % save_every_epochs == 0: + if not os.path.exists(save_dir): + os.mkdir(save_dir) + checkpoint = '{}/{}_{}_{:.5f}_{:.5f}'.format(save_dir, save_prefix, epoch+1, + avg_valid_acc, avg_test_acc) + print("Save to {}".format(checkpoint)) + simulator.save_checkpoint([W0, b0, W, b] if model_version == 'v2' else [W, b], checkpoint) + + elif do_test: + print("Load from {}".format(checkpoint)) + weights = simulator.load_checkpoint(checkpoint) + if model_version == 'v2' and len(weights) == 4: + W0, b0, W, b = weights + elif model_version == 'v1' and len(weights) == 2: + W, b = weights + else: + raise ValueError + + test_iterator = TFRIterator(X=X, Y=Y, list_size=list_size, batch_size=batch_size, split='test') + total_test_steps = max(1, test_iterator.get_num_examples() // batch_size) + print("\nEvaluate on test set...") + avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits = eval_steps(test_iterator, total_test_steps) + for i, labels, preds, scores, logits in zip(range(100), test_labels, test_preds, test_scores, test_logits): + print('labels', labels.numpy(), 'preds', preds.numpy(), 'scores', scores.numpy(), 'logits', logits.numpy()) + print('avg_test_acc', avg_test_acc.numpy()) + + test_iterator_single = TFRIterator(X=X, Y=Y, list_size=1, batch_size=len(X), split='test') + print("\nEvaluate each example in test set...") + avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits = eval_steps(test_iterator_single, 1) + for i, labels, preds, scores, logits in zip(range(100), test_labels, test_preds, test_scores, test_logits): + print('labels', labels.numpy(), 'preds', preds.numpy(), 'scores', scores.numpy(), 'logits', logits.numpy()) + test_logits = sorted(list(test_logits.numpy())) + top_10_persent = test_logits[:int(len(test_logits)*0.1)] + print('top_10_persent', top_10_persent) + print('top_10_persent threshold', top_10_persent[-1]) + print('test_logits', test_logits) + + +main(sys.argv) diff --git a/autodist/autosync/simulator/utils.py b/autodist/autosync/simulator/utils.py new file mode 100644 index 0000000..d0c6436 --- /dev/null +++ b/autodist/autosync/simulator/utils.py @@ -0,0 +1,374 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simulator-related utility functions.""" + +import glob +import json +import os +import numpy as np + +import tensorflow as tf +from tensorflow.python.framework import device_spec +import tensorflow_ranking as tfr + +from autodist.utils import logging +from autodist.resource_spec import ResourceSpec +from autodist.strategy.base import Strategy +# from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \ +# DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR +from autodist.kernel.device.resolver import DeviceResolver + + +RankingLossKeys = { + # Names for the ranking based loss functions. + 'pairwise_hinge_loss': tfr.losses.RankingLossKey.PAIRWISE_HINGE_LOSS, + 'pairwise_logistic_loss': tfr.losses.RankingLossKey.PAIRWISE_LOGISTIC_LOSS, + 'pairwise_soft_zero_one_loss': tfr.losses.RankingLossKey.PAIRWISE_SOFT_ZERO_ONE_LOSS, + 'softmax_loss': tfr.losses.RankingLossKey.SOFTMAX_LOSS, + 'sigmoid_cross_entropy_loss': tfr.losses.RankingLossKey.SIGMOID_CROSS_ENTROPY_LOSS, + 'mean_squared_loss': tfr.losses.RankingLossKey.MEAN_SQUARED_LOSS, + 'list_mle_loss': tfr.losses.RankingLossKey.LIST_MLE_LOSS, + 'approx_ndcg_loss': tfr.losses.RankingLossKey.APPROX_NDCG_LOSS, +} + +######### +# Online +######### + +def laod_from_one_folder(data_folder): + strategy_folder = '{}/strategies'.format(data_folder) + strategy_files = glob.glob(os.path.join(strategy_folder, '*')) + X = [] + Y = [] + for strategy_file in strategy_files: + # Target + runtime_file = '/'.join(strategy_file.split('/')[:-2]) + '/runtimes/' + strategy_file.split('/')[-1] + if not os.path.exists(runtime_file) or not os.path.isfile(runtime_file): + print('runtime_file does not exist: {}.'.format(runtime_file)) + continue + runtime = json.load(open(runtime_file, 'r')) + y = runtime['average'] + resource_file = strategy_file.replace('strategies', 'resource_specs') + if not os.path.exists(resource_file): + resource_file += '.yml' + if not os.path.exists(resource_file): + resource_file = os.path.join(data_folder, 'resource_spec_files/resource_spec.yml') + if not os.path.exists(resource_file): + continue + Y.append(y) + X.append([strategy_file, resource_file]) + print('Data points:{}, data_folder: {}'.format(len(X), data_folder)) + return X, Y + + +def laod_from_folders(data_dir): + if isinstance(data_dir, str): + data_folders = glob.glob("{}/*".format(data_dir), recursive=True) + elif isinstance(data_dir, list): + data_folders = data_dir + else: + raise ValueError + print('data_folders', data_folders) + X = [] + Y = [] + for data_folder in data_folders: + x, y = laod_from_one_folder(data_folder) + if len(x) == 0: + print('strategy_folder does not have files: {}, skipping it.'.format(data_folder)) + continue + Y.extend(y) + X.extend(x) + # Y = np.concatenate(Y, axis=0) + if len(Y) > 0: + Y = np.array(Y, dtype=np.float) + miny = np.min(Y) + print('min of all Y values: {}'.format(miny)) + else: + print("no files loaded.") + return X, Y + + +########## +# Offline +########## + +def laod_from_one_folder_offline(simulation_folder): + simulation_files = glob.glob(os.path.join(simulation_folder, '*'), recursive=True) + X = [] + Y = [] + for simulation_file in simulation_files: + # Features + try: + simulation = json.load(open(simulation_file, 'r')) + except: + print("Can not read simulation_file: ", simulation_file) + continue + x = simulation_file + # Target + runtime_file = '/'.join(simulation_file.split('/')[:-2]) + '/runtimes/' + simulation_file.split('/')[-1] + if not os.path.exists(runtime_file) or not os.path.isfile(runtime_file): + print('runtime_file does not exist: {}.'.format(runtime_file)) + continue + runtime = json.load(open(runtime_file, 'r')) + y = runtime['average'] + Y.append(y) + X.append(x) + Y = np.array(Y, dtype=np.float) + print('Data points:{}, simulation_folder: {}'.format(len(X), simulation_folder)) + return X, Y + + +def laod_from_folders_offline(data_dir): + simulation_folders = glob.glob("{}/*/simulations".format(data_dir), recursive=True) + print('simulation_folders', simulation_folders) + X = [] + Y = [] + for simulation_folder in simulation_folders: + x, y = laod_from_one_folder_offline(simulation_folder) + if len(x) == 0: + print('simulation folder does not have files: {}, skipping it.'.format(simulation_folder)) + continue + Y.append(y) + X.append(x) + Y = np.concatenate(Y, axis=0) + miny = np.min(Y) + print('min of Y values: {}'.format(miny)) + return X, Y + + +def split_dataset(inputs, shuffle=True, train_ratio=0.7, test_ratio=0.15): + assert isinstance(inputs, list) + nb_elements = len(inputs) + nb_samples = len(inputs[0]) + n_train = int(nb_samples * train_ratio) + n_test = int(nb_samples * test_ratio) + shuffled = [] + train = [] + valid = [] + test = [] + + if shuffle: + random_indices = np.random.permutation(list(range(nb_samples))) + for i in range(nb_elements): + shuffled_i = [inputs[i][j] for j in random_indices] + train.append(shuffled_i[:n_train]) + valid.append(shuffled_i[n_train:-n_test]) + test.append(shuffled_i[-n_test:]) + else: + for i in range(nb_elements): + train.append(inputs[i][:n_train]) + valid.append(inputs[i][n_train:-n_test]) + test.append(inputs[i][-n_test:]) + + return train, valid, test + +def read_trial_runs(): + runtime_files = glob.glob(os.path.join(DEFAULT_RUNTIME_SERIALIZATION_DIR, '*')) + strategy_files = glob.glob(os.path.join(DEFAULT_SERIALIZATION_DIR, '*')) + strategy_json_files = glob.glob(os.path.join(DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, '*')) + resource_files = glob.glob(os.path.join(DEFAULT_RESOURCE_SERIALIZATION_DIR, '*')) + logging.info(len(runtime_files), len(strategy_files), len(strategy_json_files), len(resource_files)) + + trialruns = {} + for runtime_file in runtime_files: + strategy_id = runtime_file.split('/')[-1] + strategy_file = os.path.join(DEFAULT_SERIALIZATION_DIR, strategy_id) + strategy_json_file = os.path.join(DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, strategy_id) + resource_file = os.path.join(DEFAULT_RESOURCE_SERIALIZATION_DIR, strategy_id) + if not os.path.exists(strategy_file): + logging.info("strategy_file not found, skip it: {}".format(strategy_file)) + continue + if not os.path.exists(strategy_json_file): + logging.info("strategy_json_file not found, skip it: {}".format(strategy_json_file)) + continue + if not os.path.exists(resource_file): + logging.info("resource_file not found, skip it: {}".format(resource_file)) + continue + + trialruns[strategy_id] = { + 'runtime': json.load(open(runtime_file, 'r')), + 'strategy': Strategy.deserialize(strategy_id), + 'strategy_json': json.load(open(strategy_json_file, 'r')), + 'resource_spec': ResourceSpec(resource_file=resource_file), + } + + logging.info("Total number of trials: {}".format(len(trialruns))) + return trialruns + + +DTYPE2BITS = { + tf.float16: 16, + "tf.float16": 16, + "": 16, + tf.float32: 32, + 'tf.float32': 32, + "": 32, + "": 32, + tf.float64: 64, + 'tf.float64': 64, + "": 64, + tf.bfloat16: 16, + 'tf.bfloat16': 16, + "": 16, + tf.complex64: 64, + 'tf.complex64': 64, + "": 64, + tf.complex128: 128, + 'tf.complex128': 128, + "": 128, + tf.int8: 8, + 'tf.int8': 8, + "": 8, + tf.uint8: 8, + 'tf.uint8': 8, + "": 8, + tf.uint16: 16, + 'tf.uint16': 16, + "": 16, + tf.uint32: 32, + 'tf.uint32': 32, + "": 32, + tf.uint64: 64, + 'tf.uint64': 64, + "": 64, + tf.int16: 16, + 'tf.int16': 16, + "": 16, + tf.int32: 32, + 'tf.int32': 32, + "": 32, + tf.int64: 64, + 'tf.int64': 64, + "": 64, + tf.bool: 1, + 'tf.bool': 1, + "": 1, + tf.string: 1, # todo: confirm + 'tf.string': 1, # todo: confirm + "": 1, # todo: confirm + tf.quint8: 8, + 'tf.quint8': 8, + tf.qint8: 8, + 'tf.qint8': 8, + "": 8, + "": 8, + tf.qint16: 16, + 'tf.qint16': 16, + "": 16, + tf.quint16: 16, + 'tf.quint16': 16, + "": 16, + tf.qint32: 32, + 'tf.qint32': 32, + "": 32, + tf.resource: 0, # its tensor shape is either [] or [None] todo: confirm + 'tf.resource': 0, # its tensor shape is either [] or [None] todo: confirm + "": 0, # its tensor shape is either [] or [None] todo: confirm +} + +GIGABITS = np.float(1e+9) +INFINITY = 1e+9 +NUM_RUNS = 500 +GPU_TO_CPU_BANDWIDTH = 1000 # Gbps + + +def pad_list(l, max_len): + return l + [0.0] * (max_len - len(l)) + + +def get_dtype_bits(dtype): + return DTYPE2BITS[dtype] if dtype in DTYPE2BITS else DTYPE2BITS[str(dtype)] + + +def get_dense_var_bits(size, dtype): + return size * get_dtype_bits(dtype) + + +def get_sparse_var_bits(size): + # same size of values, indices, dense_shape + return size * (get_dtype_bits(tf.float32) + 2 * get_dtype_bits(tf.int64)) \ + + 2 * get_dtype_bits(tf.int64) + + +def on_same_host(device_str1, device_str2): + """ + Return True if d1 and d2 are on the same host. + + Args: + device_str1 (string): the first device as a TF device string, e.g. /job:worker/task:0/device:CPU:0. + device_str2 (string): the first device as a TF device string, e.g. /job:worker/task:0/device:GPU:0. + + Returns: + Bool: True if they are on the same host, otherwise False. + """ + host1 = '/'.join(device_str1.split('/')[:-1]) + host2 = '/'.join(device_str2.split('/')[:-1]) + return host1 == host2 + +# def _resolved_devices_on_diff_machine(device1, device2): +# # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0' +# node1 = ':'.join(device1.split('/')[:-1]) +# node2 = ':'.join(device2.split('/')[:-1]) +# return node1 != node2 + + +# def _resolve_device_address(device: str, device_resolver: DeviceResolver): +# # change real ip address to /job:worker/task:0 +# if not device: +# return device +# parts = device.split(':') +# if parts and parts[0] in device_resolver._address_to_tasks: +# resolved_device = device_resolver._address_to_tasks[parts[0]][0] +# resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task']) +# resolved = resolved + ':'.join(parts[-2:]) +# return resolved +# else: +# raise ValueError("cannot resolve device: {} using device_resolver: {}".format( +# device, device_resolver._address_to_tasks)) + + +# def _num_local_replica(host, replicas, cluster): +# # host: e.g., '/job:worker/task:0/device:CPU:0' +# replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} +# host_device = device_spec.DeviceSpecV2.from_string(host) +# num_local_replica = sum(1 for d in replica_devices +# if cluster.get_address_from_task(d.job, d.task) == +# cluster.get_address_from_task(host_device.job, host_device.task)) +# return num_local_replica +# +# +# def _max_num_local_replica(replicas, cluster): +# replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} +# replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices} +# max_num_local_replica = 0 +# for host in replica_hosts: +# num_local_replica = sum(1 for d in replica_devices +# if cluster.get_address_from_task(d.job, d.task) == host) +# max_num_local_replica = max(max_num_local_replica, num_local_replica) +# return max_num_local_replica + + +def _strip_var_name(name): + # strip prefix + if not name: + return name + name = name.split('/') + if 'Replica' in name[0]: # remove prefix + name = name[1:] + if name and 'part' in name[-1]: # remove '/part_1' if using partitioned ps + name = name[:-1] + name = '/'.join(name) + name = name.split(':')[0] # remove ':0'. + return name diff --git a/autodist/kernel/device/resolver.py b/autodist/kernel/device/resolver.py index 609f471..8fcfecf 100644 --- a/autodist/kernel/device/resolver.py +++ b/autodist/kernel/device/resolver.py @@ -45,6 +45,15 @@ def _get_address_to_tasks(cluster): return d def resolve_to_device_spec(self, device): + """ + Resolve an AutoDist DeviceSpec or string to a TensorFlow DeviceSpec. + + Args: + device: (a container of) AutoDist DeviceSpec or DeviceSpec string. + + Returns: + device_spec, List(device_spec), or Set(device_spec) + """ """Resolve an AutoDist DeviceSpec or its string to a TensorFlow DeviceSpec.""" if isinstance(device, (list, set)): return type(device)(self.resolve_to_device_spec(d) for d in device) @@ -59,7 +68,15 @@ def resolve_to_device_spec(self, device): ) def resolve_to_device_str(self, device): - """Resolve an AutoDist DeviceSpec or its string to a TensorFlow device string.""" + """Resolve an AutoDist DeviceSpec or its string to a TensorFlow device string. + + E.g. 192.168.0.1:GPU:0 or localhost:CPU:1 -> job:worker/task:0/device:GPU:0 + Args: + device: (a container of) AutoDist DeviceSpec or DeviceSpec string. + + Returns: + str, List(str), or Set(str) + """ if isinstance(device, (list, set)): return type(device)(self.resolve_to_device_spec(d).to_string() for d in device) elif isinstance(device, RepeatedScalarContainer): diff --git a/autodist/resource_spec.py b/autodist/resource_spec.py index 017faea..be1d570 100644 --- a/autodist/resource_spec.py +++ b/autodist/resource_spec.py @@ -72,6 +72,7 @@ def __init__(self, resource_file=None): self.__chief_address = None self.__ssh_config_map = dict() self.__ssh_group = dict() + self.__network_bandwidth = dict() # set self.__devices self._from_resource_info(resource_file) @@ -147,6 +148,11 @@ def ssh_group(self): """SSH Group for each node.""" return self.__ssh_group + @property + def network_bandwidth(self): + """Network bandwidth of each node.""" + return self.__network_bandwidth + def _add_device(self, device_spec): if device_spec.name_string() not in self.__devices: self.__devices[device_spec.name_string()] = device_spec @@ -200,6 +206,14 @@ def _parse_node(self, node, num_nodes): self.__ssh_group[host_address] = node.get('ssh_config') if self.__ssh_group[host_address] is None and self.__chief_address != host_address: raise ValueError("Need to define SSH groups for all non-chief nodes.") + # network bandwidth + if node.get('network_bandwidth'): + self.__network_bandwidth[host_address] = node.get('network_bandwidth') + else: + # TODO (Hao): we could also raise ValueError here. + logging.warning('Bandwidth for {} is undefined and set as default. ' + 'Caution when using AutoStrategy.'.format(host_address)) + self.__network_bandwidth[host_address] = 1 class DeviceSpec: diff --git a/autodist/strategy/__init__.py b/autodist/strategy/__init__.py index 3be1c34..fe6a366 100644 --- a/autodist/strategy/__init__.py +++ b/autodist/strategy/__init__.py @@ -25,3 +25,4 @@ from .partitioned_all_reduce_strategy import PartitionedAR from .random_axis_partition_all_reduce_strategy import RandomAxisPartitionAR from .uneven_partition_ps_strategy import UnevenPartitionedPS +# from .auto_strategy import AutoStrategy diff --git a/autodist/strategy/auto/ar_group_assigner.py b/autodist/strategy/auto/ar_group_assigner.py new file mode 100644 index 0000000..7a529d3 --- /dev/null +++ b/autodist/strategy/auto/ar_group_assigner.py @@ -0,0 +1,83 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Collective group assigners.""" + +from collections import OrderedDict + +import numpy as np + + +def chunk_group_assigner(ar_shards, chunk_size=1): + """ + Assigner that determines the group following a chunk parameter. + + Args: + ar_shards: + chunk_size: + + Returns: + + """ + assignments = {} + for i, shard_name in enumerate(ar_shards): + assignments[shard_name] = i // chunk_size + assert(len(ar_shards)) == len(assignments) + return assignments + + +def christy_group_assigner(ar_shards, var_helpers, num_group): + """A probabilistic assigner that tries to put each ring with balanced message size""" + assignments = {} + + sorted_ar_shards = OrderedDict(sorted(ar_shards.items(), key=lambda x: var_helpers[x[0]].byte_size, reverse=True)) + cur_loads = [0.0 for i in range(num_group)] + for shard_name in sorted_ar_shards: + total_loads = sum(cur_loads) + balanced_loads = [total_loads / num_group for _ in range(num_group)] + space = np.array([balanced_load - cur_load for balanced_load, cur_load in zip(balanced_loads, cur_loads)]) + + e_x = np.exp(space-np.max(space)) + accept_prob = e_x / e_x.sum() + + des = np.random.choice(range(0, num_group), 1, p=accept_prob)[0] + assignments[shard_name] = des + cur_loads[des] += var_helpers[shard_name].byte_size + assert(len(ar_shards)) == len(assignments) + # entropy = calcuate_entropy(cur_loads) + # best_entropy = calcuate_entropy(balanced_loads) + # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy)) + return assignments + +def ordered_balanced_group_assigner(ar_shards, var_helpers, num_group): + """Greedy assigner that create balanced loads following a given var order.""" + assignments = {} + + # get total size + total_loads = 0.0 + for shard_name in ar_shards: + total_loads += var_helpers[shard_name].byte_size + + avg_load = total_loads / num_group + + cur_bucket = 0 + loads = [0 for _ in range(num_group)] + for shard_name in ar_shards: + if loads[cur_bucket] >= avg_load: + cur_bucket += 1 + if loads[cur_bucket] < avg_load: + assignments[shard_name] = cur_bucket + loads[cur_bucket] += var_helpers[shard_name].byte_size + assert(len(ar_shards) == len(assignments)) + return assignments diff --git a/autodist/strategy/auto/base.py b/autodist/strategy/auto/base.py new file mode 100644 index 0000000..05f0be4 --- /dev/null +++ b/autodist/strategy/auto/base.py @@ -0,0 +1,112 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A base class to implementating different auto strategies.""" + +from multiprocessing import Process, Queue + +import numpy as np + +from autodist.strategy.auto.strategy_sampler import RandomStrategySampler, \ + default_space, default_heuristics +from autodist.strategy.base import StrategyBuilder +from autodist.utils import logging + + +class AutoStrategyBase(StrategyBuilder): + """AutoStrategy Base class.""" + + def __init__(self, + space=None, + heuristics=None, + num_proposals=1000, + simulator=None, + train_simulator=False): + # space and heuristics params + if not space: + self._space = default_space + if not heuristics: + self._heuristics = default_heuristics + + # params + self._num_proposals = num_proposals + self._sampler = RandomStrategySampler(self._space, + self._heuristics) + if train_simulator: + raise NotImplementedError() + self._simulator = simulator + + def build(self, graph_item, resource_spec): + raise NotImplementedError() + + def propose_one(self, graph_item, resource_spec): + """ + Sequentially generate `self._num_proposals` strategies. + + Args: + graph_item: + resource_spec: + + Returns: + Strategy + """ + proposal = self._sampler.build(graph_item, resource_spec) + return proposal + + def propose_n(self, + graph_item, + resource_spec, + num_proposals, + num_threads=1): + """ + Proposal `num_proposals` strategies using multi-threading. + + Args: + graph_item: + resource_spec: + num_proposals: + num_threads: + + Returns: + List(Strategy) + """ + if num_threads > 1: + def sampler_worker(q, sampler, graph_item, resource_spec): + np.random.seed() + expr = sampler.build(graph_item, resource_spec) + q.put(expr) + + proposals = [] + while len(proposals) < num_proposals: + # create thread-safe objects before multi-threading + samplers = [RandomStrategySampler(graph_item, resource_spec) for _ in range(num_threads)] + graph_items = [graph_item for _ in range(num_threads)] + resource_specs = [resource_spec for _ in range(num_threads)] + q = Queue() + threads = [] + try: + for sampler, gi, rs in zip(samplers, graph_items, resource_specs): + thread = Process(target=sampler_worker, args=(q,sampler, gi, rs)) + thread.start() + threads.append(thread) + batch = [q.get() for _ in threads] + proposals.extend(batch) + for thread in threads: + thread.join() + except: + logging.error('Error when proposing strategies with {} threads'.format(num_threads)) + raise + else: + proposals = [self.propose_one(graph_item, resource_spec) for i in range(num_proposals)] + return proposals diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py new file mode 100644 index 0000000..94c316b --- /dev/null +++ b/autodist/strategy/auto/item.py @@ -0,0 +1,562 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper classes and functions for automatic strategy generation.""" + +from enum import Enum + +import tensorflow as tf +from tensorflow.python.framework import ops, device_spec + +from autodist.kernel.common.utils import get_op_name, get_consumers +from autodist.kernel.device.resolver import DeviceResolver +from autodist.graph_item import cached_property +from autodist.strategy.base import byte_size_load_fn +from autodist.utils import logging +from autodist.cluster import SSHCluster +from autodist.autosync.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits + + +class VarType(Enum): + SPARSE = 0 + DENSE = 1 + + +class VariableItem: + """Helper class to include meta information about a variable.""" + def __init__(self, + var, + graph_item, + node_config=None): + self.var = var + self.graph_item = graph_item + self._var_op_name = get_op_name(var.name) + self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0] + + self._config = None + if node_config: + self.update_config(node_config) + else: + logging.warning('Item with name {} has empty config.'.format(self.name)) + + def update_config(self, config): + """ + Update the nodeconfig of this variable. + + Args: + config: + """ + assert not config + self._node_config = config + + @property + def var_type(self): + """ + Return the type of the variable (VarType.SPARSE or VarType.DENSE). + + Returns: + VarType + """ + return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE + + @property + def name(self): + """ + Return the name of the variable. + + Returns: + String + """ + return self.var.name + + @property + def is_sparse(self): + """ + Return whether the variable is sparse. + + Returns: + Bool + """ + return True if self.var_type == VarType.SPARSE else False + + @property + def is_embedding(self): + """ + Return whether the variable corresponds to an embedding. + + Returns: + Bool + """ + # TODO (Hao): better way to determine is_embedding? + for op in get_consumers(self.var.op): + if op.type == "ResourceGather": + return True + return False + + @property + def shape(self): + """ + Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar). + + Returns: + List(int) + """ + return self.original_shape + + @property + def original_shape(self): + if self.var.initial_value.shape.ndims: + return self.var.initial_value.shape.as_list() + else: + return None + + @property + def size(self): + size = 1 + if self.shape: + for s in self.shape: + size *= s + return size + + @property + def original_size(self): + size = 1 + if self.original_shape: + for s in self.original_shape: + size *= s + return size + + def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1): + """ + Return the number of elements (e.g. float, integer) to transfer for this variable per iteration. + + To estimate the size to transfer for sparse variables, batch_size_per_gpu and seq_len are required. + Args: + batch_size_per_gpu: batch size used on each GPU replica. + seq_len: the length of the sequence of each input example. + + Returns: + integer + """ + if not self.is_sparse: + return self.size + else: + if not self.shape: # scalar + return 1 + + emb_size = 1 + if len(self.shape) > 1: + # infer the embedding size from original shape + for i in range(1, len(self.original_shape)): + emb_size *= self.original_shape[i] + + sparse_data_size = batch_size_per_gpu * seq_len * emb_size + + # estimate the embedding of this partition simply using a proportional formula + return sparse_data_size * float(self.size) / float(self.original_size) + + @property + def bits_to_transfer(self, batch_size_per_gpu=1, seq_len=1): + """ + Estimate the bits to transfer across the network per iteration. + + For sparse variables, this is an over-estimation as we think all columns corresponded to this batch + is unique. + Args: + batch_size_per_gpu: + seq_len: + + Returns: + integer + """ + s = self.size_to_transfer(batch_size_per_gpu, seq_len) + if self.is_sparse: # IndexSlices: values, indices, dense_shape + bits = s * get_dtype_bits(self.dtype) + \ + batch_size_per_gpu * seq_len * self.size / self.original_size * get_dtype_bits(tf.int64) + \ + 2 * get_dtype_bits(tf.int64) + return bits + else: # Tensor + return s * get_dtype_bits(self.dtype) + + @property + def partitionable_axes(self): + """ + Return the list of available axes that are legitimate to partition along. + + Returns: + List(int) + """ + valid_axes = [] + + # scalar + if not self.shape: + return valid_axes + + # Sparse variable can only be partition along the 0th axis in current implementation. + if self.is_sparse or self.is_embedding: + valid_axes = [0] + return valid_axes + for idx, dim in enumerate(self.shape): + if dim > 1: + valid_axes.append(idx) + return valid_axes + + @property + def byte_size(self): + """ + Return the byte size of the variable. + + Returns: + float + """ + return float(byte_size_load_fn(self.var)) + + @property + def dtype(self): + """ + Return the dtype of the variable. + + Returns: + dtype + """ + return self.var.dtype + + @property + def synchronizer(self): + """ + Return the synchronizer protobuf in the config of this variable. + + Returns: + NodeConfig + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer')) + + @property + def group(self): + """ + Return the group in the node config of this variable. + + Returns: + int: group + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'group', 0) + + @property + def compressor(self): + """ + Return the compressor in the node config of this variable. + + Returns: + Compressor type. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'compressor', None) + + @property + def reduction_destination(self): + """ + Return the reduction_destination in the node config of this variable. + + Returns: + str. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'reduction_destination', None) + + def device(self, resolver): + device_str = self.reduction_destination if self.reduction_destination else self.var.device + if device_str: + device_str = resolver.resolve_to_device_str(device_str) + return device_str + + @property + def local_replication(self): + """ + Return the local_replication in the node config of this variable. + + Returns: + bool + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'local_replication', False) + + +class PartItem(VariableItem): + """Helper class to include meta information about a variable partition.""" + def __init__(self, + var, + graph_item, + part_idx, + pc, + part_config=None): + super(PartItem, self).__init__(var, graph_item, part_config) + + self.part_idx = part_idx + self.pc = pc + + @property + def name(self): + """ + Return the name of this partition. + + Returns: + String + """ + name = '{}/part_{}:0'.format(get_op_name(self.var.name), self.part_idx) + return name + + @property + def partition_str(self): + return self.pc.partition_str + + @property + def shape(self): + """ + Return the shape of this partition. + + Returns: + List(int) + + """ + shape = self.original_shape + if shape: + dim_size = shape[self.pc.axis] // self.pc.num_shards + extras = shape[self.pc.axis] % self.pc.num_shards + if self.part_idx < extras: + dim_size += 1 + shape[self.pc.axis] = dim_size + return shape + + @property + def partitionable_axes(self): + """ + Return the list of available axes that are legitimate to partition along. + + Returns: + None: because this is a partition (not allowed to be partitioned further). + """ + return [] + + @property + def byte_size(self): + """ + Return the byte size of this partition. + + Returns: + float + """ + return float(byte_size_load_fn(self.var)) \ + * float(self.shape[self.pc.axis]) / float(self.original_shape[self.pc.axis]) + + @property + def synchronizer(self): + """ + + Returns: + + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + raise ValueError('Partitioner field is empty for a variable partition.') + return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer')) + + @property + def group(self): + """ + Return the group in the node config of this variable. + + Returns: + int: group + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + raise ValueError('Partitioner field is empty for a variable partition.') + return getattr(self.synchronizer, 'group', 0) + + @property + def compressor(self): + """ + Return the compressor in the node config of this variable partition. + + Returns: + Compressor. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + raise ValueError('Partitioner field is empty for a variable partition.') + return getattr(self.synchronizer, 'compressor', None) + + @property + def reduction_destination(self): + """ + Return the reduction_destination in the node config of this variable partition. + + Returns: + Reduction destination. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + logging.warning('Partitioner field is empty for a variable partition.') + return None + return getattr(self.synchronizer, 'reduction_destination', None) + + @property + def local_replication(self): + """ + Return the local_replication in the node config of this variable partition. + + Returns: + bool + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + logging.warning('Partitioner field is empty for a variable partition.') + return None + return getattr(self.synchronizer, 'local_replication', False) + + +class ResourceItem: + """ResourceItem. + + Helper class that includes meta information about a resource spec. All addresses are resolved (in TF format). + + TODO(Hao): merge ResourceItem class with ResourceSpec. + """ + + def __init__(self, resource_spec): + self._resource_spec = resource_spec + self._cluster = SSHCluster(resource_spec) + self._device_resolver = DeviceResolver(self._cluster) + + @property + def device_resolver(self): + """Resolver of this resource_spec that resolves an AutoDist device to TF device.""" + return self._device_resolver + + @property + def replicas(self): + """Return the list of replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0.""" + device_strs = [k for k, _ in self._resource_spec.devices] + return self._device_resolver.resolve_to_device_str(device_strs) + + @property + def gpu_replicas(self): + """ + Return the list of GPU replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0. + + Returns: + List(string) + """ + # device_str is autodist device string, e.g. 192.168.0.1:CPU:0 + device_strs = [k for k, _ in self._resource_spec.gpu_devices] + return self.device_resolver.resolve_to_device_str(device_strs) + + @property + def cpu_replicas(self): + """ + Return the list of CPU replicas in the format of TF device string, e.g. job:worker/task:0/device:cpu:0. + + Returns: + List(string) + """ + device_strs = [k for k, _ in self._resource_spec.cpu_devices] + return self.device_resolver.resolve_to_device_str(device_strs) + + @property + def total_num_gpu_replica(self): + return len(self.gpu_replicas) + + def num_local_gpu_replica_on(self, host): + """ + Return the number of gpu replica on a TF host address, e.g. '/job:worker/task:0/device:CPU:0'. + + Args: + host: TF host address,e .g. '/job:worker/task:0/device:CPU:0' + + Returns: + int + """ + gpu_device_specs = {device_spec.DeviceSpecV2.from_string(d) for d in self.gpu_replicas} + num = 0 + host_device_spec = device_spec.DeviceSpecV2.from_string(host) + for d in gpu_device_specs: + if self._cluster.get_address_from_task(d.job, d.task) \ + == self._cluster.get_address_from_task(host_device_spec.job, host_device_spec.task): + num += 1 + return num + + @property + def max_num_local_gpu_replica(self): + """Return the max number of local gpu replicas on the cluster.""" + return max([self.num_local_gpu_replica_on(host) for host in self.cpu_replicas]) + + @cached_property + def p2p_bandwidth(self): + """Calculates P2P network bandwidth between nodes in the cluster. + + Note that this is NOT a symmetric matrix. + """ + bw = {} # key: (device1, device2) + devices = [device for device, _ in self._resource_spec.devices] + resolved_devices = self.replicas + + for i in range(len(self.replicas)): + ip_i = devices[i].split(':')[0] + d_i = resolved_devices[i] + if d_i not in bw: + bw[d_i] = {} + for j in range(i, len(self.replicas)): + ip_j = devices[j].split(':')[0] + d_j = resolved_devices[j] + if d_j not in bw: + bw[d_j] = {} + if ip_i != ip_j: + bw[d_i][d_j] = GIGABITS * self._resource_spec.network_bandwidth[ip_i] + bw[d_j][d_i] = GIGABITS * self._resource_spec.network_bandwidth[ip_j] + else: + bw[d_i][d_j] = GIGABITS * GPU_TO_CPU_BANDWIDTH + bw[d_j][d_i] = GIGABITS * GPU_TO_CPU_BANDWIDTH + return bw + + @cached_property + def min_bandwidth(self): + """Return the minimum bandwidth (bottleneck) of all p2p connections on this cluster.""" + return min([min(v.values()) for k, v in self.p2p_bandwidth]) diff --git a/autodist/strategy/auto/ps_load_balancer.py b/autodist/strategy/auto/ps_load_balancer.py new file mode 100644 index 0000000..55a3d6e --- /dev/null +++ b/autodist/strategy/auto/ps_load_balancer.py @@ -0,0 +1,109 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PS load balancers.""" + +from collections import OrderedDict + +import numpy as np + + +def calcuate_entropy(loads): + distribution = loads / np.sum(loads) + distribution = distribution + 1e-4 + entropy = - np.sum(distribution * np.log2(distribution)) + return entropy + + +def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False): + """ + A greedy load balancer that places the next largest load on the least loaded server. + Args: + ps_shards: + resource_spec: + var_helpers: + sort_by_size: + + Returns: + + """ + # no randomness + assignments = {} + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + loads = {ps: 0.0 for ps in reduction_device_names} + + sorted_ps_shards = ps_shards + if sort_by_size: + sorted_ps_shards = OrderedDict(sorted(ps_shards.items(), + key=lambda x: var_helpers[x[0]].byte_size, reverse=True)) + + for shard_name in sorted_ps_shards: + sorted_ps = sorted(loads, key=loads.get) + destination = sorted_ps[0] + assignments[shard_name] = destination + loads[destination] += var_helpers[shard_name].byte_size + return assignments + + +def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False): + """ + A randomized greedy load balancer. It places the variable by sampling from a multinomial distribution + correlated with their current load status -- node with least loads will have highest probability being + sampled. + + Args: + ps_shards: + resource_spec: + var_helpers: + sort_by_size: + + Returns: + + """ + # Sample destination based on a distributed calculated based on loads and available bandwidth + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + loads = {ps: 0.0 for ps in reduction_device_names} + assignments = {} + + loads = sorted(list(loads.items()), key=lambda x: x[0]) + ps = [load[0] for load in loads] + bandwidth = [resource_spec.network_bandwidth[p.split(':')[0]] for p in ps] + total_bandwidth = sum(bandwidth) + cur_loads = [float(load[1]) for load in loads] + + sorted_ps_shards = ps_shards + if sort_by_size: + sorted_ps_shards = OrderedDict(sorted(ps_shards.items(), + key=lambda x: var_helpers[x[0]].byte_size, reverse=True)) + + for shard_name in sorted_ps_shards: + total_load = sum(cur_loads) # + var_load + balanced_loads = [total_load * b / total_bandwidth for b in bandwidth] + space = np.array([balanced_load - cur_load for balanced_load, cur_load in zip(balanced_loads, cur_loads)]) + + # softmax + e_x = np.exp(space - np.max(space)) + accept_prob = e_x / e_x.sum() + + # sample according to current load + des = np.random.choice(ps, 1, p=accept_prob)[0] + assignments[shard_name] = des + + cur_loads[ps.index(des)] += var_helpers[shard_name].byte_size + assert (len(ps_shards) == len(assignments)) + + # entropy = calcuate_entropy(cur_loads) + # best_entropy = calcuate_entropy(balanced_loads) + # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy)) + return assignments diff --git a/autodist/strategy/auto/strategy_sampler.py b/autodist/strategy/auto/strategy_sampler.py new file mode 100644 index 0000000..1ebb76e --- /dev/null +++ b/autodist/strategy/auto/strategy_sampler.py @@ -0,0 +1,570 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Strategy sampler that generates random strategies given model and resource spec.""" + +from collections import OrderedDict + +import numpy as np + +from autodist.kernel.common.utils import get_op_name +from autodist.kernel.partitioner import PartitionerConfig +from autodist.proto import strategy_pb2, synchronizers_pb2 +from autodist.strategy.base import Strategy +from autodist.strategy.auto.item import VariableItem, PartItem +from autodist.strategy.auto.ps_load_balancer import greedy_load_balancer, christy_load_balancer +from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \ + ordered_balanced_group_assigner +from autodist.const import MAX_INT32 + + +class RandomStrategySampler(): + """ + Random Strategy Sampler. + + This StrategyBuilder samples a strategy given graph_item and resource_spec. The sampling process is + constrained by `space`, and guided by `heuristics`, both as required arguments of its constructor. + """ + def __init__(self, space, heuristics): + """ + + Args: + space (dict): the strategy space that the random strategy should be drawn from. An example of the space + can be found at + heuristics (dict): heuristics used to guide the random sampling process. + """ + if not space: + raise ValueError('Space to perform strategy sampling is not provided.') + if not heuristics: + raise ValueError('Heuristic to guide strategy sampling is not provided.') + self.space = space + self.heuristics = heuristics + + def build(self, graph_item, resource_spec): + """Generate a randomized strategy given model and resource spec.""" + expr = Strategy() + + # number of graph replica is equal to number of GPU devices + expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices]) + variables = graph_item.trainable_var_op_to_var.values() + name_to_item = OrderedDict() + + # Perform MCMC to generate each node configs + node_config = [] + for var in variables: + var_item = VariableItem(var, graph_item) + name_to_item[var_item.name] = var_item + + node = strategy_pb2.Strategy.Node() + node.var_name = var_item.name + + # Step 1: determine whether or not to partition + # TODO(Hao): some factor is not considered, e.g. number of reduction_device_names + maybe_partition = sample_if_partition(var_item, resource_spec, self.space, self.heuristics) + + # Step 2.1: if not partition, sample a synchronizer type for it + if not maybe_partition: # no partition + sample_var_synchronizer(node, var_item, resource_spec, self.space) + else: # Step 2.2: else partition + # Step 2.2.1: sample a partitioner config + pc = sample_partition_config(var_item, resource_spec, self.space, self.heuristics) + node.partitioner = pc.partition_str + + # step 2.2.2: sample a synchronizer type for each partition + parts = [] + for i in range(pc.num_shards): + part = strategy_pb2.Strategy.Node() + part_item = PartItem(var, graph_item, i, pc) + part.var_name = '{}/part_{}:0'.format(get_op_name(var.name), i) + name_to_item[part.var_name] = part_item + parts.append(part) + sample_parts_synchronizers(parts, var_item, resource_spec, self.space, self.heuristics) + node.part_config.extend(parts) + node_config.append(node) + + # Step 3: Post-assign group or placement. + sample_group_and_reduction_destinations(node_config, resource_spec, name_to_item, self.heuristics) + + expr.node_config.extend(node_config) + self._reset() + return expr + + def _reset(self): + """Reset the helpers every time a strategy is sampled.""" + self.helpers = {} + + +def sample_if_partition(var_item, resource_spec, space, heuristics): + """ + Sample a bool value determining whether to partition a variable or not. + + Args: + var_item: the variable item. + resource_spec: the target cluster spec. + space: the space argument controlling where to sample from. + heuristics: the heuristics argument guiding the sampling process. + + Returns: + Bool + """ + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + if len(space['maybe_partition']) == 1: + return space['maybe_partition'] + if heuristics['enable_single_node_no_partition'] and len(reduction_device_names) <= 1: + return False + + # intersection of variable's partitonable axis and global constraints + if var_item.partitionable_axes: + if space['partitionable_axes']: + a = set(var_item.partitionable_axes) & set(space['partitionable_axes']) + if len(a) < 1: + return False + else: + return False + + # lower bound for abandoning partitioning + lb = heuristics['maybe_partition_bounds'][0] + ub = heuristics['maybe_partition_bounds'][1] + if var_item.byte_size <= lb: + return False + if var_item.byte_size >= ub: + return True + assert (len(space['maybe_partition']) == 2) + + if heuristics['maybe_partition_by_size']: + # By variable size -- a large variable has a higher chance to be partitioned + # TODO (Hao): MAX_INT32 is too large, reconsider later... + chance = float(var_item.byte_size - lb) / float(ub - lb) + return binary_sample(boundary=chance) + else: + return uniform_sample_by_choices(space['maybe_partition']) + + +def sample_var_synchronizer(node, var_helper, resource_spec, space): + """ + Sample a synchronizer (and all associated aspects) for an unpartitioned variable, + leaving merge_group or reduction_destination as empty. + + Args: + node (strategy_pb2.Strategy.Node): the corresponded node_config to be rewritten. + var_helper (VariableHelper): the variable helper corresponded to the variable. + resource_spec (ResourceSpec): the target cluster spec + space (dict): space. + """ + # We ALWAYS use PS for sparse variables + synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \ + else uniform_sample_by_choices(space['synchronizer_types']) + if synchronizer_type == 'PS': + node.PSSynchronizer.sync = True # we don't consider async at this moment + node.PSSynchronizer.staleness = 0 + node.PSSynchronizer.local_replication = sample_if_local_replication(space['local_replication'], + resource_spec) + else: + # no other option for spec + node.AllReduceSynchronizer.spec = synchronizers_pb2.AllReduceSynchronizer.Spec.Value('AUTO') + node.AllReduceSynchronizer.compressor = \ + synchronizers_pb2.AllReduceSynchronizer.Compressor.Value( + sample_ar_compressor(space['compressor'])) + + +def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristics): + """ + Sample synchronizers for all the partitions of a variable. + + Args: + parts: + var_helper: + resource_spec: + space: + heuristics: + + Returns: + """ + if var_helper.var_type == VarType.SPARSE: + synchronizer_types = ['PS'] * len(parts) + else: + if heuristics['same_synchronizer_for_parts']: + type = uniform_sample_by_choices(space['synchronizer_types']) + synchronizer_types = [type] * len(parts) + else: + synchronizer_types = [uniform_sample_by_choices(space['synchronizer_types']) + for part in parts] + for i, part in enumerate(parts): + if synchronizer_types[i] == 'PS': + part.PSSynchronizer.sync = True # we don't consider async at this moment + part.PSSynchronizer.staleness = 0 + part.PSSynchronizer.local_replication = sample_if_local_replication(space['local_replication'], + resource_spec) + else: + # no other option for spec + part.AllReduceSynchronizer.spec = synchronizers_pb2.AllReduceSynchronizer.Spec.Value('AUTO') + part.AllReduceSynchronizer.compressor = \ + synchronizers_pb2.AllReduceSynchronizer.Compressor.Value( + sample_ar_compressor(space['compressor'])) + + +def sample_partition_config(var_helper, resource_spec, space, heuristics): + """ + Sample the PartitionerConfig of a variable (that is to be partitioned). + + Args: + var_helper: + resource_spec: + space: + heuristics: + + Returns: + """ + # Arion only support partitioning along one axis -- we first sample a partition axis, + # then sample the number of partitions along that axis, and obtain the partition config. + assert len(var_helper.partitionable_axes) > 0, 'No partition axis available' + # sample partition axis + # TODO(Hao): some heursitics here available? + valid_axis = var_helper.partitionable_axes + if space['partitionable_axes']: + valid_axis = list(set(valid_axis) & set(space['partitionable_axes'])) + partition_axis = uniform_sample_by_choices(valid_axis) + + # sample how many partition to go + num_nodes = resource_spec.num_cpus + dim_size = var_helper.shape[partition_axis] + if heuristics['num_partition_bounds'][1] == 'num_nodes': + max_shards = min(dim_size, num_nodes) + elif isinstance(heuristics['num_partition_bounds'][1], int): + max_shards = min(dim_size, heuristics['num_partition_bounds'][1]) + else: + raise ValueError('unseen num_partition_bounds config') + + min_shards = 2 + if isinstance(heuristics['num_partition_bounds'][0], int): + min_shards = max(min_shards, heuristics['num_partition_bounds'][0]) + elif heuristics['num_partition_bounds'][0] == 'num_nodes': + min_shards = max(min_shards, heuristics['num_partition_bounds'][0]) + else: + raise ValueError('unseen num_partition_bounds config') + + # sample from [min_shards, max_shards] + num_shards = uniform_sample_by_choices(list(range(min_shards, max_shards + 1))) + + # construct a PartitionerConfig (pc) + partition_list = [1] * len(var_helper.shape) + partition_list[partition_axis] = num_shards + pc = PartitionerConfig(partition_list=partition_list) + return pc + + +def sample_if_local_replication(local_replication_space, resource_spec): + """ + Sample whether to perform local replication. + + Local replication is a PS-specific semantic; it represents whether to transfer parameters or updates + via a transfer device. + + Args: + local_replication_space: + resource_spec: + + Returns: + + """ + if resource_spec.num_gpus <= resource_spec.num_cpus: + # meaning every machine has at most 1 GPU + return False + return uniform_sample_by_choices(local_replication_space) + + +def sample_ar_compressor(compressor_space): + """ + Sample the type of the compressor being applied with collective ops. + + Available options include `NoneCompressor`, `HorovodCompressor`, `HorovodCompressorEF`, + `PowerSGDCompressor`, but `HorovodCompressorEF`, `PowerSGDCompressor` will change gradient value. + Args: + compressor_space: + + Returns: + """ + # TODO(Hao): try to use all four options + return uniform_sample_by_choices(compressor_space) + + +def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics): + """ + Sample the merge group or parameter placement (a.k.a. reduction_destination) after all other semantics + have been determined. + + Args: + node_config: + resource_spec: + helpers: + heuristics: + + Returns: + + """ + ps_shards = OrderedDict() + ar_shards = OrderedDict() + idx = 0 + for node in node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + ar_shards[part.var_name] = (idx,) + else: + ps_shards[part.var_name] = (idx,) + idx += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + ar_shards[node.var_name] = (idx,) + else: + ps_shards[node.var_name] = (idx,) + idx += 1 + + if len(ps_shards) > 0: + sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics) + if len(ar_shards) > 0: + sample_ar_groups(node_config, ar_shards, helpers, heuristics) + + +def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics): + """ + Sample the placement of shared parameter variables (a.k.a. reduction destinations). + + Args: + node_config: + ps_shards: + resource_spec: + helpers: + heuristics: + + Returns: + + """ + load_balancer = heuristics['ps_load_balancer'] + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + if not load_balancer: + destinations = {} + for shard_name in ps_shards: + destinations[shard_name] = uniform_sample_by_choices(reduction_device_names) + elif load_balancer == 'greedy': + destinations = greedy_load_balancer(ps_shards, resource_spec, helpers) + elif load_balancer == 'christy': + # copy Christy's partitionedPS + destinations = christy_load_balancer(ps_shards, resource_spec, helpers) + elif load_balancer == 'sorted_christy': + destinations = christy_load_balancer(ps_shards, resource_spec, helpers, sort_by_size=True) + elif load_balancer == 'sorted_greedy': + destinations = greedy_load_balancer(ps_shards, resource_spec, helpers, sort_by_size=True) + else: + raise ValueError('Cannot recognize load balancer') + + for shard_name, (idx, ) in ps_shards.items(): + ps_shards[shard_name] = (idx, destinations[shard_name]) + + assign_ps_reduction_destinations(node_config, ps_shards) + + +def assign_ps_reduction_destinations(node_config, ps_shards): + """ + Assign the sampled reduction destinations to node_config. + + Args: + node_config: + ps_shards: + + Returns: + + """ + for node in node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = ps_shards[part.var_name][1] + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = ps_shards[node.var_name][1] + + +def sample_ar_groups(node_config, ar_shards, helpers, heuristics): + """ + Sample the group of collective operations. + + Args: + node_config: + ar_shards: + helpers: + heuristics: + + Returns: + + """ + merge_scheme = heuristics['merge_scheme'] + if merge_scheme == 'by_chunk': + if 'chunk_size' in heuristics and heuristics['chunk_size'] > 0: + chunk_size_or_num_group = heuristics['chunk_size'] + else: + chunk_size_or_num_group = sample_chunk_size(len(ar_shards)) + else: + chunk_size_or_num_group = sample_num_ar_groups(ar_shards, + heuristics['num_group_bounds'][0], + heuristics['num_group_bounds'][1]) + assert chunk_size_or_num_group > 0, "chunk_size or num_groups need to > 1..." + + if merge_scheme in ['random', None]: + tmp_assignments = sample_merge_group(chunk_size_or_num_group, len(ar_shards)) + group_assignments = OrderedDict() + for i, shard_name in enumerate(ar_shards): + group_assignments[shard_name] = tmp_assignments[i] + elif merge_scheme == 'by_chunk': + # sample chunk_size + group_assignments = chunk_group_assigner(ar_shards, chunk_size_or_num_group) + elif merge_scheme == 'christy': + group_assignments = christy_group_assigner(ar_shards, + helpers, + chunk_size_or_num_group) + elif merge_scheme == 'ordered_balanced': + group_assignments = ordered_balanced_group_assigner(ar_shards, + helpers, + chunk_size_or_num_group) + else: + raise ValueError('unseen merge scheme..') + + for shard_name, (idx,) in ar_shards.items(): + ar_shards[shard_name] = (idx, group_assignments[shard_name]) + assign_ar_group(node_config, ar_shards) + + +def sample_num_ar_groups(ar_shards, lb, ub): + """ + Sample the number of collective groups. + + Args: + ar_shards: + lb: + ub: + + Returns: + + """ + min_num_group = max(1, lb) + max_num_group = min(len(ar_shards), ub) + num_group = uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1))) + return num_group + + +def sample_chunk_size(num_ar_shards): + """ + Sample the chunk_size if following a chunk-based merge scheme. + + Args: + num_ar_shards: + + Returns: + + """ + chunk_size = uniform_sample_by_choices(list(range(1, num_ar_shards + 1))) + return chunk_size + + +def assign_ar_group(node_config, ar_shards): + """ + Assign the sampled group values to node configs. + + Args: + node_config: + ar_shards: + + Returns: + + """ + for node in node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + synchronizer.group = ar_shards[part.var_name][1] + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + synchronizer.group = ar_shards[node.var_name][1] + + +def uniform_sample_by_choices(choices): + """ + Uniformly sample an option from a list of options. + + Args: + choices (list): a list of values to be sampled from. + + Returns: + choice: the sampled value. + + """ + assert choices + p = np.random.uniform() + t = 1.0 / len(choices) + sample = choices[0] + for i, c in enumerate(choices): + if p < t * (i+1): + sample = c + break + return sample + + +def binary_sample(boundary=0.5): + p = np.random.uniform() + if p < boundary: + return True + else: + return False + + +def sample_merge_group(num_group, num_candidates): + + def is_valid(assignment): + unique_assignment = np.unique(assignment) + if unique_assignment.shape[0] == num_group: + return True + return False + + assignment = np.random.randint(1, num_group+1, [num_candidates]) + while not is_valid(assignment): + assignment = np.random.randint(1, num_group+1, [num_candidates]) + return assignment + + +default_space = { + 'synchronizer_types': ['PS', 'AR'], + 'maybe_partition': [True, False], + 'compressor': ['HorovodCompressor', 'NoneCompressor', 'HorovodCompressorEF'], + 'local_replication': [False], + 'partitionable_axes': [] +} + + +default_heuristics = { + 'ps_load_balancer': None, # None, 'christy', 'greedy', 'LP' + 'merge_scheme': None, # random, by_chunk, christy, ordered_balanced + 'chunk_size': -1, + 'num_group_bounds': [-1, MAX_INT32], + 'maybe_partition_bounds': [0, MAX_INT32], + 'maybe_partition_by_size': None, + 'num_partition_bounds': [2, MAX_INT32], + 'enable_single_node_no_partition': False, + 'same_synchronizer_for_parts': False, +} diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py new file mode 100644 index 0000000..3b215ee --- /dev/null +++ b/autodist/strategy/auto_strategy.py @@ -0,0 +1,60 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""An AutoStrategy using a trained linear simulator.""" + +from autodist.strategy.auto.base import AutoStrategyBase +from autodist.autosync.simulator.linear_simulator import LinearSimulator + +class AutoStrategy(AutoStrategyBase): + """ + AutoStrategy builder using a trained linear simulator + + It generates a suitable Strategy based on graph_item and resource_spec using a pretrained simulator weight. + This implementation currenlty provides a linear simulator weight trained on > 9000 data points. + """ + + def __init__(self): + space = { + 'synchronizer_types': ['PS', 'AR'], + 'maybe_partition': [True, False], + 'compressor': ['HorovodCompressor', 'NoneCompressor'], + 'local_replication': [True, False], + 'partitionable_axis': [], + } + heuristics = { + 'ps_load_balancer': 'sorted_christy', # None, 'christy', 'greedy', 'LP' + 'merge_scheme': 'ordered_balanced', # random, by_chunk, christy, ordered_balanced + 'num_group_bounds': [-1, 20], + 'num_partition_bounds': [2, 40], + 'enable_single_node_no_partition': False, + 'same_synchronizer_for_parts': False, + } + + simulator = LinearSimulator() + + super(AutoStrategy, self).__init__( + space=space, + heuristics=heuristics, + num_proposals=2000, + simulator=simulator + ) + + def build(self, graph_item, resource_spec): + candidates = self.propose_n(graph_item, resource_spec, self._num_proposals) + + # Assess all candidates and simply pick the highest-scored one + features, scores = self._simulator.inference(candidates) + best_index = scores.index(min(scores)) + return candidates[best_index] diff --git a/autodist/strategy/base.py b/autodist/strategy/base.py index 965e1ff..df562da 100644 --- a/autodist/strategy/base.py +++ b/autodist/strategy/base.py @@ -18,6 +18,8 @@ from abc import ABC, abstractmethod from datetime import datetime +from tensorflow.python.framework import tensor_shape + from autodist.const import DEFAULT_SERIALIZATION_DIR from autodist.graph_item import GraphItem from autodist.kernel.common.utils import get_op_name @@ -166,3 +168,37 @@ def compile(self, strategy): if self._device_resolver: strategy = self._resolve_devices(strategy) return strategy + + +def byte_size_load_fn(op): + """ + Load function that computes the byte size of a single-output `Operation`. + + Copied (with modifications) from tensorflow.contrib.training.python.training.device_setter. + + This is intended to be used with `"Variable"` ops, which have a single + `Tensor` output with the contents of the variable. However, it can also be + used for calculating the size of any op that has a single output. + + Intended to be used with `GreedyLoadBalancingStrategy`. + + Args: + op: An `Operation` with a single output, typically a "Variable" op. + + Returns: + The number of bytes in the output `Tensor`. + + Raises: + ValueError: if `op` does not have a single output, or if the shape of the + single output is not fully-defined. + """ + elem_size = op.dtype.size + shape = op.get_shape() + if not shape.is_fully_defined(): + # Due to legacy behavior, scalar "Variable" ops have output Tensors that + # have unknown shape when the op is created (and hence passed to this + # load function for placement), even though the scalar shape is set + # explicitly immediately afterward. + shape = tensor_shape.TensorShape(op.get_attr("shape")) + shape.assert_is_fully_defined() + return shape.num_elements() * elem_size diff --git a/autodist/strategy/partitioned_ps_strategy.py b/autodist/strategy/partitioned_ps_strategy.py index b1259a6..ecca253 100644 --- a/autodist/strategy/partitioned_ps_strategy.py +++ b/autodist/strategy/partitioned_ps_strategy.py @@ -15,13 +15,12 @@ """Partitioned PS StrategyBuilder with Greedy Load Balancer.""" from math import ceil -from tensorflow.python.framework import tensor_shape from autodist.const import ENV from autodist.kernel.common.op_info import CONTROL_FLOW_OPS from autodist.kernel.common.utils import get_consumers, get_op_name from autodist.kernel.partitioner import PartitionerConfig -from autodist.strategy.base import Strategy, StrategyBuilder +from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn from autodist.proto import strategy_pb2 @@ -133,37 +132,3 @@ def get_num_shards(var): if n % i == 0: return i return n - - -def byte_size_load_fn(op): - """ - Load function that computes the byte size of a single-output `Operation`. - - Copied (with modifications) from tensorflow.contrib.training.python.training.device_setter. - - This is intended to be used with `"Variable"` ops, which have a single - `Tensor` output with the contents of the variable. However, it can also be - used for calculating the size of any op that has a single output. - - Intended to be used with `GreedyLoadBalancingStrategy`. - - Args: - op: An `Operation` with a single output, typically a "Variable" op. - - Returns: - The number of bytes in the output `Tensor`. - - Raises: - ValueError: if `op` does not have a single output, or if the shape of the - single output is not fully-defined. - """ - elem_size = op.dtype.size - shape = op.get_shape() - if not shape.is_fully_defined(): - # Due to legacy behavior, scalar "Variable" ops have output Tensors that - # have unknown shape when the op is created (and hence passed to this - # load function for placement), even though the scalar shape is set - # explicitly immediately afterward. - shape = tensor_shape.TensorShape(op.get_attr("shape")) - shape.assert_is_fully_defined() - return shape.num_elements() * elem_size diff --git a/examples/linear_regression.py b/examples/linear_regression.py index d14a3f8..4145626 100644 --- a/examples/linear_regression.py +++ b/examples/linear_regression.py @@ -7,12 +7,14 @@ from autodist import AutoDist from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax +from autodist.strategy import AutoStrategy resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') def main(_): - autodist = AutoDist(resource_spec_file, AllReduce(128)) + # autodist = AutoDist(resource_spec_file, AllReduce(128)) + autodist = AutoDist(resource_spec_file, AutoStrategy()) TRUE_W = 3.0 TRUE_b = 2.0 diff --git a/test.py b/test.py new file mode 100644 index 0000000..7c6be13 --- /dev/null +++ b/test.py @@ -0,0 +1,48 @@ +import tensorflow as tf +import autodist + +with tf.Graph().as_default(), autodist.scope(): +########################################################################## + + train_dataset = tf.data.Dataset.from_tensor_slices( + (train_images, train_labels)).repeat(EPOCHS).shuffle(len(train_images)//2).batch(BATCH_SIZE) + + train_iterator = tf.compat.v1.data.make_one_shot_iterator(train_dataset).get_next() + + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dropout(0.1), + tf.keras.layers.Dense(10, activation='softmax') + ]) + loss_fn = tf.keras.losses.SparseCategoricalCrossentropy() + optimizer = tf.keras.optimizers.SGD() + + def train_step(inputs): + x, y = inputs + y_hat = model(x, training=True) + loss = loss_fn(y, y_hat) + all_vars = [] + for v in model.trainable_variables: + all_vars.append(v) + grads = tf.gradients(loss, all_vars) + update = optimizer.apply_gradients(zip(grads, all_vars)) + + return loss, update + + fetches = train_step(train_iterator) + ##################################################################### + # Change 3: Create distributed session. + # Instead of using the original TensorFlow session for graph execution, + # let's use AutoDist's distributed session, in which a computational + # graph for distributed training is constructed. + # + # [original line] + # >>> sess = tf.compat.v1.Session() + # + sess = autodist.create_distributed_session() + ##################################################################### + for _ in range(min(10, len(train_images) // BATCH_SIZE * EPOCHS)): + loss, _ = sess.run(fetches) + print(f"train_loss: {loss}") \ No newline at end of file diff --git a/tests/test_simulator.py b/tests/test_simulator.py new file mode 100644 index 0000000..7b3d7ed --- /dev/null +++ b/tests/test_simulator.py @@ -0,0 +1,7 @@ +from autodist.resource_spec import ResourceSpec +from autodist.simulator.utils import _resolve_device_address + +from autodist.cluster import SSHCluster +from autodist.kernel.device.resolver import DeviceResolver +from autodist.resource_spec import ResourceSpec +