From 245a9d8a11459c6e1a9e3c430272b5592f4b8db5 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Wed, 15 Jul 2020 16:37:15 -0400 Subject: [PATCH 01/11] initial commit of necessary files --- autodist/search/__init__.py | 0 autodist/search/random_search.py | 336 ++++++ autodist/simulator/__init__.py | 0 autodist/simulator/models/__init__.py | 0 autodist/simulator/models/base.py | 406 +++++++ .../simulator/models/rankrnn_simulator.py | 634 ++++++++++ .../models/rankrnn_simulator_penalty.py | 729 ++++++++++++ .../models/rankrnn_simulator_penalty_fast.py | 1027 +++++++++++++++++ autodist/simulator/test.py | 17 + autodist/simulator/train_linear.py | 123 ++ .../simulator/train_predefined_simulator.py | 343 ++++++ autodist/simulator/utils.py | 342 ++++++ autodist/strategy/auto/ar_group_assigner.py | 57 + autodist/strategy/auto/auto_strategy.py | 0 autodist/strategy/auto/ps_load_balancer.py | 67 ++ autodist/strategy/auto/random_strategy.py | 443 +++++++ 16 files changed, 4524 insertions(+) create mode 100644 autodist/search/__init__.py create mode 100644 autodist/search/random_search.py create mode 100644 autodist/simulator/__init__.py create mode 100644 autodist/simulator/models/__init__.py create mode 100644 autodist/simulator/models/base.py create mode 100644 autodist/simulator/models/rankrnn_simulator.py create mode 100644 autodist/simulator/models/rankrnn_simulator_penalty.py create mode 100644 autodist/simulator/models/rankrnn_simulator_penalty_fast.py create mode 100644 autodist/simulator/test.py create mode 100644 autodist/simulator/train_linear.py create mode 100644 autodist/simulator/train_predefined_simulator.py create mode 100644 autodist/simulator/utils.py create mode 100644 autodist/strategy/auto/ar_group_assigner.py create mode 100644 autodist/strategy/auto/auto_strategy.py create mode 100644 autodist/strategy/auto/ps_load_balancer.py create mode 100644 autodist/strategy/auto/random_strategy.py diff --git a/autodist/search/__init__.py b/autodist/search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/search/random_search.py b/autodist/search/random_search.py new file mode 100644 index 0000000..38fcd67 --- /dev/null +++ b/autodist/search/random_search.py @@ -0,0 +1,336 @@ +import json +import time +from multiprocessing import Process, Queue + +import copy +import numpy as np +import os + +from arion.const import DEFAULT_RANDOM_SEARCH_DIR +from arion.graph_item import GraphItem +from arion.resource_spec import ResourceSpec +from arion.strategy import RandomStrategy, AllReduce +from arion.utils import logging + + +def build_worker(queue, builder, gi, rs): + np.random.seed() + ret = builder.build(gi, rs) + queue.put(ret) + +def get_resource_specs(trial_resource_spec_dir): + resource_specs = [] + if os.path.isdir(trial_resource_spec_dir): + for file_name in os.listdir(trial_resource_spec_dir): + file_path = os.path.join(trial_resource_spec_dir, file_name) + if os.path.isfile(file_path) and file_path.endswith('.yml'): + resource_specs.append(file_path) + elif os.path.isfile(trial_resource_spec_dir): + resource_specs.append(trial_resource_spec_dir) + else: + raise ValueError("Cannot find valid files in {}".format(trial_resource_spec_dir)) + return resource_specs + + +def get_strategies(strategies_dir): + strategies = [] + if os.path.isdir(strategies_dir): + for file_name in os.listdir(strategies_dir): + file_path = os.path.join(strategies_dir, file_name) + if os.path.isfile(file_path) and file_path.split('/')[-1].startswith('2020'): + strategies.append(file_path) + elif os.path.isfile(strategies_dir): + strategies.append(strategies_dir) + else: + raise ValueError("Cannot find valid files in {}".format(strategies_dir)) + return strategies + + +class RandomSearch: + def __init__(self, + space, + heuristics, + search_params, + original_graph_item_path, + resource_file, + simulator=None, + trial_run_fn=None): + + self.space = space + self.heuristics = heuristics + self.search_params = search_params + + self.original_graph_item_path = original_graph_item_path + self.resource_file = resource_file + + self.simulator = simulator + self.trial_run_fn = trial_run_fn + + self._resource_spec = ResourceSpec(self.resource_file) + self._original_graph_item = GraphItem.deserialize(original_graph_item_path) + + def search(self): + # candidates, scores, features = self.propose(self.search_params['num_candidate_explore']) + candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore']) + n_pick = self.search_params['num_candidate_per_trial'] + + # cast them to be np arrays + if self.search_params['diversity_metric'] == 'embedding': + picked_candidates = self.submodular_pick_by_embedding(np.array(scores), + candidates, + np.stack(features), + n_pick, + self.search_params['simulation_weight'], + self.search_params['diversity_weight']) + elif self.search_params['diversity_metric'] == 'expression': + picked_candidates = self.submodular_pick_by_expression(np.array(scores), + candidates, + n_pick, + self.search_params['simulation_weight'], + self.search_params['diversity_weight']) + else: + raise ValueError('Unrecognized diversity metric...') + if self.trial_run_fn: + self.trial_run(picked_candidates, search_iteration=0) + + def propose(self, num_proposal, use_simulator=True): + builder = RandomStrategy(self.space, self.heuristics) + candidates = [] + features = [] + scores = [] + # np.random.seed(1) + idx = 0 + + while len(candidates) < num_proposal: + logging.info('Sampling strategy {}'.format(idx)) + start_time = time.time() + expr = builder.build(self._original_graph_item, self._resource_spec) + elapsed = time.time() - start_time + logging.info('Sampling strategy takes {}'.format(elapsed)) + builder.reset() + idx += 1 + logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) + if self.simulator and use_simulator: + start_time = time.time() + score, feature = self.simulator.simulate(expr, self._resource_spec) + elapsed = time.time() - start_time + logging.info('Inference strategy takes {}'.format(elapsed)) + if score > self.search_params['rejection_score']: + logging.info('strategy {} has score {} > {}, ' + 'rejected..'.format(idx, score, self.search_params['rejection_score'])) + continue + else: + candidates.append(expr) + features.append(feature) + scores.append(score[0]) + else: + candidates.append(expr) + features.append([]) + scores.append(0) + logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) + return candidates, scores, features + + def batch_propose(self, num_proposal, batch_size=32, use_simulator=True): + + builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)] + graph_items = [self._original_graph_item for _ in range(batch_size)] + rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)] + candidates = [] + features = [] + scores = [] + # np.random.seed(1) + idx = 0 + + while len(candidates) < num_proposal: + logging.info('Sampling strategy {}'.format(idx)) + start_time = time.time() + + q = Queue() + exprs = [] + prs = [] + for obj, arg1, arg2 in zip(builders, graph_items, rss): + prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2))) + prs[-1].start() + for pr in prs: + expr = q.get() # will block + exprs.append(expr) + for pr in prs: + pr.join() + + elapsed = time.time() - start_time + logging.info('Sampling strategy takes {}'.format(elapsed)) + for builder in builders: builder.reset() + logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) + if self.simulator and use_simulator: + start_time = time.time() + batch_score, batch_feature = self.simulator.simulate(exprs, rss) + elapsed = time.time() - start_time + logging.info('Inference strategy takes {}'.format(elapsed)) + for ite, expr in enumerate(exprs): + # print(batch_score[ite], batch_feature[ite].shape) + if batch_score[ite] > self.search_params['rejection_score']: + logging.info('strategy {} has score {} > {}, ' + 'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score'])) + else: + candidates.append(expr) + features.append(batch_feature[ite]) + scores.append(batch_score[ite]) + else: + for ite, expr in enumerate(exprs): + candidates.append(expr) + features.append([]) + scores.append(0) + idx += batch_size + logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) + return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal] + + def submodular_pick_by_embedding(self, + scores, + candidates, + candidate_features, + n_pick, + beta=1.0, + alpha=1.0): + n = len(scores) + assert n == len(candidate_features) + + ret = [] + sim = np.dot(candidate_features, candidate_features.T) + remain = list(range(len(scores))) + + for _ in range(n_pick): + tmp_delta = -scores[remain] * beta + if len(ret) > 0: + tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1) + max_x = tmp_delta.argmax() + max_x = remain[max_x] + + ret.append(max_x) + remain.remove(max_x) + + return [candidates[i] for i in ret] + + def submodular_pick_by_expression(self, + scores, + candidates, + n_pick, + beta=1.0, + alpha=1.0): + + def remove_group_or_reduction_destination(strategy): + tmp_strategy = copy.deepcopy(strategy) + for node in tmp_strategy.node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = '' + else: + synchronizer.group = 0 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = '' + else: + synchronizer.group = 0 + return tmp_strategy + + def estimate_difference(strategy, node_config_set): + score = 0 + for i, node in enumerate(strategy.node_config): + if_seen = False + for seen_node in node_config_set[i]: + if seen_node == node: + if_seen = True + break + if not if_seen: + score += 1 + return score + + assert len(scores) == len(candidates) + + node_config_set = [list() for _ in candidates[0].node_config] + remain = list(range(len(scores))) + ret = [] + for _ in range(n_pick): + max_x = -1 + max_delta = -1e9 + max_strategy_copy = None + + for x in remain: + tmp_strategy = remove_group_or_reduction_destination(candidates[x]) + diff_score = estimate_difference(tmp_strategy, node_config_set) + assert(diff_score <= len(tmp_strategy.node_config)) + # print('diff score {}..'.format(diff_score)) + tmp_delta = - scores[x] * beta + diff_score * alpha + if tmp_delta > max_delta: + max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy + max_diff_score = diff_score *alpha + max_simulation_score= -scores[x] + + print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta)) + ret.append(max_x) + remain.remove(max_x) + + # update the node config set + for i, node in enumerate(max_strategy_copy.node_config): + if_seen = False + for seen_node in node_config_set[i]: + if seen_node == node: + if_seen = True + break + if not if_seen: + node_config_set[i].append(node) + + return [candidates[i] for i in ret] + + def trial_run(self, + candidate_strategies=None, + search_iteration=0): + # serialize all candidates to folder + target_dir = os.path.join(DEFAULT_RANDOM_SEARCH_DIR, str(search_iteration)) + os.makedirs(target_dir, exist_ok=False) + self._serialize_candidate_strategies(candidate_strategies, target_dir) + self._save_hyperparams(target_dir) + + # launch trial run + self._launch_trial_run(target_dir) + + @staticmethod + def _serialize_candidate_strategies(candidate_strategies, target_dir): + for strategy in candidate_strategies: + path = os.path.join(target_dir, strategy.id) + strategy.serialize(path) + + def _launch_trial_run(self, strategies_dir): + strategies = get_strategies(strategies_dir) + + # this will launch distributed processes and take very long + self.trial_run_fn([self.resource_file], strategies) + + def _save_hyperparams(self, target_dir): + # copy the constraint file as well + space_file = os.path.join(target_dir, 'space.json') + with open(space_file, 'w') as f: + json.dump(self.space, f) + heuristics_file = os.path.join(target_dir, 'heuristics.json') + with open(heuristics_file, 'w') as f: + json.dump(self.heuristics, f) + search_params_file = os.path.join(target_dir, 'search_params.json') + with open(search_params_file, 'w') as f: + json.dump(self.search_params, f) + + def check_if_visited(self): + raise NotImplementedError() + + def check_if_trial_run(self): + raise NotImplementedError() + + # Don't use, only for debug. + def _single_run(self): + # builder = BalancedPartitionedPS() + # builder = PartitionedAR(chunk_size=1) + builder = AllReduce() + expr = builder.build(self._original_graph_item, self._resource_spec) + logging.info(expr) + self.trial_run([expr], search_iteration=0) diff --git a/autodist/simulator/__init__.py b/autodist/simulator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/simulator/models/__init__.py b/autodist/simulator/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/simulator/models/base.py b/autodist/simulator/models/base.py new file mode 100644 index 0000000..a12c147 --- /dev/null +++ b/autodist/simulator/models/base.py @@ -0,0 +1,406 @@ +"""Strategy Simulator.""" +import time +from collections import defaultdict +import numpy as np + +import tensorflow as tf +from tensorflow.python.client import timeline + +from arion.simulator.utils import NUM_RUNS +from arion.cluster import SSHCluster +from arion.graph_item import GraphItem +from arion.kernel.device.resolver import DeviceResolver +from arion.kernel.partitioner import PartitionerConfig +from arion.proto.synchronizers_pb2 import AllReduceSynchronizer +from arion.resource_spec import ResourceSpec +from arion.strategy.base import Strategy +from arion.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica +from arion.strategy.random_sample_strategy import VariableHelper, PartHelper +from arion.simulator.utils import INFINITY + +# tf.compat.v1.disable_eager_execution() + +class Var: + def __init__(self, + name=None, + is_sparse=False, + synchronizer=None, + shape=None, + dtype=None, + device=None, + compressor=None): + self.name = name + self.is_sparse = is_sparse + self.synchronizer = synchronizer + self.shape = shape + self.dtype = dtype + self.device = device + self.compressor = compressor + self.device = device + self.is_partition = False + + self.original_shape = self.shape + + @property + def var_size(self): + size = 1 + if self.shape: + for s in self.shape: + size *= s + return size + + @property + def original_var_size(self): + size = 1 + if self.original_shape: + for s in self.original_shape: + size *= s + return size + + def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1): + if not self.is_sparse: + return self.var_size + else: + if not self.shape: # scalar + return 1 + + emb_size = 1 + if len(self.shape) > 1: + for i in range(1, len(self.original_shape)): + emb_size = emb_size * self.original_shape[i] + + sparse_data_size = batch_size_per_gpu * seq_len * emb_size + + # estimate the embedding of this partition simply using a proportional formula + ret = sparse_data_size * self.var_size / self.original_var_size + return ret + +class Partition(Var): + def __init__(self, + name=None, + is_sparse=False, + synchronizer=None, + shape=None, + dtype=None, + device=None, + compressor=None, + part_id=0, + original_shape=None, + partition_str=None, + num_shards=1): + super(Partition, self).__init__(name, is_sparse, synchronizer, shape, dtype, device, compressor) + self.is_partition = True + self.part_id = part_id + self.partition_str = partition_str + self.original_shape = original_shape + self.num_shards = num_shards + +class Resource: + def __init__(self, cluster, device_resolver, graph_replicas, network_bandwidth, cpu_worker_list, + gpu_worker_list, max_num_local_replica, total_num_local_replica, worker_num_replicas): + self.cluster=cluster + self.device_resolver=device_resolver + self.graph_replicas=graph_replicas + self.network_bandwidth=network_bandwidth + self.cpu_worker_list=cpu_worker_list + self.gpu_worker_list=gpu_worker_list + self.max_num_local_replica=max_num_local_replica + self.total_num_local_replica=total_num_local_replica + self.worker_num_replicas=worker_num_replicas + +class SimulatorBase: + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, original_graph_item_path): + self._original_graph_item_path = original_graph_item_path + self._original_graph_item = GraphItem.deserialize(original_graph_item_path) + # self._resource_file = resource_file + # self._resource_spec = ResourceSpec(resource_file) + # self._cluster = SSHCluster(self._resource_spec) + # self._device_resolver = DeviceResolver(self._cluster) + # + # self._graph_replicas = [_resolve_device_address(k, self._device_resolver) + # for k, v in self._resource_spec.gpu_devices] + # + # # bandwidth + # self._network_bandwidth = self.network_bandwidth(self._resource_spec, self._device_resolver) + # # Other information + # self._cpu_worker_list = [_resolve_device_address(device, self._device_resolver) + # for device, _ in self._resource_spec.cpu_devices] + # self._gpu_worker_list = [_resolve_device_address(device, self._device_resolver) + # for device, _ in self._resource_spec.gpu_devices] + # self._max_num_local_replica = _max_num_local_replica(self._graph_replicas, self._cluster) + # self._total_num_local_replica = len(self._graph_replicas) + # self._worker_num_replicas = [_num_local_replica(cpu_worker, self._graph_replicas, self._cluster) + # for cpu_worker in self._cpu_worker_list] + + def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint: str): + """Return simulated runtime value by feeding features to the cost model.""" + raise NotImplementedError() + + def inference(self, inputs, checkpoint): + raise NotImplementedError() + + def load_checkpoint(self, checkpoint): + raise NotImplementedError() + + def save_checkpoint(self, model, checkpoint): + raise NotImplementedError() + + def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): + raise NotImplementedError() + + def extract_pre_feature(self, strategy: Strategy, resource_spec: ResourceSpec): + resource = self.setup_resource(resource_spec) + + name2var = {var.name: var for var_op, var in self._original_graph_item.trainable_var_op_to_var.items()} + + meta = defaultdict() + for node in strategy.node_config: + var_name = node.var_name + # for var_op, var in self._original_graph_item.trainable_var_op_to_var.items(): + # if var.name == var_name: + # break + var = name2var[var_name] + var_helper = VariableHelper(var, self._original_graph_item) + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_helper = PartHelper(i, var, pc) + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + compressor = getattr(synchronizer, 'compressor', None) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + resource.device_resolver) + + part_meta = Partition(name=part.var_name, + is_sparse=var_helper.is_sparse, + shape=part_helper.shape, + dtype=var_helper.dtype, + synchronizer=synchronizer, + part_id=i, + num_shards=pc.num_shards, + partition_str=pc.partition_str, + original_shape=var_helper.shape, + compressor=compressor, + device=device) + meta[part_meta.name] = part_meta + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + compressor = getattr(synchronizer, 'compressor', None) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + resource.device_resolver) + + var_meta = Var(name=var_name, + is_sparse=var_helper.is_sparse, + shape=var_helper.shape, + dtype=var_helper.dtype, + synchronizer=synchronizer, + compressor=compressor, + device=device) + meta[var_meta.name] = var_meta + return meta, resource + + def extract_pre_feature_legacy(self, strategy): + """Don't use now!!!""" + meta = defaultdict() + for node in strategy.node_config: + var_name = node.var_name + for var_op, var in self._original_graph_item.trainable_var_op_to_var.items(): + if var.name == var_name: + break + var_op_name = var_op.name + var_helper = VariableHelper(var, self._original_graph_item) + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + compressor = getattr(synchronizer, 'compressor', None) + if compressor is not None: + compressor = AllReduceSynchronizer.Compressor.Name(compressor) + reduction_destinations = getattr(synchronizer, 'reduction_destinations', None) + if not reduction_destinations or len(reduction_destinations) <= 1: + # this variable is not partitioned + device = reduction_destinations[0] if reduction_destinations else var.device + var_meta = Var(name=var_name, + is_sparse=var_helper.is_sparse, + shape=var_helper.shape, + dtype=var_helper.dtype, + synchronizer=synchronizer, + compressor=compressor, + device=device) + meta[var_meta.name] = var_meta + else: + # this variable is partitioned + num_partitions = len(reduction_destinations) + partition_list = [1] * len(var_helper.shape) + partition_list[0] = num_partitions + pc = PartitionerConfig(partition_list=partition_list) + for i, device in enumerate(reduction_destinations): + part_helper = PartHelper(i, var, pc) + part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i), + is_sparse=var_helper.is_sparse, + shape=part_helper.shape, + dtype=var_helper.dtype, + synchronizer=synchronizer, + part_id=i, + partition_str=pc.partition_str, + original_shape=var_helper.shape, + compressor=compressor, + device=device) + meta[part_meta.name] = part_meta + return meta + + def setup_resource(self, resource_spec: ResourceSpec): + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = self.network_bandwidth(resource_spec, device_resolver) + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + resource = Resource(cluster=cluster, + device_resolver=device_resolver, + graph_replicas=graph_replicas, + network_bandwidth=network_bandwidth, + cpu_worker_list=cpu_worker_list, + gpu_worker_list=gpu_worker_list, + max_num_local_replica=max_num_local_replica, + total_num_local_replica=total_num_local_replica, + worker_num_replicas=worker_num_replicas) + return resource + + @staticmethod + def network_bandwidth(resource_spec: ResourceSpec, device_resolver: DeviceResolver): + """Calculates all P2P network bandwidths between nodes in the cluster.""" + devices = [device for device, _ in resource_spec.devices] + resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices] + gpu_cpu_bw = 10000. # hardcode for now + network_bandwidth = {} # key: + for i in range(len(devices)): + if resolved_devices[i] not in network_bandwidth: + network_bandwidth[resolved_devices[i]] = {} + for j in range(i, len(devices)): + if resolved_devices[j] not in network_bandwidth: + network_bandwidth[resolved_devices[j]] = {} + ip_i = devices[i].split(':')[0] + ip_j = devices[j].split(':')[0] + if ip_i != ip_j: + network_bandwidth[resolved_devices[i]][resolved_devices[j]] \ + = GIGABITS * resource_spec.network_bandwidth[ip_i] + network_bandwidth[resolved_devices[j]][resolved_devices[i]] \ + = GIGABITS * resource_spec.network_bandwidth[ip_j] + else: + network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw + network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw + + return network_bandwidth + + @staticmethod + def min_bandwitdh(worker_list, bandwidth): + min_bandwidth = INFINITY + num_workers = len(worker_list) + for i in range(num_workers): + for j in range(i, num_workers): + min_bandwidth = min(min_bandwidth, bandwidth[worker_list[j]][worker_list[i]]) + + @property + def original_graph_item_path(self): + return self._original_graph_item_path + + # @property + # def resource_file(self): + # return self._resource_file + + @staticmethod + def calculate_op_timings(fetches): + # Simple implementation. Calculate averaged run time of certain steps. + init_op = tf.compat.v1.initialize_all_variables() + outside_times = [] + + with tf.compat.v1.Session() as sess: + sess.run(init_op) + for i in range(NUM_RUNS): + start = time.time() + sess.run(fetches) + end = time.time() + outside_times.append(end - start) + comp_time_in_sec = np.mean(np.array(outside_times[1:])) + return comp_time_in_sec + + @staticmethod + def profile_on_single_machine(fetches): + # calculate computation time of every op + init_op = tf.compat.v1.initialize_all_variables() + op_name2runtime = defaultdict(list) + outside_times = [] + all_times = [] + + options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) + run_metadata = tf.compat.v1.RunMetadata() + with tf.compat.v1.Session() as sess: + sess.run(init_op) + for i in range(NUM_RUNS): + start = time.time() * 1000 + sess.run(fetches) + end = time.time() * 1000 + outside_times.append(end - start) + + sess.run(fetches, options=options, run_metadata=run_metadata) + + fetched_timeline = timeline.Timeline(run_metadata.step_stats) + chrome_trace = fetched_timeline.generate_chrome_trace_format() # necessary + for event in fetched_timeline._chrome_trace._events: + # print('\n') + # print(list(event.keys())) + # for key in list(event.keys()): + # print(key, event[key]) + if 'dur' in event: + op_name2runtime[event['args']['name']].append(event['dur']) + # todo: to be more accurate, add tid (thread/lanes id) + + mean_outside_time = np.mean(np.array(outside_times[1:])) + print('mean outside_times: ', mean_outside_time) + print(outside_times) + # print('average all_times: ', np.mean(np.array(all_times))) + + op_name2meanruntime = {} + for op_name, runtimes in op_name2runtime.items(): + runtimes = np.array(runtimes) + if len(runtimes) > 1: # Do not compute operations that only run once for all steps. + mean = np.mean(np.array(runtimes[1:])) + op_name2meanruntime[op_name] = mean + print(op_name, mean) + # print(op_name2runtime[op_name]) + + total_op_time = sum([mean_runtime for op_name, mean_runtime in op_name2meanruntime.items()]) + print('total_op_time', total_op_time / 1000.) + # total_op_time = [sum([runtime[i] for op_name, runtime in op_name2runtime.items()]) + # for i in range(self.num_runs)] + # print('total_op_time', np.mean(np.array(total_op_time)), total_op_time) + + return mean_outside_time + + # @staticmethod + # def _calculate_op_timings(graph_item: GraphItem): + # """ + # Given a graph, calculates an expected running time for each (op, input_size) pair. + # + # Args: + # graph_item (GraphItem): The input graph. + # + # Returns: + # Dict mapping (op, input_size) to time. + # """ + # all_ops = {} + # for op in graph_item.graph.get_operations(): + # input_shapes = tuple((tuple(inp.shape.dims) for inp in op.inputs)) + # op_type = op.type + # all_ops[(op_type, input_shapes)] = ops.Graph() + # + # for ((op, shape), graph) in all_ops.items(): + # with graph.as_default(): + # getattr(tensorflow.raw_ops, op) diff --git a/autodist/simulator/models/rankrnn_simulator.py b/autodist/simulator/models/rankrnn_simulator.py new file mode 100644 index 0000000..4459515 --- /dev/null +++ b/autodist/simulator/models/rankrnn_simulator.py @@ -0,0 +1,634 @@ +"""Strategy RankNetSimulator.""" +import glob +import json +import sys +from datetime import datetime +from pathlib import Path +from string import digits + +import numpy as np +import os +import tensorflow as tf +tf.compat.v1.disable_eager_execution() + +import arion +from arion.graph_item import GraphItem +from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from arion.simulator.models.base import SimulatorBase +from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS +from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica +from arion.strategy.random_sample_strategy import VariableHelper, PartHelper +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.cluster import SSHCluster +from arion.kernel.device.resolver import DeviceResolver +from arion.kernel.partitioner import PartitionerConfig +from arion.simulator.models.predefined_simulator import PredefinedSimulator + +import torch +import torch.nn as nn + +TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# feature settings +MAX_NUM_WORKERS = 16 +MAX_NUM_GROUPS = 600 +MAX_NUM_VARS = 500 +MAX_NUM_PARS = 1500 + +# model size +FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15 +PARTITION_MLP_HIDDEN = 128 +PARTITION_MLP_OUT = 32 +STEM_RNN_HIDDEN = 128 +BIDIECTIONAL = True +NUM_RNN_LAYERS = 3 + +# trainer setting +BATCH_SIZE = 64 +LR = 3e-4 +WD = 3e-4 + +GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item', + 'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item', + 'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item', + 'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item', + 'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item', + 'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item', + 'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l', + 'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l', + 'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l', + 'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'} + +def get_model(path_): + if 'densenet121' in path_: + return 'densenet121' + elif 'ncf' in path_: + return 'ncf' + elif 'inceptionv3' in path_: + return 'inceptionv3' + elif 'resnet101' in path_: + return 'resnet101' + elif 'resnet50' in path_: + return 'resnet50' + elif 'vgg16' in path_: + return 'vgg16' + elif 'bert' in path_ and '12l' in path_: + return 'bert_12l' + elif 'bert' in path_ and '6l' in path_: + return 'bert_6l' + elif 'bert' in path_ and '3l' in path_: + return 'bert_3l' + elif 'bert' in path_ and 'large' in path_: + return 'bert_large' + else: + return None + +class RankRNN(nn.Module): + def __init__(self, input_size=FEATURE_SIZE, + partition_mlp_hidden=PARTITION_MLP_HIDDEN, + partition_mlp_out=PARTITION_MLP_OUT, + stem_rnn_hidden=STEM_RNN_HIDDEN, + num_rnn_layers=NUM_RNN_LAYERS, + bidirectional=BIDIECTIONAL): + super(RankRNN, self).__init__() + self.partition_mlp_out = partition_mlp_out + # self.num_rnn_layers = num_rnn_layers + self.stem_rnn_hidden = stem_rnn_hidden + self.partition_mlp = nn.Sequential(nn.Linear(input_size, partition_mlp_hidden), + nn.ReLU(), + # nn.Linear(partition_mlp_hidden, partition_mlp_hidden), + # nn.ReLU(), + nn.Linear(partition_mlp_hidden, partition_mlp_out), + ) + + self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional) + self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1) + + self.relu = nn.ReLU() + + def forward(self, features, par_indices, var_nums): + + x = features.float() + # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float() + x = self.partition_mlp(x) + + x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype) + x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x) + + # Set initial hidden and cell states + # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + + # Forward propagate LSTM + x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False) + out, (ht, ct) = self.stem_rnn(x1) # out: tensor of shape (batch_size, seq_length, hidden_size) + + # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None] + out = ht.permute(1, 0, 2).reshape(x.shape[0], -1) + # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]]) + # print(ht.permute(1, 0, 2).shape, x.shape) + out = self.final_fc(out) + return out + +class TrainTensorDataset(torch.utils.data.Dataset): + """TensorDataset with support of transforms. + """ + def __init__(self, tensors): + assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) + self.tensors = tensors + + def __getitem__(self, index): + x = self.tensors[0][index] + x = self.perturbe_device_and_group(x) + x1 = self.tensors[1][index] + x2 = self.tensors[2][index] + + y = self.tensors[3][index] + + return x, x1, x2, y + + def __len__(self): + return self.tensors[0].size(0) + + def perturbe_device_and_group(self, x): + # perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32) + # perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32) + # mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids] + # mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids] + # x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1) + return x + + +def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas): + ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)] + + if device is not None: + ret[0][device] = 1 + + group = getattr(synchronizer, 'group', None) + if group is not None: + assert group < MAX_NUM_GROUPS, group + ret[1][group] = 1 + + compressor = getattr(synchronizer, 'compressor', None) + if compressor is not None: + if compressor in ["PowerSGDCompressor", 3]: + ret[2][2] = 1 + elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]: + ret[2][1] = 1 + elif compressor in ["NoneCompressor", 0]: + ret[2][0] = 1 + else: + raise ValueError('Compressor does not exist: {}'.format(compressor)) + + local_replication = getattr(synchronizer, 'local_replication', None) + if isinstance(synchronizer, PSSynchronizer): + synchronizer = 0 + if int(local_replication) == 0: + if int(is_sparse) == 0: + ret[3][0] = 1 + else: + ret[3][1] = 1 + else: + if int(is_sparse) == 0: + ret[3][2] = 1 + else: + ret[3][3] = 1 + else: + ret[3][4] = 1 + ret[4] = np.array([size_ratio, bd, num_replicas]) + + return np.concatenate(ret) + +def connvert_feature(strategy, resource_spec, graph_item): + + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) + network_bandwidth = network_bandwidth + min_network_bandwidth = network_bandwidth.min() + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + + num_vars = 0 + total_size_vars = 0 + for var_op, var in graph_item.trainable_var_op_to_var.items(): + num_vars += 1 + if var.initial_value.shape.ndims: + var_helper = VariableHelper(var, graph_item) + if var_helper.is_sparse: + total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) + else: + total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) + assert num_vars < MAX_NUM_VARS, num_vars + var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32) + partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) + + cnt = 0 + for node_id, node in enumerate(strategy.node_config): + var_name = node.var_name + for var_op, var in graph_item.trainable_var_op_to_var.items(): + if var.name == var_name: + break + var_helper = VariableHelper(var, graph_item) + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_helper = PartHelper(i, var, pc) + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + return var_partition_features, partition_indice, np.array(node_id+1) + +def create_predefined_features(strategy, resource_spec, predefined_simulator): + + var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec) + + features = [] + for var_name, sync_time in var_sync_time.items(): + if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies. + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + else: # AR + transmission = sync_time['transmission'] + is_ps = False + + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)] + features.append(feat) + features = np.array(features, dtype=np.float) + return features + +class RankRNNSimulator(SimulatorBase): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + original_graph_item_path, + fetches=None, + batch_size=1, + seq_len=1, + checkpoint=None): + + super(RankRNNSimulator, self).__init__(original_graph_item_path=original_graph_item_path) + print("It's using RankNet simulator.") + self._fetches = fetches + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._checkpoint = checkpoint + self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) + if self._checkpoint: + self._model = RankRNN().to(TORCH_DEVICE) + self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu'))) + + def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None): + cost = self.predict(strategy, resource_spec, strategy_path, checkpoint) + return cost + + def predict(self, + strategy, + resource_spec, + strategy_path=None, + checkpoint=None): + if checkpoint is None: + if self._checkpoint is None: + raise ValueError("checkpoint is None: {}".format(checkpoint)) + else: + model = self._model + else: + model = RankRNN().to(TORCH_DEVICE) + model.load_state_dict(torch.load(checkpoint)) + if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz')) + var_partition_features, partition_indice, var_num, _ = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + else: + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, resource_spec, self._original_graph_item) + + if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz')) + predefined_features = loaded['x4'] + else: + predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator) + + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE) + partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE) + var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE) + + return model(var_partition_features, partition_indice, var_num).view(-1).data.cpu().numpy() + +class RankNetTrainer(): + + def __init__(self, + checkpoint=None, + batch_size_per_gpu=256, + seq_len=1, + seed=1): + self._batch_size_per_gpu = batch_size_per_gpu + self._seq_len = seq_len + self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()} + self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()} + self.model = RankRNN().to(TORCH_DEVICE) + if checkpoint: + self.model.load_state_dict(torch.load(checkpoint)) + self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR, weight_decay=WD) + print("It's using RankNet trainer.") + + def train(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same', num_epochs=200): + + features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()} + for training_path in path_list: + for path in Path(training_path).rglob('strategies'): + strategy_paths = glob.glob(os.path.join(path, '*')) + for strategy_path in strategy_paths: + if 'json' in strategy_path or \ + 'bert_large_batch_8_orca_16_group_2/' in strategy_path: + continue + model = get_model(strategy_path) + if model is None: + if not ('densenets169' in strategy_path or 'densenets201' in strategy_path): + assert False, strategy_path + continue + rs_path = strategy_path.replace('strategies', 'resource_specs') + runtime_path = strategy_path.replace('strategies', 'runtimes') + npz_path = (strategy_path+'.npz').replace('strategies', 'npz') + if not os.path.isfile(rs_path): + rs_path += '.yml' + if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)): + continue + if not os.path.exists(os.path.dirname(npz_path)): + os.makedirs(os.path.dirname(npz_path)) + + if not os.path.isfile(npz_path): + strategy = Strategy.deserialize(path=strategy_path) + rs = ResourceSpec(resource_file=rs_path) + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, rs, self.graph_items[model]) + label = np.array(json.load(open(runtime_path))['average']) + np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label) + else: + loaded = np.load(npz_path) + var_partition_features, partition_indice, var_num, label = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + + if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')): + predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model]) + np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features) + else: + loaded = np.load(npz_path.replace('.npz', '_pdf.npz')) + predefined_features = loaded['x4'] + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path or 'vgg_random_orca_11' in strategy_path) # comment here + print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3]) + features[model][is_aws][0].append(var_partition_features) + features[model][is_aws][1].append(partition_indice) + features[model][is_aws][2].append(var_num) + features[model][is_aws][3].append(label) + + for k, _ in GRAPH_ITEM_PATHS.items(): + for i1 in range(2): + for i2 in range(4): + if len(features[k][i1][i2]) > 1: + features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16) + print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape) + else: + features[k][i1][i2] = None + + train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0) + train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0) + train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0) + train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0) + + if type(valid_patterns) == str and valid_patterns == 'same': + permt = np.random.permutation(train_features.shape[0]) + split = int(len(permt) * 0.8) + val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]] + train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]] + else: + val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0) + val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0) + val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0) + val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0) + + # comment here + permt = np.random.permutation(val_features.shape[0]) + split = int(len(permt) * 0.7) + train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0) + + val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]] + + print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min()) + + ## train the model + trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE))) + testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE)) + trainloader = torch.utils.data.DataLoader(dataset=trainset, + batch_size=BATCH_SIZE, + shuffle=True) + testloader = torch.utils.data.DataLoader(dataset=testset, + batch_size=32, + shuffle=False) + best_val_acc = 0. + checkpoint_path = 'model_train_on_{}-{}_new.ckpt'.format(train_patterns[0][0], 'orca' if train_patterns[0][1] == 0 else 'aws') + for epoch in range(num_epochs): + if epoch == int(num_epochs*2./5. - 1): + for param_group in self.optimizer.param_groups: param_group['lr'] = 3e-4 + if epoch == int(num_epochs*4./5. - 1): + for param_group in self.optimizer.param_groups: param_group['lr'] = 1e-4 + + labels = [] + outputs = [] + for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(trainloader): + + # Forward pass + outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze() + + true_comp = (labels_b[:, None] > labels_b[None, :]).float() * 2 - 1 + pred_comp = outputs_b[:, None] - outputs_b[None, :] + loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp) + loss = loss.tril(-1).mean() + + # Backward and optimize + self.optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(self.model.stem_rnn.parameters(), 0.25) + self.optimizer.step() + + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + + with torch.no_grad(): + labels = [] + outputs = [] + for features_b, par_indices_b, var_nums_b, labels_b in testloader: + + # Forward pass + outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze() + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + if acc.item() > best_val_acc: + best_val_acc = acc.item() + torch.save(self.model.state_dict(), checkpoint_path) + print('Saved model to {}'.format(checkpoint_path)) + print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc)) + return checkpoint_path + + +if __name__ == '__main__': + + trainer = RankNetTrainer() + checkpoint_path = trainer.train( + [ + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20', + # '/users/hzhang2/oceanus_cost_model_training_data/densenet', + # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', + '/users/hzhang2/oceanus_cost_model_training_data/vgg16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert', + ], + [ + # ('ncf', 0), #('ncf', 1), + # ('densenet121', 0), ('densenet121', 1), + # ('inceptionv3', 0), ('inceptionv3', 1), + # ('resnet101', 0), ('resnet101', 1), + # ('resnet50', 0), ('resnet50', 1), + # ('bert_12l', 0), ('bert_12l', 1), + # ('bert_6l', 0), ('bert_6l', 1), + # ('bert_3l', 0), ('bert_3l', 1), + # ('bert_large', 0), ('bert_large', 1), + ('vgg16', 0), #('vgg16', 1), + ], + [('vgg16', 1)], + num_epochs=200) + # checkpoint_path = 'model_train_on_vgg16-orca.ckpt' + test_list = [ + '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15', + '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11', #TARGET: 0.9 + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + ] + + for data_folder in test_list: + simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)], + batch_size=256, + seq_len=1, + checkpoint=checkpoint_path) + + runtimes_folder = os.path.join(data_folder, 'runtimes') + results = {} + averages= [] + scores = [] + for name in os.listdir(runtimes_folder): + strategy_path = os.path.join(data_folder, 'strategies', name) + rs_path = os.path.join(data_folder, 'resource_specs', name ) + if not os.path.isfile(rs_path): + rs_path += '.yml' + runtime_path = os.path.join(runtimes_folder, name) + + with open(runtime_path, 'r') as f: + runtimes = json.load(f) + average = np.array(runtimes['average']) + + s = Strategy.deserialize(strategy_path) + rs = ResourceSpec(resource_file=rs_path) + score = simulator.simulate(s, rs, strategy_path) + + results[name] = (average, score) + averages.append(average) + scores.append(score) + + # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])} + # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])} + # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])} + # print('Sorted by runtime.......................') + # for _, (rt, prediction) in sorted_by_runtime.items(): + # print('runtime {} prediction {}'.format(rt, prediction)) + + y_train = np.array(averages) + test_score = np.array(scores) + true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :]) + pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :]) + equal = (true_comp == pred_comp).astype(np.int) + test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1) + + print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc)) diff --git a/autodist/simulator/models/rankrnn_simulator_penalty.py b/autodist/simulator/models/rankrnn_simulator_penalty.py new file mode 100644 index 0000000..380fa10 --- /dev/null +++ b/autodist/simulator/models/rankrnn_simulator_penalty.py @@ -0,0 +1,729 @@ +"""Strategy RankNetSimulator.""" +import glob +import json +import sys +from datetime import datetime +from pathlib import Path +from string import digits + +import numpy as np +import os +import tensorflow as tf +tf.compat.v1.disable_eager_execution() + +import arion +from arion.graph_item import GraphItem +from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from arion.simulator.models.base import SimulatorBase +from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS +from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica +from arion.strategy.random_sample_strategy import VariableHelper, PartHelper +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.cluster import SSHCluster +from arion.kernel.device.resolver import DeviceResolver +from arion.kernel.partitioner import PartitionerConfig +from arion.simulator.models.predefined_simulator import PredefinedSimulator + +import torch +import torch.nn as nn + +TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# feature settings +MAX_NUM_WORKERS = 16 +MAX_NUM_GROUPS = 600 +MAX_NUM_VARS = 500 +MAX_NUM_PARS = 1500 +FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15 + +# model size +PARTITION_MLP_HIDDEN = 128 +PARTITION_MLP_OUT = 32 +STEM_RNN_HIDDEN = 128 +BIDIECTIONAL = True +BATCH_SIZE = 96 + +NUM_RNN_LAYERS = 3 +SCORE_TH = 0.005 +LR = 2e-3 +WD = 3e-4 +DATA_AUG = False +IN_LAYERS = 2 +OUT_LAYERS = 1 + +# ncf used: +# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020 +# noaug +# PARTITION_MLP_HIDDEN = 128 +# PARTITION_MLP_OUT = 32 +# STEM_RNN_HIDDEN = 128 +# BIDIECTIONAL = True +# NUM_RNN_LAYERS = 4 +# BATCH_SIZE = 64 +# LR = 1e-3 +# WD = 4e-4 + +# vgg used: +# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374 +# noaug +# PARTITION_MLP_HIDDEN = 128 +# PARTITION_MLP_OUT = 32 +# STEM_RNN_HIDDEN = 128 +# BIDIECTIONAL = True +# NUM_RNN_LAYERS = 3 +# BATCH_SIZE = 64 +# LR = 1e-3 +# WD = 3e-4 + +GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item', + 'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item', + 'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item', + 'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item', + 'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item', + 'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item', + 'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l', + 'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l', + 'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l', + 'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'} + +def get_model(path_): + if 'densenet121' in path_: + return 'densenet121' + elif 'ncf' in path_: + return 'ncf' + elif 'inceptionv3' in path_: + return 'inceptionv3' + elif 'resnet101' in path_: + return 'resnet101' + elif 'resnet50' in path_: + return 'resnet50' + elif 'vgg16' in path_: + return 'vgg16' + elif 'bert' in path_ and '12l' in path_: + return 'bert_12l' + elif 'bert' in path_ and '6l' in path_: + return 'bert_6l' + elif 'bert' in path_ and '3l' in path_: + return 'bert_3l' + elif 'bert' in path_ and 'large' in path_: + return 'bert_large' + else: + return None + +class RankRNN(nn.Module): + def __init__(self, input_size=FEATURE_SIZE, + partition_mlp_hidden=PARTITION_MLP_HIDDEN, + partition_mlp_out=PARTITION_MLP_OUT, + stem_rnn_hidden=STEM_RNN_HIDDEN, + num_rnn_layers=NUM_RNN_LAYERS, + in_layers=IN_LAYERS, + out_layers=OUT_LAYERS, + bidirectional=BIDIECTIONAL): + super(RankRNN, self).__init__() + self.partition_mlp_out = partition_mlp_out + # self.num_rnn_layers = num_rnn_layers + self.stem_rnn_hidden = stem_rnn_hidden + tmp = [nn.Linear(input_size, partition_mlp_hidden)] + for _ in range(in_layers-2): + tmp.append(nn.ReLU()) + tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden)) + tmp.append(nn.ReLU()) + tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out)) + + self.partition_mlp = nn.Sequential(*tmp) + + self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional) + + if out_layers == 1: + self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1) + elif out_layers == 2: + self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128), + nn.ReLU(), + nn.Linear(128, 1)) + + self.relu = nn.ReLU() + + def forward(self, features, par_indices, var_nums, return_feature=False): + + x = features.float() + # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float() + x = self.partition_mlp(x) + + x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype) + x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x) + + # Set initial hidden and cell states + # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + + # Forward propagate LSTM + x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False) + out, (ht, ct) = self.stem_rnn(x1) # out: tensor of shape (batch_size, seq_length, hidden_size) + + # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None] + out = ht.permute(1, 0, 2).reshape(x.shape[0], -1) + # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]]) + # print(ht.permute(1, 0, 2).shape, x.shape) + if return_feature: + return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt()) + else: + return self.final_fc(out) + +class TrainTensorDataset(torch.utils.data.Dataset): + """TensorDataset with support of transforms. + """ + def __init__(self, tensors): + assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) + self.tensors = tensors + + def __getitem__(self, index): + x = self.tensors[0][index] + x = self.perturbe_device_and_group(x) + x1 = self.tensors[1][index] + x2 = self.tensors[2][index] + + y = self.tensors[3][index] + + return x, x1, x2, y + + def __len__(self): + return self.tensors[0].size(0) + + def perturbe_device_and_group(self, x): + if DATA_AUG: + perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32) + perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32) + mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids] + mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids] + x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1) + return x + + +def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas): + ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)] + + if device is not None: + ret[0][device] = 1 + + group = getattr(synchronizer, 'group', None) + if group is not None: + assert group < MAX_NUM_GROUPS, group + ret[1][group] = 1 + + compressor = getattr(synchronizer, 'compressor', None) + if compressor is not None: + if compressor in ["PowerSGDCompressor", 3]: + ret[2][2] = 1 + elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]: + ret[2][1] = 1 + elif compressor in ["NoneCompressor", 0]: + ret[2][0] = 1 + else: + raise ValueError('Compressor does not exist: {}'.format(compressor)) + + local_replication = getattr(synchronizer, 'local_replication', None) + if isinstance(synchronizer, PSSynchronizer): + synchronizer = 0 + if int(local_replication) == 0: + if int(is_sparse) == 0: + ret[3][0] = 1 + else: + ret[3][1] = 1 + else: + if int(is_sparse) == 0: + ret[3][2] = 1 + else: + ret[3][3] = 1 + else: + ret[3][4] = 1 + ret[4] = np.array([size_ratio, bd, num_replicas]) + + return np.concatenate(ret) + +def connvert_feature(strategy, resource_spec, graph_item): + + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) + network_bandwidth = network_bandwidth + min_network_bandwidth = network_bandwidth.min() + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + + num_vars = 0 + total_size_vars = 0 + for var_op, var in graph_item.trainable_var_op_to_var.items(): + num_vars += 1 + if var.initial_value.shape.ndims: + var_helper = VariableHelper(var, graph_item) + if var_helper.is_sparse: + total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) + else: + total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) + assert num_vars < MAX_NUM_VARS, num_vars + var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32) + partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) + + cnt = 0 + for node_id, node in enumerate(strategy.node_config): + var_name = node.var_name + for var_op, var in graph_item.trainable_var_op_to_var.items(): + if var.name == var_name: + break + var_helper = VariableHelper(var, graph_item) + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_helper = PartHelper(i, var, pc) + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + return var_partition_features, partition_indice, np.array(node_id+1) + +def create_predefined_features(strategy, resource_spec, predefined_simulator): + + var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec) + + features = [] + for var_name, sync_time in var_sync_time.items(): + if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies. + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + else: # AR + transmission = sync_time['transmission'] + is_ps = False + + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)] + features.append(feat) + features = np.array(features, dtype=np.float) + return features + +class RankRNNSimulatorPenalty(SimulatorBase): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + original_graph_item_path, + num_rnn_layers, + in_layers, + out_layers, + fetches=None, + batch_size=1, + seq_len=1, + checkpoint=None): + + super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path) + print("It's using RankNet simulator.") + self._fetches = fetches + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._checkpoint = checkpoint + self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) + if self._checkpoint: + self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE) + self._model.load_state_dict(torch.load(self._checkpoint)) + + def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None): + score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint) + return score.view(-1).data.cpu().numpy(), feature.view(-1).data.cpu().numpy() + + + def predict(self, + strategy, + resource_spec, + strategy_path=None, + checkpoint=None): + if checkpoint is None: + if self._checkpoint is None: + raise ValueError("checkpoint is None: {}".format(checkpoint)) + else: + model = self._model + else: + model = RankRNN().to(TORCH_DEVICE) + model.load_state_dict(torch.load(checkpoint)) + if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz')) + var_partition_features, partition_indice, var_num, _ = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + else: + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, resource_spec, self._original_graph_item) + + if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz')) + predefined_features = loaded['x4'] + else: + predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator) + + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE) + partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE) + var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE) + + return model(var_partition_features, partition_indice, var_num, True) + +class RankNetTrainer(): + + def __init__(self, + batch_size_per_gpu=256, + seq_len=1, + seed=1): + self._batch_size_per_gpu = batch_size_per_gpu + self._seq_len = seq_len + self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()} + self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()} + self.best_acc = 0. + print("It's using RankNet trainer.") + + def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'): + features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()} + for training_path in path_list: + for path in Path(training_path).rglob('strategies'): + strategy_paths = glob.glob(os.path.join(path, '*')) + # strategy_paths = np.random.permutation(list(strategy_paths)) + for strategy_path in strategy_paths: + if 'json' in strategy_path or \ + 'bert_large_batch_8_orca_16_group_2/' in strategy_path: + continue + model = get_model(strategy_path) + if model is None: + if not ('densenets169' in strategy_path or 'densenets201' in strategy_path): + assert False, strategy_path + continue + rs_path = strategy_path.replace('strategies', 'resource_specs') + runtime_path = strategy_path.replace('strategies', 'runtimes') + npz_path = (strategy_path+'.npz').replace('strategies', 'npz') + if not os.path.isfile(rs_path): + rs_path += '.yml' + if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)): + continue + if not os.path.exists(os.path.dirname(npz_path)): + os.makedirs(os.path.dirname(npz_path)) + + if not os.path.isfile(npz_path): + strategy = Strategy.deserialize(path=strategy_path) + rs = ResourceSpec(resource_file=rs_path) + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, rs, self.graph_items[model]) + label = np.array(json.load(open(runtime_path))['average']) + np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label) + else: + loaded = np.load(npz_path) + var_partition_features, partition_indice, var_num, label = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + + if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')): + predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model]) + np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features) + else: + loaded = np.load(npz_path.replace('.npz', '_pdf.npz')) + predefined_features = loaded['x4'] + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + # is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here + is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path) + # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3]) + features[model][is_aws][0].append(var_partition_features) + features[model][is_aws][1].append(partition_indice) + features[model][is_aws][2].append(var_num) + features[model][is_aws][3].append(label) + + for k, _ in GRAPH_ITEM_PATHS.items(): + for i1 in range(2): + for i2 in range(4): + if len(features[k][i1][i2]) > 1: + features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16) + print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape) + else: + features[k][i1][i2] = None + + train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0) + train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0) + train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0) + train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0) + + if type(valid_patterns[0]) == str and valid_patterns[0] == 'same': + rng = np.random.RandomState(1) + permt = rng.permutation(train_features.shape[0]) + split = int(len(permt) * 0.7) + val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]] + train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]] + else: + val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0) + val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0) + val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0) + val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0) + + # comment here + rng = np.random.RandomState(1) + permt = rng.permutation(val_features.shape[0]) + split = int(len(permt) * 0.7) + train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0) + + val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]] + label_max = max(train_labels.max(), val_labels.max()) + label_min = min(train_labels.min(), val_labels.min()) + train_labels = (train_labels-label_min)/(label_max-label_min) + val_labels = (val_labels-label_min)/(label_max-label_min) + print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) + + ## train the model + trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE))) + testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE)) + self.trainloader = torch.utils.data.DataLoader(dataset=trainset, + batch_size=BATCH_SIZE, + shuffle=True) + self.testloader = torch.utils.data.DataLoader(dataset=testset, + batch_size=32, + shuffle=False) + + def train(self, name='', num_epochs=200, checkpoint=None): + + checkpoint_path = 'model_on_{}.ckpt'.format(name) + print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS) + + np.random.seed(1) + torch.manual_seed(1) + torch.cuda.manual_seed_all(1) + model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE) + if checkpoint: + model.load_state_dict(torch.load(checkpoint)) + optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD) + + best_val_acc = 0. + for epoch in range(num_epochs): + if epoch == int(num_epochs*2./5. - 1): + for param_group in optimizer.param_groups: param_group['lr'] = 3e-4 + if epoch == int(num_epochs*4./5. - 1): + for param_group in optimizer.param_groups: param_group['lr'] = 1e-4 + + labels = [] + outputs = [] + for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader): + + # Forward pass + outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() + + par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1) + + true_comp = ( + (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() + + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int() + + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int() + ) > 0 + true_comp = true_comp.float() * 2 - 1 + pred_comp = outputs_b[:, None] - outputs_b[None, :] + loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp) + loss = loss.tril(-1).mean() + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25) + optimizer.step() + + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + + with torch.no_grad(): + labels = [] + outputs = [] + for features_b, par_indices_b, var_nums_b, labels_b in self.testloader: + + # Forward pass + outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + if acc.item() > best_val_acc: + best_val_acc = acc.item() + if best_val_acc > self.best_acc: + print('Saved model @ acc', best_val_acc) + torch.save(model.state_dict(), checkpoint_path) + self.best_acc = best_val_acc + # print('Saved model to {}'.format(checkpoint_path)) + if epoch == num_epochs - 1: + print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc)) + return checkpoint_path + + +if __name__ == '__main__': + + if True: + trainer = RankNetTrainer() + trainer.load_data([ + '/users/hzhang2/oceanus_cost_model_training_data/vgg16', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only', + # '/users/hzhang2/oceanus_cost_model_training_data/densenet', + # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', + ], + [ + ('vgg16', 0), #('vgg16', 1), + # ('ncf', 0), #('ncf', 1), + # ('bert_large', 1), #('bert_large', 1), + # not used: + # ('densenet121', 0), ('densenet121', 1), + # ('inceptionv3', 0), ('inceptionv3', 1), + # ('resnet101', 0), ('resnet101', 1), + # ('resnet50', 0), ('resnet50', 1), + # ('bert_12l', 0), ('bert_12l', 1), + # ('bert_6l', 0), ('bert_6l', 1), + # ('bert_3l', 0), ('bert_3l', 1), + ], + [ + ('vgg16', 1), + # ('ncf', 1), + # ('bert_large', 1), + # 'same', + ], + ) + + for p2 in [0.01, 0.03]: + for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]: + for p4 in [1e-3, 2e-3, 1e-4, 3e-4, 5e-4, 5e-5]: + for p1 in [3, 4, 2]: + for p5 in [2, 3]: + for p6 in [1, 2]: + NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6 + checkpoint_path = trainer.train(name='vgg-orca-validon-0.83-sim1', num_epochs=200) + exit() + else: + checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_vgg-orca.ckpt' + test_list = [ + # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15', + # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11', #TARGET: 0.9 + '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + ] + + for data_folder in test_list: + simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)], + num_rnn_layers=3, + batch_size=256, + seq_len=1, + checkpoint=checkpoint_path) + + runtimes_folder = os.path.join(data_folder, 'runtimes') + results = {} + averages= [] + scores = [] + for name in os.listdir(runtimes_folder): + strategy_path = os.path.join(data_folder, 'strategies', name) + rs_path = os.path.join(data_folder, 'resource_specs', name ) + if not os.path.isfile(rs_path): + rs_path += '.yml' + runtime_path = os.path.join(runtimes_folder, name) + + with open(runtime_path, 'r') as f: + runtimes = json.load(f) + average = np.array(runtimes['average']) + + s = Strategy.deserialize(strategy_path) + rs = ResourceSpec(resource_file=rs_path) + score = simulator.simulate(s, rs, strategy_path) + + results[name] = (average, score) + averages.append(average) + scores.append(score) + + # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])} + # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])} + # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])} + # print('Sorted by runtime.......................') + # for _, (rt, prediction) in sorted_by_runtime.items(): + # print('runtime {} prediction {}'.format(rt, prediction)) + + y_train = np.array(averages) + test_score = np.array(scores) + true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :]) + pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :]) + equal = (true_comp == pred_comp).astype(np.int) + test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1) + + print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc)) diff --git a/autodist/simulator/models/rankrnn_simulator_penalty_fast.py b/autodist/simulator/models/rankrnn_simulator_penalty_fast.py new file mode 100644 index 0000000..5e08bbd --- /dev/null +++ b/autodist/simulator/models/rankrnn_simulator_penalty_fast.py @@ -0,0 +1,1027 @@ +"""Strategy RankNetSimulator.""" +import glob +import json +import sys +from datetime import datetime +from pathlib import Path +from string import digits +import time + +import numpy as np +import os +import tensorflow as tf +tf.compat.v1.disable_eager_execution() + +import arion +from arion.graph_item import GraphItem +from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from arion.simulator.models.base import SimulatorBase +from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS +from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica, _resolved_devices_on_diff_machine +from arion.strategy.random_sample_strategy import VariableHelper, PartHelper +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.cluster import SSHCluster +from arion.kernel.device.resolver import DeviceResolver +from arion.kernel.partitioner import PartitionerConfig +from arion.simulator.models.predefined_simulator import PredefinedSimulator + +import torch +import torch.nn as nn + +import multiprocessing +from multiprocessing import Process, Queue + +TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# feature settings +MAX_NUM_WORKERS = 16 +MAX_NUM_GROUPS = 600 +MAX_NUM_VARS = 500 +MAX_NUM_PARS = 1500 +FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15 + +# model size +PARTITION_MLP_HIDDEN = 128 +PARTITION_MLP_OUT = 32 +STEM_RNN_HIDDEN = 128 +BIDIECTIONAL = True +BATCH_SIZE = 96 + +NUM_RNN_LAYERS = 3 +SCORE_TH = 0.005 +LR = 2e-3 +WD = 3e-4 +DATA_AUG = False +IN_LAYERS = 2 +OUT_LAYERS = 1 + +# ncf used: +# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020 +# noaug +# PARTITION_MLP_HIDDEN = 128 +# PARTITION_MLP_OUT = 32 +# STEM_RNN_HIDDEN = 128 +# BIDIECTIONAL = True +# NUM_RNN_LAYERS = 4 +# BATCH_SIZE = 64 +# LR = 1e-3 +# WD = 4e-4 + +# vgg used: +# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374 +# noaug +# PARTITION_MLP_HIDDEN = 128 +# PARTITION_MLP_OUT = 32 +# STEM_RNN_HIDDEN = 128 +# BIDIECTIONAL = True +# NUM_RNN_LAYERS = 3 +# BATCH_SIZE = 64 +# LR = 1e-3 +# WD = 3e-4 + +GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item', + 'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item', + 'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item', + 'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item', + 'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item', + 'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item', + 'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l', + 'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l', + 'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l', + 'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'} + +def get_model(path_): + if 'densenet121' in path_: + return 'densenet121' + elif 'ncf' in path_: + return 'ncf' + elif 'inceptionv3' in path_: + return 'inceptionv3' + elif 'resnet101' in path_: + return 'resnet101' + elif 'resnet50' in path_: + return 'resnet50' + elif 'vgg16' in path_: + return 'vgg16' + elif 'bert' in path_ and '12l' in path_: + return 'bert_12l' + elif 'bert' in path_ and '6l' in path_: + return 'bert_6l' + elif 'bert' in path_ and '3l' in path_: + return 'bert_3l' + elif 'bert' in path_ and 'large' in path_: + return 'bert_large' + else: + return None + +class RankRNN(nn.Module): + def __init__(self, input_size=FEATURE_SIZE, + partition_mlp_hidden=PARTITION_MLP_HIDDEN, + partition_mlp_out=PARTITION_MLP_OUT, + stem_rnn_hidden=STEM_RNN_HIDDEN, + num_rnn_layers=NUM_RNN_LAYERS, + in_layers=IN_LAYERS, + out_layers=OUT_LAYERS, + bidirectional=BIDIECTIONAL): + super(RankRNN, self).__init__() + self.partition_mlp_out = partition_mlp_out + # self.num_rnn_layers = num_rnn_layers + self.stem_rnn_hidden = stem_rnn_hidden + tmp = [nn.Linear(input_size, partition_mlp_hidden)] + for _ in range(in_layers-2): + tmp.append(nn.ReLU()) + tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden)) + tmp.append(nn.ReLU()) + tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out)) + + self.partition_mlp = nn.Sequential(*tmp) + + self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional) + + if out_layers == 1: + self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1) + elif out_layers == 2: + self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128), + nn.ReLU(), + nn.Linear(128, 1)) + + self.relu = nn.ReLU() + + def forward(self, features, par_indices, var_nums, return_feature=False): + # print(features.shape, par_indices.shape, var_nums.shape) + x = features.float() + # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float() + x = self.partition_mlp(x) + + x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype) + x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x) + + # Set initial hidden and cell states + # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) + + # Forward propagate LSTM + x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False) + out, (ht, ct) = self.stem_rnn(x1) # out: tensor of shape (batch_size, seq_length, hidden_size) + + # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None] + out = ht.permute(1, 0, 2).reshape(x.shape[0], -1) + # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]]) + # print(ht.permute(1, 0, 2).shape, x.shape) + if return_feature: + return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt()) + else: + return self.final_fc(out) + +class TrainTensorDataset(torch.utils.data.Dataset): + """TensorDataset with support of transforms. + """ + def __init__(self, tensors): + assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) + self.tensors = tensors + + def __getitem__(self, index): + x = self.tensors[0][index] + x = self.perturbe_device_and_group(x) + x1 = self.tensors[1][index] + x2 = self.tensors[2][index] + + y = self.tensors[3][index] + + return x, x1, x2, y + + def __len__(self): + return self.tensors[0].size(0) + + def perturbe_device_and_group(self, x): + if DATA_AUG: + perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32) + perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32) + mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids] + mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids] + x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1) + return x + + +def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas): + ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)] + + if device is not None: + ret[0][device] = 1 + + group = getattr(synchronizer, 'group', None) + if group is not None: + assert group < MAX_NUM_GROUPS, group + ret[1][group] = 1 + + compressor = getattr(synchronizer, 'compressor', None) + if compressor is not None: + if compressor in ["PowerSGDCompressor", 3]: + ret[2][2] = 1 + elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]: + ret[2][1] = 1 + elif compressor in ["NoneCompressor", 0]: + ret[2][0] = 1 + else: + raise ValueError('Compressor does not exist: {}'.format(compressor)) + + local_replication = getattr(synchronizer, 'local_replication', None) + if isinstance(synchronizer, PSSynchronizer): + synchronizer = 0 + if int(local_replication) == 0: + if int(is_sparse) == 0: + ret[3][0] = 1 + else: + ret[3][1] = 1 + else: + if int(is_sparse) == 0: + ret[3][2] = 1 + else: + ret[3][3] = 1 + else: + ret[3][4] = 1 + ret[4] = np.array([size_ratio, bd, num_replicas]) + + return np.concatenate(ret) + +def connvert_feature(strategy, resource_spec, graph_item): + + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) + network_bandwidth = network_bandwidth + min_network_bandwidth = network_bandwidth.min() + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + + num_vars = 0 + total_size_vars = 0 + for var_op, var in graph_item.trainable_var_op_to_var.items(): + num_vars += 1 + if var.initial_value.shape.ndims: + var_helper = VariableHelper(var, graph_item) + if var_helper.is_sparse: + total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) + else: + total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) + assert num_vars < MAX_NUM_VARS, num_vars + var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32) + partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) + + cnt = 0 + for node_id, node in enumerate(strategy.node_config): + var_name = node.var_name + for var_op, var in graph_item.trainable_var_op_to_var.items(): + if var.name == var_name: + break + var_helper = VariableHelper(var, graph_item) + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_helper = PartHelper(i, var, pc) + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device = cpu_worker_list.index(device) + bd = network_bandwidth[device] + num_replicas = worker_num_replicas[device] + + if var_helper.is_sparse: + size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars + else: + size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars + var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) + partition_indice[cnt] = node_id + cnt += 1 + return var_partition_features, partition_indice, np.array(node_id+1) + +def create_predefined_features(strategy, resource_spec, predefined_simulator): + + var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec) + + features = [] + for var_name, sync_time in var_sync_time.items(): + if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies. + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + else: # AR + transmission = sync_time['transmission'] + is_ps = False + + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)] + features.append(feat) + features = np.array(features, dtype=np.float) + return features + +def extract_graph_item(graph_item): + total_size_vars = 0 + name2var = {} + name2var_helper = {} + for var_op, var in graph_item.trainable_var_op_to_var.items(): + name2var[var.name] = var + var_helper = VariableHelper(var, graph_item) + name2var_helper[var.name] = var_helper + if var.initial_value.shape.ndims: + if var_helper.is_sparse: + total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) + else: + total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) + + return total_size_vars, name2var, name2var_helper + +def wrap_fn(queue, idx, run_worker, rs, st): + ret = run_worker(rs, st) + queue.put((idx, ret)) + +def convert_feature_batch(strategys, resource_specs, total_size_vars, name2var, name2var_helper, _batch_size_per_gpu, _seq_len): + + def var_ps_time(var_size_to_transfer, is_sparse, device, dtype, local_replication, network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in PS strategy.""" + def _helper(worker_list, worker_num_replicas=None): + if worker_num_replicas is None: + worker_num_replicas = [1.0] * len(worker_list) + + this_server_time = 0 + # network transfer: sum up all workers time. equals to the time cost of this server. + # TODO(Hao): didn't consider any parallelization among partitions + for k, worker in enumerate(worker_list): + if _resolved_devices_on_diff_machine(device, worker): + if is_sparse: + this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + else: + this_worker_size = get_dense_var_bits(var_size_to_transfer, dtype) + this_server_time += this_worker_size / network_bandwidth_map[device][worker] + + return { + 'transmission': this_server_time, + 'network_overhead': len(worker_list), + 'gpu_kernel_memory_latency': max_num_local_replica, + } + + send_time = _helper(cpu_worker_list) + if local_replication: + receive_time = _helper(cpu_worker_list) + else: + receive_time = _helper(gpu_worker_list) + + return send_time, receive_time + + def var_ar_time(var_size_to_transfer, og_shape, dtype, compressor, max_num_local_replica, cpu_worker_list, network_bandwidth_map, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in AR strategy.""" + worker_list = cpu_worker_list + num_workers = len(worker_list) + min_bandwidth = None + for i in range(num_workers): + for j in range(i, num_workers): + if min_bandwidth is None: + min_bandwidth = network_bandwidth_map[worker_list[j]][worker_list[i]] + else: + min_bandwidth = min(min_bandwidth, network_bandwidth_map[worker_list[j]][worker_list[i]]) + + # Compressor + if compressor == "PowerSGDCompressor" or compressor == 3: + rank = 10 # currently using default value. So hardcode here. # todo: confirm + # assume var must be a dense variable. + ndims = len(og_shape) + if ndims <= 1: # no compress + size_to_transfer = var_size_to_transfer + else: + if ndims > 2: + n = og_shape[0] + m = 1 + for s in og_shape[1:]: + m *= s # tensor's shape (n, m) + else: + n, m = og_shape[0], og_shape[1] + size_to_transfer = n * rank + m * rank + dtype = tf.float32 + elif compressor == "HorovodCompressorEF" or compressor == "HorovodCompressor" \ + or compressor == 2 or compressor == 1: + size_to_transfer = var_size_to_transfer + dtype = tf.float32 + elif compressor == "NoneCompressor" or compressor == 0: + size_to_transfer = var_size_to_transfer + dtype = dtype + else: + raise ValueError('Compressor does not exist: {}'.format(compressor)) + + time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth + + return { + 'transmission': time, + 'network_overhead': 1, # len(worker_list), + 'gpu_kernel_memory_latency': max_num_local_replica, + } + + def network_bandwidth2(resource_spec: ResourceSpec, device_resolver: DeviceResolver): + """Calculates all P2P network bandwidths between nodes in the cluster.""" + devices = [device for device, _ in resource_spec.devices] + resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices] + gpu_cpu_bw = 10000. # hardcode for now + network_bandwidth = {} # key: + for i in range(len(devices)): + if resolved_devices[i] not in network_bandwidth: + network_bandwidth[resolved_devices[i]] = {} + for j in range(i, len(devices)): + if resolved_devices[j] not in network_bandwidth: + network_bandwidth[resolved_devices[j]] = {} + ip_i = devices[i].split(':')[0] + ip_j = devices[j].split(':')[0] + if ip_i != ip_j: + network_bandwidth[resolved_devices[i]][resolved_devices[j]] \ + = GIGABITS * resource_spec.network_bandwidth[ip_i] + network_bandwidth[resolved_devices[j]][resolved_devices[i]] \ + = GIGABITS * resource_spec.network_bandwidth[ip_j] + else: + network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw + network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw + return network_bandwidth + + def run_worker(resource_spec, strategy): + cluster = SSHCluster(resource_spec) + device_resolver = DeviceResolver(cluster) + graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] + # bandwidth + network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) + min_network_bandwidth = network_bandwidth.min() + network_bandwidth_map = network_bandwidth2(resource_spec, device_resolver) + # Other information + cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] + gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] + max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) + total_num_local_replica = len(graph_replicas) + worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] + + var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE)).astype(np.float32) + partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) + cnt = 0 + for node_id, node in enumerate(strategy.node_config): + var_name = node.var_name + var = name2var[var_name] + var_helper = name2var_helper[var_name] + + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device_id = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device_id = cpu_worker_list.index(device) + bd = network_bandwidth[device_id] + num_replicas = worker_num_replicas[device_id] + + par_shape = var.initial_value.shape.as_list() + dim_size = par_shape[pc.axis] // pc.num_shards + extras = par_shape[pc.axis] % pc.num_shards + if i < extras: + dim_size += 1 + par_shape[pc.axis] = dim_size + + size_to_transfer =np.prod(par_shape) + if var_helper.is_sparse: + raise Error + size_ratio = get_sparse_var_bits(size_to_transfer)/total_size_vars + else: + size_ratio = get_dense_var_bits(size_to_transfer, var_helper.dtype)/total_size_vars + + if isinstance(synchronizer, AllReduceSynchronizer): + sync_time = var_ar_time(size_to_transfer, par_shape, var_helper.dtype, getattr(synchronizer, 'compressor', None), max_num_local_replica, cpu_worker_list, network_bandwidth_map) + transmission = sync_time['transmission'] + is_ps = False + else: + sync_time = var_ps_time(size_to_transfer, var_helper.is_sparse, device, var_helper.dtype, getattr(synchronizer, 'local_replication', None), network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list) + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + var_partition_features[cnt] = np.concatenate([to_numpy(synchronizer, device_id, size_ratio, var_helper.is_sparse, bd, num_replicas), np.array([transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)])]) + partition_indice[cnt] = node_id + cnt += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + reduction_destination = getattr(synchronizer, 'reduction_destination', None) + device = _resolve_device_address(reduction_destination if reduction_destination else var.device, + device_resolver) + if device == '': + assert(isinstance(synchronizer, AllReduceSynchronizer)) + device_id = None + bd = min_network_bandwidth + num_replicas = 0 + else: + device_id = cpu_worker_list.index(device) + bd = network_bandwidth[device_id] + num_replicas = worker_num_replicas[device_id] + + size_to_transfer =np.prod(var_helper.shape) + if var_helper.is_sparse: + raise Error + size_ratio = get_sparse_var_bits(size_to_transfer)/total_size_vars + else: + size_ratio = get_dense_var_bits(size_to_transfer, var_helper.dtype)/total_size_vars + + if isinstance(synchronizer, AllReduceSynchronizer): + sync_time = var_ar_time(size_to_transfer, var.initial_value.shape.as_list(), var_helper.dtype, getattr(synchronizer, 'compressor', None), max_num_local_replica, cpu_worker_list, network_bandwidth_map) + transmission = sync_time['transmission'] + is_ps = False + else: + sync_time = var_ps_time(size_to_transfer, var_helper.is_sparse, device, var_helper.dtype, getattr(synchronizer, 'local_replication', None), network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list) + transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] + sync_time = sync_time[0] + is_ps = True + network_overhead = sync_time['network_overhead'] + gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] + + var_partition_features[cnt] = np.concatenate([to_numpy(synchronizer, device_id, size_ratio, var_helper.is_sparse, bd, num_replicas), np.array([transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)])]) + partition_indice[cnt] = node_id + cnt += 1 + return (var_partition_features, partition_indice, np.array(node_id+1)) + + # t1 =time.time() + # with multiprocessing.Pool(processes=32) as pool: + # results = pool.starmap(run_worker, zip(resource_specs, strategys)) + # ret1, ret2, ret3 = [], [], [] + # for tmp in results: + # ret1.append(tmp[0]); ret2.append(tmp[1]); ret3.append(tmp[2]) + + q = Queue() + rets = [] + prs = [] + for idx, (arg1, arg2) in enumerate(zip(resource_specs, strategys)): + prs.append(Process(target=wrap_fn, args=(q, idx, run_worker, arg1, arg2))) + prs[-1].start() + for pr in prs: + ret = q.get() # will block + rets.append(ret) + for pr in prs: + pr.join() + + ret1, ret2, ret3 = [], [], [] + for tmp in sorted(rets, key=lambda x: x[0]): + ret1.append(tmp[1][0]); ret2.append(tmp[1][1]); ret3.append(tmp[1][2]) + # print(time.time() - t1) + + # t1 =time.time() + # ret1, ret2, ret3 = [], [], [] + # for rs, st in zip(resource_specs, strategys): + # tmp = run_worker(rs, st) + # ret1.append(tmp[0]); ret2.append(tmp[1]); ret3.append(tmp[2]) + # print(time.time() - t1) + return np.stack(ret1), np.stack(ret2), np.stack(ret3) + + +class RankRNNSimulatorPenalty(SimulatorBase): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + original_graph_item_path, + num_rnn_layers, + in_layers, + out_layers, + fetches=None, + batch_size=1, + seq_len=1, + checkpoint=None): + + super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path) + print("It's using RankNet simulator.") + self._fetches = fetches + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._checkpoint = checkpoint + self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) + if self._checkpoint: + self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE) + self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu'))) + + total_size_vars, name2var, name2var_helper = extract_graph_item(self._original_graph_item) + self.total_size_vars = total_size_vars + self.name2var = name2var + self.name2var_helper = name2var_helper + + def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None): + score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint) + return score.view(-1).data.cpu().numpy(), feature.data.cpu().numpy() + + def predict(self, + strategy, + resource_spec, + strategy_path=None, + checkpoint=None): + if checkpoint is None: + if self._checkpoint is None: + raise ValueError("checkpoint is None: {}".format(checkpoint)) + else: + model = self._model + else: + model = RankRNN().to(TORCH_DEVICE) + model.load_state_dict(torch.load(checkpoint)) + if type(strategy) == list and type(resource_spec) == list: + + var_partition_features, partition_indice, var_num = convert_feature_batch(strategy, resource_spec, self.total_size_vars, self.name2var, self.name2var_helper, self._batch_size_per_gpu, self._seq_len) + + var_partition_features = torch.from_numpy(var_partition_features).to(TORCH_DEVICE) + partition_indice = torch.from_numpy(partition_indice).to(TORCH_DEVICE) + var_num = torch.from_numpy(var_num).to(TORCH_DEVICE) + + return model(var_partition_features, partition_indice, var_num, True) + else: + if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz')) + var_partition_features, partition_indice, var_num, _ = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + else: + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, resource_spec, self._original_graph_item) + + if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')): + loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz')) + predefined_features = loaded['x4'] + else: + predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator) + + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE) + partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE) + var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE) + + return model(var_partition_features, partition_indice, var_num, True) + +class RankNetTrainer(): + + def __init__(self, + batch_size_per_gpu=256, + seq_len=1, + seed=1): + self._batch_size_per_gpu = batch_size_per_gpu + self._seq_len = seq_len + self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()} + self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v, + batch_size=self._batch_size_per_gpu, + seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()} + self.best_acc = 0. + print("It's using RankNet trainer.") + + def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'): + features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()} + for training_path in path_list: + for path in Path(training_path).rglob('strategies'): + strategy_paths = glob.glob(os.path.join(path, '*')) + # strategy_paths = np.random.permutation(list(strategy_paths)) + for strategy_path in strategy_paths: + if 'json' in strategy_path or \ + 'bert_large_batch_8_orca_16_group_2/' in strategy_path: + continue + model = get_model(strategy_path) + if model is None: + if not ('densenets169' in strategy_path or 'densenets201' in strategy_path): + assert False, strategy_path + continue + rs_path = strategy_path.replace('strategies', 'resource_specs') + runtime_path = strategy_path.replace('strategies', 'runtimes') + npz_path = (strategy_path+'.npz').replace('strategies', 'npz') + if not os.path.isfile(rs_path): + rs_path += '.yml' + if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)): + continue + if not os.path.exists(os.path.dirname(npz_path)): + os.makedirs(os.path.dirname(npz_path)) + + if not os.path.isfile(npz_path): + strategy = Strategy.deserialize(path=strategy_path) + rs = ResourceSpec(resource_file=rs_path) + var_partition_features, partition_indice, var_num = \ + connvert_feature(strategy, rs, self.graph_items[model]) + label = np.array(json.load(open(runtime_path))['average']) + np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label) + else: + loaded = np.load(npz_path) + var_partition_features, partition_indice, var_num, label = \ + loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] + + if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')): + predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model]) + np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features) + else: + loaded = np.load(npz_path.replace('.npz', '_pdf.npz')) + predefined_features = loaded['x4'] + var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) + + is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here + # is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path) + # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3]) + features[model][is_aws][0].append(var_partition_features) + features[model][is_aws][1].append(partition_indice) + features[model][is_aws][2].append(var_num) + features[model][is_aws][3].append(label) + + for k, _ in GRAPH_ITEM_PATHS.items(): + for i1 in range(2): + for i2 in range(4): + if len(features[k][i1][i2]) > 1: + features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16) + print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape) + else: + features[k][i1][i2] = None + + train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0) + train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0) + train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0) + train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0) + + if type(valid_patterns[0]) == str and valid_patterns[0] == 'same': + rng = np.random.RandomState(1) + permt = rng.permutation(train_features.shape[0]) + split = int(len(permt) * 0.7) + val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]] + train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]] + else: + val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0) + val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0) + val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0) + val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0) + + # comment here + rng = np.random.RandomState(1) + permt = rng.permutation(val_features.shape[0]) + split = int(len(permt) * 0.7) + train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0) + + val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]] + label_max = max(train_labels.max(), val_labels.max()) + label_min = min(train_labels.min(), val_labels.min()) + train_labels = (train_labels-label_min)/(label_max-label_min) + val_labels = (val_labels-label_min)/(label_max-label_min) + print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) + + ## train the model + trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE))) + testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE)) + self.trainloader = torch.utils.data.DataLoader(dataset=trainset, + batch_size=BATCH_SIZE, + shuffle=True) + self.testloader = torch.utils.data.DataLoader(dataset=testset, + batch_size=32, + shuffle=False) + + def train(self, name='', num_epochs=200, checkpoint=None): + + checkpoint_path = 'model_on_{}.ckpt'.format(name) + print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS) + + np.random.seed(1) + torch.manual_seed(1) + torch.cuda.manual_seed_all(1) + model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE) + if checkpoint: + model.load_state_dict(torch.load(checkpoint)) + optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD) + + best_val_acc = 0. + for epoch in range(num_epochs): + if epoch == int(num_epochs*2./5. - 1): + for param_group in optimizer.param_groups: param_group['lr'] = 3e-4 + if epoch == int(num_epochs*4./5. - 1): + for param_group in optimizer.param_groups: param_group['lr'] = 1e-4 + + labels = [] + outputs = [] + for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader): + + # Forward pass + outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() + + par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1) + + true_comp = ( + (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() + + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int() + + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int() + ) > 0 + true_comp = true_comp.float() * 2 - 1 + pred_comp = outputs_b[:, None] - outputs_b[None, :] + loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp) + loss = loss.tril(-1).mean() + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25) + optimizer.step() + + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + + with torch.no_grad(): + labels = [] + outputs = [] + for features_b, par_indices_b, var_nums_b, labels_b in self.testloader: + + # Forward pass + outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() + outputs.append(outputs_b) + labels.append(labels_b) + + labels = torch.cat(labels) + outputs = torch.cat(outputs) + true_comp = (labels[:, None] > labels[None, :]) + pred_comp = (outputs[:, None] > outputs[None, :]) + equal = (true_comp == pred_comp).int() + acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) + if acc.item() > best_val_acc: + best_val_acc = acc.item() + if best_val_acc > self.best_acc: + print('Saved model @ acc', best_val_acc) + torch.save(model.state_dict(), checkpoint_path) + self.best_acc = best_val_acc + # print('Saved model to {}'.format(checkpoint_path)) + if epoch == num_epochs - 1: + print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc)) + return checkpoint_path + + +if __name__ == '__main__': + + if False: + trainer = RankNetTrainer() + trainer.load_data([ + '/users/hzhang2/oceanus_cost_model_training_data/vgg16', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf', + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only', + # '/users/hzhang2/oceanus_cost_model_training_data/densenet', + # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', + # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', + ], + [ + ('vgg16', 1), #('vgg16', 1), + # ('ncf', 0), #('ncf', 1), + # ('bert_large', 1), #('bert_large', 1), + # not used: + # ('densenet121', 0), ('densenet121', 1), + # ('inceptionv3', 0), ('inceptionv3', 1), + # ('resnet101', 0), ('resnet101', 1), + # ('resnet50', 0), ('resnet50', 1), + # ('bert_12l', 0), ('bert_12l', 1), + # ('bert_6l', 0), ('bert_6l', 1), + # ('bert_3l', 0), ('bert_3l', 1), + ], + [ + # ('vgg16', 1), + # ('ncf', 1), + # ('bert_large', 1), + 'same', + ], + ) + + for p2 in [0.01, 0.03]: + for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]: + for p4 in [1e-3, 1e-4, 3e-4, 5e-4, 5e-5, 2e-3, ]: + for p1 in [3, 4, 2]: + for p5 in [2, 3]: + for p6 in [1, 2]: + NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6 + checkpoint_path = trainer.train(name='vgg-aws-new-2', num_epochs=200) + exit() + else: + checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_bert-aws-only.ckpt' + test_list = [ + '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only', + # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15', + # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11', #TARGET: 0.9 + # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', + # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', + ] + + for data_folder in test_list: + simulator = RankRNNSimulatorPenalty3(GRAPH_ITEM_PATHS[get_model(data_folder)], + 4, + 2, + 1, + batch_size=256, + seq_len=1, + checkpoint=checkpoint_path) + + runtimes_folder = os.path.join(data_folder, 'runtimes') + results = {} + averages= [] + scores = [] + strategys = [] + rss = [] + strategy_paths = [] + for name in os.listdir(runtimes_folder): + strategy_path = os.path.join(data_folder, 'strategies', name) + rs_path = os.path.join(data_folder, 'resource_specs', name ) + + if not os.path.isfile(rs_path): + rs_path += '.yml' + runtime_path = os.path.join(runtimes_folder, name) + + strategy_paths.append(strategy_path) + + with open(runtime_path, 'r') as f: + runtimes = json.load(f) + average = np.array(runtimes['average']) + + s = Strategy.deserialize(strategy_path) + rs = ResourceSpec(resource_file=rs_path) + strategys.append(s) + rss.append(rs) + + averages.append(average) + + # for tmp1, tmp2, tmp3 in zip(strategys, rss, strategy_paths): + # scores.append(simulator.simulate(tmp1, tmp2, tmp3)[0]) + # print(np.stack(scores).reshape(-1)) + + scores = simulator.simulate(strategys, rss)[0] + print(scores) + + # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])} + # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])} + # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])} + # print('Sorted by runtime.......................') + # for _, (rt, prediction) in sorted_by_runtime.items(): + # print('runtime {} prediction {}'.format(rt, prediction)) + + y_train = np.array(averages) + test_score = np.array(scores) + true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :]) + pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :]) + equal = (true_comp == pred_comp).astype(np.int) + test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1) + + print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc)) diff --git a/autodist/simulator/test.py b/autodist/simulator/test.py new file mode 100644 index 0000000..b481208 --- /dev/null +++ b/autodist/simulator/test.py @@ -0,0 +1,17 @@ +from arion.simulator.simulator import Simulator +from arion.strategy import base +from arion.graph_item import GraphItem + +resource_spec_file = '/home/hao.zhang/project/pycharm/ncf-trial/official/recommendation/trial/trialrun_resource_specs/resource_spec_2.yml' +strategy_path = '/home/hao.zhang/oceanus_simulator/ncf_3/strategies/20200505T174311M650364' +original_graph_item_path = '/home/hao.zhang/oceanus_simulator/ncf/strategies/original_graph_item' + +s = base.Strategy.deserialize(strategy_path) + + +simulator = Simulator(resource_file=resource_spec_file, + original_graph_item_path=original_graph_item_path) + +ret = simulator.simulate(s) + +print('finished') diff --git a/autodist/simulator/train_linear.py b/autodist/simulator/train_linear.py new file mode 100644 index 0000000..c7e9438 --- /dev/null +++ b/autodist/simulator/train_linear.py @@ -0,0 +1,123 @@ +import os +import glob +import json +import numpy as np +from collections import OrderedDict +from os.path import expanduser +from sklearn import linear_model +from sklearn.linear_model import Ridge +from arion.simulator.utils import split_dataset + +def create_features(simulation): + runtime_coefficients = simulation['runtime_coefficients'] + var_sync_time = simulation['var_sync_time'] # dict: + + res = OrderedDict({ + 'network_overhead': 0.0, + 'gpu_kenrel_memory_latency': 0.0, + 'constant_factor': 0.0, + 'allreduce_factor': 0.0, + }) + for var_name, sim_time in var_sync_time.items(): + if isinstance(sim_time, list): + # PS strategies + send_time, receive_time = sim_time + res['constant_factor'] += send_time['transmission'] + receive_time['transmission'] + res['network_overhead'] += send_time['network_overhead'] + receive_time['network_overhead'] + res['gpu_kenrel_memory_latency'] += send_time['gpu_kenrel_memory_latency'] + receive_time['gpu_kenrel_memory_latency'] + elif isinstance(sim_time, dict): + # Allreduce strategy + res['allreduce_factor'] += sim_time['transmission'] + res['network_overhead'] += sim_time['network_overhead'] + res['gpu_kenrel_memory_latency'] += sim_time['gpu_kenrel_memory_latency'] + else: + raise ValueError + + # runtime_coefficients = { + # 'transmission': slowest_server_time, + # 'network_overhead': len(worker_list), + # 'gpu_kenrel_memory_latency': max_num_local_replica, + # 'constant': 1.0, + # # possible affecting factors. + # 'var_name': var_name, + # 'strategy': 'ps', + # 'local_proxy': local_proxy, + # 'is_sparse': is_sparse, + # 'server_list': [partition.to_dict() for partition in server_list], + # 'worker_list': worker_list, + # 'cpu_worker_list': cpu_worker_list, + # 'gpu_worker_list': gpu_worker_list, + # 'worker_num_replicas': worker_num_replicas, + # 'max_num_local_replica': max_num_local_replica, + # } + # runtime_coefficients = [ + # runtime_coefficients['transmission'], + # runtime_coefficients['network_overhead'], + # runtime_coefficients['gpu_kenrel_memory_latency'], + # ] + return list(res.values()) + +def load_trial_run_data(data_dir): + runtimes_folders = glob.glob("{}/**/runtimes".format(data_dir), recursive=True) + X = [] + Y = [] + for runtimes_folder in runtimes_folders: + print(runtimes_folder) + runtimes_files = glob.glob(os.path.join(runtimes_folder, '*')) + for runtimes_file in runtimes_files: + # Target + runtime = json.load(open(runtimes_file, 'r')) + y = runtime['average'] + # Features + simulation_file = '/'.join(runtimes_file.split('/')[:-2]) + '/simulations/' + runtimes_file.split('/')[-1] + assert os.path.isfile(simulation_file), 'simulation_file {} does not exist'.format(simulation_file) + simulation = json.load(open(simulation_file, 'r')) + x = create_features(simulation) + X.append(x) + Y.append(y) + return X, Y + +data_dir = os.path.join(expanduser('~'), 'oceanus_simulator/lm1b-patchon') +X, Y = load_trial_run_data(data_dir) +X_train, Y_train, X_valid, Y_valid = split_dataset(X, Y) +print('X_train', X_train.shape, 'Y_train', Y_train.shape, 'X_valid', X_valid.shape, 'Y_valid', Y_valid.shape) + +# Linear regression +lm = linear_model.LinearRegression() +model = lm.fit(X_train, Y_train) +predictions = lm.predict(X_valid) +print('predictions, targets: ') +pt = zip(predictions, Y_valid) +pt = sorted(pt, key=lambda x: x[1]) +for p, t in pt: + print(p, t) +train_score = lm.score(X_train, Y_train) +valid_score = lm.score(X_valid, Y_valid) +print('Linear train_score', train_score) +print('Linear valid_score', valid_score) + +# Ridge regression +ridge = Ridge(alpha=1.0) +ridge.fit(X_train, Y_train) +predictions = ridge.predict(X_valid) +train_score = ridge.score(X_train, Y_train) +valid_score = ridge.score(X_valid, Y_valid) +print('Ridge train_score', train_score) +print('Ridge valid_score', valid_score) + + +# Lasso +lasso = linear_model.Lasso(alpha=0.1) +lasso.fit(X_train, Y_train) +train_score = lasso.score(X_train, Y_train) +valid_score = lasso.score(X_valid, Y_valid) +print('Lasso train_score', train_score) +print('Lasso valid_score', valid_score) + +# ElasticNet +elastic = linear_model.ElasticNet(random_state=0) +elastic.fit(X_train, Y_train) +train_score = elastic.score(X_train, Y_train) +valid_score = elastic.score(X_valid, Y_valid) +print('ElasticNet train_score', train_score) +print('ElasticNet valid_score', valid_score) diff --git a/autodist/simulator/train_predefined_simulator.py b/autodist/simulator/train_predefined_simulator.py new file mode 100644 index 0000000..43bb08b --- /dev/null +++ b/autodist/simulator/train_predefined_simulator.py @@ -0,0 +1,343 @@ +import sys +import os +import numpy as np +import tensorflow as tf +from os.path import expanduser +import tqdm + +from tensorflow.python.eager import context +import tensorflow_ranking as tfr + +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.simulator import utils +from arion.simulator.models.predefined_simulator import PredefinedSimulator +from arion.simulator.utils import RankingLossKeys + +class TFRIterator: + def __init__(self, X, Y, list_size, batch_size, split, baseline=0.0, scale=1.0): + assert len(X) > 0, 'data: {}'.format(len(X)) + self.X = X + self.Y = Y + self.list_size = list_size + self.baseline = baseline + self.scale = scale + self.batch_size = batch_size + self.split = split + self.n = len(X) + self.num_examples = self.get_num_examples() + print('Split: {},\tnumber of samples: {},\tnumber of examples: {},\tmin of y: {}'.format( + split, len(X), self.num_examples, self.get_min_y())) + + def get_min_y(self): + return np.min(self.Y) + + def get_num_examples(self): + n_examples = 1 + for i in range(self.list_size): + n_examples *= (len(self.X) -1) + return n_examples + + def get_next(self): + xs = [[] for _ in range(self.list_size)] + ys = [] + for i in range(self.batch_size): + y =[] + for j in range(self.list_size): + ri = np.random.randint(self.n) + rx = self.X[ri] + ry = self.Y[ri] + xs[j].append(np.array(rx, dtype=np.float32)) + y.append(ry) + assert ry * self.scale - self.baseline > 0, '{}, {}, {}'.format(ry, self.scale, self.baseline) + ys.append(y) + xs = [np.array(xx, dtype=np.float32) for xx in xs] + ys = np.array(ys, dtype=np.float32) + if self.split == 'train': # normalize y as its used for loss weights. + ys = (ys * self.scale - self.baseline) + + return xs + [ys] + +model_params = { + 'ncf_large_adam_dense': { + 'model_batch_size': 256, + 'model_seq_len': 1, + 'data_dir': [ + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_ar_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_by_chunk', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_christy', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced_12_12', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced_20_50', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_sorted_christy_ordered_balanced_30_50', + '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_sorted_christy_ordered_balanced_30_50_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_christy_lb', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_christy_lb_ps_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_real_random', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_8', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_orca_4', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_orca_16', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_orca_16_christy_lb_if_partition_lb_linear_cost_ps_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_orca_16_christy_lb_if_partition_lb_num_partition_2_32_linear_cost_ps_only', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_random_search_christy_lb_ps_only_if_partition_lb_ranknet_simulator_2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_random_search_christy_lb_ps_only_ranknet_simulator', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_g3.4.25.3_g3.4.25.4_3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9', + # '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1', + ], + 'original_graph_item_path': '/home/christy.li/oceanus_cost_model_training_data/ncf/original_graph_item', + 'save_dir': os.path.join(expanduser('~'), 'oceanus_cost_model_training_data/ncf/predefined_checkpoints'), + 'save_prefix': 'ckpV1_ncf_large_adam_dense_orca_all', + # 'save_prefix': 'ckpV2_ncf_large_adam_dense_orca', + 'baseline': 0.15, + # 'baseline': 0.0, + 'scale': 0.5, + 'learning_rate': 0.01, + 'list_size': 2, + 'batch_size': 100, + 'ranking_loss_key': 'pairwise_logistic_loss', + 'model_version': 'v1', + # 'model_version': 'v2', + 'do_train': False, + 'do_test': True, + 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_all_600_0.83249_0.84517', + }, + 'bert': { + 'model_batch_size': 32, + 'model_seq_len': 128, + 'data_dir': [ + '/home/christy.li/oceanus_cost_model_training_data/bert/bert_3l_orca_16', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert_6l_orca_15', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert_12l_orca_15', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert.12l_g4.4.50.1_g4.4.50.2', + # '/home/christy.li/oceanus_cost_model_training_data/bert/bert.6l_g4.4.50.1_g4.4.50.2', + ], + 'original_graph_item_path': '/home/hao.zhang/oceanus_cost_model_training_data/bert/bert_original_graph_item_3l', + 'save_dir': '/home/christy.li/oceanus_cost_model_training_data/bert/predefined_checkpoints', + 'save_prefix': 'ckpV1_bert_orca', + 'baseline': 0.04, + 'scale': 0.5, + 'learning_rate': 0.01, + 'list_size': 2, + 'batch_size': 100, + 'ranking_loss_key': 'pairwise_logistic_loss', + 'do_train': False, + 'do_test': True, + 'model_version': 'v1', + # 'model_version': 'v2', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/checkpoint_500', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_16_300_0.90684_0.91947', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_16_600_0.87000_0.71000', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_all_200_0.80568_0.81116', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_200_0.81503_0.82009', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV2_ncf_large_adam_dense_orca_16_600_0.89737_0.92842', + # 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV2_ncf_large_adam_dense_all_500_0.87666_0.85391', + 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/bert/predefined_checkpoints/ckpV1_bert_orca_400_0.93600_0.93889', + }, + 'resnet101': { + 'model_batch_size': 32, + 'model_seq_len': 1, + 'baseline': 0.5, + 'scale': 0.5, + 'data_dir': '', + 'learning_rate': 0.01, + 'list_size': 2, + 'batch_size': 100, + 'ranking_loss_key': 'pairwise_logistic_loss', + }, +} + +def main(_): + np.random.seed(110) + + # Hyperparameters + # model_to_simulate = 'bert' + model_to_simulate = 'ncf_large_adam_dense' + data_dir = model_params[model_to_simulate]['data_dir'] + original_graph_item_path = model_params[model_to_simulate]['original_graph_item_path'] + batch_size = model_params[model_to_simulate]['batch_size'] + ranking_loss_key = model_params[model_to_simulate]['ranking_loss_key'] + learning_rate = model_params[model_to_simulate]['learning_rate'] + list_size = model_params[model_to_simulate]['list_size'] + baseline = model_params[model_to_simulate]['baseline'] + scale = model_params[model_to_simulate]['scale'] + save_dir = model_params[model_to_simulate]['save_dir'] + save_prefix = model_params[model_to_simulate]['save_prefix'] + do_train = model_params[model_to_simulate]['do_train'] + do_test = model_params[model_to_simulate]['do_test'] + checkpoint = model_params[model_to_simulate]['checkpoint'] + model_version = model_params[model_to_simulate]['model_version'] + + # Create simulator + simulator = PredefinedSimulator(original_graph_item_path, + batch_size=model_params[model_to_simulate]['model_batch_size'], + seq_len=model_params[model_to_simulate]['model_seq_len']) + + # Create features + strategy_resource_files, Y = utils.laod_from_folders(data_dir) + print("Createing features...") + X = [] + with context.graph_mode(): + for strategy_file, resource_file in tqdm.tqdm(strategy_resource_files): + x = simulator.create_features(Strategy.deserialize(strategy_file), ResourceSpec(resource_file)) + X.append(x) + X = np.array(X, dtype=np.float) + print("Finished createing features.") + + # Create model + hidden_dim = 12 + W = tf.Variable(tf.random.uniform([hidden_dim, 1]), name='W', dtype=tf.float32) + b = tf.Variable(0.0, name='b', dtype=tf.float32) + if model_version == 'v2': + W0 = tf.Variable(tf.random.uniform([hidden_dim, hidden_dim]), name='W0', dtype=tf.float32) + b0 = tf.Variable(0.0, name='b0', dtype=tf.float32) + loss_fn = tfr.losses.make_loss_fn(RankingLossKeys[ranking_loss_key]) + major_version, _, _ = tf.version.VERSION.split('.') + if major_version == '1': + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + else: + optimizer = tf.optimizers.Adam(learning_rate) + + def forward(xs): + rs = [] + for x in xs: + if model_version == 'v2': + x = tf.nn.elu(tf.matmul(x, W0) + b0) + r = tf.matmul(x, W) + b + rs.append(r) + r = tf.concat(rs, axis=1, name='logits') + return r + + @tf.function + def train_steps(inputs_iterator, total_steps): + + def train_step(input): + with tf.GradientTape() as tape: + logits = forward(input[:-1]) + loss = loss_fn(labels=input[-1], logits=logits, features={}) + vs = [W0, b0, W, b] if model_version == 'v2' else [W, b] + gradients = tape.gradient(loss, vs) + train_op = optimizer.apply_gradients(zip(gradients, vs)) + pred = tf.squeeze(tf.argmax(logits, axis=1)) + labels = tf.squeeze(tf.argmax(input[-1], axis=1)) + acc = tf.equal(pred, labels) + return loss, acc + + losses = [] + accs = [] + for step in range(total_steps): + l, a = train_step(inputs_iterator.get_next()) + losses.append(l) + accs.append(a) + return losses, accs + + @tf.function + def eval_step(input): + logits = forward(input[:-1]) + preds = tf.squeeze(tf.argmax(logits, axis=1)) + labels = tf.squeeze(tf.argmax(input[-1], axis=1)) + acc = tf.equal(preds, labels) + return acc, labels, preds, input[-1], logits + + def eval_steps(iterator, total_test_steps): + test_acc = [] + test_preds = [] + test_labels = [] + test_logits = [] + test_scores = [] + for step in range(total_test_steps): + acc, labels, preds, scores, logits = eval_step(iterator.get_next()) + test_acc.append(acc) + test_labels.append(labels) + test_preds.append(preds) + test_scores.append(scores) + test_logits.append(logits) + test_acc = tf.concat(test_acc, axis=0) + test_acc = tf.cast(test_acc, tf.float32) + avg_test_acc = tf.math.reduce_mean(test_acc) + test_labels = tf.concat(test_labels, axis=0) + test_preds = tf.concat(test_preds, axis=0) + test_scores = tf.concat(test_scores, axis=0) + test_logits = tf.concat(test_logits, axis=0) + return avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits + + if do_train: + train_set, valid_set, test_set = utils.split_dataset([X, Y], shuffle=True, train_ratio=0.7, test_ratio=0.15) + X_train, Y_train = train_set + X_valid, Y_valid = valid_set + X_test, Y_test = test_set + inputs_iterator = TFRIterator(X=X_train, Y=Y_train, list_size=list_size, batch_size=batch_size, split='train', + baseline=baseline, scale=scale) + valid_iterator = TFRIterator(X=X_valid, Y=Y_valid, list_size=list_size, batch_size=batch_size, split='valid') + test_iterator = TFRIterator(X=X_test, Y=Y_test, list_size=list_size, batch_size=batch_size, split='test') + total_train_steps = max(1, min(inputs_iterator.get_num_examples() // batch_size, 100)) + total_valid_steps = max(1, valid_iterator.get_num_examples() // batch_size) + total_test_steps = max(1, test_iterator.get_num_examples() // batch_size) + print("Total train steps per epoch: {}".format(total_train_steps)) + print("Total valid steps per epoch: {}".format(total_valid_steps)) + print("Total test steps: {}".format(total_test_steps)) + EPOCHS = 2000 + eval_every_epochs = 100 + save_every_epochs = 100 + + print("\nTrain model...") + losses = [] + for epoch in range(EPOCHS): + loss, acc = train_steps(inputs_iterator, total_train_steps) + losses.extend(loss) + avgloss = sum(losses) / float(len(losses)) + print('Step: {}, avgloss: {:.5f}'.format(epoch, avgloss)) + if (epoch+1) % eval_every_epochs == 0: + print("\nEvaluate on valid set...") + avg_valid_acc, *_= eval_steps(valid_iterator, total_valid_steps) + print('avg_valid_acc: {}'.format(avg_valid_acc.numpy())) + print("Evaluate on test set...") + avg_test_acc, *_= eval_steps(test_iterator, total_test_steps) + print('avg_test_acc: {}\n'.format(avg_test_acc.numpy())) + print('W', W.numpy()) + print('b', b.numpy()) + + if (epoch+1) % save_every_epochs == 0: + if not os.path.exists(save_dir): + os.mkdir(save_dir) + checkpoint = '{}/{}_{}_{:.5f}_{:.5f}'.format(save_dir, save_prefix, epoch+1, + avg_valid_acc, avg_test_acc) + print("Save to {}".format(checkpoint)) + simulator.save_checkpoint([W0, b0, W, b] if model_version == 'v2' else [W, b], checkpoint) + + elif do_test: + print("Load from {}".format(checkpoint)) + weights = simulator.load_checkpoint(checkpoint) + if model_version == 'v2' and len(weights) == 4: + W0, b0, W, b = weights + elif model_version == 'v1' and len(weights) == 2: + W, b = weights + else: + raise ValueError + + test_iterator = TFRIterator(X=X, Y=Y, list_size=list_size, batch_size=batch_size, split='test') + total_test_steps = max(1, test_iterator.get_num_examples() // batch_size) + print("\nEvaluate on test set...") + avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits = eval_steps(test_iterator, total_test_steps) + for i, labels, preds, scores, logits in zip(range(100), test_labels, test_preds, test_scores, test_logits): + print('labels', labels.numpy(), 'preds', preds.numpy(), 'scores', scores.numpy(), 'logits', logits.numpy()) + print('avg_test_acc', avg_test_acc.numpy()) + + test_iterator_single = TFRIterator(X=X, Y=Y, list_size=1, batch_size=len(X), split='test') + print("\nEvaluate each example in test set...") + avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits = eval_steps(test_iterator_single, 1) + for i, labels, preds, scores, logits in zip(range(100), test_labels, test_preds, test_scores, test_logits): + print('labels', labels.numpy(), 'preds', preds.numpy(), 'scores', scores.numpy(), 'logits', logits.numpy()) + test_logits = sorted(list(test_logits.numpy())) + top_10_persent = test_logits[:int(len(test_logits)*0.1)] + print('top_10_persent', top_10_persent) + print('top_10_persent threshold', top_10_persent[-1]) + print('test_logits', test_logits) + + +main(sys.argv) diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py new file mode 100644 index 0000000..a668e75 --- /dev/null +++ b/autodist/simulator/utils.py @@ -0,0 +1,342 @@ +import glob +import json +import os +import numpy as np + +import tensorflow_ranking as tfr +import tensorflow as tf +from tensorflow.python.framework import device_spec + +from arion.utils import logging +from arion.resource_spec import ResourceSpec +from arion.strategy.base import Strategy +from arion.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \ + DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR +from arion.kernel.device.resolver import DeviceResolver + + +RankingLossKeys = { + # Names for the ranking based loss functions. + 'pairwise_hinge_loss': tfr.losses.RankingLossKey.PAIRWISE_HINGE_LOSS, + 'pairwise_logistic_loss': tfr.losses.RankingLossKey.PAIRWISE_LOGISTIC_LOSS, + 'pairwise_soft_zero_one_loss': tfr.losses.RankingLossKey.PAIRWISE_SOFT_ZERO_ONE_LOSS, + 'softmax_loss': tfr.losses.RankingLossKey.SOFTMAX_LOSS, + 'sigmoid_cross_entropy_loss': tfr.losses.RankingLossKey.SIGMOID_CROSS_ENTROPY_LOSS, + 'mean_squared_loss': tfr.losses.RankingLossKey.MEAN_SQUARED_LOSS, + 'list_mle_loss': tfr.losses.RankingLossKey.LIST_MLE_LOSS, + 'approx_ndcg_loss': tfr.losses.RankingLossKey.APPROX_NDCG_LOSS, +} + +######### +# Online +######### + +def laod_from_one_folder(data_folder): + strategy_folder = '{}/strategies'.format(data_folder) + strategy_files = glob.glob(os.path.join(strategy_folder, '*')) + X = [] + Y = [] + for strategy_file in strategy_files: + # Target + runtime_file = '/'.join(strategy_file.split('/')[:-2]) + '/runtimes/' + strategy_file.split('/')[-1] + if not os.path.exists(runtime_file) or not os.path.isfile(runtime_file): + print('runtime_file does not exist: {}.'.format(runtime_file)) + continue + runtime = json.load(open(runtime_file, 'r')) + y = runtime['average'] + resource_file = strategy_file.replace('strategies', 'resource_specs') + if not os.path.exists(resource_file): + resource_file += '.yml' + if not os.path.exists(resource_file): + resource_file = os.path.join(data_folder, 'resource_spec_files/resource_spec.yml') + if not os.path.exists(resource_file): + continue + Y.append(y) + X.append([strategy_file, resource_file]) + print('Data points:{}, data_folder: {}'.format(len(X), data_folder)) + return X, Y + + +def laod_from_folders(data_dir): + if isinstance(data_dir, str): + data_folders = glob.glob("{}/*".format(data_dir), recursive=True) + elif isinstance(data_dir, list): + data_folders = data_dir + else: + raise ValueError + print('data_folders', data_folders) + X = [] + Y = [] + for data_folder in data_folders: + x, y = laod_from_one_folder(data_folder) + if len(x) == 0: + print('strategy_folder does not have files: {}, skipping it.'.format(data_folder)) + continue + Y.extend(y) + X.extend(x) + # Y = np.concatenate(Y, axis=0) + if len(Y) > 0: + Y = np.array(Y, dtype=np.float) + miny = np.min(Y) + print('min of all Y values: {}'.format(miny)) + else: + print("no files loaded.") + return X, Y + + +########## +# Offline +########## + +def laod_from_one_folder_offline(simulation_folder): + simulation_files = glob.glob(os.path.join(simulation_folder, '*'), recursive=True) + X = [] + Y = [] + for simulation_file in simulation_files: + # Features + try: + simulation = json.load(open(simulation_file, 'r')) + except: + print("Can not read simulation_file: ", simulation_file) + continue + x = simulation_file + # Target + runtime_file = '/'.join(simulation_file.split('/')[:-2]) + '/runtimes/' + simulation_file.split('/')[-1] + if not os.path.exists(runtime_file) or not os.path.isfile(runtime_file): + print('runtime_file does not exist: {}.'.format(runtime_file)) + continue + runtime = json.load(open(runtime_file, 'r')) + y = runtime['average'] + Y.append(y) + X.append(x) + Y = np.array(Y, dtype=np.float) + print('Data points:{}, simulation_folder: {}'.format(len(X), simulation_folder)) + return X, Y + + +def laod_from_folders_offline(data_dir): + simulation_folders = glob.glob("{}/*/simulations".format(data_dir), recursive=True) + print('simulation_folders', simulation_folders) + X = [] + Y = [] + for simulation_folder in simulation_folders: + x, y = laod_from_one_folder_offline(simulation_folder) + if len(x) == 0: + print('simulation folder does not have files: {}, skipping it.'.format(simulation_folder)) + continue + Y.append(y) + X.append(x) + Y = np.concatenate(Y, axis=0) + miny = np.min(Y) + print('min of Y values: {}'.format(miny)) + return X, Y + + +def split_dataset(inputs, shuffle=True, train_ratio=0.7, test_ratio=0.15): + assert isinstance(inputs, list) + nb_elements = len(inputs) + nb_samples = len(inputs[0]) + n_train = int(nb_samples * train_ratio) + n_test = int(nb_samples * test_ratio) + shuffled = [] + train = [] + valid = [] + test = [] + + if shuffle: + random_indices = np.random.permutation(list(range(nb_samples))) + for i in range(nb_elements): + shuffled_i = [inputs[i][j] for j in random_indices] + train.append(shuffled_i[:n_train]) + valid.append(shuffled_i[n_train:-n_test]) + test.append(shuffled_i[-n_test:]) + else: + for i in range(nb_elements): + train.append(inputs[i][:n_train]) + valid.append(inputs[i][n_train:-n_test]) + test.append(inputs[i][-n_test:]) + + return train, valid, test + +def read_trial_runs(): + runtime_files = glob.glob(os.path.join(DEFAULT_RUNTIME_SERIALIZATION_DIR, '*')) + strategy_files = glob.glob(os.path.join(DEFAULT_SERIALIZATION_DIR, '*')) + strategy_json_files = glob.glob(os.path.join(DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, '*')) + resource_files = glob.glob(os.path.join(DEFAULT_RESOURCE_SERIALIZATION_DIR, '*')) + logging.info(len(runtime_files), len(strategy_files), len(strategy_json_files), len(resource_files)) + + trialruns = {} + for runtime_file in runtime_files: + strategy_id = runtime_file.split('/')[-1] + strategy_file = os.path.join(DEFAULT_SERIALIZATION_DIR, strategy_id) + strategy_json_file = os.path.join(DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, strategy_id) + resource_file = os.path.join(DEFAULT_RESOURCE_SERIALIZATION_DIR, strategy_id) + if not os.path.exists(strategy_file): + logging.info("strategy_file not found, skip it: {}".format(strategy_file)) + continue + if not os.path.exists(strategy_json_file): + logging.info("strategy_json_file not found, skip it: {}".format(strategy_json_file)) + continue + if not os.path.exists(resource_file): + logging.info("resource_file not found, skip it: {}".format(resource_file)) + continue + + trialruns[strategy_id] = { + 'runtime': json.load(open(runtime_file, 'r')), + 'strategy': Strategy.deserialize(strategy_id), + 'strategy_json': json.load(open(strategy_json_file, 'r')), + 'resource_spec': ResourceSpec(resource_file=resource_file), + } + + logging.info("Total number of trials: {}".format(len(trialruns))) + return trialruns + + +DTYPE2BITS = { + tf.float16: 16, + "tf.float16": 16, + "": 16, + tf.float32: 32, + 'tf.float32': 32, + "": 32, + "": 32, + tf.float64: 64, + 'tf.float64': 64, + "": 64, + tf.bfloat16: 16, + 'tf.bfloat16': 16, + "": 16, + tf.complex64: 64, + 'tf.complex64': 64, + "": 64, + tf.complex128: 128, + 'tf.complex128': 128, + "": 128, + tf.int8: 8, + 'tf.int8': 8, + "": 8, + tf.uint8: 8, + 'tf.uint8': 8, + "": 8, + tf.uint16: 16, + 'tf.uint16': 16, + "": 16, + tf.uint32: 32, + 'tf.uint32': 32, + "": 32, + tf.uint64: 64, + 'tf.uint64': 64, + "": 64, + tf.int16: 16, + 'tf.int16': 16, + "": 16, + tf.int32: 32, + 'tf.int32': 32, + "": 32, + tf.int64: 64, + 'tf.int64': 64, + "": 64, + tf.bool: 1, + 'tf.bool': 1, + "": 1, + tf.string: 1, # todo: confirm + 'tf.string': 1, # todo: confirm + "": 1, # todo: confirm + tf.qint8: 8, + 'tf.qint8': 8, + "": 8, + tf.quint8: 8, + 'tf.quint8': 8, + "": 8, + tf.qint16: 16, + 'tf.qint16': 16, + "": 16, + tf.quint16: 16, + 'tf.quint16': 16, + "": 16, + tf.qint32: 32, + 'tf.qint32': 32, + "": 32, + tf.resource: 0, # its tensor shape is either [] or [None] todo: confirm + 'tf.resource': 0, # its tensor shape is either [] or [None] todo: confirm + "": 0, # its tensor shape is either [] or [None] todo: confirm +} + +GIGABITS = np.float(1e+9) +INFINITY = 1e+9 +NUM_RUNS = 500 + + +def pad_list(l, max_len): + return l + [0.0] * (max_len - len(l)) + + +def get_dtype_bits(dtype): + return DTYPE2BITS[dtype] if dtype in DTYPE2BITS else DTYPE2BITS[str(dtype)] + + +def get_dense_var_bits(size, dtype): + return size * get_dtype_bits(dtype) + + +def get_sparse_var_bits(size): + # same size of values, indices, dense_shape + return size * (get_dtype_bits(tf.float32) + 2 * get_dtype_bits(tf.int64)) \ + + 2 * get_dtype_bits(tf.int64) + + +def _resolved_devices_on_diff_machine(device1, device2): + # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0' + node1 = ':'.join(device1.split('/')[:-1]) + node2 = ':'.join(device2.split('/')[:-1]) + return node1 != node2 + + +def _resolve_device_address(device: str, device_resolver: DeviceResolver): + # change real ip address to /job:worker/task:0 + if not device: + return device + parts = device.split(':') + if parts and parts[0] in device_resolver._address_to_tasks: + resolved_device = device_resolver._address_to_tasks[parts[0]][0] + resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task']) + resolved = resolved + ':'.join(parts[-2:]) + return resolved + else: + raise ValueError("cannot resolve device: {} using device_resolver: {}".format( + device, device_resolver._address_to_tasks)) + + +def _num_local_replica(host, replicas, cluster): + # host: e.g., '/job:worker/task:0/device:CPU:0' + replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} + host_device = device_spec.DeviceSpecV2.from_string(host) + num_local_replica = sum(1 for d in replica_devices + if cluster.get_address_from_task(d.job, d.task) == + cluster.get_address_from_task(host_device.job, host_device.task)) + return num_local_replica + + +def _max_num_local_replica(replicas, cluster): + replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} + replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices} + max_num_local_replica = 0 + for host in replica_hosts: + num_local_replica = sum(1 for d in replica_devices + if cluster.get_address_from_task(d.job, d.task) == host) + max_num_local_replica = max(max_num_local_replica, num_local_replica) + return max_num_local_replica + + +def _strip_var_name(name): + # strip prefix + if not name: + return name + name = name.split('/') + if 'Replica' in name[0]: # remove prefix + name = name[1:] + if name and 'part' in name[-1]: # remove '/part_1' if using partitioned ps + name = name[:-1] + name = '/'.join(name) + name = name.split(':')[0] # remove ':0'. + return name diff --git a/autodist/strategy/auto/ar_group_assigner.py b/autodist/strategy/auto/ar_group_assigner.py new file mode 100644 index 0000000..c2d59b6 --- /dev/null +++ b/autodist/strategy/auto/ar_group_assigner.py @@ -0,0 +1,57 @@ +from collections import OrderedDict + +import numpy as np + + +def chunk_group_assigner(ar_shards, chunk_size=1): + assignments = {} + for i, shard_name in enumerate(ar_shards): + assignments[shard_name] = i // chunk_size + assert(len(ar_shards)) == len(assignments) + return assignments + + +def christy_group_assigner(ar_shards, var_helpers, num_group): + """A probabilistic assigner that tries to put each ring with balanced message size""" + assignments = {} + + sorted_ar_shards = OrderedDict(sorted(ar_shards.items(), key=lambda x: var_helpers[x[0]].byte_size, reverse=True)) + cur_loads = [0.0 for i in range(num_group)] + for shard_name in sorted_ar_shards: + total_loads = sum(cur_loads) + balanced_loads = [total_loads / num_group for _ in range(num_group)] + space = np.array([balanced_load - cur_load for balanced_load, cur_load in zip(balanced_loads, cur_loads)]) + + e_x = np.exp(space-np.max(space)) + accept_prob = e_x / e_x.sum() + + des = np.random.choice(range(0, num_group), 1, p=accept_prob)[0] + assignments[shard_name] = des + cur_loads[des] += var_helpers[shard_name].byte_size + assert(len(ar_shards)) == len(assignments) + # entropy = calcuate_entropy(cur_loads) + # best_entropy = calcuate_entropy(balanced_loads) + # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy)) + return assignments + +def ordered_balanced_group_assigner(ar_shards, var_helpers, num_group): + """Greedy assigner that create balanced loads following a given var order.""" + assignments = {} + + # get total size + total_loads = 0.0 + for shard_name in ar_shards: + total_loads += var_helpers[shard_name].byte_size + + avg_load = total_loads / num_group + + cur_bucket = 0 + loads = [0 for _ in range(num_group)] + for shard_name in ar_shards: + if loads[cur_bucket] >= avg_load: + cur_bucket += 1 + if loads[cur_bucket] < avg_load: + assignments[shard_name] = cur_bucket + loads[cur_bucket] += var_helpers[shard_name].byte_size + assert(len(ar_shards) == len(assignments)) + return assignments \ No newline at end of file diff --git a/autodist/strategy/auto/auto_strategy.py b/autodist/strategy/auto/auto_strategy.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/strategy/auto/ps_load_balancer.py b/autodist/strategy/auto/ps_load_balancer.py new file mode 100644 index 0000000..dc770d8 --- /dev/null +++ b/autodist/strategy/auto/ps_load_balancer.py @@ -0,0 +1,67 @@ +from collections import OrderedDict + +import numpy as np + + +def calcuate_entropy(loads): + distribution = loads / np.sum(loads) + distribution = distribution + 1e-4 + entropy = - np.sum(distribution * np.log2(distribution)) + return entropy + +def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False): + # no randomness + assignments = {} + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + loads = {ps: 0.0 for ps in reduction_device_names} + + sorted_ps_shards = ps_shards + if sort_by_size: + sorted_ps_shards = OrderedDict(sorted(ps_shards.items(), + key=lambda x: var_helpers[x[0]].byte_size, reverse=True)) + + for shard_name in sorted_ps_shards: + sorted_ps = sorted(loads, key=loads.get) + destination = sorted_ps[0] + assignments[shard_name] = destination + loads[destination] += var_helpers[shard_name].byte_size + return assignments + +def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False): + # Sample destination based on a distributed calculated based on loads and available bandwidth + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + loads = {ps: 0.0 for ps in reduction_device_names} + assignments = {} + + loads = sorted(list(loads.items()), key=lambda x: x[0]) + ps = [load[0] for load in loads] + bandwidth = [resource_spec.network_bandwidth[p.split(':')[0]] for p in ps] + total_bandwidth = sum(bandwidth) + cur_loads = [float(load[1]) for load in loads] + + sorted_ps_shards = ps_shards + if sort_by_size: + sorted_ps_shards = OrderedDict(sorted(ps_shards.items(), + key=lambda x: var_helpers[x[0]].byte_size, reverse=True)) + + for shard_name in sorted_ps_shards: + total_load = sum(cur_loads) # + var_load + balanced_loads = [total_load * b / total_bandwidth for b in bandwidth] + space = np.array([balanced_load - cur_load for balanced_load, cur_load in zip(balanced_loads, cur_loads)]) + + # softmax + e_x = np.exp(space - np.max(space)) + accept_prob = e_x / e_x.sum() + + # sample according to current load + des = np.random.choice(ps, 1, p=accept_prob)[0] + assignments[shard_name] = des + + cur_loads[ps.index(des)] += var_helpers[shard_name].byte_size + assert (len(ps_shards) == len(assignments)) + + # entropy = calcuate_entropy(cur_loads) + # best_entropy = calcuate_entropy(balanced_loads) + # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy)) + return assignments + diff --git a/autodist/strategy/auto/random_strategy.py b/autodist/strategy/auto/random_strategy.py new file mode 100644 index 0000000..24150dd --- /dev/null +++ b/autodist/strategy/auto/random_strategy.py @@ -0,0 +1,443 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""AllReduce StrategyBuilder.""" +from collections import OrderedDict + +from enum import Enum +from tensorflow.python.framework import ops + +from arion.kernel.common.utils import get_op_name, get_consumers +from arion.kernel.partitioner import PartitionerConfig +from arion.proto import strategy_pb2, synchronizers_pb2 +from arion.search import sample_util +from arion.strategy.base import Strategy, StrategyBuilder +from arion.strategy.base import byte_size_load_fn +from arion.strategy.component.ps_load_balancer import greedy_load_balancer, christy_load_balancer +from arion.strategy.component.ar_group_assigner import chunk_group_assigner, \ + christy_group_assigner, ordered_balanced_group_assigner + +class VarType(Enum): + SPARSE = 0 + DENSE = 1 + + +class VariableHelper: + def __init__(self, var, graph_item): + self.var = var + self.graph_item = graph_item + self._var_op_name = get_op_name(var.name) + self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0] + + @property + def var_type(self): + return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE + + @property + def is_sparse(self): + return True if self.var_type == VarType.SPARSE else False + + @property + def is_embedding(self): + for op in get_consumers(self.var.op): + if op.type == "ResourceGather": + return True + # op = new_graph_item.graph.get_operation_by_name( + # ops.prepend_name_scope(op.name, ARION_TO_DELETE_SCOPE) + # ) + return False + + @property + def shape(self): + if self.var.initial_value.shape.ndims: + return self.var.initial_value.shape.as_list() + else: + return None + + @property + def partitionable_axis(self): + valid_axis = [] + if not self.shape: + return valid_axis + # Sparse variable can only be partition along the 0th axis + # only sample axis for dense variables + if self.is_sparse or self.is_embedding: + valid_axis = [0] + return valid_axis + for idx, dim in enumerate(self.shape): + if dim > 1: + valid_axis.append(idx) + return valid_axis + + @property + def byte_size(self): + return float(byte_size_load_fn(self.var)) + + @property + def dtype(self): + return self.var.dtype + + +class PartHelper: + def __init__(self, part_idx, var, pc): + self.var = var + self.part_idx = part_idx + self.pc = pc + + @property + def shape(self): + shape = self.var.initial_value.shape.as_list() + dim_size = shape[self.pc.axis] // self.pc.num_shards + extras = shape[self.pc.axis] % self.pc.num_shards + if self.part_idx < extras: + dim_size += 1 + shape[self.pc.axis] = dim_size + return shape + + @property + def var_shape(self): + return self.var.initial_value.shape.as_list() + + @property + def byte_size(self): + return float(byte_size_load_fn(self.var)) \ + * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis]) + + +class RandomStrategy(StrategyBuilder): + def __init__(self, space, heuristics): + """ + + Args: + self: + enable_ps_load_balancer: + enable_chunk: + + Returns: + + """ + self.space = space + self.heuristics = heuristics + self.helpers = {} + + def reset(self): + self.helpers = {} + + def build(self, graph_item, resource_spec): + expr = Strategy() + + # number of graph replica is equal to number of GPU devices + expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices]) + variables = graph_item.trainable_var_op_to_var.values() + + # A fully MCMC process to generate node configs + node_config = [] + for var in variables: + var_helper = VariableHelper(var, graph_item) + self.helpers[var_helper.var.name] = var_helper + + node = strategy_pb2.Strategy.Node() + node.var_name = var_helper.var.name + + # Step 1: determine whether or not to partition + # TODO(Hao): other factor not considered -- number of reduction_device_names + maybe_partition = sample_if_partition(var_helper, resource_spec, self.space, self.heuristics) + + # Step 2.1: if not partition, sample a synchronizer type for it + if not maybe_partition: # no partition + sample_var_synchronizer(node, var_helper, resource_spec, self.space) + else: # Step 2.2: if partition + # Step 2.2.1: sample a partitioner config + pc = sample_partition_config(var_helper, resource_spec, self.space, self.heuristics) + node.partitioner = pc.partition_str + + # step 2.2.2: sample a synchornizer type for each partition + parts = [] + for i in range(pc.num_shards): + part = strategy_pb2.Strategy.Node() + part.var_name = '{}/part_{}:0'.format(get_op_name(var.name), i) + self.helpers[part.var_name] = PartHelper(i, var, pc) + parts.append(part) + sample_parts_synchronizers(parts, var_helper, resource_spec, self.space, self.heuristics) + node.part_config.extend(parts) + node_config.append(node) + + sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics) + # Mark each variable to be synchronized with a Parameter Server + expr.node_config.extend(node_config) + return expr + + +def sample_if_partition(var_helper, resource_spec, space, heuristics): + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + if len(space['maybe_partition']) == 1: + return space['maybe_partition'] + if heuristics['enable_single_node_no_partition'] and len(reduction_device_names) <= 1: + return False + + # intersection of variable's partitonable axis and global constraints + if var_helper.partitionable_axis: + if space['partitionable_axis']: + a = set(var_helper.partitionable_axis) & set(space['partitionable_axis']) + if len(a) < 1: + return False + else: + return False + + # lower bound for abandoning partitioning + lb = heuristics['maybe_partition_bounds'][0] + ub = heuristics['maybe_partition_bounds'][1] + if var_helper.byte_size <= lb: + return False + if var_helper.byte_size >= ub: + return True + assert (len(space['maybe_partition']) == 2) + + if heuristics['maybe_partition_by_size']: + # By variable size -- a large variable has a higher chance to be partitioned + # TODO (Hao): MAX_INT32 is too large, reconsider later... + chance = float(var_helper.byte_size - lb) / float(ub - lb) + return sample_util.binary_sample(boundary=chance) + else: + return sample_util.uniform_sample_by_choices(space['maybe_partition']) + + +def sample_var_synchronizer(node, var_helper, resource_spec, space): + # sample a single synchornizer for an unpartitioned variable, + # will eave merge_group of reduction_destination as empty + + # We ALWAYS use PS for sparse variables + synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \ + else sample_util.uniform_sample_by_choices(space['synchronizer_types']) + if synchronizer_type == 'PS': + node.PSSynchronizer.sync = True # we don't consider async at this moment + node.PSSynchronizer.staleness = 0 + node.PSSynchronizer.local_replication = sample_if_local_replication(space['local_replication'], + resource_spec) + else: + # no other option for spec + node.AllReduceSynchronizer.spec = synchronizers_pb2.AllReduceSynchronizer.Spec.Value('AUTO') + node.AllReduceSynchronizer.compressor = \ + synchronizers_pb2.AllReduceSynchronizer.Compressor.Value( + sample_ar_compressor(space['compressor'])) + + +def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristics): + # sample synchornizer for a group of variable partitions + + if var_helper.var_type == VarType.SPARSE: + synchronizer_types = ['PS'] * len(parts) + else: + if heuristics['same_synchronizer_for_parts']: + type = sample_util.uniform_sample_by_choices(space['synchronizer_types']) + synchronizer_types = [type] * len(parts) + else: + synchronizer_types = [sample_util.uniform_sample_by_choices(space['synchronizer_types']) + for part in parts] + for i, part in enumerate(parts): + if synchronizer_types[i] == 'PS': + part.PSSynchronizer.sync = True # we don't consider async at this moment + part.PSSynchronizer.staleness = 0 + part.PSSynchronizer.local_replication = sample_if_local_replication(space['local_replication'], + resource_spec) + else: + # no other option for spec + part.AllReduceSynchronizer.spec = synchronizers_pb2.AllReduceSynchronizer.Spec.Value('AUTO') + part.AllReduceSynchronizer.compressor = \ + synchronizers_pb2.AllReduceSynchronizer.Compressor.Value( + sample_ar_compressor(space['compressor'])) + + +def sample_partition_config(var_helper, resource_spec, space, heuristics): + # Since Arion only support parttion along one axis, + # we first sample a partition axis, then sammple #partition along that axis, we obtain the partition config. + assert len(var_helper.partitionable_axis) > 0, 'No partition axis available' + # sample partition axis + # TODO(Hao): some heursitics here available? + valid_axis = var_helper.partitionable_axis + if space['partitionable_axis']: + valid_axis = list(set(valid_axis) & set(space['partitionable_axis'])) + partition_axis = sample_util.uniform_sample_by_choices(valid_axis) + + # sample how many partition to go + num_nodes = resource_spec.num_cpus + dim_size = var_helper.shape[partition_axis] + if heuristics['num_partition_bounds'][1] == 'num_nodes': + max_shards = min(dim_size, num_nodes) + elif isinstance(heuristics['num_partition_bounds'][1], int): + max_shards = min(dim_size, heuristics['num_partition_bounds'][1]) + else: + raise ValueError('unseen num_partition_bounds config') + + min_shards = 2 + if isinstance(heuristics['num_partition_bounds'][0], int): + min_shards = max(min_shards, heuristics['num_partition_bounds'][0]) + elif heuristics['num_partition_bounds'][0] == 'num_nodes': + min_shards = max(min_shards, heuristics['num_partition_bounds'][0]) + else: + raise ValueError('unseen num_partition_bounds config') + + # sample from [min_shards, max_shards] + num_shards = sample_util.uniform_sample_by_choices(range(min_shards, max_shards + 1)) + + # construct a PartitionerConfig (pc) + partition_list = [1] * len(var_helper.shape) + partition_list[partition_axis] = num_shards + pc = PartitionerConfig(partition_list=partition_list) + return pc + + +def sample_if_local_replication(local_replication_space, resource_spec): + # Local replication is a PS-specific semantic; it represents whether to use hierarchical PS + if resource_spec.num_gpus <= resource_spec.num_cpus: + # meaning every machine has at most 1 GPU + return False + return sample_util.uniform_sample_by_choices(local_replication_space) + + +def sample_ar_compressor(compressor_space): + # [NoneCompressor, HorovodCompressor, HorovodCompressorEF, PowerSGDCompressor] + # [ HorovodCompressorEF, PowerSGDCompressor] will change gradient value + # so only two choices here + # TODO(Hao): try to use all four options + return sample_util.uniform_sample_by_choices(compressor_space) + + +def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics): + ps_shards = OrderedDict() + ar_shards = OrderedDict() + idx = 0 + for node in node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + ar_shards[part.var_name] = (idx,) + else: + ps_shards[part.var_name] = (idx,) + idx += 1 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + ar_shards[node.var_name] = (idx,) + else: + ps_shards[node.var_name] = (idx,) + idx += 1 + + if len(ps_shards) > 0: + sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics) + + # step 4: assign ar merge groups globally + if len(ar_shards) > 0: + sample_ar_groups(node_config, ar_shards, helpers, heuristics) + + +def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics): + load_balancer = heuristics['ps_load_balancer'] + reduction_device_names = [k for k, _ in resource_spec.cpu_devices] + if not load_balancer: + destinations = {} + for shard_name in ps_shards: + destinations[shard_name] = sample_util.uniform_sample_by_choices(reduction_device_names) + elif load_balancer == 'greedy': + destinations = greedy_load_balancer(ps_shards, resource_spec, helpers) + elif load_balancer == 'christy': + # copy Christy's partitionedPS + destinations = christy_load_balancer(ps_shards, resource_spec, helpers) + elif load_balancer == 'sorted_christy': + destinations = christy_load_balancer(ps_shards, resource_spec, helpers, sort_by_size=True) + elif load_balancer == 'sorted_greedy': + destinations = greedy_load_balancer(ps_shards, resource_spec, helpers, sort_by_size=True) + else: + raise ValueError('Cannot recognize load balancer') + + for shard_name, (idx, ) in ps_shards.items(): + ps_shards[shard_name] = (idx, destinations[shard_name]) + + assign_ps_reduction_destinations(node_config, ps_shards) + + +def assign_ps_reduction_destinations(node_config, ps_shards): + for node in node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = ps_shards[part.var_name][1] + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = ps_shards[node.var_name][1] + + +def sample_ar_groups(node_config, ar_shards, helpers, heuristics): + merge_scheme = heuristics['merge_scheme'] + if merge_scheme == 'by_chunk': + if 'chunk_size' in heuristics and heuristics['chunk_size'] > 0: + chunk_size_or_num_group = heuristics['chunk_size'] + else: + chunk_size_or_num_group = sample_chunk_size(len(ar_shards)) + else: + chunk_size_or_num_group = sample_num_ar_groups(ar_shards, + heuristics['num_group_bounds'][0], + heuristics['num_group_bounds'][1]) + assert chunk_size_or_num_group > 0, "chunk_size or num_groups need to > 1..." + + if merge_scheme in ['random', None]: + tmp_assignments = sample_util.sample_merge_group(chunk_size_or_num_group, len(ar_shards)) + group_assignments = OrderedDict() + for i, shard_name in enumerate(ar_shards): + group_assignments[shard_name] = tmp_assignments[i] + elif merge_scheme == 'by_chunk': + # sample chunk_size + group_assignments = chunk_group_assigner(ar_shards, chunk_size_or_num_group) + elif merge_scheme == 'christy': + group_assignments = christy_group_assigner(ar_shards, + helpers, + chunk_size_or_num_group) + elif merge_scheme == 'ordered_balanced': + group_assignments = ordered_balanced_group_assigner(ar_shards, + helpers, + chunk_size_or_num_group) + else: + raise ValueError('unseen merge scheme..') + + for shard_name, (idx,) in ar_shards.items(): + ar_shards[shard_name] = (idx, group_assignments[shard_name]) + assign_ar_group(node_config, ar_shards) + + +def sample_num_ar_groups(ar_shards, lb, ub): + min_num_group = max(1, lb) + max_num_group = min(len(ar_shards), ub) + num_group = sample_util.uniform_sample_by_choices(range(min_num_group, max_num_group + 1)) + return num_group + + +def sample_chunk_size(num_ar_shards): + chunk_size = sample_util.uniform_sample_by_choices(range(1, num_ar_shards + 1)) + return chunk_size + + +def assign_ar_group(node_config, ar_shards): + for node in node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + synchronizer.group = ar_shards[part.var_name][1] + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'compressor'): + synchronizer.group = ar_shards[node.var_name][1] From fd2128dac36a2f647c7ac34af0d778ad3968dfb9 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Thu, 16 Jul 2020 00:47:42 -0400 Subject: [PATCH 02/11] add docstrings for strategy sampler and minor improvements --- autodist/strategy/auto/sample_util.py | 61 ++++ ...random_strategy.py => strategy_sampler.py} | 307 +++++++++++++++--- autodist/strategy/base.py | 36 ++ autodist/strategy/partitioned_ps_strategy.py | 37 +-- 4 files changed, 356 insertions(+), 85 deletions(-) create mode 100644 autodist/strategy/auto/sample_util.py rename autodist/strategy/auto/{random_strategy.py => strategy_sampler.py} (67%) diff --git a/autodist/strategy/auto/sample_util.py b/autodist/strategy/auto/sample_util.py new file mode 100644 index 0000000..2547304 --- /dev/null +++ b/autodist/strategy/auto/sample_util.py @@ -0,0 +1,61 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample utility functions.""" + +import numpy as np + + +def uniform_sample_by_choices(choices): + """ + Uniformly sample an option from a list of options. + + Args: + choices (list): a list of values to be sampled from. + + Returns: + choice: the sampled value. + + """ + assert choices + p = np.random.uniform() + t = 1.0 / len(choices) + sample = choices[0] + for i, c in enumerate(choices): + if p < t * (i+1): + sample = c + break + return sample + + +def binary_sample(boundary=0.5): + p = np.random.uniform() + if p < boundary: + return True + else: + return False + + +def sample_merge_group(num_group, num_candidates): + + def is_valid(assignment): + unique_assignment = np.unique(assignment) + if unique_assignment.shape[0] == num_group: + return True + return False + + assignment = np.random.randint(1, num_group+1, [num_candidates]) + while not is_valid(assignment): + assignment = np.random.randint(1, num_group+1, [num_candidates]) + return assignment diff --git a/autodist/strategy/auto/random_strategy.py b/autodist/strategy/auto/strategy_sampler.py similarity index 67% rename from autodist/strategy/auto/random_strategy.py rename to autodist/strategy/auto/strategy_sampler.py index 24150dd..3281c4b 100644 --- a/autodist/strategy/auto/random_strategy.py +++ b/autodist/strategy/auto/strategy_sampler.py @@ -12,21 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""AllReduce StrategyBuilder.""" +"""Strategy sampler that generates random strategies given model and resource spec.""" + from collections import OrderedDict from enum import Enum from tensorflow.python.framework import ops -from arion.kernel.common.utils import get_op_name, get_consumers -from arion.kernel.partitioner import PartitionerConfig -from arion.proto import strategy_pb2, synchronizers_pb2 -from arion.search import sample_util -from arion.strategy.base import Strategy, StrategyBuilder -from arion.strategy.base import byte_size_load_fn -from arion.strategy.component.ps_load_balancer import greedy_load_balancer, christy_load_balancer -from arion.strategy.component.ar_group_assigner import chunk_group_assigner, \ - christy_group_assigner, ordered_balanced_group_assigner +from autodist.kernel.common.utils import get_op_name, get_consumers +from autodist.kernel.partitioner import PartitionerConfig +from autodist.proto import strategy_pb2, synchronizers_pb2 +from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn +from autodist.strategy.auto.ps_load_balancer import greedy_load_balancer, christy_load_balancer +from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \ + ordered_balanced_group_assigner +from autodist.strategy.auto import sample_util + class VarType(Enum): SPARSE = 0 @@ -34,6 +35,7 @@ class VarType(Enum): class VariableHelper: + """Helper class to include meta information about a variable.""" def __init__(self, var, graph_item): self.var = var self.graph_item = graph_item @@ -42,54 +44,97 @@ def __init__(self, var, graph_item): @property def var_type(self): + """ + Return the type of the variable (VarType.SPARSE or VarType.DENSE). + + Returns: + VarType + """ return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE @property def is_sparse(self): + """ + Return whether the variable is sparse. + + Returns: + Bool + """ return True if self.var_type == VarType.SPARSE else False @property def is_embedding(self): + """ + Return whether the variable corresponds to an embedding. + + Returns: + Bool + """ + # TODO (Hao): better way to determine is_embedding? for op in get_consumers(self.var.op): if op.type == "ResourceGather": return True - # op = new_graph_item.graph.get_operation_by_name( - # ops.prepend_name_scope(op.name, ARION_TO_DELETE_SCOPE) - # ) return False @property def shape(self): + """ + Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar). + + Returns: + List(int) + """ if self.var.initial_value.shape.ndims: return self.var.initial_value.shape.as_list() else: return None @property - def partitionable_axis(self): - valid_axis = [] + def partitionable_axes(self): + """ + Return the list of available axes that are legitimate to partition along. + + Returns: + List(int) + """ + valid_axes = [] + + # scalar if not self.shape: - return valid_axis - # Sparse variable can only be partition along the 0th axis - # only sample axis for dense variables + return valid_axes + + # Sparse variable can only be partition along the 0th axis in current implementation. if self.is_sparse or self.is_embedding: - valid_axis = [0] - return valid_axis + valid_axes = [0] + return valid_axes for idx, dim in enumerate(self.shape): if dim > 1: - valid_axis.append(idx) - return valid_axis + valid_axes.append(idx) + return valid_axes @property def byte_size(self): + """ + Return the byte size of the variable. + + Returns: + float + """ return float(byte_size_load_fn(self.var)) @property def dtype(self): + """ + Return the dtype of the variable. + + Returns: + dtype + """ return self.var.dtype class PartHelper: + """Helper class to include meta information about a variable partition.""" def __init__(self, part_idx, var, pc): self.var = var self.part_idx = part_idx @@ -97,6 +142,13 @@ def __init__(self, part_idx, var, pc): @property def shape(self): + """ + Return the shape of this partition. + + Returns: + List(int) + + """ shape = self.var.initial_value.shape.as_list() dim_size = shape[self.pc.axis] // self.pc.num_shards extras = shape[self.pc.axis] % self.pc.num_shards @@ -107,41 +159,62 @@ def shape(self): @property def var_shape(self): + """ + Return the shape of the original value this part belonged to. + + Returns: + List(int) + """ return self.var.initial_value.shape.as_list() @property def byte_size(self): + """ + Return the byte size of this partition. + + Returns: + float + """ return float(byte_size_load_fn(self.var)) \ * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis]) -class RandomStrategy(StrategyBuilder): +class RandomStrategySampler(StrategyBuilder): + """ + Random Strategy Sampler. + + This StrategyBuilder samples a strategy given graph_item and resource_spec. The sampling process is + constrained by `space`, and guided by `heuristics`, both as required arguments of its constructor. + """ def __init__(self, space, heuristics): """ Args: - self: - enable_ps_load_balancer: - enable_chunk: - - Returns: - + space (dict): the strategy space that the random strategy should be drawn from. An example of the space + can be found at TODO(Hao). + heuristics (dict): heuristics used to guide the random sampling process. """ + if not space: + raise ValueError('Space to perform strategy sampling is not provided.') + if not heuristics: + raise ValueError('Heuristic to guide strategy sampling is not provided.') self.space = space self.heuristics = heuristics self.helpers = {} def reset(self): + """Reset the helpers every time a strategy is sampled.""" self.helpers = {} def build(self, graph_item, resource_spec): + """Generate a randomized strategy given model and resource spec.""" expr = Strategy() # number of graph replica is equal to number of GPU devices expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices]) variables = graph_item.trainable_var_op_to_var.values() - # A fully MCMC process to generate node configs + # Perform MCMC to generate each node configs node_config = [] for var in variables: var_helper = VariableHelper(var, graph_item) @@ -151,18 +224,18 @@ def build(self, graph_item, resource_spec): node.var_name = var_helper.var.name # Step 1: determine whether or not to partition - # TODO(Hao): other factor not considered -- number of reduction_device_names + # TODO(Hao): some factor is not considered, e.g. number of reduction_device_names maybe_partition = sample_if_partition(var_helper, resource_spec, self.space, self.heuristics) # Step 2.1: if not partition, sample a synchronizer type for it if not maybe_partition: # no partition sample_var_synchronizer(node, var_helper, resource_spec, self.space) - else: # Step 2.2: if partition + else: # Step 2.2: else partition # Step 2.2.1: sample a partitioner config pc = sample_partition_config(var_helper, resource_spec, self.space, self.heuristics) node.partitioner = pc.partition_str - # step 2.2.2: sample a synchornizer type for each partition + # step 2.2.2: sample a synchronizer type for each partition parts = [] for i in range(pc.num_shards): part = strategy_pb2.Strategy.Node() @@ -173,13 +246,26 @@ def build(self, graph_item, resource_spec): node.part_config.extend(parts) node_config.append(node) + # Step 3: Post-assign group or placement. sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics) - # Mark each variable to be synchronized with a Parameter Server + expr.node_config.extend(node_config) return expr def sample_if_partition(var_helper, resource_spec, space, heuristics): + """ + Sample a bool value determining whether to partition a variable or not. + + Args: + var_helper: the variable helper corresponded to the variable of interest. + resource_spec: the target cluster spec. + space: the space argument controlling where to sample from. + heuristics: the heuristics argument guiding the sampling process. + + Returns: + Bool + """ reduction_device_names = [k for k, _ in resource_spec.cpu_devices] if len(space['maybe_partition']) == 1: return space['maybe_partition'] @@ -214,9 +300,16 @@ def sample_if_partition(var_helper, resource_spec, space, heuristics): def sample_var_synchronizer(node, var_helper, resource_spec, space): - # sample a single synchornizer for an unpartitioned variable, - # will eave merge_group of reduction_destination as empty - + """ + Sample a synchronizer (and all associated aspects) for an unpartitioned variable, + leaving merge_group or reduction_destination as empty. + + Args: + node (strategy_pb2.Strategy.Node): the corresponded node_config to be rewritten. + var_helper (VariableHelper): the variable helper corresponded to the variable. + resource_spec (ResourceSpec): the target cluster spec + space (dict): space. + """ # We ALWAYS use PS for sparse variables synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \ else sample_util.uniform_sample_by_choices(space['synchronizer_types']) @@ -234,8 +327,18 @@ def sample_var_synchronizer(node, var_helper, resource_spec, space): def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristics): - # sample synchornizer for a group of variable partitions - + """ + Sample synchronizers for all the partitions of a variable. + + Args: + parts: + var_helper: + resource_spec: + space: + heuristics: + + Returns: + """ if var_helper.var_type == VarType.SPARSE: synchronizer_types = ['PS'] * len(parts) else: @@ -260,8 +363,19 @@ def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristi def sample_partition_config(var_helper, resource_spec, space, heuristics): - # Since Arion only support parttion along one axis, - # we first sample a partition axis, then sammple #partition along that axis, we obtain the partition config. + """ + Sample the PartitionerConfig of a variable (that is to be partitioned). + + Args: + var_helper: + resource_spec: + space: + heuristics: + + Returns: + """ + # Arion only support partitioning along one axis -- we first sample a partition axis, + # then sample the number of partitions along that axis, and obtain the partition config. assert len(var_helper.partitionable_axis) > 0, 'No partition axis available' # sample partition axis # TODO(Hao): some heursitics here available? @@ -299,7 +413,19 @@ def sample_partition_config(var_helper, resource_spec, space, heuristics): def sample_if_local_replication(local_replication_space, resource_spec): - # Local replication is a PS-specific semantic; it represents whether to use hierarchical PS + """ + Sample whether to perform local replication. + + Local replication is a PS-specific semantic; it represents whether to transfer parameters or updates + via a transfer device. + + Args: + local_replication_space: + resource_spec: + + Returns: + + """ if resource_spec.num_gpus <= resource_spec.num_cpus: # meaning every machine has at most 1 GPU return False @@ -307,14 +433,34 @@ def sample_if_local_replication(local_replication_space, resource_spec): def sample_ar_compressor(compressor_space): - # [NoneCompressor, HorovodCompressor, HorovodCompressorEF, PowerSGDCompressor] - # [ HorovodCompressorEF, PowerSGDCompressor] will change gradient value - # so only two choices here + """ + Sample the type of the compressor being applied with collective ops. + + Available options include `NoneCompressor`, `HorovodCompressor`, `HorovodCompressorEF`, + `PowerSGDCompressor`, but `HorovodCompressorEF`, `PowerSGDCompressor` will change gradient value. + Args: + compressor_space: + + Returns: + """ # TODO(Hao): try to use all four options return sample_util.uniform_sample_by_choices(compressor_space) def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics): + """ + Sample the merge group or parameter placement (a.k.a. reduction_destination) after all other semantics + have been determined. + + Args: + node_config: + resource_spec: + helpers: + heuristics: + + Returns: + + """ ps_shards = OrderedDict() ar_shards = OrderedDict() idx = 0 @@ -337,13 +483,24 @@ def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, if len(ps_shards) > 0: sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics) - - # step 4: assign ar merge groups globally if len(ar_shards) > 0: sample_ar_groups(node_config, ar_shards, helpers, heuristics) def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics): + """ + Sample the placement of shared parameter variables (a.k.a. reduction destinations). + + Args: + node_config: + ps_shards: + resource_spec: + helpers: + heuristics: + + Returns: + + """ load_balancer = heuristics['ps_load_balancer'] reduction_device_names = [k for k, _ in resource_spec.cpu_devices] if not load_balancer: @@ -369,6 +526,16 @@ def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, help def assign_ps_reduction_destinations(node_config, ps_shards): + """ + Assign the sampled reduction destinations to node_config. + + Args: + node_config: + ps_shards: + + Returns: + + """ for node in node_config: if node.partitioner: for part in node.part_config: @@ -382,6 +549,18 @@ def assign_ps_reduction_destinations(node_config, ps_shards): def sample_ar_groups(node_config, ar_shards, helpers, heuristics): + """ + Sample the group of collective operations. + + Args: + node_config: + ar_shards: + helpers: + heuristics: + + Returns: + + """ merge_scheme = heuristics['merge_scheme'] if merge_scheme == 'by_chunk': if 'chunk_size' in heuristics and heuristics['chunk_size'] > 0: @@ -419,18 +598,48 @@ def sample_ar_groups(node_config, ar_shards, helpers, heuristics): def sample_num_ar_groups(ar_shards, lb, ub): + """ + Sample the number of collective groups. + + Args: + ar_shards: + lb: + ub: + + Returns: + + """ min_num_group = max(1, lb) max_num_group = min(len(ar_shards), ub) - num_group = sample_util.uniform_sample_by_choices(range(min_num_group, max_num_group + 1)) + num_group = sample_util.uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1))) return num_group def sample_chunk_size(num_ar_shards): - chunk_size = sample_util.uniform_sample_by_choices(range(1, num_ar_shards + 1)) + """ + Sample the chunk_size if following a chunk-based merge scheme. + + Args: + num_ar_shards: + + Returns: + + """ + chunk_size = sample_util.uniform_sample_by_choices(list(range(1, num_ar_shards + 1))) return chunk_size def assign_ar_group(node_config, ar_shards): + """ + Assign the sampled group values to node configs. + + Args: + node_config: + ar_shards: + + Returns: + + """ for node in node_config: if node.partitioner: for part in node.part_config: diff --git a/autodist/strategy/base.py b/autodist/strategy/base.py index 965e1ff..df562da 100644 --- a/autodist/strategy/base.py +++ b/autodist/strategy/base.py @@ -18,6 +18,8 @@ from abc import ABC, abstractmethod from datetime import datetime +from tensorflow.python.framework import tensor_shape + from autodist.const import DEFAULT_SERIALIZATION_DIR from autodist.graph_item import GraphItem from autodist.kernel.common.utils import get_op_name @@ -166,3 +168,37 @@ def compile(self, strategy): if self._device_resolver: strategy = self._resolve_devices(strategy) return strategy + + +def byte_size_load_fn(op): + """ + Load function that computes the byte size of a single-output `Operation`. + + Copied (with modifications) from tensorflow.contrib.training.python.training.device_setter. + + This is intended to be used with `"Variable"` ops, which have a single + `Tensor` output with the contents of the variable. However, it can also be + used for calculating the size of any op that has a single output. + + Intended to be used with `GreedyLoadBalancingStrategy`. + + Args: + op: An `Operation` with a single output, typically a "Variable" op. + + Returns: + The number of bytes in the output `Tensor`. + + Raises: + ValueError: if `op` does not have a single output, or if the shape of the + single output is not fully-defined. + """ + elem_size = op.dtype.size + shape = op.get_shape() + if not shape.is_fully_defined(): + # Due to legacy behavior, scalar "Variable" ops have output Tensors that + # have unknown shape when the op is created (and hence passed to this + # load function for placement), even though the scalar shape is set + # explicitly immediately afterward. + shape = tensor_shape.TensorShape(op.get_attr("shape")) + shape.assert_is_fully_defined() + return shape.num_elements() * elem_size diff --git a/autodist/strategy/partitioned_ps_strategy.py b/autodist/strategy/partitioned_ps_strategy.py index b1259a6..ecca253 100644 --- a/autodist/strategy/partitioned_ps_strategy.py +++ b/autodist/strategy/partitioned_ps_strategy.py @@ -15,13 +15,12 @@ """Partitioned PS StrategyBuilder with Greedy Load Balancer.""" from math import ceil -from tensorflow.python.framework import tensor_shape from autodist.const import ENV from autodist.kernel.common.op_info import CONTROL_FLOW_OPS from autodist.kernel.common.utils import get_consumers, get_op_name from autodist.kernel.partitioner import PartitionerConfig -from autodist.strategy.base import Strategy, StrategyBuilder +from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn from autodist.proto import strategy_pb2 @@ -133,37 +132,3 @@ def get_num_shards(var): if n % i == 0: return i return n - - -def byte_size_load_fn(op): - """ - Load function that computes the byte size of a single-output `Operation`. - - Copied (with modifications) from tensorflow.contrib.training.python.training.device_setter. - - This is intended to be used with `"Variable"` ops, which have a single - `Tensor` output with the contents of the variable. However, it can also be - used for calculating the size of any op that has a single output. - - Intended to be used with `GreedyLoadBalancingStrategy`. - - Args: - op: An `Operation` with a single output, typically a "Variable" op. - - Returns: - The number of bytes in the output `Tensor`. - - Raises: - ValueError: if `op` does not have a single output, or if the shape of the - single output is not fully-defined. - """ - elem_size = op.dtype.size - shape = op.get_shape() - if not shape.is_fully_defined(): - # Due to legacy behavior, scalar "Variable" ops have output Tensors that - # have unknown shape when the op is created (and hence passed to this - # load function for placement), even though the scalar shape is set - # explicitly immediately afterward. - shape = tensor_shape.TensorShape(op.get_attr("shape")) - shape.assert_is_fully_defined() - return shape.num_elements() * elem_size From 59fb2a371dda68ea406abb5f09b72a39b7adf57c Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Thu, 16 Jul 2020 02:15:16 -0400 Subject: [PATCH 03/11] add a few more comments and predefined simulator --- autodist/simulator/{models => }/base.py | 92 +-- autodist/simulator/models/__init__.py | 0 .../simulator/models/rankrnn_simulator.py | 634 --------------- .../models/rankrnn_simulator_penalty.py | 729 ------------------ autodist/simulator/predefined_simulator.py | 374 +++++++++ ...r_penalty_fast.py => rankrnn_simulator.py} | 0 autodist/strategy/auto/ar_group_assigner.py | 28 +- autodist/strategy/auto/auto_strategy.py | 249 ++++++ autodist/strategy/auto/ps_load_balancer.py | 44 +- 9 files changed, 739 insertions(+), 1411 deletions(-) rename autodist/simulator/{models => }/base.py (83%) delete mode 100644 autodist/simulator/models/__init__.py delete mode 100644 autodist/simulator/models/rankrnn_simulator.py delete mode 100644 autodist/simulator/models/rankrnn_simulator_penalty.py create mode 100644 autodist/simulator/predefined_simulator.py rename autodist/simulator/{models/rankrnn_simulator_penalty_fast.py => rankrnn_simulator.py} (100%) diff --git a/autodist/simulator/models/base.py b/autodist/simulator/base.py similarity index 83% rename from autodist/simulator/models/base.py rename to autodist/simulator/base.py index a12c147..964302b 100644 --- a/autodist/simulator/models/base.py +++ b/autodist/simulator/base.py @@ -203,52 +203,52 @@ def extract_pre_feature(self, strategy: Strategy, resource_spec: ResourceSpec): meta[var_meta.name] = var_meta return meta, resource - def extract_pre_feature_legacy(self, strategy): - """Don't use now!!!""" - meta = defaultdict() - for node in strategy.node_config: - var_name = node.var_name - for var_op, var in self._original_graph_item.trainable_var_op_to_var.items(): - if var.name == var_name: - break - var_op_name = var_op.name - var_helper = VariableHelper(var, self._original_graph_item) - synchronizer = getattr(node, node.WhichOneof('synchronizer')) - compressor = getattr(synchronizer, 'compressor', None) - if compressor is not None: - compressor = AllReduceSynchronizer.Compressor.Name(compressor) - reduction_destinations = getattr(synchronizer, 'reduction_destinations', None) - if not reduction_destinations or len(reduction_destinations) <= 1: - # this variable is not partitioned - device = reduction_destinations[0] if reduction_destinations else var.device - var_meta = Var(name=var_name, - is_sparse=var_helper.is_sparse, - shape=var_helper.shape, - dtype=var_helper.dtype, - synchronizer=synchronizer, - compressor=compressor, - device=device) - meta[var_meta.name] = var_meta - else: - # this variable is partitioned - num_partitions = len(reduction_destinations) - partition_list = [1] * len(var_helper.shape) - partition_list[0] = num_partitions - pc = PartitionerConfig(partition_list=partition_list) - for i, device in enumerate(reduction_destinations): - part_helper = PartHelper(i, var, pc) - part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i), - is_sparse=var_helper.is_sparse, - shape=part_helper.shape, - dtype=var_helper.dtype, - synchronizer=synchronizer, - part_id=i, - partition_str=pc.partition_str, - original_shape=var_helper.shape, - compressor=compressor, - device=device) - meta[part_meta.name] = part_meta - return meta + # def extract_pre_feature_legacy(self, strategy): + # """Don't use now!!!""" + # meta = defaultdict() + # for node in strategy.node_config: + # var_name = node.var_name + # for var_op, var in self._original_graph_item.trainable_var_op_to_var.items(): + # if var.name == var_name: + # break + # var_op_name = var_op.name + # var_helper = VariableHelper(var, self._original_graph_item) + # synchronizer = getattr(node, node.WhichOneof('synchronizer')) + # compressor = getattr(synchronizer, 'compressor', None) + # if compressor is not None: + # compressor = AllReduceSynchronizer.Compressor.Name(compressor) + # reduction_destinations = getattr(synchronizer, 'reduction_destinations', None) + # if not reduction_destinations or len(reduction_destinations) <= 1: + # # this variable is not partitioned + # device = reduction_destinations[0] if reduction_destinations else var.device + # var_meta = Var(name=var_name, + # is_sparse=var_helper.is_sparse, + # shape=var_helper.shape, + # dtype=var_helper.dtype, + # synchronizer=synchronizer, + # compressor=compressor, + # device=device) + # meta[var_meta.name] = var_meta + # else: + # # this variable is partitioned + # num_partitions = len(reduction_destinations) + # partition_list = [1] * len(var_helper.shape) + # partition_list[0] = num_partitions + # pc = PartitionerConfig(partition_list=partition_list) + # for i, device in enumerate(reduction_destinations): + # part_helper = PartHelper(i, var, pc) + # part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i), + # is_sparse=var_helper.is_sparse, + # shape=part_helper.shape, + # dtype=var_helper.dtype, + # synchronizer=synchronizer, + # part_id=i, + # partition_str=pc.partition_str, + # original_shape=var_helper.shape, + # compressor=compressor, + # device=device) + # meta[part_meta.name] = part_meta + # return meta def setup_resource(self, resource_spec: ResourceSpec): cluster = SSHCluster(resource_spec) diff --git a/autodist/simulator/models/__init__.py b/autodist/simulator/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/autodist/simulator/models/rankrnn_simulator.py b/autodist/simulator/models/rankrnn_simulator.py deleted file mode 100644 index 4459515..0000000 --- a/autodist/simulator/models/rankrnn_simulator.py +++ /dev/null @@ -1,634 +0,0 @@ -"""Strategy RankNetSimulator.""" -import glob -import json -import sys -from datetime import datetime -from pathlib import Path -from string import digits - -import numpy as np -import os -import tensorflow as tf -tf.compat.v1.disable_eager_execution() - -import arion -from arion.graph_item import GraphItem -from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer -from arion.simulator.models.base import SimulatorBase -from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS -from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica -from arion.strategy.random_sample_strategy import VariableHelper, PartHelper -from arion.strategy.base import Strategy -from arion.resource_spec import ResourceSpec -from arion.cluster import SSHCluster -from arion.kernel.device.resolver import DeviceResolver -from arion.kernel.partitioner import PartitionerConfig -from arion.simulator.models.predefined_simulator import PredefinedSimulator - -import torch -import torch.nn as nn - -TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# feature settings -MAX_NUM_WORKERS = 16 -MAX_NUM_GROUPS = 600 -MAX_NUM_VARS = 500 -MAX_NUM_PARS = 1500 - -# model size -FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15 -PARTITION_MLP_HIDDEN = 128 -PARTITION_MLP_OUT = 32 -STEM_RNN_HIDDEN = 128 -BIDIECTIONAL = True -NUM_RNN_LAYERS = 3 - -# trainer setting -BATCH_SIZE = 64 -LR = 3e-4 -WD = 3e-4 - -GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item', - 'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item', - 'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item', - 'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item', - 'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item', - 'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item', - 'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l', - 'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l', - 'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l', - 'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'} - -def get_model(path_): - if 'densenet121' in path_: - return 'densenet121' - elif 'ncf' in path_: - return 'ncf' - elif 'inceptionv3' in path_: - return 'inceptionv3' - elif 'resnet101' in path_: - return 'resnet101' - elif 'resnet50' in path_: - return 'resnet50' - elif 'vgg16' in path_: - return 'vgg16' - elif 'bert' in path_ and '12l' in path_: - return 'bert_12l' - elif 'bert' in path_ and '6l' in path_: - return 'bert_6l' - elif 'bert' in path_ and '3l' in path_: - return 'bert_3l' - elif 'bert' in path_ and 'large' in path_: - return 'bert_large' - else: - return None - -class RankRNN(nn.Module): - def __init__(self, input_size=FEATURE_SIZE, - partition_mlp_hidden=PARTITION_MLP_HIDDEN, - partition_mlp_out=PARTITION_MLP_OUT, - stem_rnn_hidden=STEM_RNN_HIDDEN, - num_rnn_layers=NUM_RNN_LAYERS, - bidirectional=BIDIECTIONAL): - super(RankRNN, self).__init__() - self.partition_mlp_out = partition_mlp_out - # self.num_rnn_layers = num_rnn_layers - self.stem_rnn_hidden = stem_rnn_hidden - self.partition_mlp = nn.Sequential(nn.Linear(input_size, partition_mlp_hidden), - nn.ReLU(), - # nn.Linear(partition_mlp_hidden, partition_mlp_hidden), - # nn.ReLU(), - nn.Linear(partition_mlp_hidden, partition_mlp_out), - ) - - self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional) - self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1) - - self.relu = nn.ReLU() - - def forward(self, features, par_indices, var_nums): - - x = features.float() - # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float() - x = self.partition_mlp(x) - - x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype) - x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x) - - # Set initial hidden and cell states - # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) - # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) - - # Forward propagate LSTM - x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False) - out, (ht, ct) = self.stem_rnn(x1) # out: tensor of shape (batch_size, seq_length, hidden_size) - - # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None] - out = ht.permute(1, 0, 2).reshape(x.shape[0], -1) - # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]]) - # print(ht.permute(1, 0, 2).shape, x.shape) - out = self.final_fc(out) - return out - -class TrainTensorDataset(torch.utils.data.Dataset): - """TensorDataset with support of transforms. - """ - def __init__(self, tensors): - assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) - self.tensors = tensors - - def __getitem__(self, index): - x = self.tensors[0][index] - x = self.perturbe_device_and_group(x) - x1 = self.tensors[1][index] - x2 = self.tensors[2][index] - - y = self.tensors[3][index] - - return x, x1, x2, y - - def __len__(self): - return self.tensors[0].size(0) - - def perturbe_device_and_group(self, x): - # perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32) - # perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32) - # mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids] - # mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids] - # x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1) - return x - - -def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas): - ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)] - - if device is not None: - ret[0][device] = 1 - - group = getattr(synchronizer, 'group', None) - if group is not None: - assert group < MAX_NUM_GROUPS, group - ret[1][group] = 1 - - compressor = getattr(synchronizer, 'compressor', None) - if compressor is not None: - if compressor in ["PowerSGDCompressor", 3]: - ret[2][2] = 1 - elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]: - ret[2][1] = 1 - elif compressor in ["NoneCompressor", 0]: - ret[2][0] = 1 - else: - raise ValueError('Compressor does not exist: {}'.format(compressor)) - - local_replication = getattr(synchronizer, 'local_replication', None) - if isinstance(synchronizer, PSSynchronizer): - synchronizer = 0 - if int(local_replication) == 0: - if int(is_sparse) == 0: - ret[3][0] = 1 - else: - ret[3][1] = 1 - else: - if int(is_sparse) == 0: - ret[3][2] = 1 - else: - ret[3][3] = 1 - else: - ret[3][4] = 1 - ret[4] = np.array([size_ratio, bd, num_replicas]) - - return np.concatenate(ret) - -def connvert_feature(strategy, resource_spec, graph_item): - - cluster = SSHCluster(resource_spec) - device_resolver = DeviceResolver(cluster) - graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] - # bandwidth - network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) - network_bandwidth = network_bandwidth - min_network_bandwidth = network_bandwidth.min() - # Other information - cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] - gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] - max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) - total_num_local_replica = len(graph_replicas) - worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] - - num_vars = 0 - total_size_vars = 0 - for var_op, var in graph_item.trainable_var_op_to_var.items(): - num_vars += 1 - if var.initial_value.shape.ndims: - var_helper = VariableHelper(var, graph_item) - if var_helper.is_sparse: - total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) - else: - total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) - assert num_vars < MAX_NUM_VARS, num_vars - var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32) - partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) - - cnt = 0 - for node_id, node in enumerate(strategy.node_config): - var_name = node.var_name - for var_op, var in graph_item.trainable_var_op_to_var.items(): - if var.name == var_name: - break - var_helper = VariableHelper(var, graph_item) - - if node.partitioner: - pc = PartitionerConfig(partition_str=node.partitioner) - for i, part in enumerate(node.part_config): - part_helper = PartHelper(i, var, pc) - synchronizer = getattr(part, part.WhichOneof('synchronizer')) - reduction_destination = getattr(synchronizer, 'reduction_destination', None) - device = _resolve_device_address(reduction_destination if reduction_destination else var.device, - device_resolver) - if device == '': - assert(isinstance(synchronizer, AllReduceSynchronizer)) - device = None - bd = min_network_bandwidth - num_replicas = 0 - else: - device = cpu_worker_list.index(device) - bd = network_bandwidth[device] - num_replicas = worker_num_replicas[device] - - if var_helper.is_sparse: - size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars - else: - size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars - var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) - partition_indice[cnt] = node_id - cnt += 1 - else: - synchronizer = getattr(node, node.WhichOneof('synchronizer')) - reduction_destination = getattr(synchronizer, 'reduction_destination', None) - device = _resolve_device_address(reduction_destination if reduction_destination else var.device, - device_resolver) - if device == '': - assert(isinstance(synchronizer, AllReduceSynchronizer)) - device = None - bd = min_network_bandwidth - num_replicas = 0 - else: - device = cpu_worker_list.index(device) - bd = network_bandwidth[device] - num_replicas = worker_num_replicas[device] - - if var_helper.is_sparse: - size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars - else: - size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars - var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) - partition_indice[cnt] = node_id - cnt += 1 - return var_partition_features, partition_indice, np.array(node_id+1) - -def create_predefined_features(strategy, resource_spec, predefined_simulator): - - var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec) - - features = [] - for var_name, sync_time in var_sync_time.items(): - if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies. - transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] - sync_time = sync_time[0] - is_ps = True - else: # AR - transmission = sync_time['transmission'] - is_ps = False - - network_overhead = sync_time['network_overhead'] - gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] - - feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)] - features.append(feat) - features = np.array(features, dtype=np.float) - return features - -class RankRNNSimulator(SimulatorBase): - """Simulates strategies for a given graph and resource spec.""" - - def __init__(self, - original_graph_item_path, - fetches=None, - batch_size=1, - seq_len=1, - checkpoint=None): - - super(RankRNNSimulator, self).__init__(original_graph_item_path=original_graph_item_path) - print("It's using RankNet simulator.") - self._fetches = fetches - self._batch_size_per_gpu = batch_size - self._seq_len = seq_len - self._checkpoint = checkpoint - self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path, - batch_size=self._batch_size_per_gpu, - seq_len=self._seq_len) - if self._checkpoint: - self._model = RankRNN().to(TORCH_DEVICE) - self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu'))) - - def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None): - cost = self.predict(strategy, resource_spec, strategy_path, checkpoint) - return cost - - def predict(self, - strategy, - resource_spec, - strategy_path=None, - checkpoint=None): - if checkpoint is None: - if self._checkpoint is None: - raise ValueError("checkpoint is None: {}".format(checkpoint)) - else: - model = self._model - else: - model = RankRNN().to(TORCH_DEVICE) - model.load_state_dict(torch.load(checkpoint)) - if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')): - loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz')) - var_partition_features, partition_indice, var_num, _ = \ - loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] - else: - var_partition_features, partition_indice, var_num = \ - connvert_feature(strategy, resource_spec, self._original_graph_item) - - if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')): - loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz')) - predefined_features = loaded['x4'] - else: - predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator) - - var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) - - var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE) - partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE) - var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE) - - return model(var_partition_features, partition_indice, var_num).view(-1).data.cpu().numpy() - -class RankNetTrainer(): - - def __init__(self, - checkpoint=None, - batch_size_per_gpu=256, - seq_len=1, - seed=1): - self._batch_size_per_gpu = batch_size_per_gpu - self._seq_len = seq_len - self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()} - self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v, - batch_size=self._batch_size_per_gpu, - seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()} - self.model = RankRNN().to(TORCH_DEVICE) - if checkpoint: - self.model.load_state_dict(torch.load(checkpoint)) - self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR, weight_decay=WD) - print("It's using RankNet trainer.") - - def train(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same', num_epochs=200): - - features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()} - for training_path in path_list: - for path in Path(training_path).rglob('strategies'): - strategy_paths = glob.glob(os.path.join(path, '*')) - for strategy_path in strategy_paths: - if 'json' in strategy_path or \ - 'bert_large_batch_8_orca_16_group_2/' in strategy_path: - continue - model = get_model(strategy_path) - if model is None: - if not ('densenets169' in strategy_path or 'densenets201' in strategy_path): - assert False, strategy_path - continue - rs_path = strategy_path.replace('strategies', 'resource_specs') - runtime_path = strategy_path.replace('strategies', 'runtimes') - npz_path = (strategy_path+'.npz').replace('strategies', 'npz') - if not os.path.isfile(rs_path): - rs_path += '.yml' - if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)): - continue - if not os.path.exists(os.path.dirname(npz_path)): - os.makedirs(os.path.dirname(npz_path)) - - if not os.path.isfile(npz_path): - strategy = Strategy.deserialize(path=strategy_path) - rs = ResourceSpec(resource_file=rs_path) - var_partition_features, partition_indice, var_num = \ - connvert_feature(strategy, rs, self.graph_items[model]) - label = np.array(json.load(open(runtime_path))['average']) - np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label) - else: - loaded = np.load(npz_path) - var_partition_features, partition_indice, var_num, label = \ - loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] - - if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')): - predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model]) - np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features) - else: - loaded = np.load(npz_path.replace('.npz', '_pdf.npz')) - predefined_features = loaded['x4'] - var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) - - is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path or 'vgg_random_orca_11' in strategy_path) # comment here - print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3]) - features[model][is_aws][0].append(var_partition_features) - features[model][is_aws][1].append(partition_indice) - features[model][is_aws][2].append(var_num) - features[model][is_aws][3].append(label) - - for k, _ in GRAPH_ITEM_PATHS.items(): - for i1 in range(2): - for i2 in range(4): - if len(features[k][i1][i2]) > 1: - features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16) - print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape) - else: - features[k][i1][i2] = None - - train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0) - train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0) - train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0) - train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0) - - if type(valid_patterns) == str and valid_patterns == 'same': - permt = np.random.permutation(train_features.shape[0]) - split = int(len(permt) * 0.8) - val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]] - train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]] - else: - val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0) - val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0) - val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0) - val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0) - - # comment here - permt = np.random.permutation(val_features.shape[0]) - split = int(len(permt) * 0.7) - train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0) - - val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]] - - print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min()) - - ## train the model - trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE))) - testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE)) - trainloader = torch.utils.data.DataLoader(dataset=trainset, - batch_size=BATCH_SIZE, - shuffle=True) - testloader = torch.utils.data.DataLoader(dataset=testset, - batch_size=32, - shuffle=False) - best_val_acc = 0. - checkpoint_path = 'model_train_on_{}-{}_new.ckpt'.format(train_patterns[0][0], 'orca' if train_patterns[0][1] == 0 else 'aws') - for epoch in range(num_epochs): - if epoch == int(num_epochs*2./5. - 1): - for param_group in self.optimizer.param_groups: param_group['lr'] = 3e-4 - if epoch == int(num_epochs*4./5. - 1): - for param_group in self.optimizer.param_groups: param_group['lr'] = 1e-4 - - labels = [] - outputs = [] - for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(trainloader): - - # Forward pass - outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze() - - true_comp = (labels_b[:, None] > labels_b[None, :]).float() * 2 - 1 - pred_comp = outputs_b[:, None] - outputs_b[None, :] - loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp) - loss = loss.tril(-1).mean() - - # Backward and optimize - self.optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_norm_(self.model.stem_rnn.parameters(), 0.25) - self.optimizer.step() - - outputs.append(outputs_b) - labels.append(labels_b) - - labels = torch.cat(labels) - outputs = torch.cat(outputs) - true_comp = (labels[:, None] > labels[None, :]) - pred_comp = (outputs[:, None] > outputs[None, :]) - equal = (true_comp == pred_comp).int() - train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) - - with torch.no_grad(): - labels = [] - outputs = [] - for features_b, par_indices_b, var_nums_b, labels_b in testloader: - - # Forward pass - outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze() - outputs.append(outputs_b) - labels.append(labels_b) - - labels = torch.cat(labels) - outputs = torch.cat(outputs) - true_comp = (labels[:, None] > labels[None, :]) - pred_comp = (outputs[:, None] > outputs[None, :]) - equal = (true_comp == pred_comp).int() - acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) - if acc.item() > best_val_acc: - best_val_acc = acc.item() - torch.save(self.model.state_dict(), checkpoint_path) - print('Saved model to {}'.format(checkpoint_path)) - print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc)) - return checkpoint_path - - -if __name__ == '__main__': - - trainer = RankNetTrainer() - checkpoint_path = trainer.train( - [ - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20', - # '/users/hzhang2/oceanus_cost_model_training_data/densenet', - # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', - # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', - # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', - '/users/hzhang2/oceanus_cost_model_training_data/vgg16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert', - ], - [ - # ('ncf', 0), #('ncf', 1), - # ('densenet121', 0), ('densenet121', 1), - # ('inceptionv3', 0), ('inceptionv3', 1), - # ('resnet101', 0), ('resnet101', 1), - # ('resnet50', 0), ('resnet50', 1), - # ('bert_12l', 0), ('bert_12l', 1), - # ('bert_6l', 0), ('bert_6l', 1), - # ('bert_3l', 0), ('bert_3l', 1), - # ('bert_large', 0), ('bert_large', 1), - ('vgg16', 0), #('vgg16', 1), - ], - [('vgg16', 1)], - num_epochs=200) - # checkpoint_path = 'model_train_on_vgg16-orca.ckpt' - test_list = [ - '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15', - '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11', #TARGET: 0.9 - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', - ] - - for data_folder in test_list: - simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)], - batch_size=256, - seq_len=1, - checkpoint=checkpoint_path) - - runtimes_folder = os.path.join(data_folder, 'runtimes') - results = {} - averages= [] - scores = [] - for name in os.listdir(runtimes_folder): - strategy_path = os.path.join(data_folder, 'strategies', name) - rs_path = os.path.join(data_folder, 'resource_specs', name ) - if not os.path.isfile(rs_path): - rs_path += '.yml' - runtime_path = os.path.join(runtimes_folder, name) - - with open(runtime_path, 'r') as f: - runtimes = json.load(f) - average = np.array(runtimes['average']) - - s = Strategy.deserialize(strategy_path) - rs = ResourceSpec(resource_file=rs_path) - score = simulator.simulate(s, rs, strategy_path) - - results[name] = (average, score) - averages.append(average) - scores.append(score) - - # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])} - # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])} - # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])} - # print('Sorted by runtime.......................') - # for _, (rt, prediction) in sorted_by_runtime.items(): - # print('runtime {} prediction {}'.format(rt, prediction)) - - y_train = np.array(averages) - test_score = np.array(scores) - true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :]) - pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :]) - equal = (true_comp == pred_comp).astype(np.int) - test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1) - - print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc)) diff --git a/autodist/simulator/models/rankrnn_simulator_penalty.py b/autodist/simulator/models/rankrnn_simulator_penalty.py deleted file mode 100644 index 380fa10..0000000 --- a/autodist/simulator/models/rankrnn_simulator_penalty.py +++ /dev/null @@ -1,729 +0,0 @@ -"""Strategy RankNetSimulator.""" -import glob -import json -import sys -from datetime import datetime -from pathlib import Path -from string import digits - -import numpy as np -import os -import tensorflow as tf -tf.compat.v1.disable_eager_execution() - -import arion -from arion.graph_item import GraphItem -from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer -from arion.simulator.models.base import SimulatorBase -from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS -from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica -from arion.strategy.random_sample_strategy import VariableHelper, PartHelper -from arion.strategy.base import Strategy -from arion.resource_spec import ResourceSpec -from arion.cluster import SSHCluster -from arion.kernel.device.resolver import DeviceResolver -from arion.kernel.partitioner import PartitionerConfig -from arion.simulator.models.predefined_simulator import PredefinedSimulator - -import torch -import torch.nn as nn - -TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -# feature settings -MAX_NUM_WORKERS = 16 -MAX_NUM_GROUPS = 600 -MAX_NUM_VARS = 500 -MAX_NUM_PARS = 1500 -FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15 - -# model size -PARTITION_MLP_HIDDEN = 128 -PARTITION_MLP_OUT = 32 -STEM_RNN_HIDDEN = 128 -BIDIECTIONAL = True -BATCH_SIZE = 96 - -NUM_RNN_LAYERS = 3 -SCORE_TH = 0.005 -LR = 2e-3 -WD = 3e-4 -DATA_AUG = False -IN_LAYERS = 2 -OUT_LAYERS = 1 - -# ncf used: -# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020 -# noaug -# PARTITION_MLP_HIDDEN = 128 -# PARTITION_MLP_OUT = 32 -# STEM_RNN_HIDDEN = 128 -# BIDIECTIONAL = True -# NUM_RNN_LAYERS = 4 -# BATCH_SIZE = 64 -# LR = 1e-3 -# WD = 4e-4 - -# vgg used: -# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374 -# noaug -# PARTITION_MLP_HIDDEN = 128 -# PARTITION_MLP_OUT = 32 -# STEM_RNN_HIDDEN = 128 -# BIDIECTIONAL = True -# NUM_RNN_LAYERS = 3 -# BATCH_SIZE = 64 -# LR = 1e-3 -# WD = 3e-4 - -GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item', - 'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item', - 'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item', - 'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item', - 'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item', - 'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item', - 'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l', - 'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l', - 'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l', - 'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'} - -def get_model(path_): - if 'densenet121' in path_: - return 'densenet121' - elif 'ncf' in path_: - return 'ncf' - elif 'inceptionv3' in path_: - return 'inceptionv3' - elif 'resnet101' in path_: - return 'resnet101' - elif 'resnet50' in path_: - return 'resnet50' - elif 'vgg16' in path_: - return 'vgg16' - elif 'bert' in path_ and '12l' in path_: - return 'bert_12l' - elif 'bert' in path_ and '6l' in path_: - return 'bert_6l' - elif 'bert' in path_ and '3l' in path_: - return 'bert_3l' - elif 'bert' in path_ and 'large' in path_: - return 'bert_large' - else: - return None - -class RankRNN(nn.Module): - def __init__(self, input_size=FEATURE_SIZE, - partition_mlp_hidden=PARTITION_MLP_HIDDEN, - partition_mlp_out=PARTITION_MLP_OUT, - stem_rnn_hidden=STEM_RNN_HIDDEN, - num_rnn_layers=NUM_RNN_LAYERS, - in_layers=IN_LAYERS, - out_layers=OUT_LAYERS, - bidirectional=BIDIECTIONAL): - super(RankRNN, self).__init__() - self.partition_mlp_out = partition_mlp_out - # self.num_rnn_layers = num_rnn_layers - self.stem_rnn_hidden = stem_rnn_hidden - tmp = [nn.Linear(input_size, partition_mlp_hidden)] - for _ in range(in_layers-2): - tmp.append(nn.ReLU()) - tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden)) - tmp.append(nn.ReLU()) - tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out)) - - self.partition_mlp = nn.Sequential(*tmp) - - self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional) - - if out_layers == 1: - self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1) - elif out_layers == 2: - self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128), - nn.ReLU(), - nn.Linear(128, 1)) - - self.relu = nn.ReLU() - - def forward(self, features, par_indices, var_nums, return_feature=False): - - x = features.float() - # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float() - x = self.partition_mlp(x) - - x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype) - x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x) - - # Set initial hidden and cell states - # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) - # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) - - # Forward propagate LSTM - x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False) - out, (ht, ct) = self.stem_rnn(x1) # out: tensor of shape (batch_size, seq_length, hidden_size) - - # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None] - out = ht.permute(1, 0, 2).reshape(x.shape[0], -1) - # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]]) - # print(ht.permute(1, 0, 2).shape, x.shape) - if return_feature: - return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt()) - else: - return self.final_fc(out) - -class TrainTensorDataset(torch.utils.data.Dataset): - """TensorDataset with support of transforms. - """ - def __init__(self, tensors): - assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors) - self.tensors = tensors - - def __getitem__(self, index): - x = self.tensors[0][index] - x = self.perturbe_device_and_group(x) - x1 = self.tensors[1][index] - x2 = self.tensors[2][index] - - y = self.tensors[3][index] - - return x, x1, x2, y - - def __len__(self): - return self.tensors[0].size(0) - - def perturbe_device_and_group(self, x): - if DATA_AUG: - perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32) - perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32) - mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids] - mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids] - x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1) - return x - - -def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas): - ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)] - - if device is not None: - ret[0][device] = 1 - - group = getattr(synchronizer, 'group', None) - if group is not None: - assert group < MAX_NUM_GROUPS, group - ret[1][group] = 1 - - compressor = getattr(synchronizer, 'compressor', None) - if compressor is not None: - if compressor in ["PowerSGDCompressor", 3]: - ret[2][2] = 1 - elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]: - ret[2][1] = 1 - elif compressor in ["NoneCompressor", 0]: - ret[2][0] = 1 - else: - raise ValueError('Compressor does not exist: {}'.format(compressor)) - - local_replication = getattr(synchronizer, 'local_replication', None) - if isinstance(synchronizer, PSSynchronizer): - synchronizer = 0 - if int(local_replication) == 0: - if int(is_sparse) == 0: - ret[3][0] = 1 - else: - ret[3][1] = 1 - else: - if int(is_sparse) == 0: - ret[3][2] = 1 - else: - ret[3][3] = 1 - else: - ret[3][4] = 1 - ret[4] = np.array([size_ratio, bd, num_replicas]) - - return np.concatenate(ret) - -def connvert_feature(strategy, resource_spec, graph_item): - - cluster = SSHCluster(resource_spec) - device_resolver = DeviceResolver(cluster) - graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] - # bandwidth - network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices]) - network_bandwidth = network_bandwidth - min_network_bandwidth = network_bandwidth.min() - # Other information - cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] - gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] - max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) - total_num_local_replica = len(graph_replicas) - worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] - - num_vars = 0 - total_size_vars = 0 - for var_op, var in graph_item.trainable_var_op_to_var.items(): - num_vars += 1 - if var.initial_value.shape.ndims: - var_helper = VariableHelper(var, graph_item) - if var_helper.is_sparse: - total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape)) - else: - total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype) - assert num_vars < MAX_NUM_VARS, num_vars - var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32) - partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1) - - cnt = 0 - for node_id, node in enumerate(strategy.node_config): - var_name = node.var_name - for var_op, var in graph_item.trainable_var_op_to_var.items(): - if var.name == var_name: - break - var_helper = VariableHelper(var, graph_item) - - if node.partitioner: - pc = PartitionerConfig(partition_str=node.partitioner) - for i, part in enumerate(node.part_config): - part_helper = PartHelper(i, var, pc) - synchronizer = getattr(part, part.WhichOneof('synchronizer')) - reduction_destination = getattr(synchronizer, 'reduction_destination', None) - device = _resolve_device_address(reduction_destination if reduction_destination else var.device, - device_resolver) - if device == '': - assert(isinstance(synchronizer, AllReduceSynchronizer)) - device = None - bd = min_network_bandwidth - num_replicas = 0 - else: - device = cpu_worker_list.index(device) - bd = network_bandwidth[device] - num_replicas = worker_num_replicas[device] - - if var_helper.is_sparse: - size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars - else: - size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars - var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) - partition_indice[cnt] = node_id - cnt += 1 - else: - synchronizer = getattr(node, node.WhichOneof('synchronizer')) - reduction_destination = getattr(synchronizer, 'reduction_destination', None) - device = _resolve_device_address(reduction_destination if reduction_destination else var.device, - device_resolver) - if device == '': - assert(isinstance(synchronizer, AllReduceSynchronizer)) - device = None - bd = min_network_bandwidth - num_replicas = 0 - else: - device = cpu_worker_list.index(device) - bd = network_bandwidth[device] - num_replicas = worker_num_replicas[device] - - if var_helper.is_sparse: - size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars - else: - size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars - var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas) - partition_indice[cnt] = node_id - cnt += 1 - return var_partition_features, partition_indice, np.array(node_id+1) - -def create_predefined_features(strategy, resource_spec, predefined_simulator): - - var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec) - - features = [] - for var_name, sync_time in var_sync_time.items(): - if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies. - transmission = sync_time[0]['transmission'] + sync_time[1]['transmission'] - sync_time = sync_time[0] - is_ps = True - else: # AR - transmission = sync_time['transmission'] - is_ps = False - - network_overhead = sync_time['network_overhead'] - gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency'] - - feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)] - features.append(feat) - features = np.array(features, dtype=np.float) - return features - -class RankRNNSimulatorPenalty(SimulatorBase): - """Simulates strategies for a given graph and resource spec.""" - - def __init__(self, - original_graph_item_path, - num_rnn_layers, - in_layers, - out_layers, - fetches=None, - batch_size=1, - seq_len=1, - checkpoint=None): - - super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path) - print("It's using RankNet simulator.") - self._fetches = fetches - self._batch_size_per_gpu = batch_size - self._seq_len = seq_len - self._checkpoint = checkpoint - self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path, - batch_size=self._batch_size_per_gpu, - seq_len=self._seq_len) - if self._checkpoint: - self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE) - self._model.load_state_dict(torch.load(self._checkpoint)) - - def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None): - score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint) - return score.view(-1).data.cpu().numpy(), feature.view(-1).data.cpu().numpy() - - - def predict(self, - strategy, - resource_spec, - strategy_path=None, - checkpoint=None): - if checkpoint is None: - if self._checkpoint is None: - raise ValueError("checkpoint is None: {}".format(checkpoint)) - else: - model = self._model - else: - model = RankRNN().to(TORCH_DEVICE) - model.load_state_dict(torch.load(checkpoint)) - if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')): - loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz')) - var_partition_features, partition_indice, var_num, _ = \ - loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] - else: - var_partition_features, partition_indice, var_num = \ - connvert_feature(strategy, resource_spec, self._original_graph_item) - - if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')): - loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz')) - predefined_features = loaded['x4'] - else: - predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator) - - var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) - - var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE) - partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE) - var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE) - - return model(var_partition_features, partition_indice, var_num, True) - -class RankNetTrainer(): - - def __init__(self, - batch_size_per_gpu=256, - seq_len=1, - seed=1): - self._batch_size_per_gpu = batch_size_per_gpu - self._seq_len = seq_len - self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()} - self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v, - batch_size=self._batch_size_per_gpu, - seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()} - self.best_acc = 0. - print("It's using RankNet trainer.") - - def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'): - features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()} - for training_path in path_list: - for path in Path(training_path).rglob('strategies'): - strategy_paths = glob.glob(os.path.join(path, '*')) - # strategy_paths = np.random.permutation(list(strategy_paths)) - for strategy_path in strategy_paths: - if 'json' in strategy_path or \ - 'bert_large_batch_8_orca_16_group_2/' in strategy_path: - continue - model = get_model(strategy_path) - if model is None: - if not ('densenets169' in strategy_path or 'densenets201' in strategy_path): - assert False, strategy_path - continue - rs_path = strategy_path.replace('strategies', 'resource_specs') - runtime_path = strategy_path.replace('strategies', 'runtimes') - npz_path = (strategy_path+'.npz').replace('strategies', 'npz') - if not os.path.isfile(rs_path): - rs_path += '.yml' - if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)): - continue - if not os.path.exists(os.path.dirname(npz_path)): - os.makedirs(os.path.dirname(npz_path)) - - if not os.path.isfile(npz_path): - strategy = Strategy.deserialize(path=strategy_path) - rs = ResourceSpec(resource_file=rs_path) - var_partition_features, partition_indice, var_num = \ - connvert_feature(strategy, rs, self.graph_items[model]) - label = np.array(json.load(open(runtime_path))['average']) - np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label) - else: - loaded = np.load(npz_path) - var_partition_features, partition_indice, var_num, label = \ - loaded['x1'], loaded['x2'], loaded['x3'], loaded['y'] - - if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')): - predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model]) - np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features) - else: - loaded = np.load(npz_path.replace('.npz', '_pdf.npz')) - predefined_features = loaded['x4'] - var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1) - - # is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here - is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path) - # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3]) - features[model][is_aws][0].append(var_partition_features) - features[model][is_aws][1].append(partition_indice) - features[model][is_aws][2].append(var_num) - features[model][is_aws][3].append(label) - - for k, _ in GRAPH_ITEM_PATHS.items(): - for i1 in range(2): - for i2 in range(4): - if len(features[k][i1][i2]) > 1: - features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16) - print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape) - else: - features[k][i1][i2] = None - - train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0) - train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0) - train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0) - train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0) - - if type(valid_patterns[0]) == str and valid_patterns[0] == 'same': - rng = np.random.RandomState(1) - permt = rng.permutation(train_features.shape[0]) - split = int(len(permt) * 0.7) - val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]] - train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]] - else: - val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0) - val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0) - val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0) - val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0) - - # comment here - rng = np.random.RandomState(1) - permt = rng.permutation(val_features.shape[0]) - split = int(len(permt) * 0.7) - train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0) - - val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]] - label_max = max(train_labels.max(), val_labels.max()) - label_min = min(train_labels.min(), val_labels.min()) - train_labels = (train_labels-label_min)/(label_max-label_min) - val_labels = (val_labels-label_min)/(label_max-label_min) - print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) - - ## train the model - trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE))) - testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE)) - self.trainloader = torch.utils.data.DataLoader(dataset=trainset, - batch_size=BATCH_SIZE, - shuffle=True) - self.testloader = torch.utils.data.DataLoader(dataset=testset, - batch_size=32, - shuffle=False) - - def train(self, name='', num_epochs=200, checkpoint=None): - - checkpoint_path = 'model_on_{}.ckpt'.format(name) - print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS) - - np.random.seed(1) - torch.manual_seed(1) - torch.cuda.manual_seed_all(1) - model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE) - if checkpoint: - model.load_state_dict(torch.load(checkpoint)) - optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD) - - best_val_acc = 0. - for epoch in range(num_epochs): - if epoch == int(num_epochs*2./5. - 1): - for param_group in optimizer.param_groups: param_group['lr'] = 3e-4 - if epoch == int(num_epochs*4./5. - 1): - for param_group in optimizer.param_groups: param_group['lr'] = 1e-4 - - labels = [] - outputs = [] - for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader): - - # Forward pass - outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() - - par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1) - - true_comp = ( - (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() - + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int() - + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int() - ) > 0 - true_comp = true_comp.float() * 2 - 1 - pred_comp = outputs_b[:, None] - outputs_b[None, :] - loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp) - loss = loss.tril(-1).mean() - - # Backward and optimize - optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25) - optimizer.step() - - outputs.append(outputs_b) - labels.append(labels_b) - - labels = torch.cat(labels) - outputs = torch.cat(outputs) - true_comp = (labels[:, None] > labels[None, :]) - pred_comp = (outputs[:, None] > outputs[None, :]) - equal = (true_comp == pred_comp).int() - train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) - - with torch.no_grad(): - labels = [] - outputs = [] - for features_b, par_indices_b, var_nums_b, labels_b in self.testloader: - - # Forward pass - outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze() - outputs.append(outputs_b) - labels.append(labels_b) - - labels = torch.cat(labels) - outputs = torch.cat(outputs) - true_comp = (labels[:, None] > labels[None, :]) - pred_comp = (outputs[:, None] > outputs[None, :]) - equal = (true_comp == pred_comp).int() - acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1) - if acc.item() > best_val_acc: - best_val_acc = acc.item() - if best_val_acc > self.best_acc: - print('Saved model @ acc', best_val_acc) - torch.save(model.state_dict(), checkpoint_path) - self.best_acc = best_val_acc - # print('Saved model to {}'.format(checkpoint_path)) - if epoch == num_epochs - 1: - print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc)) - return checkpoint_path - - -if __name__ == '__main__': - - if True: - trainer = RankNetTrainer() - trainer.load_data([ - '/users/hzhang2/oceanus_cost_model_training_data/vgg16', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf', - # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only', - # '/users/hzhang2/oceanus_cost_model_training_data/densenet', - # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', - # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', - # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', - ], - [ - ('vgg16', 0), #('vgg16', 1), - # ('ncf', 0), #('ncf', 1), - # ('bert_large', 1), #('bert_large', 1), - # not used: - # ('densenet121', 0), ('densenet121', 1), - # ('inceptionv3', 0), ('inceptionv3', 1), - # ('resnet101', 0), ('resnet101', 1), - # ('resnet50', 0), ('resnet50', 1), - # ('bert_12l', 0), ('bert_12l', 1), - # ('bert_6l', 0), ('bert_6l', 1), - # ('bert_3l', 0), ('bert_3l', 1), - ], - [ - ('vgg16', 1), - # ('ncf', 1), - # ('bert_large', 1), - # 'same', - ], - ) - - for p2 in [0.01, 0.03]: - for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]: - for p4 in [1e-3, 2e-3, 1e-4, 3e-4, 5e-4, 5e-5]: - for p1 in [3, 4, 2]: - for p5 in [2, 3]: - for p6 in [1, 2]: - NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6 - checkpoint_path = trainer.train(name='vgg-orca-validon-0.83-sim1', num_epochs=200) - exit() - else: - checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_vgg-orca.ckpt' - test_list = [ - # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15', - # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11', #TARGET: 0.9 - '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4', - # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5', - ] - - for data_folder in test_list: - simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)], - num_rnn_layers=3, - batch_size=256, - seq_len=1, - checkpoint=checkpoint_path) - - runtimes_folder = os.path.join(data_folder, 'runtimes') - results = {} - averages= [] - scores = [] - for name in os.listdir(runtimes_folder): - strategy_path = os.path.join(data_folder, 'strategies', name) - rs_path = os.path.join(data_folder, 'resource_specs', name ) - if not os.path.isfile(rs_path): - rs_path += '.yml' - runtime_path = os.path.join(runtimes_folder, name) - - with open(runtime_path, 'r') as f: - runtimes = json.load(f) - average = np.array(runtimes['average']) - - s = Strategy.deserialize(strategy_path) - rs = ResourceSpec(resource_file=rs_path) - score = simulator.simulate(s, rs, strategy_path) - - results[name] = (average, score) - averages.append(average) - scores.append(score) - - # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])} - # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])} - # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])} - # print('Sorted by runtime.......................') - # for _, (rt, prediction) in sorted_by_runtime.items(): - # print('runtime {} prediction {}'.format(rt, prediction)) - - y_train = np.array(averages) - test_score = np.array(scores) - true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :]) - pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :]) - equal = (true_comp == pred_comp).astype(np.int) - test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1) - - print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc)) diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py new file mode 100644 index 0000000..91519c6 --- /dev/null +++ b/autodist/simulator/predefined_simulator.py @@ -0,0 +1,374 @@ +"""Strategy Simulator.""" + +import numpy as np +import json +import pickle as pkl + +import tensorflow as tf +from tensorflow.python.eager import context + +from arion.strategy.base import Strategy +from arion.resource_spec import ResourceSpec +from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from arion.simulator.models.base import SimulatorBase +from arion.simulator.utils import _resolve_device_address, _resolved_devices_on_diff_machine, \ + get_dense_var_bits, get_sparse_var_bits + +class PredefinedSimulator(SimulatorBase): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + original_graph_item_path, + fetches=None, + batch_size=1, + seq_len=1, + get_coef=True, + checkpoint=None): + + super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path) + + print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size)) + self._fetches = fetches + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._get_coef = get_coef + self._checkpoint = checkpoint + self._weights = None + with context.eager_mode(): + if self._checkpoint: + self._weights = self.load_checkpoint(self._checkpoint) + + def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None): + """Return simulated runtime value.""" + inputs = self.create_features(strategy, resource_spec) + with context.eager_mode(): + cost = self.inference(inputs, checkpoint) + return cost + + def inference(self, inputs, checkpoint=None): + if checkpoint is not None: + weights = self.load_checkpoint(checkpoint) + elif self._weights is not None: + weights = self._weights + else: + raise ValueError("No checkpoint provided in either initialization or inference.") + + if not isinstance(inputs, tf.Tensor): + inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) + + if len(weights) == 4: + W0, b0, W, b = weights + inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0) + cost = tf.matmul(inputs, W) + b + elif len(weights) == 2: + W, b = weights + cost = tf.matmul(inputs, W) + b + else: + raise ValueError + return cost + + def load_checkpoint(self, checkpoint=None): + if checkpoint is None: + if self._checkpoint is not None: + checkpoint = self._checkpoint + else: + raise ValueError("checkpoint is None: {}".format(checkpoint)) + self._weights = pkl.load(open(checkpoint, 'rb')) + # self._weights = json.load(open(checkpoint, 'r')) + print("Load checkpoint: ") + print(self._weights) + return self._weights + + def save_checkpoint(self, model, checkpoint): + pkl.dump(model, open(checkpoint, 'wb')) + self._checkpoint = checkpoint + self._weights = model + + def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec): + var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) + + # Add up sync time per device to find the slowest server time. + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + device_ps_sync_time = {} + var_ar_sync_time = {} + for var_name, sync_time in var_sync_time.items(): + if isinstance(vars[var_name].synchronizer, PSSynchronizer): + device = vars[var_name].device + if device not in device_ps_sync_time: + device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] + + else: # AllReduce + if var_name not in var_ar_sync_time: + var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + var_ar_sync_time[var_name][key] += sync_time[key] + + max_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_var_ar_sync_time = {key: 0.0 for key in feature_keys} + for key in feature_keys: + max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0]) + + feat = [max_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_var_ar_sync_time[key] for key in feature_keys] + + return feat + + def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): + # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) + + vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec) + + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + device_ps_sync_time = {} + group_ar_sync_time = {} + + for var_name, var in vars.items(): + if isinstance(var.synchronizer, PSSynchronizer): + sync_time = self.var_ps_time(var, resource) + device = vars[var_name].device + if device not in device_ps_sync_time: + device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] + elif isinstance(var.synchronizer, AllReduceSynchronizer): + sync_time = self.var_ar_time(var, resource) + var_group = sync_time['group'] + if var_group not in group_ar_sync_time: + group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + group_ar_sync_time[var_group][key] += sync_time[key] + else: + raise ValueError('{}'.format(type(var.synchronizer))) + + max_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} + max_group_ar_sync_time = {key: 0.0 for key in feature_keys} + sum_group_ar_sync_time = {key: 0.0 for key in feature_keys} + for key in feature_keys: + max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) + max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0]) + sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0]) + + feat = [max_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_device_ps_sync_time[key] for key in feature_keys] \ + + [max_group_ar_sync_time[key] for key in feature_keys] \ + + [sum_group_ar_sync_time[key] for key in feature_keys] + + return feat + + def predefined_sync_time(self, strategy, resource_spec): + """ graph_item: transformed graph item """ + vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec) + # Compute synchronization time for every var + var_sync_time = {} + for var_name, var in vars.items(): + if isinstance(var.synchronizer, PSSynchronizer): + var_sync_time[var_name] = self.var_ps_time(var, resource) + elif isinstance(var.synchronizer, AllReduceSynchronizer): + var_sync_time[var_name] = self.var_ar_time(var, resource) + else: + raise ValueError('{}'.format(type(var.synchronizer))) + return var_sync_time, vars, resource + + def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in PS strategy.""" + def _helper(worker_list, worker_num_replicas=None): + if worker_num_replicas is None: + worker_num_replicas = [1.0] * len(worker_list) + + this_server_time = 0 + # network transfer: sum up all workers time. equals to the time cost of this server. + # TODO(Hao): didn't consider any parallelization among partitions + for k, worker in enumerate(worker_list): + if _resolved_devices_on_diff_machine(var.device, worker): + if var.is_sparse: + this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + else: + this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) + this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] + + if self._get_coef: + return { + 'transmission': this_server_time, + 'network_overhead': len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'strategy': 'ps', + 'local_proxy': var.synchronizer.local_replication, + 'is_sparse': var.is_sparse, + 'size_to_transfer': var_size_to_transfer, + 'dtype': str(var.dtype), + # 'server_list': [partition.to_dict() for partition in server_list], + 'worker_list': worker_list, + 'cpu_worker_list': resource.cpu_worker_list, + 'gpu_worker_list': resource.gpu_worker_list, + 'worker_num_replicas': worker_num_replicas, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': True, + } + else: + return this_server_time + len(worker_list) * network_overhead + \ + gpu_kernel_memory_latency * resource.max_num_local_replica + + var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + + if var.is_sparse: + send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas) + receive_time = _helper(resource.gpu_worker_list) + else: + send_time = _helper(resource.cpu_worker_list) + if var.synchronizer.local_replication: + receive_time = _helper(resource.cpu_worker_list) + else: + receive_time = _helper(resource.gpu_worker_list) + + return send_time, receive_time + + def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in AR strategy.""" + worker_list = resource.cpu_worker_list + num_workers = len(worker_list) + min_bandwidth = None + for i in range(num_workers): + for j in range(i, num_workers): + if min_bandwidth is None: + min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]] + else: + min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]]) + + # Compressor + if var.compressor == "PowerSGDCompressor" or var.compressor == 3: + rank = 10 # currently using default value. So hardcode here. # todo: confirm + # assume var must be a dense variable. + og_shape = var.shape + ndims = len(og_shape) + if ndims <= 1: # no compress + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + else: + if ndims > 2: + n = og_shape[0] + m = 1 + for s in og_shape[1:]: + m *= s # tensor's shape (n, m) + else: + n, m = og_shape[0], og_shape[1] + size_to_transfer = n * rank + m * rank + dtype = tf.float32 + elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \ + or var.compressor == 2 or var.compressor == 1: + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = tf.float32 + elif var.compressor == "NoneCompressor" or var.compressor == 0: + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = var.dtype + else: + raise ValueError('Compressor does not exist: {}'.format(var.compressor)) + + # todo: chunk_size + # AllReduce communication time + # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers) + time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth + + if self._get_coef: + return { + 'transmission': time, + 'network_overhead': 1, # len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'group': var.synchronizer.group, + 'strategy': 'allreduce', + 'is_sparse': False, + # 'chunk_size': chunk_size, + 'spec': 'NCCL', # default + 'compressor': var.compressor, + 'worker_list': worker_list, + 'num_workers': num_workers, + 'size_to_transfer': size_to_transfer, + 'dtype': str(dtype), + 'min_bandwidth': min_bandwidth, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': False, + } + else: + return time + network_overhead * len(worker_list) \ + + gpu_kernel_memory_latency * resource.max_num_local_replica + + + + # @staticmethod + # def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, + # max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, + # network_overhead=0.0, gpu_kernel_memory_latency=0.0): + # """Compute synchrinzation time of a variable in PS strategy.""" + # + # def _helper(worker_list, worker_num_replicas=None): + # if worker_num_replicas is None: + # worker_num_replicas = [1.0] * len(worker_list) + # # Compute the slowest server + # slowest_server_time = 0 + # for j, server in enumerate(server_list): + # if server.size_to_transfer == 0: + # continue + # # network transfer: sum up all workers time. equals to the time cost of this server. + # this_server_time = 0 + # for k, worker in enumerate(worker_list): + # if _resolved_devices_on_diff_machine(server.device, worker): + # if is_sparse: + # this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k] + # else: + # this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype) + # this_server_time += this_worker_size / network_bandwidth[server.device][worker] + # slowest_server_time = max(slowest_server_time, this_server_time) + # + # if get_coef: + # return { + # 'transmission': slowest_server_time, + # 'network_overhead': len(worker_list), + # 'gpu_kernel_memory_latency': max_num_local_replica, + # 'constant': 1.0, + # # possible affecting factors. + # 'var_name': var_name, + # 'strategy': 'ps', + # 'local_proxy': local_proxy, + # 'is_sparse': is_sparse, + # 'server_list': [partition.to_dict() for partition in server_list], + # 'worker_list': worker_list, + # 'cpu_worker_list': cpu_worker_list, + # 'gpu_worker_list': gpu_worker_list, + # 'worker_num_replicas': worker_num_replicas, + # 'max_num_local_replica': max_num_local_replica, + # } + # else: + # return slowest_server_time + len(worker_list) * network_overhead + \ + # gpu_kernel_memory_latency * max_num_local_replica + # + # if is_sparse: + # send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas) + # receive_time = _helper(gpu_worker_list) + # else: + # send_time = _helper(cpu_worker_list) + # if local_proxy: + # receive_time = _helper(cpu_worker_list) + # else: + # receive_time = _helper(gpu_worker_list) + # + # if get_coef: + # # return {key: send_time[key]+receive_time[key] for key in send_time.keys()} + # return send_time, receive_time + # else: + # return send_time, receive_time diff --git a/autodist/simulator/models/rankrnn_simulator_penalty_fast.py b/autodist/simulator/rankrnn_simulator.py similarity index 100% rename from autodist/simulator/models/rankrnn_simulator_penalty_fast.py rename to autodist/simulator/rankrnn_simulator.py diff --git a/autodist/strategy/auto/ar_group_assigner.py b/autodist/strategy/auto/ar_group_assigner.py index c2d59b6..7a529d3 100644 --- a/autodist/strategy/auto/ar_group_assigner.py +++ b/autodist/strategy/auto/ar_group_assigner.py @@ -1,9 +1,35 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Collective group assigners.""" + from collections import OrderedDict import numpy as np def chunk_group_assigner(ar_shards, chunk_size=1): + """ + Assigner that determines the group following a chunk parameter. + + Args: + ar_shards: + chunk_size: + + Returns: + + """ assignments = {} for i, shard_name in enumerate(ar_shards): assignments[shard_name] = i // chunk_size @@ -54,4 +80,4 @@ def ordered_balanced_group_assigner(ar_shards, var_helpers, num_group): assignments[shard_name] = cur_bucket loads[cur_bucket] += var_helpers[shard_name].byte_size assert(len(ar_shards) == len(assignments)) - return assignments \ No newline at end of file + return assignments diff --git a/autodist/strategy/auto/auto_strategy.py b/autodist/strategy/auto/auto_strategy.py index e69de29..260c7be 100644 --- a/autodist/strategy/auto/auto_strategy.py +++ b/autodist/strategy/auto/auto_strategy.py @@ -0,0 +1,249 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PS StrategyBuilder.""" + +from autodist.strategy.base import Strategy, StrategyBuilder +from autodist.proto import strategy_pb2 +from autodist.strategy.auto.strategy_sampler import RandomStrategySampler + + +class AutoStrategy(StrategyBuilder): + """ + AutoStrategy Builder. + + It generates a suitable Strategy based on graph_item and resource_spec following the AutoSync framework. + """ + + def __init__(self): + return + + def build(self, graph_item, resource_spec): + # TODO: merge the code in search and propose here. + return + + def search(self): + # candidates, scores, features = self.propose(self.search_params['num_candidate_explore']) + candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore']) + n_pick = self.search_params['num_candidate_per_trial'] + + # cast them to be np arrays + if self.search_params['diversity_metric'] == 'embedding': + picked_candidates = self.submodular_pick_by_embedding(np.array(scores), + candidates, + np.stack(features), + n_pick, + self.search_params['simulation_weight'], + self.search_params['diversity_weight']) + elif self.search_params['diversity_metric'] == 'expression': + picked_candidates = self.submodular_pick_by_expression(np.array(scores), + candidates, + n_pick, + self.search_params['simulation_weight'], + self.search_params['diversity_weight']) + else: + raise ValueError('Unrecognized diversity metric...') + if self.trial_run_fn: + self.trial_run(picked_candidates, search_iteration=0) + + def propose(self, num_proposal, use_simulator=True): + builder = RandomStrategy(self.space, self.heuristics) + candidates = [] + features = [] + scores = [] + # np.random.seed(1) + idx = 0 + + while len(candidates) < num_proposal: + logging.info('Sampling strategy {}'.format(idx)) + start_time = time.time() + expr = builder.build(self._original_graph_item, self._resource_spec) + elapsed = time.time() - start_time + logging.info('Sampling strategy takes {}'.format(elapsed)) + builder.reset() + idx += 1 + logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) + if self.simulator and use_simulator: + start_time = time.time() + score, feature = self.simulator.simulate(expr, self._resource_spec) + elapsed = time.time() - start_time + logging.info('Inference strategy takes {}'.format(elapsed)) + if score > self.search_params['rejection_score']: + logging.info('strategy {} has score {} > {}, ' + 'rejected..'.format(idx, score, self.search_params['rejection_score'])) + continue + else: + candidates.append(expr) + features.append(feature) + scores.append(score[0]) + else: + candidates.append(expr) + features.append([]) + scores.append(0) + logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) + return candidates, scores, features + + def batch_propose(self, num_proposal, batch_size=32, use_simulator=True): + + builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)] + graph_items = [self._original_graph_item for _ in range(batch_size)] + rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)] + candidates = [] + features = [] + scores = [] + # np.random.seed(1) + idx = 0 + + while len(candidates) < num_proposal: + logging.info('Sampling strategy {}'.format(idx)) + start_time = time.time() + + q = Queue() + exprs = [] + prs = [] + for obj, arg1, arg2 in zip(builders, graph_items, rss): + prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2))) + prs[-1].start() + for pr in prs: + expr = q.get() # will block + exprs.append(expr) + for pr in prs: + pr.join() + + elapsed = time.time() - start_time + logging.info('Sampling strategy takes {}'.format(elapsed)) + for builder in builders: builder.reset() + logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) + if self.simulator and use_simulator: + start_time = time.time() + batch_score, batch_feature = self.simulator.simulate(exprs, rss) + elapsed = time.time() - start_time + logging.info('Inference strategy takes {}'.format(elapsed)) + for ite, expr in enumerate(exprs): + # print(batch_score[ite], batch_feature[ite].shape) + if batch_score[ite] > self.search_params['rejection_score']: + logging.info('strategy {} has score {} > {}, ' + 'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score'])) + else: + candidates.append(expr) + features.append(batch_feature[ite]) + scores.append(batch_score[ite]) + else: + for ite, expr in enumerate(exprs): + candidates.append(expr) + features.append([]) + scores.append(0) + idx += batch_size + logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) + return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal] + + def submodular_pick_by_embedding(self, + scores, + candidates, + candidate_features, + n_pick, + beta=1.0, + alpha=1.0): + n = len(scores) + assert n == len(candidate_features) + + ret = [] + sim = np.dot(candidate_features, candidate_features.T) + remain = list(range(len(scores))) + + for _ in range(n_pick): + tmp_delta = -scores[remain] * beta + if len(ret) > 0: + tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1) + max_x = tmp_delta.argmax() + max_x = remain[max_x] + + ret.append(max_x) + remain.remove(max_x) + + return [candidates[i] for i in ret] + + def submodular_pick_by_expression(self, + scores, + candidates, + n_pick, + beta=1.0, + alpha=1.0): + + def remove_group_or_reduction_destination(strategy): + tmp_strategy = copy.deepcopy(strategy) + for node in tmp_strategy.node_config: + if node.partitioner: + for part in node.part_config: + synchronizer = getattr(part, part.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = '' + else: + synchronizer.group = 0 + else: + synchronizer = getattr(node, node.WhichOneof('synchronizer')) + if hasattr(synchronizer, 'reduction_destination'): + synchronizer.reduction_destination = '' + else: + synchronizer.group = 0 + return tmp_strategy + + def estimate_difference(strategy, node_config_set): + score = 0 + for i, node in enumerate(strategy.node_config): + if_seen = False + for seen_node in node_config_set[i]: + if seen_node == node: + if_seen = True + break + if not if_seen: + score += 1 + return score + + assert len(scores) == len(candidates) + + node_config_set = [list() for _ in candidates[0].node_config] + remain = list(range(len(scores))) + ret = [] + for _ in range(n_pick): + max_x = -1 + max_delta = -1e9 + max_strategy_copy = None + + for x in remain: + tmp_strategy = remove_group_or_reduction_destination(candidates[x]) + diff_score = estimate_difference(tmp_strategy, node_config_set) + assert(diff_score <= len(tmp_strategy.node_config)) + # print('diff score {}..'.format(diff_score)) + tmp_delta = - scores[x] * beta + diff_score * alpha + if tmp_delta > max_delta: + max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy + max_diff_score = diff_score *alpha + max_simulation_score= -scores[x] + + print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta)) + ret.append(max_x) + remain.remove(max_x) + + # update the node config set + for i, node in enumerate(max_strategy_copy.node_config): + if_seen = False + for seen_node in node_config_set[i]: + if seen_node == node: + if_seen = True + break + if not if_seen: + node_config_set[i].append(node) + + return [candidates[i] for i in ret] diff --git a/autodist/strategy/auto/ps_load_balancer.py b/autodist/strategy/auto/ps_load_balancer.py index dc770d8..55a3d6e 100644 --- a/autodist/strategy/auto/ps_load_balancer.py +++ b/autodist/strategy/auto/ps_load_balancer.py @@ -1,3 +1,19 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PS load balancers.""" + from collections import OrderedDict import numpy as np @@ -9,7 +25,19 @@ def calcuate_entropy(loads): entropy = - np.sum(distribution * np.log2(distribution)) return entropy + def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False): + """ + A greedy load balancer that places the next largest load on the least loaded server. + Args: + ps_shards: + resource_spec: + var_helpers: + sort_by_size: + + Returns: + + """ # no randomness assignments = {} reduction_device_names = [k for k, _ in resource_spec.cpu_devices] @@ -27,7 +55,22 @@ def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=Fal loads[destination] += var_helpers[shard_name].byte_size return assignments + def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False): + """ + A randomized greedy load balancer. It places the variable by sampling from a multinomial distribution + correlated with their current load status -- node with least loads will have highest probability being + sampled. + + Args: + ps_shards: + resource_spec: + var_helpers: + sort_by_size: + + Returns: + + """ # Sample destination based on a distributed calculated based on loads and available bandwidth reduction_device_names = [k for k, _ in resource_spec.cpu_devices] loads = {ps: 0.0 for ps in reduction_device_names} @@ -64,4 +107,3 @@ def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=Fa # best_entropy = calcuate_entropy(balanced_loads) # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy)) return assignments - From f74e650d8797afd1ade627fb1660e521ea9a2cf8 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Thu, 16 Jul 2020 20:08:46 -0400 Subject: [PATCH 04/11] temporally remove op profiling code --- autodist/simulator/base.py | 135 +++++++------------------------------ 1 file changed, 24 insertions(+), 111 deletions(-) diff --git a/autodist/simulator/base.py b/autodist/simulator/base.py index 964302b..bac33d5 100644 --- a/autodist/simulator/base.py +++ b/autodist/simulator/base.py @@ -1,24 +1,31 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Strategy Simulator.""" -import time -from collections import defaultdict -import numpy as np -import tensorflow as tf -from tensorflow.python.client import timeline +from collections import defaultdict -from arion.simulator.utils import NUM_RUNS -from arion.cluster import SSHCluster -from arion.graph_item import GraphItem -from arion.kernel.device.resolver import DeviceResolver -from arion.kernel.partitioner import PartitionerConfig -from arion.proto.synchronizers_pb2 import AllReduceSynchronizer -from arion.resource_spec import ResourceSpec -from arion.strategy.base import Strategy -from arion.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica -from arion.strategy.random_sample_strategy import VariableHelper, PartHelper -from arion.simulator.utils import INFINITY +from autodist.cluster import SSHCluster +from autodist.graph_item import GraphItem +from autodist.kernel.device.resolver import DeviceResolver +from autodist.kernel.partitioner import PartitionerConfig +from autodist.resource_spec import ResourceSpec +from autodist.strategy.base import Strategy +from autodist.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica +from autodist.strategy.auto.strategy_sampler import VariableHelper, PartHelper +from autodist.simulator.utils import INFINITY -# tf.compat.v1.disable_eager_execution() class Var: def __init__(self, @@ -310,97 +317,3 @@ def min_bandwitdh(worker_list, bandwidth): @property def original_graph_item_path(self): return self._original_graph_item_path - - # @property - # def resource_file(self): - # return self._resource_file - - @staticmethod - def calculate_op_timings(fetches): - # Simple implementation. Calculate averaged run time of certain steps. - init_op = tf.compat.v1.initialize_all_variables() - outside_times = [] - - with tf.compat.v1.Session() as sess: - sess.run(init_op) - for i in range(NUM_RUNS): - start = time.time() - sess.run(fetches) - end = time.time() - outside_times.append(end - start) - comp_time_in_sec = np.mean(np.array(outside_times[1:])) - return comp_time_in_sec - - @staticmethod - def profile_on_single_machine(fetches): - # calculate computation time of every op - init_op = tf.compat.v1.initialize_all_variables() - op_name2runtime = defaultdict(list) - outside_times = [] - all_times = [] - - options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) - run_metadata = tf.compat.v1.RunMetadata() - with tf.compat.v1.Session() as sess: - sess.run(init_op) - for i in range(NUM_RUNS): - start = time.time() * 1000 - sess.run(fetches) - end = time.time() * 1000 - outside_times.append(end - start) - - sess.run(fetches, options=options, run_metadata=run_metadata) - - fetched_timeline = timeline.Timeline(run_metadata.step_stats) - chrome_trace = fetched_timeline.generate_chrome_trace_format() # necessary - for event in fetched_timeline._chrome_trace._events: - # print('\n') - # print(list(event.keys())) - # for key in list(event.keys()): - # print(key, event[key]) - if 'dur' in event: - op_name2runtime[event['args']['name']].append(event['dur']) - # todo: to be more accurate, add tid (thread/lanes id) - - mean_outside_time = np.mean(np.array(outside_times[1:])) - print('mean outside_times: ', mean_outside_time) - print(outside_times) - # print('average all_times: ', np.mean(np.array(all_times))) - - op_name2meanruntime = {} - for op_name, runtimes in op_name2runtime.items(): - runtimes = np.array(runtimes) - if len(runtimes) > 1: # Do not compute operations that only run once for all steps. - mean = np.mean(np.array(runtimes[1:])) - op_name2meanruntime[op_name] = mean - print(op_name, mean) - # print(op_name2runtime[op_name]) - - total_op_time = sum([mean_runtime for op_name, mean_runtime in op_name2meanruntime.items()]) - print('total_op_time', total_op_time / 1000.) - # total_op_time = [sum([runtime[i] for op_name, runtime in op_name2runtime.items()]) - # for i in range(self.num_runs)] - # print('total_op_time', np.mean(np.array(total_op_time)), total_op_time) - - return mean_outside_time - - # @staticmethod - # def _calculate_op_timings(graph_item: GraphItem): - # """ - # Given a graph, calculates an expected running time for each (op, input_size) pair. - # - # Args: - # graph_item (GraphItem): The input graph. - # - # Returns: - # Dict mapping (op, input_size) to time. - # """ - # all_ops = {} - # for op in graph_item.graph.get_operations(): - # input_shapes = tuple((tuple(inp.shape.dims) for inp in op.inputs)) - # op_type = op.type - # all_ops[(op_type, input_shapes)] = ops.Graph() - # - # for ((op, shape), graph) in all_ops.items(): - # with graph.as_default(): - # getattr(tensorflow.raw_ops, op) From 98ec2260b4dae1cb02b3581675f8ed54804a96c8 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Sat, 18 Jul 2020 02:00:00 -0400 Subject: [PATCH 05/11] some refactoring on AutoStrategy interface --- autodist/simulator/linear_simulator.py | 21 ++ autodist/simulator/predefined_simulator.py | 29 ++- autodist/simulator/utils.py | 30 ++- autodist/strategy/auto/auto_strategy.py | 249 ------------------- autodist/strategy/auto/base.py | 112 +++++++++ autodist/strategy/auto/default_constraint.py | 0 autodist/strategy/auto/strategy_sampler.py | 34 ++- autodist/strategy/auto_strategy.py | 55 ++++ autodist/simulator/test.py => test.py | 0 9 files changed, 261 insertions(+), 269 deletions(-) create mode 100644 autodist/simulator/linear_simulator.py delete mode 100644 autodist/strategy/auto/auto_strategy.py create mode 100644 autodist/strategy/auto/base.py create mode 100644 autodist/strategy/auto/default_constraint.py create mode 100644 autodist/strategy/auto_strategy.py rename autodist/simulator/test.py => test.py (100%) diff --git a/autodist/simulator/linear_simulator.py b/autodist/simulator/linear_simulator.py new file mode 100644 index 0000000..527d923 --- /dev/null +++ b/autodist/simulator/linear_simulator.py @@ -0,0 +1,21 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Linear simulator.""" + +from autodist.simulator.base import SimulatorBase + +class LinearSimulator(SimulatorBase): + def __init__(self): + super(LinearSimulator, self).__init__() diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py index 91519c6..973fbef 100644 --- a/autodist/simulator/predefined_simulator.py +++ b/autodist/simulator/predefined_simulator.py @@ -1,18 +1,31 @@ -"""Strategy Simulator.""" +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Predefined simulator with linear model.""" -import numpy as np -import json import pickle as pkl import tensorflow as tf from tensorflow.python.eager import context -from arion.strategy.base import Strategy -from arion.resource_spec import ResourceSpec -from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer -from arion.simulator.models.base import SimulatorBase -from arion.simulator.utils import _resolve_device_address, _resolved_devices_on_diff_machine, \ +from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from autodist.resource_spec import ResourceSpec +from autodist.simulator.base import SimulatorBase +from autodist.simulator.utils import _resolved_devices_on_diff_machine, \ get_dense_var_bits, get_sparse_var_bits +from autodist.strategy.base import Strategy + class PredefinedSimulator(SimulatorBase): """Simulates strategies for a given graph and resource spec.""" diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py index a668e75..2febd63 100644 --- a/autodist/simulator/utils.py +++ b/autodist/simulator/utils.py @@ -1,18 +1,34 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simulator-related utility functions.""" + import glob import json import os import numpy as np -import tensorflow_ranking as tfr import tensorflow as tf from tensorflow.python.framework import device_spec +import tensorflow_ranking as tfr -from arion.utils import logging -from arion.resource_spec import ResourceSpec -from arion.strategy.base import Strategy -from arion.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \ +from autodist.utils import logging +from autodist.resource_spec import ResourceSpec +from autodist.strategy.base import Strategy +from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \ DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR -from arion.kernel.device.resolver import DeviceResolver +from autodist.kernel.device.resolver import DeviceResolver RankingLossKeys = { @@ -268,7 +284,7 @@ def read_trial_runs(): def pad_list(l, max_len): - return l + [0.0] * (max_len - len(l)) + return l + [0.0] * (max_len - len(l)) def get_dtype_bits(dtype): diff --git a/autodist/strategy/auto/auto_strategy.py b/autodist/strategy/auto/auto_strategy.py deleted file mode 100644 index 260c7be..0000000 --- a/autodist/strategy/auto/auto_strategy.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright 2020 Petuum. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""PS StrategyBuilder.""" - -from autodist.strategy.base import Strategy, StrategyBuilder -from autodist.proto import strategy_pb2 -from autodist.strategy.auto.strategy_sampler import RandomStrategySampler - - -class AutoStrategy(StrategyBuilder): - """ - AutoStrategy Builder. - - It generates a suitable Strategy based on graph_item and resource_spec following the AutoSync framework. - """ - - def __init__(self): - return - - def build(self, graph_item, resource_spec): - # TODO: merge the code in search and propose here. - return - - def search(self): - # candidates, scores, features = self.propose(self.search_params['num_candidate_explore']) - candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore']) - n_pick = self.search_params['num_candidate_per_trial'] - - # cast them to be np arrays - if self.search_params['diversity_metric'] == 'embedding': - picked_candidates = self.submodular_pick_by_embedding(np.array(scores), - candidates, - np.stack(features), - n_pick, - self.search_params['simulation_weight'], - self.search_params['diversity_weight']) - elif self.search_params['diversity_metric'] == 'expression': - picked_candidates = self.submodular_pick_by_expression(np.array(scores), - candidates, - n_pick, - self.search_params['simulation_weight'], - self.search_params['diversity_weight']) - else: - raise ValueError('Unrecognized diversity metric...') - if self.trial_run_fn: - self.trial_run(picked_candidates, search_iteration=0) - - def propose(self, num_proposal, use_simulator=True): - builder = RandomStrategy(self.space, self.heuristics) - candidates = [] - features = [] - scores = [] - # np.random.seed(1) - idx = 0 - - while len(candidates) < num_proposal: - logging.info('Sampling strategy {}'.format(idx)) - start_time = time.time() - expr = builder.build(self._original_graph_item, self._resource_spec) - elapsed = time.time() - start_time - logging.info('Sampling strategy takes {}'.format(elapsed)) - builder.reset() - idx += 1 - logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) - if self.simulator and use_simulator: - start_time = time.time() - score, feature = self.simulator.simulate(expr, self._resource_spec) - elapsed = time.time() - start_time - logging.info('Inference strategy takes {}'.format(elapsed)) - if score > self.search_params['rejection_score']: - logging.info('strategy {} has score {} > {}, ' - 'rejected..'.format(idx, score, self.search_params['rejection_score'])) - continue - else: - candidates.append(expr) - features.append(feature) - scores.append(score[0]) - else: - candidates.append(expr) - features.append([]) - scores.append(0) - logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) - return candidates, scores, features - - def batch_propose(self, num_proposal, batch_size=32, use_simulator=True): - - builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)] - graph_items = [self._original_graph_item for _ in range(batch_size)] - rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)] - candidates = [] - features = [] - scores = [] - # np.random.seed(1) - idx = 0 - - while len(candidates) < num_proposal: - logging.info('Sampling strategy {}'.format(idx)) - start_time = time.time() - - q = Queue() - exprs = [] - prs = [] - for obj, arg1, arg2 in zip(builders, graph_items, rss): - prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2))) - prs[-1].start() - for pr in prs: - expr = q.get() # will block - exprs.append(expr) - for pr in prs: - pr.join() - - elapsed = time.time() - start_time - logging.info('Sampling strategy takes {}'.format(elapsed)) - for builder in builders: builder.reset() - logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) - if self.simulator and use_simulator: - start_time = time.time() - batch_score, batch_feature = self.simulator.simulate(exprs, rss) - elapsed = time.time() - start_time - logging.info('Inference strategy takes {}'.format(elapsed)) - for ite, expr in enumerate(exprs): - # print(batch_score[ite], batch_feature[ite].shape) - if batch_score[ite] > self.search_params['rejection_score']: - logging.info('strategy {} has score {} > {}, ' - 'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score'])) - else: - candidates.append(expr) - features.append(batch_feature[ite]) - scores.append(batch_score[ite]) - else: - for ite, expr in enumerate(exprs): - candidates.append(expr) - features.append([]) - scores.append(0) - idx += batch_size - logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) - return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal] - - def submodular_pick_by_embedding(self, - scores, - candidates, - candidate_features, - n_pick, - beta=1.0, - alpha=1.0): - n = len(scores) - assert n == len(candidate_features) - - ret = [] - sim = np.dot(candidate_features, candidate_features.T) - remain = list(range(len(scores))) - - for _ in range(n_pick): - tmp_delta = -scores[remain] * beta - if len(ret) > 0: - tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1) - max_x = tmp_delta.argmax() - max_x = remain[max_x] - - ret.append(max_x) - remain.remove(max_x) - - return [candidates[i] for i in ret] - - def submodular_pick_by_expression(self, - scores, - candidates, - n_pick, - beta=1.0, - alpha=1.0): - - def remove_group_or_reduction_destination(strategy): - tmp_strategy = copy.deepcopy(strategy) - for node in tmp_strategy.node_config: - if node.partitioner: - for part in node.part_config: - synchronizer = getattr(part, part.WhichOneof('synchronizer')) - if hasattr(synchronizer, 'reduction_destination'): - synchronizer.reduction_destination = '' - else: - synchronizer.group = 0 - else: - synchronizer = getattr(node, node.WhichOneof('synchronizer')) - if hasattr(synchronizer, 'reduction_destination'): - synchronizer.reduction_destination = '' - else: - synchronizer.group = 0 - return tmp_strategy - - def estimate_difference(strategy, node_config_set): - score = 0 - for i, node in enumerate(strategy.node_config): - if_seen = False - for seen_node in node_config_set[i]: - if seen_node == node: - if_seen = True - break - if not if_seen: - score += 1 - return score - - assert len(scores) == len(candidates) - - node_config_set = [list() for _ in candidates[0].node_config] - remain = list(range(len(scores))) - ret = [] - for _ in range(n_pick): - max_x = -1 - max_delta = -1e9 - max_strategy_copy = None - - for x in remain: - tmp_strategy = remove_group_or_reduction_destination(candidates[x]) - diff_score = estimate_difference(tmp_strategy, node_config_set) - assert(diff_score <= len(tmp_strategy.node_config)) - # print('diff score {}..'.format(diff_score)) - tmp_delta = - scores[x] * beta + diff_score * alpha - if tmp_delta > max_delta: - max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy - max_diff_score = diff_score *alpha - max_simulation_score= -scores[x] - - print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta)) - ret.append(max_x) - remain.remove(max_x) - - # update the node config set - for i, node in enumerate(max_strategy_copy.node_config): - if_seen = False - for seen_node in node_config_set[i]: - if seen_node == node: - if_seen = True - break - if not if_seen: - node_config_set[i].append(node) - - return [candidates[i] for i in ret] diff --git a/autodist/strategy/auto/base.py b/autodist/strategy/auto/base.py new file mode 100644 index 0000000..05f0be4 --- /dev/null +++ b/autodist/strategy/auto/base.py @@ -0,0 +1,112 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A base class to implementating different auto strategies.""" + +from multiprocessing import Process, Queue + +import numpy as np + +from autodist.strategy.auto.strategy_sampler import RandomStrategySampler, \ + default_space, default_heuristics +from autodist.strategy.base import StrategyBuilder +from autodist.utils import logging + + +class AutoStrategyBase(StrategyBuilder): + """AutoStrategy Base class.""" + + def __init__(self, + space=None, + heuristics=None, + num_proposals=1000, + simulator=None, + train_simulator=False): + # space and heuristics params + if not space: + self._space = default_space + if not heuristics: + self._heuristics = default_heuristics + + # params + self._num_proposals = num_proposals + self._sampler = RandomStrategySampler(self._space, + self._heuristics) + if train_simulator: + raise NotImplementedError() + self._simulator = simulator + + def build(self, graph_item, resource_spec): + raise NotImplementedError() + + def propose_one(self, graph_item, resource_spec): + """ + Sequentially generate `self._num_proposals` strategies. + + Args: + graph_item: + resource_spec: + + Returns: + Strategy + """ + proposal = self._sampler.build(graph_item, resource_spec) + return proposal + + def propose_n(self, + graph_item, + resource_spec, + num_proposals, + num_threads=1): + """ + Proposal `num_proposals` strategies using multi-threading. + + Args: + graph_item: + resource_spec: + num_proposals: + num_threads: + + Returns: + List(Strategy) + """ + if num_threads > 1: + def sampler_worker(q, sampler, graph_item, resource_spec): + np.random.seed() + expr = sampler.build(graph_item, resource_spec) + q.put(expr) + + proposals = [] + while len(proposals) < num_proposals: + # create thread-safe objects before multi-threading + samplers = [RandomStrategySampler(graph_item, resource_spec) for _ in range(num_threads)] + graph_items = [graph_item for _ in range(num_threads)] + resource_specs = [resource_spec for _ in range(num_threads)] + q = Queue() + threads = [] + try: + for sampler, gi, rs in zip(samplers, graph_items, resource_specs): + thread = Process(target=sampler_worker, args=(q,sampler, gi, rs)) + thread.start() + threads.append(thread) + batch = [q.get() for _ in threads] + proposals.extend(batch) + for thread in threads: + thread.join() + except: + logging.error('Error when proposing strategies with {} threads'.format(num_threads)) + raise + else: + proposals = [self.propose_one(graph_item, resource_spec) for i in range(num_proposals)] + return proposals diff --git a/autodist/strategy/auto/default_constraint.py b/autodist/strategy/auto/default_constraint.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/strategy/auto/strategy_sampler.py b/autodist/strategy/auto/strategy_sampler.py index 3281c4b..a317089 100644 --- a/autodist/strategy/auto/strategy_sampler.py +++ b/autodist/strategy/auto/strategy_sampler.py @@ -27,6 +27,7 @@ from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \ ordered_balanced_group_assigner from autodist.strategy.auto import sample_util +from autodist.const import MAX_INT32 class VarType(Enum): @@ -179,7 +180,7 @@ def byte_size(self): * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis]) -class RandomStrategySampler(StrategyBuilder): +class RandomStrategySampler(): """ Random Strategy Sampler. @@ -202,10 +203,6 @@ def __init__(self, space, heuristics): self.heuristics = heuristics self.helpers = {} - def reset(self): - """Reset the helpers every time a strategy is sampled.""" - self.helpers = {} - def build(self, graph_item, resource_spec): """Generate a randomized strategy given model and resource spec.""" expr = Strategy() @@ -250,8 +247,13 @@ def build(self, graph_item, resource_spec): sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics) expr.node_config.extend(node_config) + self._reset() return expr + def _reset(self): + """Reset the helpers every time a strategy is sampled.""" + self.helpers = {} + def sample_if_partition(var_helper, resource_spec, space, heuristics): """ @@ -650,3 +652,25 @@ def assign_ar_group(node_config, ar_shards): synchronizer = getattr(node, node.WhichOneof('synchronizer')) if hasattr(synchronizer, 'compressor'): synchronizer.group = ar_shards[node.var_name][1] + + +default_space = { + 'synchronizer_types': ['PS', 'AR'], + 'maybe_partition': [True, False], + 'compressor': ['HorovodCompressor', 'NoneCompressor', 'HorovodCompressorEF'], + 'local_replication': [False], + 'partitionable_axis': [] +} + + +default_heuristics = { + 'ps_load_balancer': None, # None, 'christy', 'greedy', 'LP' + 'merge_scheme': None, # random, by_chunk, christy, ordered_balanced + 'chunk_size': -1, + 'num_group_bounds': [-1, MAX_INT32], + 'maybe_partition_bounds': [0, MAX_INT32], + 'maybe_partition_by_size': None, + 'num_partition_bounds': [2, MAX_INT32], + 'enable_single_node_no_partition': False, + 'same_synchronizer_for_parts': False, +} diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py new file mode 100644 index 0000000..5d6b78f --- /dev/null +++ b/autodist/strategy/auto_strategy.py @@ -0,0 +1,55 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""An AutoStrategy using a trained linear simulator.""" + +from autodist.strategy.auto.base import AutoStrategyBase +from autodist.simulator.linear_simulator import LinearSimulator + +class AutoStrategy(AutoStrategyBase): + """ + AutoStrategy builder using a trained linear simulator + + It generates a suitable Strategy based on graph_item and resource_spec using a pretrained simulator weight. + This implementation currenlty provides a linear simulator weight trained on > 9000 data points. + """ + + def __init__(self): + space = { + 'synchronizer_types': ['PS', 'AR'], + 'maybe_partition': [True, False], + 'compressor': ['HorovodCompressor', 'NoneCompressor'], + 'local_replication': [True, False], + 'partitionable_axis': [], + } + heuristics = { + 'ps_load_balancer': 'sorted_christy', # None, 'christy', 'greedy', 'LP' + 'merge_scheme': 'ordered_balanced', # random, by_chunk, christy, ordered_balanced + 'num_group_bounds': [-1, 20], + 'num_partition_bounds': [2, 40], + 'enable_single_node_no_partition': False, + 'same_synchronizer_for_parts': False, + } + + simulator = LinearSimulator() + + super(AutoStrategy, self).__init__( + space=space, + heuristics=heuristics, + num_proposals=2000, + simulator=simulator + ) + + def build(self, graph_item, resource_spec): + return \ No newline at end of file diff --git a/autodist/simulator/test.py b/test.py similarity index 100% rename from autodist/simulator/test.py rename to test.py From d404f7fa715bd6d5cfde888656e1ed731fb6032b Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Mon, 20 Jul 2020 23:31:57 -0400 Subject: [PATCH 06/11] refactor some simulator and autostrategy utilities --- autodist/kernel/device/resolver.py | 19 +- autodist/resource_spec.py | 14 + autodist/simulator/base.py | 394 +++++----------- autodist/simulator/predefined_simulator.py | 8 +- autodist/simulator/utils.py | 73 +-- autodist/strategy/__init__.py | 1 + autodist/strategy/auto/default_constraint.py | 0 autodist/strategy/auto/item.py | 463 +++++++++++++++++++ autodist/strategy/auto/sample_util.py | 61 --- autodist/strategy/auto/strategy_sampler.py | 278 ++++------- autodist/strategy/auto_strategy.py | 7 +- examples/linear_regression.py | 4 +- tests/test_simulator.py | 27 ++ 13 files changed, 780 insertions(+), 569 deletions(-) delete mode 100644 autodist/strategy/auto/default_constraint.py create mode 100644 autodist/strategy/auto/item.py delete mode 100644 autodist/strategy/auto/sample_util.py create mode 100644 tests/test_simulator.py diff --git a/autodist/kernel/device/resolver.py b/autodist/kernel/device/resolver.py index 609f471..8fcfecf 100644 --- a/autodist/kernel/device/resolver.py +++ b/autodist/kernel/device/resolver.py @@ -45,6 +45,15 @@ def _get_address_to_tasks(cluster): return d def resolve_to_device_spec(self, device): + """ + Resolve an AutoDist DeviceSpec or string to a TensorFlow DeviceSpec. + + Args: + device: (a container of) AutoDist DeviceSpec or DeviceSpec string. + + Returns: + device_spec, List(device_spec), or Set(device_spec) + """ """Resolve an AutoDist DeviceSpec or its string to a TensorFlow DeviceSpec.""" if isinstance(device, (list, set)): return type(device)(self.resolve_to_device_spec(d) for d in device) @@ -59,7 +68,15 @@ def resolve_to_device_spec(self, device): ) def resolve_to_device_str(self, device): - """Resolve an AutoDist DeviceSpec or its string to a TensorFlow device string.""" + """Resolve an AutoDist DeviceSpec or its string to a TensorFlow device string. + + E.g. 192.168.0.1:GPU:0 or localhost:CPU:1 -> job:worker/task:0/device:GPU:0 + Args: + device: (a container of) AutoDist DeviceSpec or DeviceSpec string. + + Returns: + str, List(str), or Set(str) + """ if isinstance(device, (list, set)): return type(device)(self.resolve_to_device_spec(d).to_string() for d in device) elif isinstance(device, RepeatedScalarContainer): diff --git a/autodist/resource_spec.py b/autodist/resource_spec.py index 017faea..be1d570 100644 --- a/autodist/resource_spec.py +++ b/autodist/resource_spec.py @@ -72,6 +72,7 @@ def __init__(self, resource_file=None): self.__chief_address = None self.__ssh_config_map = dict() self.__ssh_group = dict() + self.__network_bandwidth = dict() # set self.__devices self._from_resource_info(resource_file) @@ -147,6 +148,11 @@ def ssh_group(self): """SSH Group for each node.""" return self.__ssh_group + @property + def network_bandwidth(self): + """Network bandwidth of each node.""" + return self.__network_bandwidth + def _add_device(self, device_spec): if device_spec.name_string() not in self.__devices: self.__devices[device_spec.name_string()] = device_spec @@ -200,6 +206,14 @@ def _parse_node(self, node, num_nodes): self.__ssh_group[host_address] = node.get('ssh_config') if self.__ssh_group[host_address] is None and self.__chief_address != host_address: raise ValueError("Need to define SSH groups for all non-chief nodes.") + # network bandwidth + if node.get('network_bandwidth'): + self.__network_bandwidth[host_address] = node.get('network_bandwidth') + else: + # TODO (Hao): we could also raise ValueError here. + logging.warning('Bandwidth for {} is undefined and set as default. ' + 'Caution when using AutoStrategy.'.format(host_address)) + self.__network_bandwidth[host_address] = 1 class DeviceSpec: diff --git a/autodist/simulator/base.py b/autodist/simulator/base.py index bac33d5..31e2d1a 100644 --- a/autodist/simulator/base.py +++ b/autodist/simulator/base.py @@ -12,308 +12,156 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Strategy Simulator.""" +"""Simulator base class.""" +import os +from collections import OrderedDict -from collections import defaultdict - -from autodist.cluster import SSHCluster from autodist.graph_item import GraphItem -from autodist.kernel.device.resolver import DeviceResolver from autodist.kernel.partitioner import PartitionerConfig from autodist.resource_spec import ResourceSpec from autodist.strategy.base import Strategy -from autodist.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica -from autodist.strategy.auto.strategy_sampler import VariableHelper, PartHelper -from autodist.simulator.utils import INFINITY - - -class Var: - def __init__(self, - name=None, - is_sparse=False, - synchronizer=None, - shape=None, - dtype=None, - device=None, - compressor=None): - self.name = name - self.is_sparse = is_sparse - self.synchronizer = synchronizer - self.shape = shape - self.dtype = dtype - self.device = device - self.compressor = compressor - self.device = device - self.is_partition = False +from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem - self.original_shape = self.shape - - @property - def var_size(self): - size = 1 - if self.shape: - for s in self.shape: - size *= s - return size - - @property - def original_var_size(self): - size = 1 - if self.original_shape: - for s in self.original_shape: - size *= s - return size - - def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1): - if not self.is_sparse: - return self.var_size - else: - if not self.shape: # scalar - return 1 - emb_size = 1 - if len(self.shape) > 1: - for i in range(1, len(self.original_shape)): - emb_size = emb_size * self.original_shape[i] - - sparse_data_size = batch_size_per_gpu * seq_len * emb_size - - # estimate the embedding of this partition simply using a proportional formula - ret = sparse_data_size * self.var_size / self.original_var_size - return ret +class SimulatorBase: + """Simulates strategies for a given graph and resource spec.""" -class Partition(Var): def __init__(self, - name=None, - is_sparse=False, - synchronizer=None, - shape=None, - dtype=None, - device=None, - compressor=None, - part_id=0, - original_shape=None, - partition_str=None, - num_shards=1): - super(Partition, self).__init__(name, is_sparse, synchronizer, shape, dtype, device, compressor) - self.is_partition = True - self.part_id = part_id - self.partition_str = partition_str - self.original_shape = original_shape - self.num_shards = num_shards + graph_item=None, + resource_spec=None): + """ + Constructor for simulator base class + Args: + graph_item: a GraphItem object, or a path to a serialized GraphItem object. + resource_spec: a ResourceSpec object, or a path to a resource file. + """ + # check if it is a path + self._graph_item = None + if isinstance(graph_item, GraphItem): + self._graph_item = graph_item + elif isinstance(graph_item, str) and os.path.exists(graph_item): + self._graph_item = GraphItem.deserialize(graph_item) + else: + raise ValueError("Invalid graph_item: {}".format(graph_item)) -class Resource: - def __init__(self, cluster, device_resolver, graph_replicas, network_bandwidth, cpu_worker_list, - gpu_worker_list, max_num_local_replica, total_num_local_replica, worker_num_replicas): - self.cluster=cluster - self.device_resolver=device_resolver - self.graph_replicas=graph_replicas - self.network_bandwidth=network_bandwidth - self.cpu_worker_list=cpu_worker_list - self.gpu_worker_list=gpu_worker_list - self.max_num_local_replica=max_num_local_replica - self.total_num_local_replica=total_num_local_replica - self.worker_num_replicas=worker_num_replicas + self._resource_spec = None + if isinstance(resource_spec, ResourceSpec): + self._resource_spec = resource_spec + elif isinstance(resource_spec, str) and os.path.exists(resource_spec): + self._resource_spec = ResourceSpec(resource_spec) + else: + raise ValueError("Invalid resource_spec: {}".format(resource_spec)) + + def update_graph_item(self, graph_item): + """Change the default graph_item with this simulator.""" + if not graph_item: + raise ValueError('Empty graph item.') + self._graph_item = graph_item + + def update_resource_spec(self, resource_spec): + """Change the default resource_spec with this simulator.""" + if not resource_spec: + raise ValueError('Empty resource spec.') + self._resource_spec = resource_spec + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + checkpoint=None): + """ + Return simulated runtime cost given (Strategy, GraphItem, ResourceSpec) tuple. + + Args: + strategy: + graph_item: + resource_spec: + checkpoint: + + Returns: + float + """ + raise NotImplementedError() -class SimulatorBase: - """Simulates strategies for a given graph and resource spec.""" + def inference(self, + features, + checkpoint=None): + """ + Abstract method for simulator inference. - def __init__(self, original_graph_item_path): - self._original_graph_item_path = original_graph_item_path - self._original_graph_item = GraphItem.deserialize(original_graph_item_path) - # self._resource_file = resource_file - # self._resource_spec = ResourceSpec(resource_file) - # self._cluster = SSHCluster(self._resource_spec) - # self._device_resolver = DeviceResolver(self._cluster) - # - # self._graph_replicas = [_resolve_device_address(k, self._device_resolver) - # for k, v in self._resource_spec.gpu_devices] - # - # # bandwidth - # self._network_bandwidth = self.network_bandwidth(self._resource_spec, self._device_resolver) - # # Other information - # self._cpu_worker_list = [_resolve_device_address(device, self._device_resolver) - # for device, _ in self._resource_spec.cpu_devices] - # self._gpu_worker_list = [_resolve_device_address(device, self._device_resolver) - # for device, _ in self._resource_spec.gpu_devices] - # self._max_num_local_replica = _max_num_local_replica(self._graph_replicas, self._cluster) - # self._total_num_local_replica = len(self._graph_replicas) - # self._worker_num_replicas = [_num_local_replica(cpu_worker, self._graph_replicas, self._cluster) - # for cpu_worker in self._cpu_worker_list] + Args: + features: feature input extracted from (GraphItem, ResourceSpec, Strategy) tuple. + checkpoint: optional simulator weight. - def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint: str): - """Return simulated runtime value by feeding features to the cost model.""" + Returns: + float + """ raise NotImplementedError() - def inference(self, inputs, checkpoint): - raise NotImplementedError() + def load_checkpoint(self, checkpoint=None): + """ + Load a checkpoint file as weights of the simulator. - def load_checkpoint(self, checkpoint): + Args: + checkpoint: path to a checkpoint file. + """ raise NotImplementedError() def save_checkpoint(self, model, checkpoint): + """ + Save a trained weight as a checkpoint file. + + Args: + model: trained model. + checkpoint: path where to save the checkpoint. + """ raise NotImplementedError() def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): raise NotImplementedError() - def extract_pre_feature(self, strategy: Strategy, resource_spec: ResourceSpec): - resource = self.setup_resource(resource_spec) + def preprocess(self, + strategy, + graph_item=None, + resource_spec=None): + """ + Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features. + + Args: + strategy: a distribution strategy + graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used. + resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used. + + Returns: + OrderedDict(): variable/part name to variable/part items. + ResourceItem: + """ + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + if not strategy: + raise ValueError('No strategy provided.') - name2var = {var.name: var for var_op, var in self._original_graph_item.trainable_var_op_to_var.items()} + resource_item = ResourceItem(resource_spec) + name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()} - meta = defaultdict() + name_to_items = OrderedDict() for node in strategy.node_config: var_name = node.var_name - # for var_op, var in self._original_graph_item.trainable_var_op_to_var.items(): - # if var.name == var_name: - # break - var = name2var[var_name] - var_helper = VariableHelper(var, self._original_graph_item) - + var = name_to_var[var_name] if node.partitioner: pc = PartitionerConfig(partition_str=node.partitioner) for i, part in enumerate(node.part_config): - part_helper = PartHelper(i, var, pc) - synchronizer = getattr(part, part.WhichOneof('synchronizer')) - compressor = getattr(synchronizer, 'compressor', None) - reduction_destination = getattr(synchronizer, 'reduction_destination', None) - device = _resolve_device_address(reduction_destination if reduction_destination else var.device, - resource.device_resolver) - - part_meta = Partition(name=part.var_name, - is_sparse=var_helper.is_sparse, - shape=part_helper.shape, - dtype=var_helper.dtype, - synchronizer=synchronizer, - part_id=i, - num_shards=pc.num_shards, - partition_str=pc.partition_str, - original_shape=var_helper.shape, - compressor=compressor, - device=device) - meta[part_meta.name] = part_meta + part_item = PartItem(var, graph_item, i, pc, part) + name_to_items[part_item.name] = part_item else: - synchronizer = getattr(node, node.WhichOneof('synchronizer')) - compressor = getattr(synchronizer, 'compressor', None) - reduction_destination = getattr(synchronizer, 'reduction_destination', None) - device = _resolve_device_address(reduction_destination if reduction_destination else var.device, - resource.device_resolver) - - var_meta = Var(name=var_name, - is_sparse=var_helper.is_sparse, - shape=var_helper.shape, - dtype=var_helper.dtype, - synchronizer=synchronizer, - compressor=compressor, - device=device) - meta[var_meta.name] = var_meta - return meta, resource - - # def extract_pre_feature_legacy(self, strategy): - # """Don't use now!!!""" - # meta = defaultdict() - # for node in strategy.node_config: - # var_name = node.var_name - # for var_op, var in self._original_graph_item.trainable_var_op_to_var.items(): - # if var.name == var_name: - # break - # var_op_name = var_op.name - # var_helper = VariableHelper(var, self._original_graph_item) - # synchronizer = getattr(node, node.WhichOneof('synchronizer')) - # compressor = getattr(synchronizer, 'compressor', None) - # if compressor is not None: - # compressor = AllReduceSynchronizer.Compressor.Name(compressor) - # reduction_destinations = getattr(synchronizer, 'reduction_destinations', None) - # if not reduction_destinations or len(reduction_destinations) <= 1: - # # this variable is not partitioned - # device = reduction_destinations[0] if reduction_destinations else var.device - # var_meta = Var(name=var_name, - # is_sparse=var_helper.is_sparse, - # shape=var_helper.shape, - # dtype=var_helper.dtype, - # synchronizer=synchronizer, - # compressor=compressor, - # device=device) - # meta[var_meta.name] = var_meta - # else: - # # this variable is partitioned - # num_partitions = len(reduction_destinations) - # partition_list = [1] * len(var_helper.shape) - # partition_list[0] = num_partitions - # pc = PartitionerConfig(partition_list=partition_list) - # for i, device in enumerate(reduction_destinations): - # part_helper = PartHelper(i, var, pc) - # part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i), - # is_sparse=var_helper.is_sparse, - # shape=part_helper.shape, - # dtype=var_helper.dtype, - # synchronizer=synchronizer, - # part_id=i, - # partition_str=pc.partition_str, - # original_shape=var_helper.shape, - # compressor=compressor, - # device=device) - # meta[part_meta.name] = part_meta - # return meta - - def setup_resource(self, resource_spec: ResourceSpec): - cluster = SSHCluster(resource_spec) - device_resolver = DeviceResolver(cluster) - graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices] - # bandwidth - network_bandwidth = self.network_bandwidth(resource_spec, device_resolver) - # Other information - cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices] - gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices] - max_num_local_replica = _max_num_local_replica(graph_replicas, cluster) - total_num_local_replica = len(graph_replicas) - worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list] - resource = Resource(cluster=cluster, - device_resolver=device_resolver, - graph_replicas=graph_replicas, - network_bandwidth=network_bandwidth, - cpu_worker_list=cpu_worker_list, - gpu_worker_list=gpu_worker_list, - max_num_local_replica=max_num_local_replica, - total_num_local_replica=total_num_local_replica, - worker_num_replicas=worker_num_replicas) - return resource - - @staticmethod - def network_bandwidth(resource_spec: ResourceSpec, device_resolver: DeviceResolver): - """Calculates all P2P network bandwidths between nodes in the cluster.""" - devices = [device for device, _ in resource_spec.devices] - resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices] - gpu_cpu_bw = 10000. # hardcode for now - network_bandwidth = {} # key: - for i in range(len(devices)): - if resolved_devices[i] not in network_bandwidth: - network_bandwidth[resolved_devices[i]] = {} - for j in range(i, len(devices)): - if resolved_devices[j] not in network_bandwidth: - network_bandwidth[resolved_devices[j]] = {} - ip_i = devices[i].split(':')[0] - ip_j = devices[j].split(':')[0] - if ip_i != ip_j: - network_bandwidth[resolved_devices[i]][resolved_devices[j]] \ - = GIGABITS * resource_spec.network_bandwidth[ip_i] - network_bandwidth[resolved_devices[j]][resolved_devices[i]] \ - = GIGABITS * resource_spec.network_bandwidth[ip_j] - else: - network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw - network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw - - return network_bandwidth - - @staticmethod - def min_bandwitdh(worker_list, bandwidth): - min_bandwidth = INFINITY - num_workers = len(worker_list) - for i in range(num_workers): - for j in range(i, num_workers): - min_bandwidth = min(min_bandwidth, bandwidth[worker_list[j]][worker_list[i]]) - - @property - def original_graph_item_path(self): - return self._original_graph_item_path + var_item = VariableItem(var, graph_item, node) + name_to_items[var_item.name] = var_item + return name_to_items, resource_item diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py index 973fbef..a419126 100644 --- a/autodist/simulator/predefined_simulator.py +++ b/autodist/simulator/predefined_simulator.py @@ -31,8 +31,8 @@ class PredefinedSimulator(SimulatorBase): """Simulates strategies for a given graph and resource spec.""" def __init__(self, - original_graph_item_path, - fetches=None, + graph_item=None, + resource_spec=None, batch_size=1, seq_len=1, get_coef=True, @@ -135,7 +135,7 @@ def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec): def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) - vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec) + vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] device_ps_sync_time = {} @@ -178,7 +178,7 @@ def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): def predefined_sync_time(self, strategy, resource_spec): """ graph_item: transformed graph item """ - vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec) + vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) # Compute synchronization time for every var var_sync_time = {} for var_name, var in vars.items(): diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py index 2febd63..b200007 100644 --- a/autodist/simulator/utils.py +++ b/autodist/simulator/utils.py @@ -26,8 +26,8 @@ from autodist.utils import logging from autodist.resource_spec import ResourceSpec from autodist.strategy.base import Strategy -from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \ - DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR +# from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \ +# DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR from autodist.kernel.device.resolver import DeviceResolver @@ -281,6 +281,7 @@ def read_trial_runs(): GIGABITS = np.float(1e+9) INFINITY = 1e+9 NUM_RUNS = 500 +GPU_TO_CPU_BANDWIDTH = 1000 # Gbps def pad_list(l, max_len): @@ -308,40 +309,40 @@ def _resolved_devices_on_diff_machine(device1, device2): return node1 != node2 -def _resolve_device_address(device: str, device_resolver: DeviceResolver): - # change real ip address to /job:worker/task:0 - if not device: - return device - parts = device.split(':') - if parts and parts[0] in device_resolver._address_to_tasks: - resolved_device = device_resolver._address_to_tasks[parts[0]][0] - resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task']) - resolved = resolved + ':'.join(parts[-2:]) - return resolved - else: - raise ValueError("cannot resolve device: {} using device_resolver: {}".format( - device, device_resolver._address_to_tasks)) - - -def _num_local_replica(host, replicas, cluster): - # host: e.g., '/job:worker/task:0/device:CPU:0' - replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} - host_device = device_spec.DeviceSpecV2.from_string(host) - num_local_replica = sum(1 for d in replica_devices - if cluster.get_address_from_task(d.job, d.task) == - cluster.get_address_from_task(host_device.job, host_device.task)) - return num_local_replica - - -def _max_num_local_replica(replicas, cluster): - replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} - replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices} - max_num_local_replica = 0 - for host in replica_hosts: - num_local_replica = sum(1 for d in replica_devices - if cluster.get_address_from_task(d.job, d.task) == host) - max_num_local_replica = max(max_num_local_replica, num_local_replica) - return max_num_local_replica +# def _resolve_device_address(device: str, device_resolver: DeviceResolver): +# # change real ip address to /job:worker/task:0 +# if not device: +# return device +# parts = device.split(':') +# if parts and parts[0] in device_resolver._address_to_tasks: +# resolved_device = device_resolver._address_to_tasks[parts[0]][0] +# resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task']) +# resolved = resolved + ':'.join(parts[-2:]) +# return resolved +# else: +# raise ValueError("cannot resolve device: {} using device_resolver: {}".format( +# device, device_resolver._address_to_tasks)) + + +# def _num_local_replica(host, replicas, cluster): +# # host: e.g., '/job:worker/task:0/device:CPU:0' +# replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} +# host_device = device_spec.DeviceSpecV2.from_string(host) +# num_local_replica = sum(1 for d in replica_devices +# if cluster.get_address_from_task(d.job, d.task) == +# cluster.get_address_from_task(host_device.job, host_device.task)) +# return num_local_replica +# +# +# def _max_num_local_replica(replicas, cluster): +# replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas} +# replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices} +# max_num_local_replica = 0 +# for host in replica_hosts: +# num_local_replica = sum(1 for d in replica_devices +# if cluster.get_address_from_task(d.job, d.task) == host) +# max_num_local_replica = max(max_num_local_replica, num_local_replica) +# return max_num_local_replica def _strip_var_name(name): diff --git a/autodist/strategy/__init__.py b/autodist/strategy/__init__.py index 3be1c34..fe6a366 100644 --- a/autodist/strategy/__init__.py +++ b/autodist/strategy/__init__.py @@ -25,3 +25,4 @@ from .partitioned_all_reduce_strategy import PartitionedAR from .random_axis_partition_all_reduce_strategy import RandomAxisPartitionAR from .uneven_partition_ps_strategy import UnevenPartitionedPS +# from .auto_strategy import AutoStrategy diff --git a/autodist/strategy/auto/default_constraint.py b/autodist/strategy/auto/default_constraint.py deleted file mode 100644 index e69de29..0000000 diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py new file mode 100644 index 0000000..d2377e6 --- /dev/null +++ b/autodist/strategy/auto/item.py @@ -0,0 +1,463 @@ +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper classes and functions for automatic strategy generation.""" + +from enum import Enum + +from tensorflow.python.framework import ops, device_spec + +from autodist.kernel.common.utils import get_op_name, get_consumers +from autodist.kernel.device.resolver import DeviceResolver +from autodist.graph_item import cached_property +from autodist.strategy.base import byte_size_load_fn +from autodist.utils import logging +from autodist.cluster import SSHCluster +from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS + + +class VarType(Enum): + SPARSE = 0 + DENSE = 1 + + +class VariableItem: + """Helper class to include meta information about a variable.""" + def __init__(self, + var, + graph_item, + node_config=None): + self.var = var + self.graph_item = graph_item + self._var_op_name = get_op_name(var.name) + self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0] + + self._config = None + if node_config: + self.update_config(node_config) + else: + logging.warning('Item with name {} has empty config.'.format(self.name)) + + def update_config(self, config): + """ + Update the nodeconfig of this variable. + + Args: + config: + """ + assert not config + self._node_config = config + + @property + def var_type(self): + """ + Return the type of the variable (VarType.SPARSE or VarType.DENSE). + + Returns: + VarType + """ + return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE + + @property + def name(self): + """ + Return the name of the variable. + + Returns: + String + """ + return self.var.name + + @property + def is_sparse(self): + """ + Return whether the variable is sparse. + + Returns: + Bool + """ + return True if self.var_type == VarType.SPARSE else False + + @property + def is_embedding(self): + """ + Return whether the variable corresponds to an embedding. + + Returns: + Bool + """ + # TODO (Hao): better way to determine is_embedding? + for op in get_consumers(self.var.op): + if op.type == "ResourceGather": + return True + return False + + @property + def shape(self): + """ + Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar). + + Returns: + List(int) + """ + return self.original_shape + + @property + def original_shape(self): + if self.var.initial_value.shape.ndims: + return self.var.initial_value.shape.as_list() + else: + return None + + @property + def size(self): + size = 1 + if self.shape: + for s in self.shape: + size *= s + return size + + @property + def original_size(self): + size = 1 + if self.original_shape: + for s in self.original_shape: + size *= s + return size + + @property + def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1): + if not self.is_sparse: + return self.size + else: + if not self.shape: # scalar + return 1 + + emb_size = 1 + if len(self.shape) > 1: + # infer the embedding size from original shape + for i in range(1, len(self.original_shape)): + emb_size *= self.original_shape[i] + + sparse_data_size = batch_size_per_gpu * seq_len * emb_size + + # estimate the embedding of this partition simply using a proportional formula + return sparse_data_size * self.size / self.original_size + + @property + def partitionable_axes(self): + """ + Return the list of available axes that are legitimate to partition along. + + Returns: + List(int) + """ + valid_axes = [] + + # scalar + if not self.shape: + return valid_axes + + # Sparse variable can only be partition along the 0th axis in current implementation. + if self.is_sparse or self.is_embedding: + valid_axes = [0] + return valid_axes + for idx, dim in enumerate(self.shape): + if dim > 1: + valid_axes.append(idx) + return valid_axes + + @property + def byte_size(self): + """ + Return the byte size of the variable. + + Returns: + float + """ + return float(byte_size_load_fn(self.var)) + + @property + def dtype(self): + """ + Return the dtype of the variable. + + Returns: + dtype + """ + return self.var.dtype + + @property + def synchronizer(self): + """ + Return the synchronizer protobuf in the config of this variable. + + Returns: + NodeConfig + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer')) + + @property + def compressor(self): + """ + Return the compressor in the node config of this variable. + + Returns: + Compressor type. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'compressor', None) + + @property + def reduction_destination(self): + """ + Return the reduction_destination in the node config of this variable. + + Returns: + Reduction destinaiton. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'reduction_destination', None) + + def device(self, resolver): + device_str = self.reduction_destination if self.reduction_destination else self.var.device + if device_str: + device_str = resolver.resolve_to_device_str(device_str) + return device_str + +class PartItem(VariableItem): + """Helper class to include meta information about a variable partition.""" + def __init__(self, + var, + graph_item, + part_idx, + pc, + part_config=None): + super(PartItem, self).__init__(var, graph_item, part_config) + + self.part_idx = part_idx + self.pc = pc + + @property + def name(self): + """ + Return the name of this partition. + + Returns: + String + """ + name = '{}/part_{}:0'.format(get_op_name(self.var.name), self.part_idx) + return name + + @property + def partition_str(self): + return self.pc.partition_str + + @property + def shape(self): + """ + Return the shape of this partition. + + Returns: + List(int) + + """ + shape = self.original_shape + if shape: + dim_size = shape[self.pc.axis] // self.pc.num_shards + extras = shape[self.pc.axis] % self.pc.num_shards + if self.part_idx < extras: + dim_size += 1 + shape[self.pc.axis] = dim_size + return shape + + @property + def partitionable_axes(self): + """ + Return the list of available axes that are legitimate to partition along. + + Returns: + None: because this is a partition (not allowed to be partitioned further). + """ + return [] + + @property + def byte_size(self): + """ + Return the byte size of this partition. + + Returns: + float + """ + return float(byte_size_load_fn(self.var)) \ + * float(self.shape[self.pc.axis]) / float(self.original_shape[self.pc.axis]) + + @property + def synchronizer(self): + """ + + Returns: + + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + raise ValueError('Partitioner field is empty for a variable partition.') + return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer')) + + @property + def compressor(self): + """ + Return the compressor in the node config of this variable partition. + + Returns: + Compressor. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + raise ValueError('Partitioner field is empty for a variable partition.') + return getattr(self.synchronizer, 'compressor', None) + + @property + def reduction_destination(self): + """ + Return the reduction_destination in the node config of this variable partition. + + Returns: + Reduction destination. + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + logging.warning('Partitioner field is empty for a variable partition.') + return None + return getattr(self.synchronizer, 'reduction_destination', None) + + +class ResourceItem: + """ResourceItem. + + Helper class that includes meta information about a resource spec. All addresses are resolved (in TF format). + + TODO(zhisbug): merge ResourceItem class with ResourceSpec. + """ + + def __init__(self, resource_spec): + self._resource_spec = resource_spec + self._cluster = SSHCluster(resource_spec) + self._device_resolver = DeviceResolver(self._cluster) + + @property + def replicas(self): + """Return the list of replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0.""" + device_strs = [k for k, _ in self._resource_spec.devices] + return self._device_resolver.resolve_to_device_str(device_strs) + + @property + def gpu_replicas(self): + """ + Return the list of GPU replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0. + + Returns: + List(string) + """ + # device_str is autodist device string, e.g. 192.168.0.1:CPU:0 + device_strs = [k for k, _ in self._resource_spec.gpu_devices] + return self._device_resolver.resolve_to_device_str(device_strs) + + @property + def cpu_replicas(self): + """ + Return the list of CPU replicas in the format of TF device string, e.g. job:worker/task:0/device:cpu:0. + + Returns: + List(string) + """ + device_strs = [k for k, _ in self._resource_spec.cpu_devices] + return self._device_resolver.resolve_to_device_str(device_strs) + + @property + def total_num_gpu_replica(self): + return len(self.gpu_replicas) + + def num_local_gpu_replica(self, host): + """ + Return the number of gpu replica on a TF host address, e.g. '/job:worker/task:0/device:CPU:0'. + + Args: + host: TF host address,e .g. '/job:worker/task:0/device:CPU:0' + + Returns: + int + """ + gpu_device_specs = {device_spec.DeviceSpecV2.from_string(d) for d in self.gpu_replicas} + num = 0 + host_device_spec = device_spec.DeviceSpecV2.from_string(host) + for d in gpu_device_specs: + if self._cluster.get_address_from_task(d.job, d.task) \ + == self._cluster.get_address_from_task(host_device_spec.job, host_device_spec.task): + num += 1 + return num + + @property + def max_num_local_gpu_replica(self): + """Return the max number of local gpu replicas on the cluster.""" + return max([self.num_local_gpu_replica(host) for host in self.cpu_replicas]) + + @cached_property + def p2p_bandwidth(self): + """Calculates P2P network bandwidth between nodes in the cluster. + + Note that this is NOT a sysmetric + """ + bw = {} # key: (device1, device2) + devices = [device for device, _ in self._resource_spec.devices] + resolved_devices = self.replicas + + for i in range(len(self.replicas)): + ip_i = devices[i].split(':')[0] + d_i = resolved_devices[i] + if d_i not in bw: + bw[d_i] = {} + for j in range(i, len(self.replicas)): + ip_j = devices[j].split(':')[0] + d_j = resolved_devices[j] + if d_j not in bw: + bw[d_j] = {} + if ip_i != ip_j: + bw[d_i][d_j] = GIGABITS * self._resource_spec[ip_i].bandwidth[ip_i] + bw[d_j][d_i] = GIGABITS * self._resource_spec[ip_j].bandwidth[ip_j] + else: + bw[d_i][d_j] = GIGABITS * GPU_TO_CPU_BANDWIDTH + bw[d_j][d_i] = GIGABITS * GPU_TO_CPU_BANDWIDTH + return bw + + @cached_property + def min_bandwidth(self): + """Return the minimum bandwidth (bottleneck) of all p2p connections on this cluster.""" + return min([min(v.values()) for k, v in self.p2p_bandwidth]) diff --git a/autodist/strategy/auto/sample_util.py b/autodist/strategy/auto/sample_util.py deleted file mode 100644 index 2547304..0000000 --- a/autodist/strategy/auto/sample_util.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2020 Petuum. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Sample utility functions.""" - -import numpy as np - - -def uniform_sample_by_choices(choices): - """ - Uniformly sample an option from a list of options. - - Args: - choices (list): a list of values to be sampled from. - - Returns: - choice: the sampled value. - - """ - assert choices - p = np.random.uniform() - t = 1.0 / len(choices) - sample = choices[0] - for i, c in enumerate(choices): - if p < t * (i+1): - sample = c - break - return sample - - -def binary_sample(boundary=0.5): - p = np.random.uniform() - if p < boundary: - return True - else: - return False - - -def sample_merge_group(num_group, num_candidates): - - def is_valid(assignment): - unique_assignment = np.unique(assignment) - if unique_assignment.shape[0] == num_group: - return True - return False - - assignment = np.random.randint(1, num_group+1, [num_candidates]) - while not is_valid(assignment): - assignment = np.random.randint(1, num_group+1, [num_candidates]) - return assignment diff --git a/autodist/strategy/auto/strategy_sampler.py b/autodist/strategy/auto/strategy_sampler.py index a317089..1ebb76e 100644 --- a/autodist/strategy/auto/strategy_sampler.py +++ b/autodist/strategy/auto/strategy_sampler.py @@ -16,170 +16,19 @@ from collections import OrderedDict -from enum import Enum -from tensorflow.python.framework import ops +import numpy as np -from autodist.kernel.common.utils import get_op_name, get_consumers +from autodist.kernel.common.utils import get_op_name from autodist.kernel.partitioner import PartitionerConfig from autodist.proto import strategy_pb2, synchronizers_pb2 -from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn +from autodist.strategy.base import Strategy +from autodist.strategy.auto.item import VariableItem, PartItem from autodist.strategy.auto.ps_load_balancer import greedy_load_balancer, christy_load_balancer from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \ ordered_balanced_group_assigner -from autodist.strategy.auto import sample_util from autodist.const import MAX_INT32 -class VarType(Enum): - SPARSE = 0 - DENSE = 1 - - -class VariableHelper: - """Helper class to include meta information about a variable.""" - def __init__(self, var, graph_item): - self.var = var - self.graph_item = graph_item - self._var_op_name = get_op_name(var.name) - self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0] - - @property - def var_type(self): - """ - Return the type of the variable (VarType.SPARSE or VarType.DENSE). - - Returns: - VarType - """ - return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE - - @property - def is_sparse(self): - """ - Return whether the variable is sparse. - - Returns: - Bool - """ - return True if self.var_type == VarType.SPARSE else False - - @property - def is_embedding(self): - """ - Return whether the variable corresponds to an embedding. - - Returns: - Bool - """ - # TODO (Hao): better way to determine is_embedding? - for op in get_consumers(self.var.op): - if op.type == "ResourceGather": - return True - return False - - @property - def shape(self): - """ - Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar). - - Returns: - List(int) - """ - if self.var.initial_value.shape.ndims: - return self.var.initial_value.shape.as_list() - else: - return None - - @property - def partitionable_axes(self): - """ - Return the list of available axes that are legitimate to partition along. - - Returns: - List(int) - """ - valid_axes = [] - - # scalar - if not self.shape: - return valid_axes - - # Sparse variable can only be partition along the 0th axis in current implementation. - if self.is_sparse or self.is_embedding: - valid_axes = [0] - return valid_axes - for idx, dim in enumerate(self.shape): - if dim > 1: - valid_axes.append(idx) - return valid_axes - - @property - def byte_size(self): - """ - Return the byte size of the variable. - - Returns: - float - """ - return float(byte_size_load_fn(self.var)) - - @property - def dtype(self): - """ - Return the dtype of the variable. - - Returns: - dtype - """ - return self.var.dtype - - -class PartHelper: - """Helper class to include meta information about a variable partition.""" - def __init__(self, part_idx, var, pc): - self.var = var - self.part_idx = part_idx - self.pc = pc - - @property - def shape(self): - """ - Return the shape of this partition. - - Returns: - List(int) - - """ - shape = self.var.initial_value.shape.as_list() - dim_size = shape[self.pc.axis] // self.pc.num_shards - extras = shape[self.pc.axis] % self.pc.num_shards - if self.part_idx < extras: - dim_size += 1 - shape[self.pc.axis] = dim_size - return shape - - @property - def var_shape(self): - """ - Return the shape of the original value this part belonged to. - - Returns: - List(int) - """ - return self.var.initial_value.shape.as_list() - - @property - def byte_size(self): - """ - Return the byte size of this partition. - - Returns: - float - """ - return float(byte_size_load_fn(self.var)) \ - * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis]) - - class RandomStrategySampler(): """ Random Strategy Sampler. @@ -192,7 +41,7 @@ def __init__(self, space, heuristics): Args: space (dict): the strategy space that the random strategy should be drawn from. An example of the space - can be found at TODO(Hao). + can be found at heuristics (dict): heuristics used to guide the random sampling process. """ if not space: @@ -201,7 +50,6 @@ def __init__(self, space, heuristics): raise ValueError('Heuristic to guide strategy sampling is not provided.') self.space = space self.heuristics = heuristics - self.helpers = {} def build(self, graph_item, resource_spec): """Generate a randomized strategy given model and resource spec.""" @@ -210,41 +58,43 @@ def build(self, graph_item, resource_spec): # number of graph replica is equal to number of GPU devices expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices]) variables = graph_item.trainable_var_op_to_var.values() + name_to_item = OrderedDict() # Perform MCMC to generate each node configs node_config = [] for var in variables: - var_helper = VariableHelper(var, graph_item) - self.helpers[var_helper.var.name] = var_helper + var_item = VariableItem(var, graph_item) + name_to_item[var_item.name] = var_item node = strategy_pb2.Strategy.Node() - node.var_name = var_helper.var.name + node.var_name = var_item.name # Step 1: determine whether or not to partition # TODO(Hao): some factor is not considered, e.g. number of reduction_device_names - maybe_partition = sample_if_partition(var_helper, resource_spec, self.space, self.heuristics) + maybe_partition = sample_if_partition(var_item, resource_spec, self.space, self.heuristics) # Step 2.1: if not partition, sample a synchronizer type for it if not maybe_partition: # no partition - sample_var_synchronizer(node, var_helper, resource_spec, self.space) + sample_var_synchronizer(node, var_item, resource_spec, self.space) else: # Step 2.2: else partition # Step 2.2.1: sample a partitioner config - pc = sample_partition_config(var_helper, resource_spec, self.space, self.heuristics) + pc = sample_partition_config(var_item, resource_spec, self.space, self.heuristics) node.partitioner = pc.partition_str # step 2.2.2: sample a synchronizer type for each partition parts = [] for i in range(pc.num_shards): part = strategy_pb2.Strategy.Node() + part_item = PartItem(var, graph_item, i, pc) part.var_name = '{}/part_{}:0'.format(get_op_name(var.name), i) - self.helpers[part.var_name] = PartHelper(i, var, pc) + name_to_item[part.var_name] = part_item parts.append(part) - sample_parts_synchronizers(parts, var_helper, resource_spec, self.space, self.heuristics) + sample_parts_synchronizers(parts, var_item, resource_spec, self.space, self.heuristics) node.part_config.extend(parts) node_config.append(node) # Step 3: Post-assign group or placement. - sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics) + sample_group_and_reduction_destinations(node_config, resource_spec, name_to_item, self.heuristics) expr.node_config.extend(node_config) self._reset() @@ -255,12 +105,12 @@ def _reset(self): self.helpers = {} -def sample_if_partition(var_helper, resource_spec, space, heuristics): +def sample_if_partition(var_item, resource_spec, space, heuristics): """ Sample a bool value determining whether to partition a variable or not. Args: - var_helper: the variable helper corresponded to the variable of interest. + var_item: the variable item. resource_spec: the target cluster spec. space: the space argument controlling where to sample from. heuristics: the heuristics argument guiding the sampling process. @@ -275,9 +125,9 @@ def sample_if_partition(var_helper, resource_spec, space, heuristics): return False # intersection of variable's partitonable axis and global constraints - if var_helper.partitionable_axis: - if space['partitionable_axis']: - a = set(var_helper.partitionable_axis) & set(space['partitionable_axis']) + if var_item.partitionable_axes: + if space['partitionable_axes']: + a = set(var_item.partitionable_axes) & set(space['partitionable_axes']) if len(a) < 1: return False else: @@ -286,19 +136,19 @@ def sample_if_partition(var_helper, resource_spec, space, heuristics): # lower bound for abandoning partitioning lb = heuristics['maybe_partition_bounds'][0] ub = heuristics['maybe_partition_bounds'][1] - if var_helper.byte_size <= lb: + if var_item.byte_size <= lb: return False - if var_helper.byte_size >= ub: + if var_item.byte_size >= ub: return True assert (len(space['maybe_partition']) == 2) if heuristics['maybe_partition_by_size']: # By variable size -- a large variable has a higher chance to be partitioned # TODO (Hao): MAX_INT32 is too large, reconsider later... - chance = float(var_helper.byte_size - lb) / float(ub - lb) - return sample_util.binary_sample(boundary=chance) + chance = float(var_item.byte_size - lb) / float(ub - lb) + return binary_sample(boundary=chance) else: - return sample_util.uniform_sample_by_choices(space['maybe_partition']) + return uniform_sample_by_choices(space['maybe_partition']) def sample_var_synchronizer(node, var_helper, resource_spec, space): @@ -314,7 +164,7 @@ def sample_var_synchronizer(node, var_helper, resource_spec, space): """ # We ALWAYS use PS for sparse variables synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \ - else sample_util.uniform_sample_by_choices(space['synchronizer_types']) + else uniform_sample_by_choices(space['synchronizer_types']) if synchronizer_type == 'PS': node.PSSynchronizer.sync = True # we don't consider async at this moment node.PSSynchronizer.staleness = 0 @@ -345,10 +195,10 @@ def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristi synchronizer_types = ['PS'] * len(parts) else: if heuristics['same_synchronizer_for_parts']: - type = sample_util.uniform_sample_by_choices(space['synchronizer_types']) + type = uniform_sample_by_choices(space['synchronizer_types']) synchronizer_types = [type] * len(parts) else: - synchronizer_types = [sample_util.uniform_sample_by_choices(space['synchronizer_types']) + synchronizer_types = [uniform_sample_by_choices(space['synchronizer_types']) for part in parts] for i, part in enumerate(parts): if synchronizer_types[i] == 'PS': @@ -378,13 +228,13 @@ def sample_partition_config(var_helper, resource_spec, space, heuristics): """ # Arion only support partitioning along one axis -- we first sample a partition axis, # then sample the number of partitions along that axis, and obtain the partition config. - assert len(var_helper.partitionable_axis) > 0, 'No partition axis available' + assert len(var_helper.partitionable_axes) > 0, 'No partition axis available' # sample partition axis # TODO(Hao): some heursitics here available? - valid_axis = var_helper.partitionable_axis - if space['partitionable_axis']: - valid_axis = list(set(valid_axis) & set(space['partitionable_axis'])) - partition_axis = sample_util.uniform_sample_by_choices(valid_axis) + valid_axis = var_helper.partitionable_axes + if space['partitionable_axes']: + valid_axis = list(set(valid_axis) & set(space['partitionable_axes'])) + partition_axis = uniform_sample_by_choices(valid_axis) # sample how many partition to go num_nodes = resource_spec.num_cpus @@ -405,7 +255,7 @@ def sample_partition_config(var_helper, resource_spec, space, heuristics): raise ValueError('unseen num_partition_bounds config') # sample from [min_shards, max_shards] - num_shards = sample_util.uniform_sample_by_choices(range(min_shards, max_shards + 1)) + num_shards = uniform_sample_by_choices(list(range(min_shards, max_shards + 1))) # construct a PartitionerConfig (pc) partition_list = [1] * len(var_helper.shape) @@ -431,7 +281,7 @@ def sample_if_local_replication(local_replication_space, resource_spec): if resource_spec.num_gpus <= resource_spec.num_cpus: # meaning every machine has at most 1 GPU return False - return sample_util.uniform_sample_by_choices(local_replication_space) + return uniform_sample_by_choices(local_replication_space) def sample_ar_compressor(compressor_space): @@ -446,7 +296,7 @@ def sample_ar_compressor(compressor_space): Returns: """ # TODO(Hao): try to use all four options - return sample_util.uniform_sample_by_choices(compressor_space) + return uniform_sample_by_choices(compressor_space) def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics): @@ -508,7 +358,7 @@ def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, help if not load_balancer: destinations = {} for shard_name in ps_shards: - destinations[shard_name] = sample_util.uniform_sample_by_choices(reduction_device_names) + destinations[shard_name] = uniform_sample_by_choices(reduction_device_names) elif load_balancer == 'greedy': destinations = greedy_load_balancer(ps_shards, resource_spec, helpers) elif load_balancer == 'christy': @@ -576,7 +426,7 @@ def sample_ar_groups(node_config, ar_shards, helpers, heuristics): assert chunk_size_or_num_group > 0, "chunk_size or num_groups need to > 1..." if merge_scheme in ['random', None]: - tmp_assignments = sample_util.sample_merge_group(chunk_size_or_num_group, len(ar_shards)) + tmp_assignments = sample_merge_group(chunk_size_or_num_group, len(ar_shards)) group_assignments = OrderedDict() for i, shard_name in enumerate(ar_shards): group_assignments[shard_name] = tmp_assignments[i] @@ -613,7 +463,7 @@ def sample_num_ar_groups(ar_shards, lb, ub): """ min_num_group = max(1, lb) max_num_group = min(len(ar_shards), ub) - num_group = sample_util.uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1))) + num_group = uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1))) return num_group @@ -627,7 +477,7 @@ def sample_chunk_size(num_ar_shards): Returns: """ - chunk_size = sample_util.uniform_sample_by_choices(list(range(1, num_ar_shards + 1))) + chunk_size = uniform_sample_by_choices(list(range(1, num_ar_shards + 1))) return chunk_size @@ -654,12 +504,56 @@ def assign_ar_group(node_config, ar_shards): synchronizer.group = ar_shards[node.var_name][1] +def uniform_sample_by_choices(choices): + """ + Uniformly sample an option from a list of options. + + Args: + choices (list): a list of values to be sampled from. + + Returns: + choice: the sampled value. + + """ + assert choices + p = np.random.uniform() + t = 1.0 / len(choices) + sample = choices[0] + for i, c in enumerate(choices): + if p < t * (i+1): + sample = c + break + return sample + + +def binary_sample(boundary=0.5): + p = np.random.uniform() + if p < boundary: + return True + else: + return False + + +def sample_merge_group(num_group, num_candidates): + + def is_valid(assignment): + unique_assignment = np.unique(assignment) + if unique_assignment.shape[0] == num_group: + return True + return False + + assignment = np.random.randint(1, num_group+1, [num_candidates]) + while not is_valid(assignment): + assignment = np.random.randint(1, num_group+1, [num_candidates]) + return assignment + + default_space = { 'synchronizer_types': ['PS', 'AR'], 'maybe_partition': [True, False], 'compressor': ['HorovodCompressor', 'NoneCompressor', 'HorovodCompressorEF'], 'local_replication': [False], - 'partitionable_axis': [] + 'partitionable_axes': [] } diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py index 5d6b78f..354d62d 100644 --- a/autodist/strategy/auto_strategy.py +++ b/autodist/strategy/auto_strategy.py @@ -52,4 +52,9 @@ def __init__(self): ) def build(self, graph_item, resource_spec): - return \ No newline at end of file + candidates = self.propose_n(graph_item, resource_spec, self._num_proposals) + + # Assess all candidates and simply pick the highest-scored one + features, scores = self._simulator.inference(candidates) + best_index = scores.index(min(scores)) + return candidates[best_index] diff --git a/examples/linear_regression.py b/examples/linear_regression.py index d14a3f8..4145626 100644 --- a/examples/linear_regression.py +++ b/examples/linear_regression.py @@ -7,12 +7,14 @@ from autodist import AutoDist from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax +from autodist.strategy import AutoStrategy resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml') def main(_): - autodist = AutoDist(resource_spec_file, AllReduce(128)) + # autodist = AutoDist(resource_spec_file, AllReduce(128)) + autodist = AutoDist(resource_spec_file, AutoStrategy()) TRUE_W = 3.0 TRUE_b = 2.0 diff --git a/tests/test_simulator.py b/tests/test_simulator.py new file mode 100644 index 0000000..f2aaeb1 --- /dev/null +++ b/tests/test_simulator.py @@ -0,0 +1,27 @@ +from autodist.simulator.utils import _resolve_device_address +from autodist.resource_spec import ResourceSpec +from autodist.cluster import SSHCluster +from autodist.kernel.device.resolver import DeviceResolver +from autodist.simulator.base import SimulatorBase +from autodist.simulator.utils import _resolve_device_address + +# def test_resolve_device_address(): +# resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml' +# rs = ResourceSpec(resource_spec_file) +# cluster = SSHCluster(rs) +# resolver = DeviceResolver(cluster) +# return True + +def test_resolve(): + resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml' + rs = ResourceSpec(resource_spec_file) + cluster = SSHCluster(rs) + resolver = DeviceResolver(cluster) + SimulatorBase.network_bandwidth(rs, resolver) + devices = [device for device, _ in rs.devices] + + resolved_devices_1 = [_resolve_device_address(device, resolver) for device, _ in rs.devices] + devices = resolver.resolve_to_device_str(devices) + + for d1, d2 in zip(resolved_devices_1, devices): + assert d1 == d2 \ No newline at end of file From 15e491f1d1d5498cd00ddd82a15a1f63afae2ed0 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Tue, 28 Jul 2020 02:45:26 -0400 Subject: [PATCH 07/11] update predefined simulator and linear simulator --- autodist/simulator/base.py | 11 +- autodist/simulator/linear_simulator.py | 374 +++++++++- autodist/simulator/predefined_simulator.py | 795 +++++++++++---------- autodist/strategy/auto/item.py | 7 +- 4 files changed, 814 insertions(+), 373 deletions(-) diff --git a/autodist/simulator/base.py b/autodist/simulator/base.py index 31e2d1a..19b965d 100644 --- a/autodist/simulator/base.py +++ b/autodist/simulator/base.py @@ -68,9 +68,10 @@ def simulate(self, strategy, graph_item=None, resource_spec=None, - checkpoint=None): + *args, + **kwargs): """ - Return simulated runtime cost given (Strategy, GraphItem, ResourceSpec) tuple. + Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple. Args: strategy: @@ -84,8 +85,7 @@ def simulate(self, raise NotImplementedError() def inference(self, - features, - checkpoint=None): + features): """ Abstract method for simulator inference. @@ -117,9 +117,6 @@ def save_checkpoint(self, model, checkpoint): """ raise NotImplementedError() - def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): - raise NotImplementedError() - def preprocess(self, strategy, graph_item=None, diff --git a/autodist/simulator/linear_simulator.py b/autodist/simulator/linear_simulator.py index 527d923..5dc2e6b 100644 --- a/autodist/simulator/linear_simulator.py +++ b/autodist/simulator/linear_simulator.py @@ -1,4 +1,4 @@ -# Copyright 2020 Petuum. All Rights Reserved. +# Copyright 2020 Petuum Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,10 +12,376 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Linear simulator.""" +"""Predefined simulator with linear model.""" +import pickle as pkl + +import tensorflow as tf +from tensorflow.python.eager import context + +from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from autodist.resource_spec import ResourceSpec from autodist.simulator.base import SimulatorBase +from autodist.simulator.utils import _resolved_devices_on_diff_machine, \ + get_dense_var_bits, get_sparse_var_bits +from autodist.strategy.base import Strategy + class LinearSimulator(SimulatorBase): - def __init__(self): - super(LinearSimulator, self).__init__() + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + graph_item=None, + resource_spec=None, + batch_size=1, + seq_len=1, + get_coef=True, + checkpoint=None): + + super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path) + + print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size)) + self._fetches = fetches + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + self._get_coef = get_coef + self._checkpoint = checkpoint + self._weights = None + with context.eager_mode(): + if self._checkpoint: + self._weights = self.load_checkpoint(self._checkpoint) + + def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None): + """Return simulated runtime value.""" + inputs = self.create_features(strategy, resource_spec) + with context.eager_mode(): + cost = self.inference(inputs, checkpoint) + return cost + + def inference(self, inputs, checkpoint=None): + if checkpoint is not None: + weights = self.load_checkpoint(checkpoint) + elif self._weights is not None: + weights = self._weights + else: + raise ValueError("No checkpoint provided in either initialization or inference.") + + if not isinstance(inputs, tf.Tensor): + inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) + + if len(weights) == 4: + W0, b0, W, b = weights + inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0) + cost = tf.matmul(inputs, W) + b + elif len(weights) == 2: + W, b = weights + cost = tf.matmul(inputs, W) + b + else: + raise ValueError + return cost + + def load_checkpoint(self, checkpoint=None): + if checkpoint is None: + if self._checkpoint is not None: + checkpoint = self._checkpoint + else: + raise ValueError("checkpoint is None: {}".format(checkpoint)) + self._weights = pkl.load(open(checkpoint, 'rb')) + # self._weights = json.load(open(checkpoint, 'r')) + print("Load checkpoint: ") + print(self._weights) + return self._weights + + def save_checkpoint(self, model, checkpoint): + pkl.dump(model, open(checkpoint, 'wb')) + self._checkpoint = checkpoint + self._weights = model + + def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec): + var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) + + # Add up sync time per device to find the slowest server time. + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + device_ps_sync_time = {} + var_ar_sync_time = {} + for var_name, sync_time in var_sync_time.items(): + if isinstance(vars[var_name].synchronizer, PSSynchronizer): + device = vars[var_name].device + if device not in device_ps_sync_time: + device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] + + else: # AllReduce + if var_name not in var_ar_sync_time: + var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + var_ar_sync_time[var_name][key] += sync_time[key] + + max_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_var_ar_sync_time = {key: 0.0 for key in feature_keys} + for key in feature_keys: + max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0]) + + feat = [max_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_var_ar_sync_time[key] for key in feature_keys] + + return feat + + def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): + # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) + + vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) + + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + device_ps_sync_time = {} + group_ar_sync_time = {} + + for var_name, var in vars.items(): + if isinstance(var.synchronizer, PSSynchronizer): + sync_time = self.var_ps_time(var, resource) + device = vars[var_name].device + if device not in device_ps_sync_time: + device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] + elif isinstance(var.synchronizer, AllReduceSynchronizer): + sync_time = self.var_ar_time(var, resource) + var_group = sync_time['group'] + if var_group not in group_ar_sync_time: + group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + group_ar_sync_time[var_group][key] += sync_time[key] + else: + raise ValueError('{}'.format(type(var.synchronizer))) + + max_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} + max_group_ar_sync_time = {key: 0.0 for key in feature_keys} + sum_group_ar_sync_time = {key: 0.0 for key in feature_keys} + for key in feature_keys: + max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) + max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0]) + sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0]) + + feat = [max_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_device_ps_sync_time[key] for key in feature_keys] \ + + [max_group_ar_sync_time[key] for key in feature_keys] \ + + [sum_group_ar_sync_time[key] for key in feature_keys] + + return feat + + def predefined_sync_time(self, strategy, resource_spec): + """ graph_item: transformed graph item """ + vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) + # Compute synchronization time for every var + var_sync_time = {} + for var_name, var in vars.items(): + if isinstance(var.synchronizer, PSSynchronizer): + var_sync_time[var_name] = self.var_ps_time(var, resource) + elif isinstance(var.synchronizer, AllReduceSynchronizer): + var_sync_time[var_name] = self.var_ar_time(var, resource) + else: + raise ValueError('{}'.format(type(var.synchronizer))) + return var_sync_time, vars, resource + + def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in PS strategy.""" + def _helper(worker_list, worker_num_replicas=None): + if worker_num_replicas is None: + worker_num_replicas = [1.0] * len(worker_list) + + this_server_time = 0 + # network transfer: sum up all workers time. equals to the time cost of this server. + # TODO(Hao): didn't consider any parallelization among partitions + for k, worker in enumerate(worker_list): + if _resolved_devices_on_diff_machine(var.device, worker): + if var.is_sparse: + this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + else: + this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) + this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] + + if self._get_coef: + return { + 'transmission': this_server_time, + 'network_overhead': len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'strategy': 'ps', + 'local_proxy': var.synchronizer.local_replication, + 'is_sparse': var.is_sparse, + 'size_to_transfer': var_size_to_transfer, + 'dtype': str(var.dtype), + # 'server_list': [partition.to_dict() for partition in server_list], + 'worker_list': worker_list, + 'cpu_worker_list': resource.cpu_worker_list, + 'gpu_worker_list': resource.gpu_worker_list, + 'worker_num_replicas': worker_num_replicas, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': True, + } + else: + return this_server_time + len(worker_list) * network_overhead + \ + gpu_kernel_memory_latency * resource.max_num_local_replica + + var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + + if var.is_sparse: + send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas) + receive_time = _helper(resource.gpu_worker_list) + else: + send_time = _helper(resource.cpu_worker_list) + if var.synchronizer.local_replication: + receive_time = _helper(resource.cpu_worker_list) + else: + receive_time = _helper(resource.gpu_worker_list) + + return send_time, receive_time + + def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): + """Compute synchronization time of a variable in AR strategy.""" + worker_list = resource.cpu_worker_list + num_workers = len(worker_list) + min_bandwidth = None + for i in range(num_workers): + for j in range(i, num_workers): + if min_bandwidth is None: + min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]] + else: + min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]]) + + # Compressor + if var.compressor == "PowerSGDCompressor" or var.compressor == 3: + rank = 10 # currently using default value. So hardcode here. # todo: confirm + # assume var must be a dense variable. + og_shape = var.shape + ndims = len(og_shape) + if ndims <= 1: # no compress + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + else: + if ndims > 2: + n = og_shape[0] + m = 1 + for s in og_shape[1:]: + m *= s # tensor's shape (n, m) + else: + n, m = og_shape[0], og_shape[1] + size_to_transfer = n * rank + m * rank + dtype = tf.float32 + elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \ + or var.compressor == 2 or var.compressor == 1: + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = tf.float32 + elif var.compressor == "NoneCompressor" or var.compressor == 0: + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = var.dtype + else: + raise ValueError('Compressor does not exist: {}'.format(var.compressor)) + + # todo: chunk_size + # AllReduce communication time + # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers) + time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth + + if self._get_coef: + return { + 'transmission': time, + 'network_overhead': 1, # len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'group': var.synchronizer.group, + 'strategy': 'allreduce', + 'is_sparse': False, + # 'chunk_size': chunk_size, + 'spec': 'NCCL', # default + 'compressor': var.compressor, + 'worker_list': worker_list, + 'num_workers': num_workers, + 'size_to_transfer': size_to_transfer, + 'dtype': str(dtype), + 'min_bandwidth': min_bandwidth, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': False, + } + else: + return time + network_overhead * len(worker_list) \ + + gpu_kernel_memory_latency * resource.max_num_local_replica + + + + # @staticmethod + # def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, + # max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, + # network_overhead=0.0, gpu_kernel_memory_latency=0.0): + # """Compute synchrinzation time of a variable in PS strategy.""" + # + # def _helper(worker_list, worker_num_replicas=None): + # if worker_num_replicas is None: + # worker_num_replicas = [1.0] * len(worker_list) + # # Compute the slowest server + # slowest_server_time = 0 + # for j, server in enumerate(server_list): + # if server.size_to_transfer == 0: + # continue + # # network transfer: sum up all workers time. equals to the time cost of this server. + # this_server_time = 0 + # for k, worker in enumerate(worker_list): + # if _resolved_devices_on_diff_machine(server.device, worker): + # if is_sparse: + # this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k] + # else: + # this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype) + # this_server_time += this_worker_size / network_bandwidth[server.device][worker] + # slowest_server_time = max(slowest_server_time, this_server_time) + # + # if get_coef: + # return { + # 'transmission': slowest_server_time, + # 'network_overhead': len(worker_list), + # 'gpu_kernel_memory_latency': max_num_local_replica, + # 'constant': 1.0, + # # possible affecting factors. + # 'var_name': var_name, + # 'strategy': 'ps', + # 'local_proxy': local_proxy, + # 'is_sparse': is_sparse, + # 'server_list': [partition.to_dict() for partition in server_list], + # 'worker_list': worker_list, + # 'cpu_worker_list': cpu_worker_list, + # 'gpu_worker_list': gpu_worker_list, + # 'worker_num_replicas': worker_num_replicas, + # 'max_num_local_replica': max_num_local_replica, + # } + # else: + # return slowest_server_time + len(worker_list) * network_overhead + \ + # gpu_kernel_memory_latency * max_num_local_replica + # + # if is_sparse: + # send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas) + # receive_time = _helper(gpu_worker_list) + # else: + # send_time = _helper(cpu_worker_list) + # if local_proxy: + # receive_time = _helper(cpu_worker_list) + # else: + # receive_time = _helper(gpu_worker_list) + # + # if get_coef: + # # return {key: send_time[key]+receive_time[key] for key in send_time.keys()} + # return send_time, receive_time + # else: + # return send_time, receive_time diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py index a419126..6b141c8 100644 --- a/autodist/simulator/predefined_simulator.py +++ b/autodist/simulator/predefined_simulator.py @@ -1,4 +1,4 @@ -# Copyright 2020 Petuum. All Rights Reserved. +# Copyright 2020 Petuum Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ """Predefined simulator with linear model.""" import pickle as pkl +from collections import OrderedDict import tensorflow as tf from tensorflow.python.eager import context @@ -23,365 +24,441 @@ from autodist.resource_spec import ResourceSpec from autodist.simulator.base import SimulatorBase from autodist.simulator.utils import _resolved_devices_on_diff_machine, \ - get_dense_var_bits, get_sparse_var_bits + get_dense_var_bits, get_sparse_var_bits from autodist.strategy.base import Strategy +from autodist.utils import logging class PredefinedSimulator(SimulatorBase): - """Simulates strategies for a given graph and resource spec.""" - - def __init__(self, - graph_item=None, - resource_spec=None, - batch_size=1, - seq_len=1, - get_coef=True, - checkpoint=None): - - super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path) - - print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size)) - self._fetches = fetches - self._batch_size_per_gpu = batch_size - self._seq_len = seq_len - self._get_coef = get_coef - self._checkpoint = checkpoint - self._weights = None - with context.eager_mode(): - if self._checkpoint: - self._weights = self.load_checkpoint(self._checkpoint) - - def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None): - """Return simulated runtime value.""" - inputs = self.create_features(strategy, resource_spec) - with context.eager_mode(): - cost = self.inference(inputs, checkpoint) - return cost - - def inference(self, inputs, checkpoint=None): - if checkpoint is not None: - weights = self.load_checkpoint(checkpoint) - elif self._weights is not None: - weights = self._weights - else: - raise ValueError("No checkpoint provided in either initialization or inference.") - - if not isinstance(inputs, tf.Tensor): - inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) - - if len(weights) == 4: - W0, b0, W, b = weights - inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0) - cost = tf.matmul(inputs, W) + b - elif len(weights) == 2: - W, b = weights - cost = tf.matmul(inputs, W) + b - else: - raise ValueError - return cost - - def load_checkpoint(self, checkpoint=None): - if checkpoint is None: - if self._checkpoint is not None: - checkpoint = self._checkpoint - else: - raise ValueError("checkpoint is None: {}".format(checkpoint)) - self._weights = pkl.load(open(checkpoint, 'rb')) - # self._weights = json.load(open(checkpoint, 'r')) - print("Load checkpoint: ") - print(self._weights) - return self._weights - - def save_checkpoint(self, model, checkpoint): - pkl.dump(model, open(checkpoint, 'wb')) - self._checkpoint = checkpoint - self._weights = model - - def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec): - var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) - - # Add up sync time per device to find the slowest server time. - feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] - device_ps_sync_time = {} - var_ar_sync_time = {} - for var_name, sync_time in var_sync_time.items(): - if isinstance(vars[var_name].synchronizer, PSSynchronizer): - device = vars[var_name].device - if device not in device_ps_sync_time: - device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] - - else: # AllReduce - if var_name not in var_ar_sync_time: - var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - var_ar_sync_time[var_name][key] += sync_time[key] - - max_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_var_ar_sync_time = {key: 0.0 for key in feature_keys} - for key in feature_keys: - max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0]) - - feat = [max_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_var_ar_sync_time[key] for key in feature_keys] - - return feat - - def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): - # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) - - vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) - - feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] - device_ps_sync_time = {} - group_ar_sync_time = {} - - for var_name, var in vars.items(): - if isinstance(var.synchronizer, PSSynchronizer): - sync_time = self.var_ps_time(var, resource) - device = vars[var_name].device - if device not in device_ps_sync_time: - device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] - elif isinstance(var.synchronizer, AllReduceSynchronizer): - sync_time = self.var_ar_time(var, resource) - var_group = sync_time['group'] - if var_group not in group_ar_sync_time: - group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - group_ar_sync_time[var_group][key] += sync_time[key] - else: - raise ValueError('{}'.format(type(var.synchronizer))) - - max_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} - max_group_ar_sync_time = {key: 0.0 for key in feature_keys} - sum_group_ar_sync_time = {key: 0.0 for key in feature_keys} - for key in feature_keys: - max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) - max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0]) - sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0]) - - feat = [max_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_device_ps_sync_time[key] for key in feature_keys] \ - + [max_group_ar_sync_time[key] for key in feature_keys] \ - + [sum_group_ar_sync_time[key] for key in feature_keys] - - return feat - - def predefined_sync_time(self, strategy, resource_spec): - """ graph_item: transformed graph item """ - vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) - # Compute synchronization time for every var - var_sync_time = {} - for var_name, var in vars.items(): - if isinstance(var.synchronizer, PSSynchronizer): - var_sync_time[var_name] = self.var_ps_time(var, resource) - elif isinstance(var.synchronizer, AllReduceSynchronizer): - var_sync_time[var_name] = self.var_ar_time(var, resource) - else: - raise ValueError('{}'.format(type(var.synchronizer))) - return var_sync_time, vars, resource - - def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): - """Compute synchronization time of a variable in PS strategy.""" - def _helper(worker_list, worker_num_replicas=None): - if worker_num_replicas is None: - worker_num_replicas = [1.0] * len(worker_list) - - this_server_time = 0 - # network transfer: sum up all workers time. equals to the time cost of this server. - # TODO(Hao): didn't consider any parallelization among partitions - for k, worker in enumerate(worker_list): - if _resolved_devices_on_diff_machine(var.device, worker): - if var.is_sparse: - this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] - else: - this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) - this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] - - if self._get_coef: - return { - 'transmission': this_server_time, - 'network_overhead': len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'strategy': 'ps', - 'local_proxy': var.synchronizer.local_replication, - 'is_sparse': var.is_sparse, - 'size_to_transfer': var_size_to_transfer, - 'dtype': str(var.dtype), - # 'server_list': [partition.to_dict() for partition in server_list], - 'worker_list': worker_list, - 'cpu_worker_list': resource.cpu_worker_list, - 'gpu_worker_list': resource.gpu_worker_list, - 'worker_num_replicas': worker_num_replicas, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': True, - } - else: - return this_server_time + len(worker_list) * network_overhead + \ - gpu_kernel_memory_latency * resource.max_num_local_replica - - var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - - if var.is_sparse: - send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas) - receive_time = _helper(resource.gpu_worker_list) - else: - send_time = _helper(resource.cpu_worker_list) - if var.synchronizer.local_replication: - receive_time = _helper(resource.cpu_worker_list) - else: - receive_time = _helper(resource.gpu_worker_list) - - return send_time, receive_time - - def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): - """Compute synchronization time of a variable in AR strategy.""" - worker_list = resource.cpu_worker_list - num_workers = len(worker_list) - min_bandwidth = None - for i in range(num_workers): - for j in range(i, num_workers): - if min_bandwidth is None: - min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]] - else: - min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]]) - - # Compressor - if var.compressor == "PowerSGDCompressor" or var.compressor == 3: - rank = 10 # currently using default value. So hardcode here. # todo: confirm - # assume var must be a dense variable. - og_shape = var.shape - ndims = len(og_shape) - if ndims <= 1: # no compress - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - else: - if ndims > 2: - n = og_shape[0] - m = 1 - for s in og_shape[1:]: - m *= s # tensor's shape (n, m) - else: - n, m = og_shape[0], og_shape[1] - size_to_transfer = n * rank + m * rank - dtype = tf.float32 - elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \ - or var.compressor == 2 or var.compressor == 1: - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - dtype = tf.float32 - elif var.compressor == "NoneCompressor" or var.compressor == 0: - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - dtype = var.dtype - else: - raise ValueError('Compressor does not exist: {}'.format(var.compressor)) - - # todo: chunk_size - # AllReduce communication time - # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers) - time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth - - if self._get_coef: - return { - 'transmission': time, - 'network_overhead': 1, # len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'group': var.synchronizer.group, - 'strategy': 'allreduce', - 'is_sparse': False, - # 'chunk_size': chunk_size, - 'spec': 'NCCL', # default - 'compressor': var.compressor, - 'worker_list': worker_list, - 'num_workers': num_workers, - 'size_to_transfer': size_to_transfer, - 'dtype': str(dtype), - 'min_bandwidth': min_bandwidth, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': False, - } - else: - return time + network_overhead * len(worker_list) \ - + gpu_kernel_memory_latency * resource.max_num_local_replica - - - - # @staticmethod - # def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, - # max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, - # network_overhead=0.0, gpu_kernel_memory_latency=0.0): - # """Compute synchrinzation time of a variable in PS strategy.""" - # - # def _helper(worker_list, worker_num_replicas=None): - # if worker_num_replicas is None: - # worker_num_replicas = [1.0] * len(worker_list) - # # Compute the slowest server - # slowest_server_time = 0 - # for j, server in enumerate(server_list): - # if server.size_to_transfer == 0: - # continue - # # network transfer: sum up all workers time. equals to the time cost of this server. - # this_server_time = 0 - # for k, worker in enumerate(worker_list): - # if _resolved_devices_on_diff_machine(server.device, worker): - # if is_sparse: - # this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k] - # else: - # this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype) - # this_server_time += this_worker_size / network_bandwidth[server.device][worker] - # slowest_server_time = max(slowest_server_time, this_server_time) - # - # if get_coef: - # return { - # 'transmission': slowest_server_time, - # 'network_overhead': len(worker_list), - # 'gpu_kernel_memory_latency': max_num_local_replica, - # 'constant': 1.0, - # # possible affecting factors. - # 'var_name': var_name, - # 'strategy': 'ps', - # 'local_proxy': local_proxy, - # 'is_sparse': is_sparse, - # 'server_list': [partition.to_dict() for partition in server_list], - # 'worker_list': worker_list, - # 'cpu_worker_list': cpu_worker_list, - # 'gpu_worker_list': gpu_worker_list, - # 'worker_num_replicas': worker_num_replicas, - # 'max_num_local_replica': max_num_local_replica, - # } - # else: - # return slowest_server_time + len(worker_list) * network_overhead + \ - # gpu_kernel_memory_latency * max_num_local_replica - # - # if is_sparse: - # send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas) - # receive_time = _helper(gpu_worker_list) - # else: - # send_time = _helper(cpu_worker_list) - # if local_proxy: - # receive_time = _helper(cpu_worker_list) - # else: - # receive_time = _helper(gpu_worker_list) - # - # if get_coef: - # # return {key: send_time[key]+receive_time[key] for key in send_time.keys()} - # return send_time, receive_time - # else: - # return send_time, receive_time + """ + Simulator that uses a predefined communication model to estimate the runtime of strategies. + + See this paper TODO(Hao): put the paper link. + """ + def __init__(self, + graph_item=None, + resource_spec=None, + batch_size=1, + seq_len=1): + """ + Construct a predefined simulator. + + The reason we need the per-replica batch size and the length of the inputsequence is to estimate + the communication load of variables that are sparsely access (e.g. embeddings). For dense variables, + these two arguments have no influence on estimation. + + Args: + graph_item: a GraphItem object, or a path to a serialized GraphItem object. + resource_spec: a ResourceSpec object, or a path to a resource file. + batch_size: the per-replica batch size used to train this model, if there are sparse variables. + seq_len: the average length of input sequences (if there is any). + """ + super(PredefinedSimulator, self).__init__(graph_item, resource_spec) + logging.debug('A PredefinedSimualtor is instantiated: batch_size_per_gpu is {}'.format(batch_size)) + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + *args, + **kwargs): + """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.""" + inputs = self.create_features(strategy, resource_spec) + with context.eager_mode(): + cost = self.inference(inputs, checkpoint) + return cost + + def inference(self, inputs, checkpoint=None): + if checkpoint is not None: + weights = self.load_checkpoint(checkpoint) + elif self._weights is not None: + weights = self._weights + else: + raise ValueError("No checkpoint provided in either initialization or inference.") + + if not isinstance(inputs, tf.Tensor): + inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) + + if len(weights) == 4: + W0, b0, W, b = weights + inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0) + cost = tf.matmul(inputs, W) + b + elif len(weights) == 2: + W, b = weights + cost = tf.matmul(inputs, W) + b + else: + raise ValueError + return cost + + def estimate_sync_time(self, + strategy, + graph_item=None, + resource_spec=None): + if not strategy: + raise ValueError('strategy is None.') + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + + # construct the meta objects + name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec) + + # Now estimate the per-variable sync time + var_sync_time = OrderedDict() + for var_name, var_item in name_to_items.items(): + if isinstance(var_item.synchronizer, PSSynchronizer): + var_sync_time[var_name] = self.var_ps_time(var_item, resource_item) + elif isinstance(var_item.synchronizer, AllReduceSynchronizer): + var_sync_time = self.var_ar_time(var_item, resource_item) + else: + raise ValueError('{}'.format(type(var_item.synchronizer))) + return var_sync_time + + + + + + + def create_features(self, + strategy, + resource_spec): + # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) + + vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) + + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + device_ps_sync_time = {} + group_ar_sync_time = {} + + for var_name, var in vars.items(): + if isinstance(var.synchronizer, PSSynchronizer): + sync_time = self.var_ps_time(var, resource) + device = vars[var_name].device + if device not in device_ps_sync_time: + device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] + elif isinstance(var.synchronizer, AllReduceSynchronizer): + sync_time = self.var_ar_time(var, resource) + var_group = sync_time['group'] + if var_group not in group_ar_sync_time: + group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + group_ar_sync_time[var_group][key] += sync_time[key] + else: + raise ValueError('{}'.format(type(var.synchronizer))) + + max_device_ps_sync_time = {key: 0.0 for key in feature_keys} + sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} + max_group_ar_sync_time = {key: 0.0 for key in feature_keys} + sum_group_ar_sync_time = {key: 0.0 for key in feature_keys} + for key in feature_keys: + max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) + sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) + max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0]) + sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0]) + + feat = [max_device_ps_sync_time[key] for key in feature_keys] \ + + [sum_device_ps_sync_time[key] for key in feature_keys] \ + + [max_group_ar_sync_time[key] for key in feature_keys] \ + + [sum_group_ar_sync_time[key] for key in feature_keys] + + return feat + + + + + # def predefined_sync_time(self, strategy, resource_spec): + # """ graph_item: transformed graph item """ + # vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) + # # Compute synchronization time for every var + # var_sync_time = {} + # for var_name, var in vars.items(): + # if isinstance(var.synchronizer, PSSynchronizer): + # var_sync_time[var_name] = self.var_ps_time(var, resource) + # elif isinstance(var.synchronizer, AllReduceSynchronizer): + # var_sync_time[var_name] = self.var_ar_time(var, resource) + # else: + # raise ValueError('{}'.format(type(var.synchronizer))) + # return var_sync_time, vars, resource + + + def var_ps_time(self, + var_item, + resource_item, + network_overhead=0.0, + gpu_kernel_memory_latency=0.0, + get_coef=False): + """ + Estimate the synchronization time of a variable with PS synchronizer. + + Args: + var_item: + resource_item: + network_overhead: + gpu_kernel_memory_latency: + get_coef: return the + + Returns: + + """ + + def _helper(worker_list, worker_num_replicas=None): + if worker_num_replicas is None: + worker_num_replicas = [1.0] * len(worker_list) + + this_server_time = 0 + # network transfer: sum up all workers time. equals to the time cost of this server. + # TODO(Hao): didn't consider any parallelization among partitions + for k, worker in enumerate(worker_list): + if _resolved_devices_on_diff_machine(var.device, worker): + if var.is_sparse: + this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + else: + this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) + this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] + + if get_coef: + return { + 'transmission': this_server_time, + 'network_overhead': len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'strategy': 'ps', + 'local_proxy': var.synchronizer.local_replication, + 'is_sparse': var.is_sparse, + 'size_to_transfer': var_size_to_transfer, + 'dtype': str(var.dtype), + # 'server_list': [partition.to_dict() for partition in server_list], + 'worker_list': worker_list, + 'cpu_worker_list': resource.cpu_worker_list, + 'gpu_worker_list': resource.gpu_worker_list, + 'worker_num_replicas': worker_num_replicas, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': True, + } + else: + return this_server_time + len(worker_list) * network_overhead + \ + gpu_kernel_memory_latency * resource.max_num_local_replica + + var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + + if var.is_sparse: + send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas) + receive_time = _helper(resource.gpu_worker_list) + else: + send_time = _helper(resource.cpu_worker_list) + if var.synchronizer.local_replication: + receive_time = _helper(resource.cpu_worker_list) + else: + receive_time = _helper(resource.gpu_worker_list) + + return send_time, receive_time + + @staticmethod + def _estimate_ps_send_receive_time(var_item, + resource_item, + hosts, + virtual_num_local_replica): + """ + Estimate the send and receive time of a ps and return multiple impacting factors. + + Args: + var_item: + resource_item: + hosts: + virtual_num_local_replica: + + Returns: + Dict: a dictionary of impacting factors. + """ + if worker_num_replicas is None: + worker_num_replicas = [1.0] * len(worker_list) + + this_server_time = 0 + # network transfer: sum up all workers time. equals to the time cost of this server. + # TODO(Hao): didn't consider any parallelization among partitions + for k, worker in enumerate(worker_list): + if _resolved_devices_on_diff_machine(var.device, worker): + if var.is_sparse: + this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + else: + this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) + this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] + + if get_coef: + return { + 'transmission': this_server_time, + 'network_overhead': len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'strategy': 'ps', + 'local_proxy': var.synchronizer.local_replication, + 'is_sparse': var.is_sparse, + 'size_to_transfer': var_size_to_transfer, + 'dtype': str(var.dtype), + # 'server_list': [partition.to_dict() for partition in server_list], + 'worker_list': worker_list, + 'cpu_worker_list': resource.cpu_worker_list, + 'gpu_worker_list': resource.gpu_worker_list, + 'worker_num_replicas': worker_num_replicas, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': True, + } + else: + return this_server_time + len(worker_list) * network_overhead + \ + gpu_kernel_memory_latency * resource.max_num_local_replica + + + def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0, get_coef=False): + """Compute synchronization time of a variable in AR strategy.""" + worker_list = resource.cpu_worker_list + num_workers = len(worker_list) + min_bandwidth = None + for i in range(num_workers): + for j in range(i, num_workers): + if min_bandwidth is None: + min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]] + else: + min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]]) + + # Compressor + if var.compressor == "PowerSGDCompressor" or var.compressor == 3: + rank = 10 # currently using default value. So hardcode here. # todo: confirm + # assume var must be a dense variable. + og_shape = var.shape + ndims = len(og_shape) + if ndims <= 1: # no compress + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + else: + if ndims > 2: + n = og_shape[0] + m = 1 + for s in og_shape[1:]: + m *= s # tensor's shape (n, m) + else: + n, m = og_shape[0], og_shape[1] + size_to_transfer = n * rank + m * rank + dtype = tf.float32 + elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \ + or var.compressor == 2 or var.compressor == 1: + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = tf.float32 + elif var.compressor == "NoneCompressor" or var.compressor == 0: + size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = var.dtype + else: + raise ValueError('Compressor does not exist: {}'.format(var.compressor)) + + # todo: chunk_size + # AllReduce communication time + # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers) + time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth + + if get_coef: + return { + 'transmission': time, + 'network_overhead': 1, # len(worker_list), + 'gpu_kernel_memory_latency': resource.max_num_local_replica, + 'constant': 1.0, + # possible affecting factors. + 'var_name': var.name, + 'group': var.synchronizer.group, + 'strategy': 'allreduce', + 'is_sparse': False, + # 'chunk_size': chunk_size, + 'spec': 'NCCL', # default + 'compressor': var.compressor, + 'worker_list': worker_list, + 'num_workers': num_workers, + 'size_to_transfer': size_to_transfer, + 'dtype': str(dtype), + 'min_bandwidth': min_bandwidth, + 'max_num_local_replica': resource.max_num_local_replica, + 'is_ps': False, + } + else: + return time + network_overhead * len(worker_list) \ + + gpu_kernel_memory_latency * resource.max_num_local_replica + + + +# @staticmethod +# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, +# max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, +# network_overhead=0.0, gpu_kernel_memory_latency=0.0): +# """Compute synchrinzation time of a variable in PS strategy.""" +# +# def _helper(worker_list, worker_num_replicas=None): +# if worker_num_replicas is None: +# worker_num_replicas = [1.0] * len(worker_list) +# # Compute the slowest server +# slowest_server_time = 0 +# for j, server in enumerate(server_list): +# if server.size_to_transfer == 0: +# continue +# # network transfer: sum up all workers time. equals to the time cost of this server. +# this_server_time = 0 +# for k, worker in enumerate(worker_list): +# if _resolved_devices_on_diff_machine(server.device, worker): +# if is_sparse: +# this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k] +# else: +# this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype) +# this_server_time += this_worker_size / network_bandwidth[server.device][worker] +# slowest_server_time = max(slowest_server_time, this_server_time) +# +# if get_coef: +# return { +# 'transmission': slowest_server_time, +# 'network_overhead': len(worker_list), +# 'gpu_kernel_memory_latency': max_num_local_replica, +# 'constant': 1.0, +# # possible affecting factors. +# 'var_name': var_name, +# 'strategy': 'ps', +# 'local_proxy': local_proxy, +# 'is_sparse': is_sparse, +# 'server_list': [partition.to_dict() for partition in server_list], +# 'worker_list': worker_list, +# 'cpu_worker_list': cpu_worker_list, +# 'gpu_worker_list': gpu_worker_list, +# 'worker_num_replicas': worker_num_replicas, +# 'max_num_local_replica': max_num_local_replica, +# } +# else: +# return slowest_server_time + len(worker_list) * network_overhead + \ +# gpu_kernel_memory_latency * max_num_local_replica +# +# if is_sparse: +# send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas) +# receive_time = _helper(gpu_worker_list) +# else: +# send_time = _helper(cpu_worker_list) +# if local_proxy: +# receive_time = _helper(cpu_worker_list) +# else: +# receive_time = _helper(gpu_worker_list) +# +# if get_coef: +# # return {key: send_time[key]+receive_time[key] for key in send_time.keys()} +# return send_time, receive_time +# else: +# return send_time, receive_time diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py index d2377e6..6d2d871 100644 --- a/autodist/strategy/auto/item.py +++ b/autodist/strategy/auto/item.py @@ -249,6 +249,7 @@ def device(self, resolver): device_str = resolver.resolve_to_device_str(device_str) return device_str + class PartItem(VariableItem): """Helper class to include meta information about a variable partition.""" def __init__(self, @@ -364,7 +365,7 @@ class ResourceItem: Helper class that includes meta information about a resource spec. All addresses are resolved (in TF format). - TODO(zhisbug): merge ResourceItem class with ResourceSpec. + TODO(Hao): merge ResourceItem class with ResourceSpec. """ def __init__(self, resource_spec): @@ -405,7 +406,7 @@ def cpu_replicas(self): def total_num_gpu_replica(self): return len(self.gpu_replicas) - def num_local_gpu_replica(self, host): + def num_local_gpu_replica_on(self, host): """ Return the number of gpu replica on a TF host address, e.g. '/job:worker/task:0/device:CPU:0'. @@ -427,7 +428,7 @@ def num_local_gpu_replica(self, host): @property def max_num_local_gpu_replica(self): """Return the max number of local gpu replicas on the cluster.""" - return max([self.num_local_gpu_replica(host) for host in self.cpu_replicas]) + return max([self.num_local_gpu_replica_on(host) for host in self.cpu_replicas]) @cached_property def p2p_bandwidth(self): From 668127a2c59570bb8f9931e913a857595a177251 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Wed, 29 Jul 2020 02:50:46 -0400 Subject: [PATCH 08/11] improve the estimator for ps syncers, some minor changes --- autodist/simulator/predefined_simulator.py | 216 ++++++++++----------- autodist/simulator/utils.py | 29 ++- autodist/strategy/auto/item.py | 87 ++++++++- 3 files changed, 208 insertions(+), 124 deletions(-) diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py index 6b141c8..f6f37c6 100644 --- a/autodist/simulator/predefined_simulator.py +++ b/autodist/simulator/predefined_simulator.py @@ -23,7 +23,7 @@ from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer from autodist.resource_spec import ResourceSpec from autodist.simulator.base import SimulatorBase -from autodist.simulator.utils import _resolved_devices_on_diff_machine, \ +from autodist.simulator.utils import on_same_host, \ get_dense_var_bits, get_sparse_var_bits from autodist.strategy.base import Strategy from autodist.utils import logging @@ -196,133 +196,89 @@ def var_ps_time(self, var_item, resource_item, network_overhead=0.0, - gpu_kernel_memory_latency=0.0, - get_coef=False): + gpu_kernel_memory_latency=0.0): """ - Estimate the synchronization time of a variable with PS synchronizer. + Estimate the synchronization time of a variable using PS synchronizer. Args: var_item: resource_item: network_overhead: gpu_kernel_memory_latency: - get_coef: return the Returns: - + tuple(dict) """ - - def _helper(worker_list, worker_num_replicas=None): - if worker_num_replicas is None: - worker_num_replicas = [1.0] * len(worker_list) - - this_server_time = 0 - # network transfer: sum up all workers time. equals to the time cost of this server. - # TODO(Hao): didn't consider any parallelization among partitions - for k, worker in enumerate(worker_list): - if _resolved_devices_on_diff_machine(var.device, worker): - if var.is_sparse: - this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] - else: - this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) - this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] - - if get_coef: - return { - 'transmission': this_server_time, - 'network_overhead': len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'strategy': 'ps', - 'local_proxy': var.synchronizer.local_replication, - 'is_sparse': var.is_sparse, - 'size_to_transfer': var_size_to_transfer, - 'dtype': str(var.dtype), - # 'server_list': [partition.to_dict() for partition in server_list], - 'worker_list': worker_list, - 'cpu_worker_list': resource.cpu_worker_list, - 'gpu_worker_list': resource.gpu_worker_list, - 'worker_num_replicas': worker_num_replicas, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': True, - } - else: - return this_server_time + len(worker_list) * network_overhead + \ - gpu_kernel_memory_latency * resource.max_num_local_replica - - var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - - if var.is_sparse: - send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas) - receive_time = _helper(resource.gpu_worker_list) + bits_to_transfer = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) + + num_local_replica_on_each_worker = [resource_item.num_local_gpu_replica_on(host) + for host in resource_item.cpu_replicas] + if var_item.is_sparse: + send_time = self._estimate_ps_time(var_item, + resource_item, + resource_item.cpu_replicas, + num_local_replica_on_each_worker) + recv_time = self._estimate_ps_time(var_item, + resource_item, + resource_item.gpu_replicas, + [1.0] * len(resource_item.gpu_replicas)) else: - send_time = _helper(resource.cpu_worker_list) - if var.synchronizer.local_replication: - receive_time = _helper(resource.cpu_worker_list) + send_time = self._estimate_ps_time(var_item, + resource_item, + resource_item.cpu_replicas, + [1.0] * len(resource_item.cpu_replicas)) + if var_item.local_replication: + recv_time = self._estimate_ps_time(var_item, + resource_item, + resource_item.cpu_replicas, + [1.0] * len(resource_item.cpu_replicas)) else: - receive_time = _helper(resource.gpu_worker_list) - - return send_time, receive_time - - @staticmethod - def _estimate_ps_send_receive_time(var_item, - resource_item, - hosts, - virtual_num_local_replica): + recv_time = self._estimate_ps_time(var_item, + resource_item, + resource_item.gpu_replicas, + [1.0] * len(resource_item.gpu_replicas)) + return send_time, recv_time + + def _estimate_ps_time(self, + var_item, + resource_item, + virtual_worker_list, + virtual_num_local_replica): """ - Estimate the send and receive time of a ps and return multiple impacting factors. + Estimate the send or receive time of a ps and return multiple impacting factors. Args: - var_item: + var_item: the variable whose communication time will be estimated. resource_item: - hosts: - virtual_num_local_replica: + virtual_worker_list: A list of virtual workers (could be actual gpu workers, or virtual cpu worker). + virtual_num_local_replica: A list of integers indicating the number of local replica on each virtual worker. Returns: Dict: a dictionary of impacting factors. """ - if worker_num_replicas is None: - worker_num_replicas = [1.0] * len(worker_list) - - this_server_time = 0 - # network transfer: sum up all workers time. equals to the time cost of this server. - # TODO(Hao): didn't consider any parallelization among partitions - for k, worker in enumerate(worker_list): - if _resolved_devices_on_diff_machine(var.device, worker): - if var.is_sparse: - this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] - else: - this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) - this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] - - if get_coef: - return { - 'transmission': this_server_time, - 'network_overhead': len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'strategy': 'ps', - 'local_proxy': var.synchronizer.local_replication, - 'is_sparse': var.is_sparse, - 'size_to_transfer': var_size_to_transfer, - 'dtype': str(var.dtype), - # 'server_list': [partition.to_dict() for partition in server_list], - 'worker_list': worker_list, - 'cpu_worker_list': resource.cpu_worker_list, - 'gpu_worker_list': resource.gpu_worker_list, - 'worker_num_replicas': worker_num_replicas, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': True, - } - else: - return this_server_time + len(worker_list) * network_overhead + \ - gpu_kernel_memory_latency * resource.max_num_local_replica - + transmission_time = 0.0 + + # To estimate network transmission time for the given variable var_item on PS, we simply sum up the time of + # transmitting (or say, synchronizing) this variable across all workers. + # The time is separately estimated as send_time and recv_time by calling this function twice with different + # values of arguments. + # TODO(Hao): didn't consider any parallelization between variables or partitions. + for k, worker in enumerate(virtual_worker_list): + if not on_same_host(var_item.device, worker): + bits_on_this_worker = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) * \ + virtual_num_local_replica[k] + bandwidth = min(resource_item.p2p_bandwidth[var_item.device][worker], + resource_item.p2p_bandwidth[worker][var_item.device]) + transmission_time += bits_on_this_worker / bandwidth + + + factors = { + 'transmission': transmission_time, + 'network_overhead': len(virtual_worker_list), + 'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica, + 'constant': 1.0 + } + return factors def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0, get_coef=False): """Compute synchronization time of a variable in AR strategy.""" @@ -400,6 +356,7 @@ def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_lat + # @staticmethod # def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, # max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, @@ -462,3 +419,46 @@ def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_lat # return send_time, receive_time # else: # return send_time, receive_time + + + + + # def _helper(worker_list, worker_num_replicas=None): + # if worker_num_replicas is None: + # worker_num_replicas = [1.0] * len(worker_list) + # + # this_server_time = 0 + # # network transfer: sum up all workers time. equals to the time cost of this server. + # # TODO(Hao): didn't consider any parallelization among partitions + # for k, worker in enumerate(worker_list): + # if _resolved_devices_on_diff_machine(var.device, worker): + # if var.is_sparse: + # this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] + # else: + # this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) + # this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] + # + # if get_coef: + # return { + # 'transmission': this_server_time, + # 'network_overhead': len(worker_list), + # 'gpu_kernel_memory_latency': resource.max_num_local_replica, + # 'constant': 1.0, + # # possible affecting factors. + # 'var_name': var.name, + # 'strategy': 'ps', + # 'local_proxy': var.synchronizer.local_replication, + # 'is_sparse': var.is_sparse, + # 'size_to_transfer': var_size_to_transfer, + # 'dtype': str(var.dtype), + # # 'server_list': [partition.to_dict() for partition in server_list], + # 'worker_list': worker_list, + # 'cpu_worker_list': resource.cpu_worker_list, + # 'gpu_worker_list': resource.gpu_worker_list, + # 'worker_num_replicas': worker_num_replicas, + # 'max_num_local_replica': resource.max_num_local_replica, + # 'is_ps': True, + # } + # else: + # return this_server_time + len(worker_list) * network_overhead + \ + # gpu_kernel_memory_latency * resource.max_num_local_replica \ No newline at end of file diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py index b200007..d0c6436 100644 --- a/autodist/simulator/utils.py +++ b/autodist/simulator/utils.py @@ -258,11 +258,11 @@ def read_trial_runs(): tf.string: 1, # todo: confirm 'tf.string': 1, # todo: confirm "": 1, # todo: confirm + tf.quint8: 8, + 'tf.quint8': 8, tf.qint8: 8, 'tf.qint8': 8, "": 8, - tf.quint8: 8, - 'tf.quint8': 8, "": 8, tf.qint16: 16, 'tf.qint16': 16, @@ -302,11 +302,26 @@ def get_sparse_var_bits(size): + 2 * get_dtype_bits(tf.int64) -def _resolved_devices_on_diff_machine(device1, device2): - # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0' - node1 = ':'.join(device1.split('/')[:-1]) - node2 = ':'.join(device2.split('/')[:-1]) - return node1 != node2 +def on_same_host(device_str1, device_str2): + """ + Return True if d1 and d2 are on the same host. + + Args: + device_str1 (string): the first device as a TF device string, e.g. /job:worker/task:0/device:CPU:0. + device_str2 (string): the first device as a TF device string, e.g. /job:worker/task:0/device:GPU:0. + + Returns: + Bool: True if they are on the same host, otherwise False. + """ + host1 = '/'.join(device_str1.split('/')[:-1]) + host2 = '/'.join(device_str2.split('/')[:-1]) + return host1 == host2 + +# def _resolved_devices_on_diff_machine(device1, device2): +# # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0' +# node1 = ':'.join(device1.split('/')[:-1]) +# node2 = ':'.join(device2.split('/')[:-1]) +# return node1 != node2 # def _resolve_device_address(device: str, device_resolver: DeviceResolver): diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py index 6d2d871..263ebba 100644 --- a/autodist/strategy/auto/item.py +++ b/autodist/strategy/auto/item.py @@ -16,6 +16,7 @@ from enum import Enum +import tensorflow as tf from tensorflow.python.framework import ops, device_spec from autodist.kernel.common.utils import get_op_name, get_consumers @@ -24,7 +25,7 @@ from autodist.strategy.base import byte_size_load_fn from autodist.utils import logging from autodist.cluster import SSHCluster -from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS +from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits class VarType(Enum): @@ -136,8 +137,18 @@ def original_size(self): size *= s return size - @property def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1): + """ + Return the number of elements (e.g. float, integer) to transfer for this variable per iteration. + + To estimate the size to transfer for sparse variables, batch_size_per_gpu and seq_len are required. + Args: + batch_size_per_gpu: batch size used on each GPU replica. + seq_len: the length of the sequence of each input example. + + Returns: + integer + """ if not self.is_sparse: return self.size else: @@ -153,7 +164,30 @@ def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1): sparse_data_size = batch_size_per_gpu * seq_len * emb_size # estimate the embedding of this partition simply using a proportional formula - return sparse_data_size * self.size / self.original_size + return sparse_data_size * float(self.size) / float(self.original_size) + + @property + def bits_to_transfer(self, batch_size_per_gpu=1, seq_len=1): + """ + Estimate the bits to transfer across the network per iteration. + + For sparse variables, this is an over-estimation as we think all columns corresponded to this batch + is unique. + Args: + batch_size_per_gpu: + seq_len: + + Returns: + integer + """ + s = self.size_to_transfer(batch_size_per_gpu, seq_len) + if self.is_sparse: # IndexSlices: values, indices, dense_shape + bits = s * get_dtype_bits(self.dtype) + \ + batch_size_per_gpu * seq_len * self.size / self.original_size * get_dtype_bits(tf.int64) + \ + 2 * get_dtype_bits(tf.int64) + return bits + else: # Tensor + return s * get_dtype_bits(self.dtype) @property def partitionable_axes(self): @@ -234,7 +268,7 @@ def reduction_destination(self): Return the reduction_destination in the node config of this variable. Returns: - Reduction destinaiton. + str. """ if not self._node_config: raise ValueError('Node config is unset.') @@ -248,6 +282,21 @@ def device(self, resolver): if device_str: device_str = resolver.resolve_to_device_str(device_str) return device_str + + @property + def local_replication(self): + """ + Return the local_replication in the node config of this variable. + + Returns: + bool + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'local_replication', False) class PartItem(VariableItem): @@ -359,6 +408,21 @@ def reduction_destination(self): return None return getattr(self.synchronizer, 'reduction_destination', None) + @property + def local_replication(self): + """ + Return the local_replication in the node config of this variable partition. + + Returns: + bool + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + logging.warning('Partitioner field is empty for a variable partition.') + return None + return getattr(self.synchronizer, 'local_replication', False) + class ResourceItem: """ResourceItem. @@ -373,6 +437,11 @@ def __init__(self, resource_spec): self._cluster = SSHCluster(resource_spec) self._device_resolver = DeviceResolver(self._cluster) + @property + def device_resolver(self): + """Resolver of this resource_spec that resolves an AutoDist device to TF device.""" + return self._device_resolver + @property def replicas(self): """Return the list of replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0.""" @@ -389,7 +458,7 @@ def gpu_replicas(self): """ # device_str is autodist device string, e.g. 192.168.0.1:CPU:0 device_strs = [k for k, _ in self._resource_spec.gpu_devices] - return self._device_resolver.resolve_to_device_str(device_strs) + return self.device_resolver.resolve_to_device_str(device_strs) @property def cpu_replicas(self): @@ -400,7 +469,7 @@ def cpu_replicas(self): List(string) """ device_strs = [k for k, _ in self._resource_spec.cpu_devices] - return self._device_resolver.resolve_to_device_str(device_strs) + return self.device_resolver.resolve_to_device_str(device_strs) @property def total_num_gpu_replica(self): @@ -434,7 +503,7 @@ def max_num_local_gpu_replica(self): def p2p_bandwidth(self): """Calculates P2P network bandwidth between nodes in the cluster. - Note that this is NOT a sysmetric + Note that this is NOT a symmetric matrix. """ bw = {} # key: (device1, device2) devices = [device for device, _ in self._resource_spec.devices] @@ -451,8 +520,8 @@ def p2p_bandwidth(self): if d_j not in bw: bw[d_j] = {} if ip_i != ip_j: - bw[d_i][d_j] = GIGABITS * self._resource_spec[ip_i].bandwidth[ip_i] - bw[d_j][d_i] = GIGABITS * self._resource_spec[ip_j].bandwidth[ip_j] + bw[d_i][d_j] = GIGABITS * self._resource_spec.network_bandwidth[ip_i] + bw[d_j][d_i] = GIGABITS * self._resource_spec.network_bandwidth[ip_j] else: bw[d_i][d_j] = GIGABITS * GPU_TO_CPU_BANDWIDTH bw[d_j][d_i] = GIGABITS * GPU_TO_CPU_BANDWIDTH From 0f9755d0f06879020f0ee74cc5945bd680d5f40c Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Tue, 4 Aug 2020 00:22:29 -0400 Subject: [PATCH 09/11] predefined simulator refactoring done --- autodist/simulator/predefined_simulator.py | 495 ++++++++------------- autodist/strategy/auto/item.py | 31 +- 2 files changed, 207 insertions(+), 319 deletions(-) diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py index f6f37c6..0e3f60b 100644 --- a/autodist/simulator/predefined_simulator.py +++ b/autodist/simulator/predefined_simulator.py @@ -14,18 +14,14 @@ """Predefined simulator with linear model.""" -import pickle as pkl from collections import OrderedDict import tensorflow as tf -from tensorflow.python.eager import context from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer -from autodist.resource_spec import ResourceSpec from autodist.simulator.base import SimulatorBase from autodist.simulator.utils import on_same_host, \ - get_dense_var_bits, get_sparse_var_bits -from autodist.strategy.base import Strategy + get_dtype_bits from autodist.utils import logging @@ -35,28 +31,39 @@ class PredefinedSimulator(SimulatorBase): See this paper TODO(Hao): put the paper link. """ + def __init__(self, graph_item=None, resource_spec=None, batch_size=1, - seq_len=1): + seq_len=1, + mode='sum'): """ Construct a predefined simulator. - The reason we need the per-replica batch size and the length of the inputsequence is to estimate - the communication load of variables that are sparsely access (e.g. embeddings). For dense variables, - these two arguments have no influence on estimation. + We need the per-replica batch size and the length of the input sequence to estimate the communication load of + variables that are sparsely accessed (e.g. embeddings). For dense variables, these two arguments have no + influence on estimation. + Note that graph_item and resource_spec are not required to instantiate a simulator object as we allow + transferring a trained simulator on a graph_item (or resource_spec) to a different graph_item (or different + resource_spec). This can be done by passing graph_item or resource_spec Args: graph_item: a GraphItem object, or a path to a serialized GraphItem object. resource_spec: a ResourceSpec object, or a path to a resource file. batch_size: the per-replica batch size used to train this model, if there are sparse variables. seq_len: the average length of input sequences (if there is any). + mode: use the `sum` or `max` of all variable sync time as the cost. """ super(PredefinedSimulator, self).__init__(graph_item, resource_spec) logging.debug('A PredefinedSimualtor is instantiated: batch_size_per_gpu is {}'.format(batch_size)) self._batch_size_per_gpu = batch_size self._seq_len = seq_len + self._mode = mode + + # Constants for predefined modeling. + self._network_overhead = 0.0 + self._gpu_kernel_memory_latency = 0.0 def simulate(self, strategy, @@ -64,38 +71,83 @@ def simulate(self, resource_spec=None, *args, **kwargs): - """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.""" - inputs = self.create_features(strategy, resource_spec) - with context.eager_mode(): - cost = self.inference(inputs, checkpoint) - return cost - - def inference(self, inputs, checkpoint=None): - if checkpoint is not None: - weights = self.load_checkpoint(checkpoint) - elif self._weights is not None: - weights = self._weights - else: - raise ValueError("No checkpoint provided in either initialization or inference.") - - if not isinstance(inputs, tf.Tensor): - inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) - - if len(weights) == 4: - W0, b0, W, b = weights - inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0) - cost = tf.matmul(inputs, W) + b - elif len(weights) == 2: - W, b = weights - cost = tf.matmul(inputs, W) + b + """ + Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple. + + Args: + strategy: the strategy to simulate + graph_item: the graph_item this strategy is generated on. + resource_spec: the resource_spec this strategy is on. + + Returns: + float: the estimated runtime (lower is better). + """ + var_name_to_items, resource_item, var_name_to_sync_time = \ + self.extract_prefeature(strategy, graph_item, resource_spec) + + # Now use the estimated per-variable sync time to calculate the overall sync time. + ps_server_sync_time = {} + cc_group_sync_time = {} + + for var_name, var_item in var_name_to_items.items(): + sync_time = var_name_to_sync_time[var_name] + + # we use a simple formula: + # time = transmission + network_overhead * participating_workers + gpu_memory_latency * max(#gpus) + if isinstance(var_item.synchronizer, PSSynchronizer): + server = var_item.device + if server not in ps_server_sync_time: + ps_server_sync_time[server] = 0.0 + send_time = sync_time[0]['transmission'] + \ + sync_time[0]['network_overhead'] * self._network_overhead + \ + sync_time[0]['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency + recv_time = sync_time[1]['transmission'] + \ + sync_time[1]['network_overhead'] * self._network_overhead + \ + sync_time[1]['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency + # Then accumulate the time for each variable on this PS. Note this is not necessarily accurate as + # there might exist parallel communication of variables even on one server. + ps_server_sync_time[server] += send_time + ps_server_sync_time[server] += recv_time + elif isinstance(var_item.synchronizer, AllReduceSynchronizer): + group = var_item.group + if group not in cc_group_sync_time: + # Each group of variables are fused as one message to pass, so we accumulate the + # overhead and latency for only ONCE. + cc_group_sync_time[group] += sync_time['network_overhead'] * self._network_overhead + \ + sync_time['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency + cc_group_sync_time[group] += sync_time['transmission'] + else: + raise ValueError('Unrecognized type of synchronizer: {}'.format(type(var_item.synchronizer))) + + sync_time = [v for v in ps_server_sync_time.values()] + [v for v in cc_group_sync_time.values()] + if self._mode == 'max': + # In `max` mode, we assume all PS and collective groups communicate in parallel, and the PS/group that + # takes the longest time to sync would bound the overall per-iter time. + per_iter_time = max(sync_time) + elif self._mode == 'sum': + # In `sum` mode, we assume all PS and collective groups synchronize sequentially, and the overall per-iter + # time is the summation of the sync time of all serviers and collective groups. + # !!Note: both modes have over-simplified assumptions than a real system. + per_iter_time = sum(sync_time) else: - raise ValueError - return cost + raise ValueError('Unrecognized simulation mode: {}'.format(self._mode)) + return per_iter_time - def estimate_sync_time(self, + def extract_prefeature(self, strategy, graph_item=None, resource_spec=None): + """ + Extract impacting factors of the communication time for each variable. + + Args: + strategy: the strategy to simulate. + graph_item: the graph_item this strategy is generated for. + resource_spec: the resource_spec this strategy is on. + + Returns: + Dict: A dict of variable name (str) to impacting factors (dict). + """ if not strategy: raise ValueError('strategy is None.') if not graph_item: @@ -108,7 +160,7 @@ def estimate_sync_time(self, raise ValueError('No resource spec provided.') else: resource_spec = self._resource_spec - + # TODO(Hao): need to make sure the (strategy, graph_item, resource_spec) match each other. # construct the meta objects name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec) @@ -118,138 +170,83 @@ def estimate_sync_time(self, if isinstance(var_item.synchronizer, PSSynchronizer): var_sync_time[var_name] = self.var_ps_time(var_item, resource_item) elif isinstance(var_item.synchronizer, AllReduceSynchronizer): - var_sync_time = self.var_ar_time(var_item, resource_item) + var_sync_time[var_name] = self.var_ar_time(var_item, resource_item) else: raise ValueError('{}'.format(type(var_item.synchronizer))) return var_sync_time - - - - - - def create_features(self, - strategy, - resource_spec): - # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) - - vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) - - feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] - device_ps_sync_time = {} - group_ar_sync_time = {} - - for var_name, var in vars.items(): - if isinstance(var.synchronizer, PSSynchronizer): - sync_time = self.var_ps_time(var, resource) - device = vars[var_name].device - if device not in device_ps_sync_time: - device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] - elif isinstance(var.synchronizer, AllReduceSynchronizer): - sync_time = self.var_ar_time(var, resource) - var_group = sync_time['group'] - if var_group not in group_ar_sync_time: - group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - group_ar_sync_time[var_group][key] += sync_time[key] - else: - raise ValueError('{}'.format(type(var.synchronizer))) - - max_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} - max_group_ar_sync_time = {key: 0.0 for key in feature_keys} - sum_group_ar_sync_time = {key: 0.0 for key in feature_keys} - for key in feature_keys: - max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) - max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0]) - sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0]) - - feat = [max_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_device_ps_sync_time[key] for key in feature_keys] \ - + [max_group_ar_sync_time[key] for key in feature_keys] \ - + [sum_group_ar_sync_time[key] for key in feature_keys] - - return feat - - - - - # def predefined_sync_time(self, strategy, resource_spec): - # """ graph_item: transformed graph item """ - # vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) - # # Compute synchronization time for every var - # var_sync_time = {} - # for var_name, var in vars.items(): - # if isinstance(var.synchronizer, PSSynchronizer): - # var_sync_time[var_name] = self.var_ps_time(var, resource) - # elif isinstance(var.synchronizer, AllReduceSynchronizer): - # var_sync_time[var_name] = self.var_ar_time(var, resource) - # else: - # raise ValueError('{}'.format(type(var.synchronizer))) - # return var_sync_time, vars, resource - - def var_ps_time(self, var_item, - resource_item, - network_overhead=0.0, - gpu_kernel_memory_latency=0.0): + resource_item): """ - Estimate the synchronization time of a variable using PS synchronizer. + Estimate the synchronization time of a variable that uses PS synchronizer. Args: - var_item: - resource_item: - network_overhead: - gpu_kernel_memory_latency: + var_item: the variable meta information. + resource_item: the resource meta information. Returns: - tuple(dict) + tuple(Dict): a dict of potential impacting factors for send and recv time, respectively. """ bits_to_transfer = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) - + placement = var_item.device + p2p_bandwidth = resource_item.p2p_bandwidth + max_num_local_gpu_replica = resource_item.max_num_local_gpu_replica num_local_replica_on_each_worker = [resource_item.num_local_gpu_replica_on(host) for host in resource_item.cpu_replicas] if var_item.is_sparse: - send_time = self._estimate_ps_time(var_item, - resource_item, + send_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, resource_item.cpu_replicas, num_local_replica_on_each_worker) - recv_time = self._estimate_ps_time(var_item, - resource_item, + recv_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, resource_item.gpu_replicas, [1.0] * len(resource_item.gpu_replicas)) else: - send_time = self._estimate_ps_time(var_item, - resource_item, + # In AutoDist, the gradients are always locally accumulated then SENT to parameter server. + send_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, resource_item.cpu_replicas, [1.0] * len(resource_item.cpu_replicas)) + # The communication overhead of receiving parameters from PS depends on `local_replication`. if var_item.local_replication: - recv_time = self._estimate_ps_time(var_item, - resource_item, + recv_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, resource_item.cpu_replicas, [1.0] * len(resource_item.cpu_replicas)) else: - recv_time = self._estimate_ps_time(var_item, - resource_item, + recv_time = self._estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, resource_item.gpu_replicas, [1.0] * len(resource_item.gpu_replicas)) return send_time, recv_time - def _estimate_ps_time(self, - var_item, - resource_item, + @staticmethod + def _estimate_ps_time(bits_to_transfer, + placement, + p2p_bandwidth, + max_num_local_gpu_replica, virtual_worker_list, virtual_num_local_replica): """ Estimate the send or receive time of a ps and return multiple impacting factors. Args: - var_item: the variable whose communication time will be estimated. - resource_item: + bits_to_transfer: the variable whose communication time will be estimated. + placement: the placement of the variable. + p2p_bandwidth: point-to-point bandwidth between divices of the cluster. + max_num_local_gpu_replica: the maximum number of on a single node across the cluster. virtual_worker_list: A list of virtual workers (could be actual gpu workers, or virtual cpu worker). virtual_num_local_replica: A list of integers indicating the number of local replica on each virtual worker. @@ -264,201 +261,63 @@ def _estimate_ps_time(self, # values of arguments. # TODO(Hao): didn't consider any parallelization between variables or partitions. for k, worker in enumerate(virtual_worker_list): - if not on_same_host(var_item.device, worker): - bits_on_this_worker = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) * \ - virtual_num_local_replica[k] - bandwidth = min(resource_item.p2p_bandwidth[var_item.device][worker], - resource_item.p2p_bandwidth[worker][var_item.device]) + if not on_same_host(placement, worker): + bits_on_this_worker = bits_to_transfer * virtual_num_local_replica[k] + bandwidth = min(p2p_bandwidth[placement][worker], p2p_bandwidth[worker][placement]) transmission_time += bits_on_this_worker / bandwidth - - factors = { 'transmission': transmission_time, 'network_overhead': len(virtual_worker_list), - 'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica, + 'gpu_kernel_memory_latency': max_num_local_gpu_replica, # TODO(Hao): Is this correct? 'constant': 1.0 } return factors - def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0, get_coef=False): - """Compute synchronization time of a variable in AR strategy.""" - worker_list = resource.cpu_worker_list - num_workers = len(worker_list) - min_bandwidth = None - for i in range(num_workers): - for j in range(i, num_workers): - if min_bandwidth is None: - min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]] - else: - min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]]) - - # Compressor - if var.compressor == "PowerSGDCompressor" or var.compressor == 3: - rank = 10 # currently using default value. So hardcode here. # todo: confirm - # assume var must be a dense variable. - og_shape = var.shape - ndims = len(og_shape) - if ndims <= 1: # no compress - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - else: - if ndims > 2: - n = og_shape[0] - m = 1 - for s in og_shape[1:]: - m *= s # tensor's shape (n, m) - else: - n, m = og_shape[0], og_shape[1] - size_to_transfer = n * rank + m * rank - dtype = tf.float32 - elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \ - or var.compressor == 2 or var.compressor == 1: - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - dtype = tf.float32 - elif var.compressor == "NoneCompressor" or var.compressor == 0: - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - dtype = var.dtype - else: - raise ValueError('Compressor does not exist: {}'.format(var.compressor)) - - # todo: chunk_size - # AllReduce communication time - # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers) - time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth - - if get_coef: - return { - 'transmission': time, - 'network_overhead': 1, # len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'group': var.synchronizer.group, - 'strategy': 'allreduce', - 'is_sparse': False, - # 'chunk_size': chunk_size, - 'spec': 'NCCL', # default - 'compressor': var.compressor, - 'worker_list': worker_list, - 'num_workers': num_workers, - 'size_to_transfer': size_to_transfer, - 'dtype': str(dtype), - 'min_bandwidth': min_bandwidth, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': False, - } - else: - return time + network_overhead * len(worker_list) \ - + gpu_kernel_memory_latency * resource.max_num_local_replica + def var_ar_time(self, + var_item, + resource_item, + powersgd_rank=10): + """ + Estimate the synchronization time of a variable that uses collective synchronizer. + Due to limitation, we only consider dense variables for now. + Args: + var_item: the variable meta information. + resource_item: the resource meta information. + Returns: + Dict: a dictionary of impacting factors. + """ + # Address cases for different types of compressors + if var_item.compressor not in ['PowerSGDCopmressor', 'HorovodCompressorEF', 'HorovodCompressor', + 'NoneCompressor', 0, 1, 2, 3]: + raise ValueError('Compressor type not recognized: {}'.format(var_item.compressor)) + size_to_transfer = var_item.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, + seq_len=self._seq_len) + dtype = var_item.dtype -# @staticmethod -# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, -# max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, -# network_overhead=0.0, gpu_kernel_memory_latency=0.0): -# """Compute synchrinzation time of a variable in PS strategy.""" -# -# def _helper(worker_list, worker_num_replicas=None): -# if worker_num_replicas is None: -# worker_num_replicas = [1.0] * len(worker_list) -# # Compute the slowest server -# slowest_server_time = 0 -# for j, server in enumerate(server_list): -# if server.size_to_transfer == 0: -# continue -# # network transfer: sum up all workers time. equals to the time cost of this server. -# this_server_time = 0 -# for k, worker in enumerate(worker_list): -# if _resolved_devices_on_diff_machine(server.device, worker): -# if is_sparse: -# this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k] -# else: -# this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype) -# this_server_time += this_worker_size / network_bandwidth[server.device][worker] -# slowest_server_time = max(slowest_server_time, this_server_time) -# -# if get_coef: -# return { -# 'transmission': slowest_server_time, -# 'network_overhead': len(worker_list), -# 'gpu_kernel_memory_latency': max_num_local_replica, -# 'constant': 1.0, -# # possible affecting factors. -# 'var_name': var_name, -# 'strategy': 'ps', -# 'local_proxy': local_proxy, -# 'is_sparse': is_sparse, -# 'server_list': [partition.to_dict() for partition in server_list], -# 'worker_list': worker_list, -# 'cpu_worker_list': cpu_worker_list, -# 'gpu_worker_list': gpu_worker_list, -# 'worker_num_replicas': worker_num_replicas, -# 'max_num_local_replica': max_num_local_replica, -# } -# else: -# return slowest_server_time + len(worker_list) * network_overhead + \ -# gpu_kernel_memory_latency * max_num_local_replica -# -# if is_sparse: -# send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas) -# receive_time = _helper(gpu_worker_list) -# else: -# send_time = _helper(cpu_worker_list) -# if local_proxy: -# receive_time = _helper(cpu_worker_list) -# else: -# receive_time = _helper(gpu_worker_list) -# -# if get_coef: -# # return {key: send_time[key]+receive_time[key] for key in send_time.keys()} -# return send_time, receive_time -# else: -# return send_time, receive_time - - - - - # def _helper(worker_list, worker_num_replicas=None): - # if worker_num_replicas is None: - # worker_num_replicas = [1.0] * len(worker_list) - # - # this_server_time = 0 - # # network transfer: sum up all workers time. equals to the time cost of this server. - # # TODO(Hao): didn't consider any parallelization among partitions - # for k, worker in enumerate(worker_list): - # if _resolved_devices_on_diff_machine(var.device, worker): - # if var.is_sparse: - # this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] - # else: - # this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) - # this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] - # - # if get_coef: - # return { - # 'transmission': this_server_time, - # 'network_overhead': len(worker_list), - # 'gpu_kernel_memory_latency': resource.max_num_local_replica, - # 'constant': 1.0, - # # possible affecting factors. - # 'var_name': var.name, - # 'strategy': 'ps', - # 'local_proxy': var.synchronizer.local_replication, - # 'is_sparse': var.is_sparse, - # 'size_to_transfer': var_size_to_transfer, - # 'dtype': str(var.dtype), - # # 'server_list': [partition.to_dict() for partition in server_list], - # 'worker_list': worker_list, - # 'cpu_worker_list': resource.cpu_worker_list, - # 'gpu_worker_list': resource.gpu_worker_list, - # 'worker_num_replicas': worker_num_replicas, - # 'max_num_local_replica': resource.max_num_local_replica, - # 'is_ps': True, - # } - # else: - # return this_server_time + len(worker_list) * network_overhead + \ - # gpu_kernel_memory_latency * resource.max_num_local_replica \ No newline at end of file + if var_item.compressor in ['PowerSGDCopmressor', 3, "HorovodCompressorEF", "HorovodCompressor", 1, 2]: + # These compressors always use float32 to communicate. + dtype = tf.float32 + if var_item.compressor in ["PowerSGDCompressor", 3]: + # For PowerSGDCompessor, we hard-code the rank as 10. It will always use float32 to communicate. + if len(var_item.shape) > 1: + n = var_item.shape[0] + m = 1 + for d in var_item.shape[1:]: + m *= d + size_to_transfer = (m + n) * powersgd_rank + + # We assume ring allreduce, and multiple rings will be constructed and executed serialliy to synchronize grads. + # In one ring, each worker exchanges grads with its next worker in parallel. Hence, the time a single ring + # completes is bounded by the slowest pair of workers; the total time spent for all workers to synchronize + # grads are bounded by the time all rings finish on the slowest pair of workers. + transmission_time = size_to_transfer * get_dtype_bits(dtype) / resource_item.min_bandwidth + factors = { + 'transmission': transmission_time, + 'network_overhead': 1, # TODO(Hao): is this correct? + 'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica, + 'constant': 1.0 + } + return factors diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py index 263ebba..b73d22c 100644 --- a/autodist/strategy/auto/item.py +++ b/autodist/strategy/auto/item.py @@ -186,7 +186,7 @@ def bits_to_transfer(self, batch_size_per_gpu=1, seq_len=1): batch_size_per_gpu * seq_len * self.size / self.original_size * get_dtype_bits(tf.int64) + \ 2 * get_dtype_bits(tf.int64) return bits - else: # Tensor + else: # Tensor return s * get_dtype_bits(self.dtype) @property @@ -247,6 +247,21 @@ def synchronizer(self): return None return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer')) + @property + def group(self): + """ + Return the group in the node config of this variable. + + Returns: + int: group + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if self._node_config.partitioner: + logging.warning('This variable will be partitioned') + return None + return getattr(self.synchronizer, 'group', 0) + @property def compressor(self): """ @@ -379,6 +394,20 @@ def synchronizer(self): raise ValueError('Partitioner field is empty for a variable partition.') return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer')) + @property + def group(self): + """ + Return the group in the node config of this variable. + + Returns: + int: group + """ + if not self._node_config: + raise ValueError('Node config is unset.') + if not self._node_config.partitioner: + raise ValueError('Partitioner field is empty for a variable partition.') + return getattr(self.synchronizer, 'group', 0) + @property def compressor(self): """ From ef66bf7ab87e21303ffa25b908aada2e3efb33b4 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Sun, 9 Aug 2020 18:54:24 -0400 Subject: [PATCH 10/11] linear simulator done. Move simulation/search code under autosync scope --- autodist/{search => autosync}/__init__.py | 0 .../search}/__init__.py | 0 .../{ => autosync}/search/random_search.py | 0 autodist/autosync/simulator/__init__.py | 0 autodist/{ => autosync}/simulator/base.py | 36 +- .../autosync/simulator/linear_simulator.py | 193 +++++++++ .../simulator/predefined_simulator.py | 36 +- .../simulator/rankrnn_simulator.py | 0 .../{ => autosync}/simulator/train_linear.py | 0 .../simulator/train_predefined_simulator.py | 0 autodist/{ => autosync}/simulator/utils.py | 0 autodist/simulator/linear_simulator.py | 387 ------------------ autodist/strategy/auto/item.py | 2 +- autodist/strategy/auto_strategy.py | 2 +- test.py | 53 ++- 15 files changed, 267 insertions(+), 442 deletions(-) rename autodist/{search => autosync}/__init__.py (100%) rename autodist/{simulator => autosync/search}/__init__.py (100%) rename autodist/{ => autosync}/search/random_search.py (100%) create mode 100644 autodist/autosync/simulator/__init__.py rename autodist/{ => autosync}/simulator/base.py (86%) create mode 100644 autodist/autosync/simulator/linear_simulator.py rename autodist/{ => autosync}/simulator/predefined_simulator.py (98%) rename autodist/{ => autosync}/simulator/rankrnn_simulator.py (100%) rename autodist/{ => autosync}/simulator/train_linear.py (100%) rename autodist/{ => autosync}/simulator/train_predefined_simulator.py (100%) rename autodist/{ => autosync}/simulator/utils.py (100%) delete mode 100644 autodist/simulator/linear_simulator.py diff --git a/autodist/search/__init__.py b/autodist/autosync/__init__.py similarity index 100% rename from autodist/search/__init__.py rename to autodist/autosync/__init__.py diff --git a/autodist/simulator/__init__.py b/autodist/autosync/search/__init__.py similarity index 100% rename from autodist/simulator/__init__.py rename to autodist/autosync/search/__init__.py diff --git a/autodist/search/random_search.py b/autodist/autosync/search/random_search.py similarity index 100% rename from autodist/search/random_search.py rename to autodist/autosync/search/random_search.py diff --git a/autodist/autosync/simulator/__init__.py b/autodist/autosync/simulator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autodist/simulator/base.py b/autodist/autosync/simulator/base.py similarity index 86% rename from autodist/simulator/base.py rename to autodist/autosync/simulator/base.py index 19b965d..5ac04b7 100644 --- a/autodist/simulator/base.py +++ b/autodist/autosync/simulator/base.py @@ -70,22 +70,10 @@ def simulate(self, resource_spec=None, *args, **kwargs): - """ - Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple. - - Args: - strategy: - graph_item: - resource_spec: - checkpoint: - - Returns: - float - """ + """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.""" raise NotImplementedError() - def inference(self, - features): + def inference(self, *args, **kwargs): """ Abstract method for simulator inference. @@ -98,7 +86,7 @@ def inference(self, """ raise NotImplementedError() - def load_checkpoint(self, checkpoint=None): + def load_checkpoint(self, checkpoint): """ Load a checkpoint file as weights of the simulator. @@ -107,15 +95,15 @@ def load_checkpoint(self, checkpoint=None): """ raise NotImplementedError() - def save_checkpoint(self, model, checkpoint): - """ - Save a trained weight as a checkpoint file. - - Args: - model: trained model. - checkpoint: path where to save the checkpoint. - """ - raise NotImplementedError() + # def save_checkpoint(self, model, checkpoint): + # """ + # Save a trained weight as a checkpoint file. + # + # Args: + # model: trained model. + # checkpoint: path where to save the checkpoint. + # """ + # raise NotImplementedError() def preprocess(self, strategy, diff --git a/autodist/autosync/simulator/linear_simulator.py b/autodist/autosync/simulator/linear_simulator.py new file mode 100644 index 0000000..d0ee310 --- /dev/null +++ b/autodist/autosync/simulator/linear_simulator.py @@ -0,0 +1,193 @@ +# Copyright 2020 Petuum Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Predefined simulator with linear model.""" +import os +import pickle as pkl + +import tensorflow as tf +import numpy as np + +from autodist.autosync.simulator.predefined_simulator import PredefinedSimulator +from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer +from autodist.utils import logging + + +class LinearSimulator(PredefinedSimulator): + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + graph_item=None, + resource_spec=None, + batch_size=1, + seq_len=1, + checkpoint=None): + super(PredefinedSimulator, self).__init__(graph_item, resource_spec) + logging.debug('A LinearSimulator is instantiated: batch_size_per_gpu is {}'.format(batch_size)) + + self._batch_size_per_gpu = batch_size + self._seq_len = seq_len + + # For loading weights of the linear model. + self._checkpoint = checkpoint + if self._checkpoint: + try: + self._weight = self.load_checkpoint(checkpoint) + except ValueError: + logging.warning('self._checkpoint is invalid') + self._weight = None + + # TODO(Hao): add the default weights here. + self._default_weights = None + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + checkpoint=None, + *args, + **kwargs): + """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple. + + Args: + strategy: the strategy to simulate. + graph_item: the graph_item this strategy is generated on. + resource_spec: the resource_spec this strategy is on. + checkpoint: the checkpoint to perform inference (in place of the default weight). + + Returns: + float: the estimated cost (lower is better). + """ + if not strategy: + raise ValueError('strategy is None.') + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + + x = self._extract_feature(strategy, graph_item, resource_spec) + + # The priority of checkpoint lookup priority is: + # simulate(checkpoint) > self._weight > self._default_weight + if checkpoint: + weights = self.load_checkpoint(checkpoint) + elif self._weights: + weights = self._weights + else: + weights = self._default_weight + + cost = self.inference(np.array(x), weights) + return cost + + def inference(self, x, weights): + """ + + Args: + x: features extracts from a (strategy, graph_item, resource_spec). + weight: trained linear model weight. + + Returns: + float: ranking score. + """ + # if not isinstance(inputs, tf.Tensor): + # inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) + + assert len(weights) == 2 + W, b = weights + cost = np.array(W) * x.T + np.array(b) + return cost + + def load_checkpoint(self, checkpoint): + """ + Load a trained weight from a checkpoint. + + Args: + checkpoint: the file path to a npz, or a list/array of weights. + + Returns: + list: load weights [W, b]. + """ + logging.info('Loading checkpoint: {}'.format(checkpoint)) + if isinstance(checkpoint, list): + assert(len(checkpoint) == 2 or len(checkpoint) == 13) + if len(checkpoint) == 13: + checkpoint = checkpoint[:11], checkpoint[12] + return checkpoint + elif isinstance(checkpoint, str): + if os.path.isfile(checkpoint): + weights = np.load(checkpoint) + return weights['W'], weights['b'] + else: + raise ValueError('Unable to load the checkpoint: {}'.format(checkpoint)) + + def _extract_feature(self, + strategy, + graph_item, + resource_spec): + """Get the feature vector as input to the linear model.""" + var_name_to_items, resource_item, var_name_to_sync_time = \ + self.extract_prefeature(strategy, graph_item, resource_spec) + + feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] + ps_server_sync_time = {} + cc_group_sync_time = {} + + for var_name, var_item in var_name_to_items.items(): + sync_time = var_name_to_sync_time[var_name] + + # Extract per-server and per-group sync time. + if isinstance(var_item.synchronizer, PSSynchronizer): + server = var_item.device + if server not in ps_server_sync_time: + ps_server_sync_time[server] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + ps_server_sync_time[server][key] += sync_time[0][key] + sync_time[1][key] + elif isinstance(var_item.synchronizer, AllReduceSynchronizer): + group = var_item.group + if group not in cc_group_sync_time: + cc_group_sync_time[group] = {key: 0.0 for key in feature_keys} + for key in feature_keys: + cc_group_sync_time[group][key] += sync_time[key] + else: + raise ValueError('Unrecognized type of synchronizer: {}'.format(type(var_item.synchronizer))) + + # Different from predefined modeling, we transform these into feature vectors in this simulator. + # We care about the sum time of all servers/groups, or the slowest (max) server/group. + max_ps_server_sync_time = {key: 0.0 for key in feature_keys} + sum_ps_server_sync_time = {key: 0.0 for key in feature_keys} + max_cc_group_sync_time = {key: 0.0 for key in feature_keys} + sum_cc_group_sync_time = {key: 0.0 for key in feature_keys} + + for key in feature_keys: + max_ps_server_sync_time[key] = \ + max([sync_time[key] for sync_time in ps_server_sync_time.values()] or [0.0]) + sum_ps_server_sync_time[key] = \ + sum([sync_time[key] for sync_time in ps_server_sync_time.values()] or [0.0]) + max_cc_group_sync_time[key] = \ + max([sync_time[key] for sync_time in cc_group_sync_time.values()] or [0.0]) + sum_cc_group_sync_time[key] = \ + sum([sync_time[key] for sync_time in cc_group_sync_time.values()] or [0.0]) + + # concat them to get the feature. + x = [max_ps_server_sync_time[key] for key in feature_keys] + \ + [sum_ps_server_sync_time[key] for key in feature_keys] + \ + [max_cc_group_sync_time[key] for key in feature_keys] + \ + [sum_cc_group_sync_time[key] for key in feature_keys] + return x diff --git a/autodist/simulator/predefined_simulator.py b/autodist/autosync/simulator/predefined_simulator.py similarity index 98% rename from autodist/simulator/predefined_simulator.py rename to autodist/autosync/simulator/predefined_simulator.py index 0e3f60b..b05b50d 100644 --- a/autodist/simulator/predefined_simulator.py +++ b/autodist/autosync/simulator/predefined_simulator.py @@ -19,9 +19,8 @@ import tensorflow as tf from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer -from autodist.simulator.base import SimulatorBase -from autodist.simulator.utils import on_same_host, \ - get_dtype_bits +from autodist.autosync.simulator.base import SimulatorBase +from autodist.autosync.simulator.utils import on_same_host, get_dtype_bits from autodist.utils import logging @@ -82,6 +81,19 @@ def simulate(self, Returns: float: the estimated runtime (lower is better). """ + if not strategy: + raise ValueError('strategy is None.') + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + var_name_to_items, resource_item, var_name_to_sync_time = \ self.extract_prefeature(strategy, graph_item, resource_spec) @@ -135,8 +147,8 @@ def simulate(self, def extract_prefeature(self, strategy, - graph_item=None, - resource_spec=None): + graph_item, + resource_spec): """ Extract impacting factors of the communication time for each variable. @@ -148,18 +160,6 @@ def extract_prefeature(self, Returns: Dict: A dict of variable name (str) to impacting factors (dict). """ - if not strategy: - raise ValueError('strategy is None.') - if not graph_item: - if not self._graph_item: - raise ValueError('No graph item provided.') - else: - graph_item = self._graph_item - if not resource_spec: - if not self._resource_spec: - raise ValueError('No resource spec provided.') - else: - resource_spec = self._resource_spec # TODO(Hao): need to make sure the (strategy, graph_item, resource_spec) match each other. # construct the meta objects name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec) @@ -173,7 +173,7 @@ def extract_prefeature(self, var_sync_time[var_name] = self.var_ar_time(var_item, resource_item) else: raise ValueError('{}'.format(type(var_item.synchronizer))) - return var_sync_time + return name_to_items, resource_item, var_sync_time def var_ps_time(self, var_item, diff --git a/autodist/simulator/rankrnn_simulator.py b/autodist/autosync/simulator/rankrnn_simulator.py similarity index 100% rename from autodist/simulator/rankrnn_simulator.py rename to autodist/autosync/simulator/rankrnn_simulator.py diff --git a/autodist/simulator/train_linear.py b/autodist/autosync/simulator/train_linear.py similarity index 100% rename from autodist/simulator/train_linear.py rename to autodist/autosync/simulator/train_linear.py diff --git a/autodist/simulator/train_predefined_simulator.py b/autodist/autosync/simulator/train_predefined_simulator.py similarity index 100% rename from autodist/simulator/train_predefined_simulator.py rename to autodist/autosync/simulator/train_predefined_simulator.py diff --git a/autodist/simulator/utils.py b/autodist/autosync/simulator/utils.py similarity index 100% rename from autodist/simulator/utils.py rename to autodist/autosync/simulator/utils.py diff --git a/autodist/simulator/linear_simulator.py b/autodist/simulator/linear_simulator.py deleted file mode 100644 index 5dc2e6b..0000000 --- a/autodist/simulator/linear_simulator.py +++ /dev/null @@ -1,387 +0,0 @@ -# Copyright 2020 Petuum Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Predefined simulator with linear model.""" - -import pickle as pkl - -import tensorflow as tf -from tensorflow.python.eager import context - -from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer -from autodist.resource_spec import ResourceSpec -from autodist.simulator.base import SimulatorBase -from autodist.simulator.utils import _resolved_devices_on_diff_machine, \ - get_dense_var_bits, get_sparse_var_bits -from autodist.strategy.base import Strategy - - -class LinearSimulator(SimulatorBase): - """Simulates strategies for a given graph and resource spec.""" - - def __init__(self, - graph_item=None, - resource_spec=None, - batch_size=1, - seq_len=1, - get_coef=True, - checkpoint=None): - - super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path) - - print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size)) - self._fetches = fetches - self._batch_size_per_gpu = batch_size - self._seq_len = seq_len - self._get_coef = get_coef - self._checkpoint = checkpoint - self._weights = None - with context.eager_mode(): - if self._checkpoint: - self._weights = self.load_checkpoint(self._checkpoint) - - def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None): - """Return simulated runtime value.""" - inputs = self.create_features(strategy, resource_spec) - with context.eager_mode(): - cost = self.inference(inputs, checkpoint) - return cost - - def inference(self, inputs, checkpoint=None): - if checkpoint is not None: - weights = self.load_checkpoint(checkpoint) - elif self._weights is not None: - weights = self._weights - else: - raise ValueError("No checkpoint provided in either initialization or inference.") - - if not isinstance(inputs, tf.Tensor): - inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)]) - - if len(weights) == 4: - W0, b0, W, b = weights - inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0) - cost = tf.matmul(inputs, W) + b - elif len(weights) == 2: - W, b = weights - cost = tf.matmul(inputs, W) + b - else: - raise ValueError - return cost - - def load_checkpoint(self, checkpoint=None): - if checkpoint is None: - if self._checkpoint is not None: - checkpoint = self._checkpoint - else: - raise ValueError("checkpoint is None: {}".format(checkpoint)) - self._weights = pkl.load(open(checkpoint, 'rb')) - # self._weights = json.load(open(checkpoint, 'r')) - print("Load checkpoint: ") - print(self._weights) - return self._weights - - def save_checkpoint(self, model, checkpoint): - pkl.dump(model, open(checkpoint, 'wb')) - self._checkpoint = checkpoint - self._weights = model - - def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec): - var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) - - # Add up sync time per device to find the slowest server time. - feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] - device_ps_sync_time = {} - var_ar_sync_time = {} - for var_name, sync_time in var_sync_time.items(): - if isinstance(vars[var_name].synchronizer, PSSynchronizer): - device = vars[var_name].device - if device not in device_ps_sync_time: - device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] - - else: # AllReduce - if var_name not in var_ar_sync_time: - var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - var_ar_sync_time[var_name][key] += sync_time[key] - - max_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_var_ar_sync_time = {key: 0.0 for key in feature_keys} - for key in feature_keys: - max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0]) - - feat = [max_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_var_ar_sync_time[key] for key in feature_keys] - - return feat - - def create_features(self, strategy: Strategy, resource_spec: ResourceSpec): - # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec) - - vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) - - feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency'] - device_ps_sync_time = {} - group_ar_sync_time = {} - - for var_name, var in vars.items(): - if isinstance(var.synchronizer, PSSynchronizer): - sync_time = self.var_ps_time(var, resource) - device = vars[var_name].device - if device not in device_ps_sync_time: - device_ps_sync_time[device] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key] - elif isinstance(var.synchronizer, AllReduceSynchronizer): - sync_time = self.var_ar_time(var, resource) - var_group = sync_time['group'] - if var_group not in group_ar_sync_time: - group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys} - for key in feature_keys: - group_ar_sync_time[var_group][key] += sync_time[key] - else: - raise ValueError('{}'.format(type(var.synchronizer))) - - max_device_ps_sync_time = {key: 0.0 for key in feature_keys} - sum_device_ps_sync_time = {key: 0.0 for key in feature_keys} - max_group_ar_sync_time = {key: 0.0 for key in feature_keys} - sum_group_ar_sync_time = {key: 0.0 for key in feature_keys} - for key in feature_keys: - max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0]) - sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0]) - max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0]) - sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0]) - - feat = [max_device_ps_sync_time[key] for key in feature_keys] \ - + [sum_device_ps_sync_time[key] for key in feature_keys] \ - + [max_group_ar_sync_time[key] for key in feature_keys] \ - + [sum_group_ar_sync_time[key] for key in feature_keys] - - return feat - - def predefined_sync_time(self, strategy, resource_spec): - """ graph_item: transformed graph item """ - vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec) - # Compute synchronization time for every var - var_sync_time = {} - for var_name, var in vars.items(): - if isinstance(var.synchronizer, PSSynchronizer): - var_sync_time[var_name] = self.var_ps_time(var, resource) - elif isinstance(var.synchronizer, AllReduceSynchronizer): - var_sync_time[var_name] = self.var_ar_time(var, resource) - else: - raise ValueError('{}'.format(type(var.synchronizer))) - return var_sync_time, vars, resource - - def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): - """Compute synchronization time of a variable in PS strategy.""" - def _helper(worker_list, worker_num_replicas=None): - if worker_num_replicas is None: - worker_num_replicas = [1.0] * len(worker_list) - - this_server_time = 0 - # network transfer: sum up all workers time. equals to the time cost of this server. - # TODO(Hao): didn't consider any parallelization among partitions - for k, worker in enumerate(worker_list): - if _resolved_devices_on_diff_machine(var.device, worker): - if var.is_sparse: - this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k] - else: - this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype) - this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker] - - if self._get_coef: - return { - 'transmission': this_server_time, - 'network_overhead': len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'strategy': 'ps', - 'local_proxy': var.synchronizer.local_replication, - 'is_sparse': var.is_sparse, - 'size_to_transfer': var_size_to_transfer, - 'dtype': str(var.dtype), - # 'server_list': [partition.to_dict() for partition in server_list], - 'worker_list': worker_list, - 'cpu_worker_list': resource.cpu_worker_list, - 'gpu_worker_list': resource.gpu_worker_list, - 'worker_num_replicas': worker_num_replicas, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': True, - } - else: - return this_server_time + len(worker_list) * network_overhead + \ - gpu_kernel_memory_latency * resource.max_num_local_replica - - var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - - if var.is_sparse: - send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas) - receive_time = _helper(resource.gpu_worker_list) - else: - send_time = _helper(resource.cpu_worker_list) - if var.synchronizer.local_replication: - receive_time = _helper(resource.cpu_worker_list) - else: - receive_time = _helper(resource.gpu_worker_list) - - return send_time, receive_time - - def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0): - """Compute synchronization time of a variable in AR strategy.""" - worker_list = resource.cpu_worker_list - num_workers = len(worker_list) - min_bandwidth = None - for i in range(num_workers): - for j in range(i, num_workers): - if min_bandwidth is None: - min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]] - else: - min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]]) - - # Compressor - if var.compressor == "PowerSGDCompressor" or var.compressor == 3: - rank = 10 # currently using default value. So hardcode here. # todo: confirm - # assume var must be a dense variable. - og_shape = var.shape - ndims = len(og_shape) - if ndims <= 1: # no compress - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - else: - if ndims > 2: - n = og_shape[0] - m = 1 - for s in og_shape[1:]: - m *= s # tensor's shape (n, m) - else: - n, m = og_shape[0], og_shape[1] - size_to_transfer = n * rank + m * rank - dtype = tf.float32 - elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \ - or var.compressor == 2 or var.compressor == 1: - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - dtype = tf.float32 - elif var.compressor == "NoneCompressor" or var.compressor == 0: - size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu, - seq_len=self._seq_len) - dtype = var.dtype - else: - raise ValueError('Compressor does not exist: {}'.format(var.compressor)) - - # todo: chunk_size - # AllReduce communication time - # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers) - time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth - - if self._get_coef: - return { - 'transmission': time, - 'network_overhead': 1, # len(worker_list), - 'gpu_kernel_memory_latency': resource.max_num_local_replica, - 'constant': 1.0, - # possible affecting factors. - 'var_name': var.name, - 'group': var.synchronizer.group, - 'strategy': 'allreduce', - 'is_sparse': False, - # 'chunk_size': chunk_size, - 'spec': 'NCCL', # default - 'compressor': var.compressor, - 'worker_list': worker_list, - 'num_workers': num_workers, - 'size_to_transfer': size_to_transfer, - 'dtype': str(dtype), - 'min_bandwidth': min_bandwidth, - 'max_num_local_replica': resource.max_num_local_replica, - 'is_ps': False, - } - else: - return time + network_overhead * len(worker_list) \ - + gpu_kernel_memory_latency * resource.max_num_local_replica - - - - # @staticmethod - # def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list, - # max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef, - # network_overhead=0.0, gpu_kernel_memory_latency=0.0): - # """Compute synchrinzation time of a variable in PS strategy.""" - # - # def _helper(worker_list, worker_num_replicas=None): - # if worker_num_replicas is None: - # worker_num_replicas = [1.0] * len(worker_list) - # # Compute the slowest server - # slowest_server_time = 0 - # for j, server in enumerate(server_list): - # if server.size_to_transfer == 0: - # continue - # # network transfer: sum up all workers time. equals to the time cost of this server. - # this_server_time = 0 - # for k, worker in enumerate(worker_list): - # if _resolved_devices_on_diff_machine(server.device, worker): - # if is_sparse: - # this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k] - # else: - # this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype) - # this_server_time += this_worker_size / network_bandwidth[server.device][worker] - # slowest_server_time = max(slowest_server_time, this_server_time) - # - # if get_coef: - # return { - # 'transmission': slowest_server_time, - # 'network_overhead': len(worker_list), - # 'gpu_kernel_memory_latency': max_num_local_replica, - # 'constant': 1.0, - # # possible affecting factors. - # 'var_name': var_name, - # 'strategy': 'ps', - # 'local_proxy': local_proxy, - # 'is_sparse': is_sparse, - # 'server_list': [partition.to_dict() for partition in server_list], - # 'worker_list': worker_list, - # 'cpu_worker_list': cpu_worker_list, - # 'gpu_worker_list': gpu_worker_list, - # 'worker_num_replicas': worker_num_replicas, - # 'max_num_local_replica': max_num_local_replica, - # } - # else: - # return slowest_server_time + len(worker_list) * network_overhead + \ - # gpu_kernel_memory_latency * max_num_local_replica - # - # if is_sparse: - # send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas) - # receive_time = _helper(gpu_worker_list) - # else: - # send_time = _helper(cpu_worker_list) - # if local_proxy: - # receive_time = _helper(cpu_worker_list) - # else: - # receive_time = _helper(gpu_worker_list) - # - # if get_coef: - # # return {key: send_time[key]+receive_time[key] for key in send_time.keys()} - # return send_time, receive_time - # else: - # return send_time, receive_time diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py index b73d22c..94c316b 100644 --- a/autodist/strategy/auto/item.py +++ b/autodist/strategy/auto/item.py @@ -25,7 +25,7 @@ from autodist.strategy.base import byte_size_load_fn from autodist.utils import logging from autodist.cluster import SSHCluster -from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits +from autodist.autosync.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits class VarType(Enum): diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py index 354d62d..3b215ee 100644 --- a/autodist/strategy/auto_strategy.py +++ b/autodist/strategy/auto_strategy.py @@ -15,7 +15,7 @@ """An AutoStrategy using a trained linear simulator.""" from autodist.strategy.auto.base import AutoStrategyBase -from autodist.simulator.linear_simulator import LinearSimulator +from autodist.autosync.simulator.linear_simulator import LinearSimulator class AutoStrategy(AutoStrategyBase): """ diff --git a/test.py b/test.py index b481208..7c6be13 100644 --- a/test.py +++ b/test.py @@ -1,17 +1,48 @@ -from arion.simulator.simulator import Simulator -from arion.strategy import base -from arion.graph_item import GraphItem +import tensorflow as tf +import autodist -resource_spec_file = '/home/hao.zhang/project/pycharm/ncf-trial/official/recommendation/trial/trialrun_resource_specs/resource_spec_2.yml' -strategy_path = '/home/hao.zhang/oceanus_simulator/ncf_3/strategies/20200505T174311M650364' -original_graph_item_path = '/home/hao.zhang/oceanus_simulator/ncf/strategies/original_graph_item' +with tf.Graph().as_default(), autodist.scope(): +########################################################################## -s = base.Strategy.deserialize(strategy_path) + train_dataset = tf.data.Dataset.from_tensor_slices( + (train_images, train_labels)).repeat(EPOCHS).shuffle(len(train_images)//2).batch(BATCH_SIZE) + train_iterator = tf.compat.v1.data.make_one_shot_iterator(train_dataset).get_next() -simulator = Simulator(resource_file=resource_spec_file, - original_graph_item_path=original_graph_item_path) + model = tf.keras.Sequential([ + tf.keras.layers.Conv2D(32, 3, activation='relu'), + tf.keras.layers.MaxPooling2D(), + tf.keras.layers.Flatten(), + tf.keras.layers.Dropout(0.1), + tf.keras.layers.Dense(10, activation='softmax') + ]) + loss_fn = tf.keras.losses.SparseCategoricalCrossentropy() + optimizer = tf.keras.optimizers.SGD() -ret = simulator.simulate(s) + def train_step(inputs): + x, y = inputs + y_hat = model(x, training=True) + loss = loss_fn(y, y_hat) + all_vars = [] + for v in model.trainable_variables: + all_vars.append(v) + grads = tf.gradients(loss, all_vars) + update = optimizer.apply_gradients(zip(grads, all_vars)) -print('finished') + return loss, update + + fetches = train_step(train_iterator) + ##################################################################### + # Change 3: Create distributed session. + # Instead of using the original TensorFlow session for graph execution, + # let's use AutoDist's distributed session, in which a computational + # graph for distributed training is constructed. + # + # [original line] + # >>> sess = tf.compat.v1.Session() + # + sess = autodist.create_distributed_session() + ##################################################################### + for _ in range(min(10, len(train_images) // BATCH_SIZE * EPOCHS)): + loss, _ = sess.run(fetches) + print(f"train_loss: {loss}") \ No newline at end of file From 0737cb59a4bb29fb61d807223ac61be9069789d9 Mon Sep 17 00:00:00 2001 From: Hao Zhang Date: Mon, 10 Aug 2020 02:25:55 -0400 Subject: [PATCH 11/11] remove search code for now --- autodist/autosync/search/__init__.py | 0 autodist/autosync/search/random_search.py | 336 ---------------------- autodist/autosync/simulator/base.py | 295 +++++++++---------- tests/test_simulator.py | 26 +- 4 files changed, 146 insertions(+), 511 deletions(-) delete mode 100644 autodist/autosync/search/__init__.py delete mode 100644 autodist/autosync/search/random_search.py diff --git a/autodist/autosync/search/__init__.py b/autodist/autosync/search/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/autodist/autosync/search/random_search.py b/autodist/autosync/search/random_search.py deleted file mode 100644 index 38fcd67..0000000 --- a/autodist/autosync/search/random_search.py +++ /dev/null @@ -1,336 +0,0 @@ -import json -import time -from multiprocessing import Process, Queue - -import copy -import numpy as np -import os - -from arion.const import DEFAULT_RANDOM_SEARCH_DIR -from arion.graph_item import GraphItem -from arion.resource_spec import ResourceSpec -from arion.strategy import RandomStrategy, AllReduce -from arion.utils import logging - - -def build_worker(queue, builder, gi, rs): - np.random.seed() - ret = builder.build(gi, rs) - queue.put(ret) - -def get_resource_specs(trial_resource_spec_dir): - resource_specs = [] - if os.path.isdir(trial_resource_spec_dir): - for file_name in os.listdir(trial_resource_spec_dir): - file_path = os.path.join(trial_resource_spec_dir, file_name) - if os.path.isfile(file_path) and file_path.endswith('.yml'): - resource_specs.append(file_path) - elif os.path.isfile(trial_resource_spec_dir): - resource_specs.append(trial_resource_spec_dir) - else: - raise ValueError("Cannot find valid files in {}".format(trial_resource_spec_dir)) - return resource_specs - - -def get_strategies(strategies_dir): - strategies = [] - if os.path.isdir(strategies_dir): - for file_name in os.listdir(strategies_dir): - file_path = os.path.join(strategies_dir, file_name) - if os.path.isfile(file_path) and file_path.split('/')[-1].startswith('2020'): - strategies.append(file_path) - elif os.path.isfile(strategies_dir): - strategies.append(strategies_dir) - else: - raise ValueError("Cannot find valid files in {}".format(strategies_dir)) - return strategies - - -class RandomSearch: - def __init__(self, - space, - heuristics, - search_params, - original_graph_item_path, - resource_file, - simulator=None, - trial_run_fn=None): - - self.space = space - self.heuristics = heuristics - self.search_params = search_params - - self.original_graph_item_path = original_graph_item_path - self.resource_file = resource_file - - self.simulator = simulator - self.trial_run_fn = trial_run_fn - - self._resource_spec = ResourceSpec(self.resource_file) - self._original_graph_item = GraphItem.deserialize(original_graph_item_path) - - def search(self): - # candidates, scores, features = self.propose(self.search_params['num_candidate_explore']) - candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore']) - n_pick = self.search_params['num_candidate_per_trial'] - - # cast them to be np arrays - if self.search_params['diversity_metric'] == 'embedding': - picked_candidates = self.submodular_pick_by_embedding(np.array(scores), - candidates, - np.stack(features), - n_pick, - self.search_params['simulation_weight'], - self.search_params['diversity_weight']) - elif self.search_params['diversity_metric'] == 'expression': - picked_candidates = self.submodular_pick_by_expression(np.array(scores), - candidates, - n_pick, - self.search_params['simulation_weight'], - self.search_params['diversity_weight']) - else: - raise ValueError('Unrecognized diversity metric...') - if self.trial_run_fn: - self.trial_run(picked_candidates, search_iteration=0) - - def propose(self, num_proposal, use_simulator=True): - builder = RandomStrategy(self.space, self.heuristics) - candidates = [] - features = [] - scores = [] - # np.random.seed(1) - idx = 0 - - while len(candidates) < num_proposal: - logging.info('Sampling strategy {}'.format(idx)) - start_time = time.time() - expr = builder.build(self._original_graph_item, self._resource_spec) - elapsed = time.time() - start_time - logging.info('Sampling strategy takes {}'.format(elapsed)) - builder.reset() - idx += 1 - logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) - if self.simulator and use_simulator: - start_time = time.time() - score, feature = self.simulator.simulate(expr, self._resource_spec) - elapsed = time.time() - start_time - logging.info('Inference strategy takes {}'.format(elapsed)) - if score > self.search_params['rejection_score']: - logging.info('strategy {} has score {} > {}, ' - 'rejected..'.format(idx, score, self.search_params['rejection_score'])) - continue - else: - candidates.append(expr) - features.append(feature) - scores.append(score[0]) - else: - candidates.append(expr) - features.append([]) - scores.append(0) - logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) - return candidates, scores, features - - def batch_propose(self, num_proposal, batch_size=32, use_simulator=True): - - builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)] - graph_items = [self._original_graph_item for _ in range(batch_size)] - rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)] - candidates = [] - features = [] - scores = [] - # np.random.seed(1) - idx = 0 - - while len(candidates) < num_proposal: - logging.info('Sampling strategy {}'.format(idx)) - start_time = time.time() - - q = Queue() - exprs = [] - prs = [] - for obj, arg1, arg2 in zip(builders, graph_items, rss): - prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2))) - prs[-1].start() - for pr in prs: - expr = q.get() # will block - exprs.append(expr) - for pr in prs: - pr.join() - - elapsed = time.time() - start_time - logging.info('Sampling strategy takes {}'.format(elapsed)) - for builder in builders: builder.reset() - logging.info('Progress {}/{}'.format(len(candidates), num_proposal)) - if self.simulator and use_simulator: - start_time = time.time() - batch_score, batch_feature = self.simulator.simulate(exprs, rss) - elapsed = time.time() - start_time - logging.info('Inference strategy takes {}'.format(elapsed)) - for ite, expr in enumerate(exprs): - # print(batch_score[ite], batch_feature[ite].shape) - if batch_score[ite] > self.search_params['rejection_score']: - logging.info('strategy {} has score {} > {}, ' - 'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score'])) - else: - candidates.append(expr) - features.append(batch_feature[ite]) - scores.append(batch_score[ite]) - else: - for ite, expr in enumerate(exprs): - candidates.append(expr) - features.append([]) - scores.append(0) - idx += batch_size - logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx))) - return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal] - - def submodular_pick_by_embedding(self, - scores, - candidates, - candidate_features, - n_pick, - beta=1.0, - alpha=1.0): - n = len(scores) - assert n == len(candidate_features) - - ret = [] - sim = np.dot(candidate_features, candidate_features.T) - remain = list(range(len(scores))) - - for _ in range(n_pick): - tmp_delta = -scores[remain] * beta - if len(ret) > 0: - tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1) - max_x = tmp_delta.argmax() - max_x = remain[max_x] - - ret.append(max_x) - remain.remove(max_x) - - return [candidates[i] for i in ret] - - def submodular_pick_by_expression(self, - scores, - candidates, - n_pick, - beta=1.0, - alpha=1.0): - - def remove_group_or_reduction_destination(strategy): - tmp_strategy = copy.deepcopy(strategy) - for node in tmp_strategy.node_config: - if node.partitioner: - for part in node.part_config: - synchronizer = getattr(part, part.WhichOneof('synchronizer')) - if hasattr(synchronizer, 'reduction_destination'): - synchronizer.reduction_destination = '' - else: - synchronizer.group = 0 - else: - synchronizer = getattr(node, node.WhichOneof('synchronizer')) - if hasattr(synchronizer, 'reduction_destination'): - synchronizer.reduction_destination = '' - else: - synchronizer.group = 0 - return tmp_strategy - - def estimate_difference(strategy, node_config_set): - score = 0 - for i, node in enumerate(strategy.node_config): - if_seen = False - for seen_node in node_config_set[i]: - if seen_node == node: - if_seen = True - break - if not if_seen: - score += 1 - return score - - assert len(scores) == len(candidates) - - node_config_set = [list() for _ in candidates[0].node_config] - remain = list(range(len(scores))) - ret = [] - for _ in range(n_pick): - max_x = -1 - max_delta = -1e9 - max_strategy_copy = None - - for x in remain: - tmp_strategy = remove_group_or_reduction_destination(candidates[x]) - diff_score = estimate_difference(tmp_strategy, node_config_set) - assert(diff_score <= len(tmp_strategy.node_config)) - # print('diff score {}..'.format(diff_score)) - tmp_delta = - scores[x] * beta + diff_score * alpha - if tmp_delta > max_delta: - max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy - max_diff_score = diff_score *alpha - max_simulation_score= -scores[x] - - print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta)) - ret.append(max_x) - remain.remove(max_x) - - # update the node config set - for i, node in enumerate(max_strategy_copy.node_config): - if_seen = False - for seen_node in node_config_set[i]: - if seen_node == node: - if_seen = True - break - if not if_seen: - node_config_set[i].append(node) - - return [candidates[i] for i in ret] - - def trial_run(self, - candidate_strategies=None, - search_iteration=0): - # serialize all candidates to folder - target_dir = os.path.join(DEFAULT_RANDOM_SEARCH_DIR, str(search_iteration)) - os.makedirs(target_dir, exist_ok=False) - self._serialize_candidate_strategies(candidate_strategies, target_dir) - self._save_hyperparams(target_dir) - - # launch trial run - self._launch_trial_run(target_dir) - - @staticmethod - def _serialize_candidate_strategies(candidate_strategies, target_dir): - for strategy in candidate_strategies: - path = os.path.join(target_dir, strategy.id) - strategy.serialize(path) - - def _launch_trial_run(self, strategies_dir): - strategies = get_strategies(strategies_dir) - - # this will launch distributed processes and take very long - self.trial_run_fn([self.resource_file], strategies) - - def _save_hyperparams(self, target_dir): - # copy the constraint file as well - space_file = os.path.join(target_dir, 'space.json') - with open(space_file, 'w') as f: - json.dump(self.space, f) - heuristics_file = os.path.join(target_dir, 'heuristics.json') - with open(heuristics_file, 'w') as f: - json.dump(self.heuristics, f) - search_params_file = os.path.join(target_dir, 'search_params.json') - with open(search_params_file, 'w') as f: - json.dump(self.search_params, f) - - def check_if_visited(self): - raise NotImplementedError() - - def check_if_trial_run(self): - raise NotImplementedError() - - # Don't use, only for debug. - def _single_run(self): - # builder = BalancedPartitionedPS() - # builder = PartitionedAR(chunk_size=1) - builder = AllReduce() - expr = builder.build(self._original_graph_item, self._resource_spec) - logging.info(expr) - self.trial_run([expr], search_iteration=0) diff --git a/autodist/autosync/simulator/base.py b/autodist/autosync/simulator/base.py index 5ac04b7..e670cfa 100644 --- a/autodist/autosync/simulator/base.py +++ b/autodist/autosync/simulator/base.py @@ -1,152 +1,143 @@ -# Copyright 2020 Petuum. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Simulator base class.""" -import os -from collections import OrderedDict - -from autodist.graph_item import GraphItem -from autodist.kernel.partitioner import PartitionerConfig -from autodist.resource_spec import ResourceSpec -from autodist.strategy.base import Strategy -from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem - - -class SimulatorBase: - """Simulates strategies for a given graph and resource spec.""" - - def __init__(self, - graph_item=None, - resource_spec=None): - """ - Constructor for simulator base class - Args: - graph_item: a GraphItem object, or a path to a serialized GraphItem object. - resource_spec: a ResourceSpec object, or a path to a resource file. - """ - # check if it is a path - self._graph_item = None - if isinstance(graph_item, GraphItem): - self._graph_item = graph_item - elif isinstance(graph_item, str) and os.path.exists(graph_item): - self._graph_item = GraphItem.deserialize(graph_item) - else: - raise ValueError("Invalid graph_item: {}".format(graph_item)) - - self._resource_spec = None - if isinstance(resource_spec, ResourceSpec): - self._resource_spec = resource_spec - elif isinstance(resource_spec, str) and os.path.exists(resource_spec): - self._resource_spec = ResourceSpec(resource_spec) - else: - raise ValueError("Invalid resource_spec: {}".format(resource_spec)) - - def update_graph_item(self, graph_item): - """Change the default graph_item with this simulator.""" - if not graph_item: - raise ValueError('Empty graph item.') - self._graph_item = graph_item - - def update_resource_spec(self, resource_spec): - """Change the default resource_spec with this simulator.""" - if not resource_spec: - raise ValueError('Empty resource spec.') - self._resource_spec = resource_spec - - def simulate(self, - strategy, - graph_item=None, - resource_spec=None, - *args, - **kwargs): - """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.""" - raise NotImplementedError() - - def inference(self, *args, **kwargs): - """ - Abstract method for simulator inference. - - Args: - features: feature input extracted from (GraphItem, ResourceSpec, Strategy) tuple. - checkpoint: optional simulator weight. - - Returns: - float - """ - raise NotImplementedError() - - def load_checkpoint(self, checkpoint): - """ - Load a checkpoint file as weights of the simulator. - - Args: - checkpoint: path to a checkpoint file. - """ - raise NotImplementedError() - - # def save_checkpoint(self, model, checkpoint): - # """ - # Save a trained weight as a checkpoint file. - # - # Args: - # model: trained model. - # checkpoint: path where to save the checkpoint. - # """ - # raise NotImplementedError() - - def preprocess(self, - strategy, - graph_item=None, - resource_spec=None): - """ - Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features. - - Args: - strategy: a distribution strategy - graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used. - resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used. - - Returns: - OrderedDict(): variable/part name to variable/part items. - ResourceItem: - """ - if not graph_item: - if not self._graph_item: - raise ValueError('No graph item provided.') - else: - graph_item = self._graph_item - if not resource_spec: - if not self._resource_spec: - raise ValueError('No resource spec provided.') - else: - resource_spec = self._resource_spec - if not strategy: - raise ValueError('No strategy provided.') - - resource_item = ResourceItem(resource_spec) - name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()} - - name_to_items = OrderedDict() - for node in strategy.node_config: - var_name = node.var_name - var = name_to_var[var_name] - if node.partitioner: - pc = PartitionerConfig(partition_str=node.partitioner) - for i, part in enumerate(node.part_config): - part_item = PartItem(var, graph_item, i, pc, part) - name_to_items[part_item.name] = part_item - else: - var_item = VariableItem(var, graph_item, node) - name_to_items[var_item.name] = var_item - return name_to_items, resource_item +# Copyright 2020 Petuum. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simulator base class.""" +from collections import OrderedDict + +import os + +from autodist.graph_item import GraphItem +from autodist.kernel.partitioner import PartitionerConfig +from autodist.resource_spec import ResourceSpec +from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem + + +class SimulatorBase: + """Simulates strategies for a given graph and resource spec.""" + + def __init__(self, + graph_item=None, + resource_spec=None): + """ + Constructor for simulator base class + Args: + graph_item: a GraphItem object, or a path to a serialized GraphItem object. + resource_spec: a ResourceSpec object, or a path to a resource file. + """ + # check if it is a path + self._graph_item = None + if isinstance(graph_item, GraphItem): + self._graph_item = graph_item + elif isinstance(graph_item, str) and os.path.exists(graph_item): + self._graph_item = GraphItem.deserialize(graph_item) + else: + raise ValueError("Invalid graph_item: {}".format(graph_item)) + + self._resource_spec = None + if isinstance(resource_spec, ResourceSpec): + self._resource_spec = resource_spec + elif isinstance(resource_spec, str) and os.path.exists(resource_spec): + self._resource_spec = ResourceSpec(resource_spec) + else: + raise ValueError("Invalid resource_spec: {}".format(resource_spec)) + + def update_graph_item(self, graph_item): + """Change the default graph_item with this simulator.""" + if not graph_item: + raise ValueError('Empty graph item.') + self._graph_item = graph_item + + def update_resource_spec(self, resource_spec): + """Change the default resource_spec with this simulator.""" + if not resource_spec: + raise ValueError('Empty resource spec.') + self._resource_spec = resource_spec + + def simulate(self, + strategy, + graph_item=None, + resource_spec=None, + *args, + **kwargs): + """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.""" + raise NotImplementedError() + + def inference(self, *args, **kwargs): + """Abstract method for simulator inference.""" + raise NotImplementedError() + + def load_checkpoint(self, checkpoint): + """ + Load a checkpoint file as weights of the simulator. + + Args: + checkpoint: path to a checkpoint file. + """ + raise NotImplementedError() + + # def save_checkpoint(self, model, checkpoint): + # """ + # Save a trained weight as a checkpoint file. + # + # Args: + # model: trained model. + # checkpoint: path where to save the checkpoint. + # """ + # raise NotImplementedError() + + def preprocess(self, + strategy, + graph_item=None, + resource_spec=None): + """ + Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features. + + Args: + strategy: a distribution strategy + graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used. + resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used. + + Returns: + OrderedDict(): variable/part name to variable/part items. + ResourceItem: + """ + if not graph_item: + if not self._graph_item: + raise ValueError('No graph item provided.') + else: + graph_item = self._graph_item + if not resource_spec: + if not self._resource_spec: + raise ValueError('No resource spec provided.') + else: + resource_spec = self._resource_spec + if not strategy: + raise ValueError('No strategy provided.') + + resource_item = ResourceItem(resource_spec) + name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()} + + name_to_items = OrderedDict() + for node in strategy.node_config: + var_name = node.var_name + var = name_to_var[var_name] + if node.partitioner: + pc = PartitionerConfig(partition_str=node.partitioner) + for i, part in enumerate(node.part_config): + part_item = PartItem(var, graph_item, i, pc, part) + name_to_items[part_item.name] = part_item + else: + var_item = VariableItem(var, graph_item, node) + name_to_items[var_item.name] = var_item + return name_to_items, resource_item diff --git a/tests/test_simulator.py b/tests/test_simulator.py index f2aaeb1..7b3d7ed 100644 --- a/tests/test_simulator.py +++ b/tests/test_simulator.py @@ -1,27 +1,7 @@ -from autodist.simulator.utils import _resolve_device_address from autodist.resource_spec import ResourceSpec -from autodist.cluster import SSHCluster -from autodist.kernel.device.resolver import DeviceResolver -from autodist.simulator.base import SimulatorBase from autodist.simulator.utils import _resolve_device_address -# def test_resolve_device_address(): -# resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml' -# rs = ResourceSpec(resource_spec_file) -# cluster = SSHCluster(rs) -# resolver = DeviceResolver(cluster) -# return True - -def test_resolve(): - resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml' - rs = ResourceSpec(resource_spec_file) - cluster = SSHCluster(rs) - resolver = DeviceResolver(cluster) - SimulatorBase.network_bandwidth(rs, resolver) - devices = [device for device, _ in rs.devices] - - resolved_devices_1 = [_resolve_device_address(device, resolver) for device, _ in rs.devices] - devices = resolver.resolve_to_device_str(devices) +from autodist.cluster import SSHCluster +from autodist.kernel.device.resolver import DeviceResolver +from autodist.resource_spec import ResourceSpec - for d1, d2 in zip(resolved_devices_1, devices): - assert d1 == d2 \ No newline at end of file