From 245a9d8a11459c6e1a9e3c430272b5592f4b8db5 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Wed, 15 Jul 2020 16:37:15 -0400
Subject: [PATCH 01/11] initial commit of necessary files

---
 autodist/search/__init__.py                   |    0
 autodist/search/random_search.py              |  336 ++++++
 autodist/simulator/__init__.py                |    0
 autodist/simulator/models/__init__.py         |    0
 autodist/simulator/models/base.py             |  406 +++++++
 .../simulator/models/rankrnn_simulator.py     |  634 ++++++++++
 .../models/rankrnn_simulator_penalty.py       |  729 ++++++++++++
 .../models/rankrnn_simulator_penalty_fast.py  | 1027 +++++++++++++++++
 autodist/simulator/test.py                    |   17 +
 autodist/simulator/train_linear.py            |  123 ++
 .../simulator/train_predefined_simulator.py   |  343 ++++++
 autodist/simulator/utils.py                   |  342 ++++++
 autodist/strategy/auto/ar_group_assigner.py   |   57 +
 autodist/strategy/auto/auto_strategy.py       |    0
 autodist/strategy/auto/ps_load_balancer.py    |   67 ++
 autodist/strategy/auto/random_strategy.py     |  443 +++++++
 16 files changed, 4524 insertions(+)
 create mode 100644 autodist/search/__init__.py
 create mode 100644 autodist/search/random_search.py
 create mode 100644 autodist/simulator/__init__.py
 create mode 100644 autodist/simulator/models/__init__.py
 create mode 100644 autodist/simulator/models/base.py
 create mode 100644 autodist/simulator/models/rankrnn_simulator.py
 create mode 100644 autodist/simulator/models/rankrnn_simulator_penalty.py
 create mode 100644 autodist/simulator/models/rankrnn_simulator_penalty_fast.py
 create mode 100644 autodist/simulator/test.py
 create mode 100644 autodist/simulator/train_linear.py
 create mode 100644 autodist/simulator/train_predefined_simulator.py
 create mode 100644 autodist/simulator/utils.py
 create mode 100644 autodist/strategy/auto/ar_group_assigner.py
 create mode 100644 autodist/strategy/auto/auto_strategy.py
 create mode 100644 autodist/strategy/auto/ps_load_balancer.py
 create mode 100644 autodist/strategy/auto/random_strategy.py

diff --git a/autodist/search/__init__.py b/autodist/search/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/autodist/search/random_search.py b/autodist/search/random_search.py
new file mode 100644
index 0000000..38fcd67
--- /dev/null
+++ b/autodist/search/random_search.py
@@ -0,0 +1,336 @@
+import json
+import time
+from multiprocessing import Process, Queue
+
+import copy
+import numpy as np
+import os
+
+from arion.const import DEFAULT_RANDOM_SEARCH_DIR
+from arion.graph_item import GraphItem
+from arion.resource_spec import ResourceSpec
+from arion.strategy import RandomStrategy, AllReduce
+from arion.utils import logging
+
+
+def build_worker(queue, builder, gi, rs):
+    np.random.seed()
+    ret = builder.build(gi, rs)
+    queue.put(ret)
+
+def get_resource_specs(trial_resource_spec_dir):
+    resource_specs = []
+    if os.path.isdir(trial_resource_spec_dir):
+        for file_name in os.listdir(trial_resource_spec_dir):
+            file_path = os.path.join(trial_resource_spec_dir, file_name)
+            if os.path.isfile(file_path) and file_path.endswith('.yml'):
+                resource_specs.append(file_path)
+    elif os.path.isfile(trial_resource_spec_dir):
+        resource_specs.append(trial_resource_spec_dir)
+    else:
+        raise ValueError("Cannot find valid files in {}".format(trial_resource_spec_dir))
+    return resource_specs
+
+
+def get_strategies(strategies_dir):
+    strategies = []
+    if os.path.isdir(strategies_dir):
+        for file_name in os.listdir(strategies_dir):
+            file_path = os.path.join(strategies_dir, file_name)
+            if os.path.isfile(file_path) and file_path.split('/')[-1].startswith('2020'):
+                strategies.append(file_path)
+    elif os.path.isfile(strategies_dir):
+        strategies.append(strategies_dir)
+    else:
+        raise ValueError("Cannot find valid files in {}".format(strategies_dir))
+    return strategies
+
+
+class RandomSearch:
+    def __init__(self,
+                 space,
+                 heuristics,
+                 search_params,
+                 original_graph_item_path,
+                 resource_file,
+                 simulator=None,
+                 trial_run_fn=None):
+
+        self.space = space
+        self.heuristics = heuristics
+        self.search_params = search_params
+
+        self.original_graph_item_path = original_graph_item_path
+        self.resource_file = resource_file
+
+        self.simulator = simulator
+        self.trial_run_fn = trial_run_fn
+
+        self._resource_spec = ResourceSpec(self.resource_file)
+        self._original_graph_item = GraphItem.deserialize(original_graph_item_path)
+
+    def search(self):
+        # candidates, scores, features = self.propose(self.search_params['num_candidate_explore'])
+        candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore'])
+        n_pick = self.search_params['num_candidate_per_trial']
+
+        # cast them to be np arrays
+        if self.search_params['diversity_metric'] == 'embedding':
+            picked_candidates = self.submodular_pick_by_embedding(np.array(scores),
+                                                                  candidates,
+                                                                  np.stack(features),
+                                                                  n_pick,
+                                                                  self.search_params['simulation_weight'],
+                                                                  self.search_params['diversity_weight'])
+        elif self.search_params['diversity_metric'] == 'expression':
+            picked_candidates = self.submodular_pick_by_expression(np.array(scores),
+                                                                   candidates,
+                                                                   n_pick,
+                                                                   self.search_params['simulation_weight'],
+                                                                   self.search_params['diversity_weight'])
+        else:
+            raise ValueError('Unrecognized diversity metric...')
+        if self.trial_run_fn:
+            self.trial_run(picked_candidates, search_iteration=0)
+
+    def propose(self, num_proposal, use_simulator=True):
+        builder = RandomStrategy(self.space, self.heuristics)
+        candidates = []
+        features = []
+        scores = []
+        # np.random.seed(1)
+        idx = 0
+
+        while len(candidates) < num_proposal:
+            logging.info('Sampling strategy {}'.format(idx))
+            start_time = time.time()
+            expr = builder.build(self._original_graph_item, self._resource_spec)
+            elapsed = time.time() - start_time
+            logging.info('Sampling strategy takes {}'.format(elapsed))
+            builder.reset()
+            idx += 1
+            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
+            if self.simulator and use_simulator:
+                start_time = time.time()
+                score, feature = self.simulator.simulate(expr, self._resource_spec)
+                elapsed = time.time() - start_time
+                logging.info('Inference strategy takes {}'.format(elapsed))
+                if score > self.search_params['rejection_score']:
+                    logging.info('strategy {} has score {} > {}, '
+                                 'rejected..'.format(idx, score, self.search_params['rejection_score']))
+                    continue
+                else:
+                    candidates.append(expr)
+                    features.append(feature)
+                    scores.append(score[0])
+            else:
+                candidates.append(expr)
+                features.append([])
+                scores.append(0)
+        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
+        return candidates, scores, features
+
+    def batch_propose(self, num_proposal, batch_size=32, use_simulator=True):
+
+        builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)]
+        graph_items = [self._original_graph_item for _ in range(batch_size)]
+        rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)]
+        candidates = []
+        features = []
+        scores = []
+        # np.random.seed(1)
+        idx = 0
+
+        while len(candidates) < num_proposal:
+            logging.info('Sampling strategy {}'.format(idx))
+            start_time = time.time()
+
+            q = Queue()
+            exprs = []
+            prs = []
+            for obj, arg1, arg2 in zip(builders, graph_items, rss):
+                prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2)))
+                prs[-1].start()
+            for pr in prs:
+                expr = q.get() # will block
+                exprs.append(expr)
+            for pr in prs:
+                pr.join()
+
+            elapsed = time.time() - start_time
+            logging.info('Sampling strategy takes {}'.format(elapsed))
+            for builder in builders: builder.reset() 
+            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
+            if self.simulator and use_simulator:
+                start_time = time.time()
+                batch_score, batch_feature = self.simulator.simulate(exprs, rss)
+                elapsed = time.time() - start_time
+                logging.info('Inference strategy takes {}'.format(elapsed))
+                for ite, expr in enumerate(exprs):
+                    # print(batch_score[ite], batch_feature[ite].shape)
+                    if batch_score[ite] > self.search_params['rejection_score']:
+                        logging.info('strategy {} has score {} > {}, '
+                                     'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score']))
+                    else:
+                        candidates.append(expr)
+                        features.append(batch_feature[ite])
+                        scores.append(batch_score[ite])
+            else:
+                for ite, expr in enumerate(exprs):
+                    candidates.append(expr)
+                    features.append([])
+                    scores.append(0)
+            idx += batch_size
+        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
+        return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal]
+
+    def submodular_pick_by_embedding(self,
+                                     scores,
+                                     candidates,
+                                     candidate_features,
+                                     n_pick,
+                                     beta=1.0,
+                                     alpha=1.0):
+        n = len(scores)
+        assert n == len(candidate_features)
+
+        ret = []
+        sim = np.dot(candidate_features, candidate_features.T)
+        remain = list(range(len(scores)))
+
+        for _ in range(n_pick):
+            tmp_delta = -scores[remain] * beta
+            if len(ret) > 0:
+                tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1)
+            max_x = tmp_delta.argmax()
+            max_x = remain[max_x]
+
+            ret.append(max_x)
+            remain.remove(max_x)
+
+        return [candidates[i] for i in ret]
+
+    def submodular_pick_by_expression(self,
+                                      scores,
+                                      candidates,
+                                      n_pick,
+                                      beta=1.0,
+                                      alpha=1.0):
+
+        def remove_group_or_reduction_destination(strategy):
+            tmp_strategy = copy.deepcopy(strategy)
+            for node in tmp_strategy.node_config:
+                if node.partitioner:
+                    for part in node.part_config:
+                        synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                        if hasattr(synchronizer, 'reduction_destination'):
+                            synchronizer.reduction_destination = ''
+                        else:
+                            synchronizer.group = 0
+                else:
+                    synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+                    if hasattr(synchronizer, 'reduction_destination'):
+                        synchronizer.reduction_destination = ''
+                    else:
+                        synchronizer.group = 0
+            return tmp_strategy
+
+        def estimate_difference(strategy, node_config_set):
+            score = 0
+            for i, node in enumerate(strategy.node_config):
+                if_seen = False
+                for seen_node in node_config_set[i]:
+                    if seen_node == node:
+                        if_seen = True
+                        break
+                if not if_seen:
+                    score += 1
+            return score
+
+        assert len(scores) == len(candidates)
+
+        node_config_set = [list() for _ in candidates[0].node_config]
+        remain = list(range(len(scores)))
+        ret = []
+        for _ in range(n_pick):
+            max_x = -1
+            max_delta = -1e9
+            max_strategy_copy = None
+
+            for x in remain:
+                tmp_strategy = remove_group_or_reduction_destination(candidates[x])
+                diff_score = estimate_difference(tmp_strategy, node_config_set)
+                assert(diff_score <= len(tmp_strategy.node_config))
+                # print('diff score {}..'.format(diff_score))
+                tmp_delta = - scores[x] * beta + diff_score * alpha
+                if tmp_delta > max_delta:
+                    max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy
+                    max_diff_score = diff_score *alpha
+                    max_simulation_score= -scores[x]
+
+            print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta))
+            ret.append(max_x)
+            remain.remove(max_x)
+
+            # update the node config set
+            for i, node in enumerate(max_strategy_copy.node_config):
+                if_seen = False
+                for seen_node in node_config_set[i]:
+                    if seen_node == node:
+                        if_seen = True
+                        break
+                if not if_seen:
+                    node_config_set[i].append(node)
+
+        return [candidates[i] for i in ret]
+
+    def trial_run(self,
+                  candidate_strategies=None,
+                  search_iteration=0):
+        # serialize all candidates to folder
+        target_dir = os.path.join(DEFAULT_RANDOM_SEARCH_DIR, str(search_iteration))
+        os.makedirs(target_dir, exist_ok=False)
+        self._serialize_candidate_strategies(candidate_strategies, target_dir)
+        self._save_hyperparams(target_dir)
+
+        # launch trial run
+        self._launch_trial_run(target_dir)
+
+    @staticmethod
+    def _serialize_candidate_strategies(candidate_strategies, target_dir):
+        for strategy in candidate_strategies:
+            path = os.path.join(target_dir, strategy.id)
+            strategy.serialize(path)
+
+    def _launch_trial_run(self, strategies_dir):
+        strategies = get_strategies(strategies_dir)
+
+        # this will launch distributed processes and take very long
+        self.trial_run_fn([self.resource_file], strategies)
+
+    def _save_hyperparams(self, target_dir):
+        # copy the constraint file as well
+        space_file = os.path.join(target_dir, 'space.json')
+        with open(space_file, 'w') as f:
+            json.dump(self.space, f)
+        heuristics_file = os.path.join(target_dir, 'heuristics.json')
+        with open(heuristics_file, 'w') as f:
+            json.dump(self.heuristics, f)
+        search_params_file = os.path.join(target_dir, 'search_params.json')
+        with open(search_params_file, 'w') as f:
+            json.dump(self.search_params, f)
+
+    def check_if_visited(self):
+        raise NotImplementedError()
+
+    def check_if_trial_run(self):
+        raise NotImplementedError()
+
+    # Don't use, only for debug.
+    def _single_run(self):
+        # builder = BalancedPartitionedPS()
+        # builder = PartitionedAR(chunk_size=1)
+        builder = AllReduce()
+        expr = builder.build(self._original_graph_item, self._resource_spec)
+        logging.info(expr)
+        self.trial_run([expr], search_iteration=0)
diff --git a/autodist/simulator/__init__.py b/autodist/simulator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/autodist/simulator/models/__init__.py b/autodist/simulator/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/autodist/simulator/models/base.py b/autodist/simulator/models/base.py
new file mode 100644
index 0000000..a12c147
--- /dev/null
+++ b/autodist/simulator/models/base.py
@@ -0,0 +1,406 @@
+"""Strategy Simulator."""
+import time
+from collections import defaultdict
+import numpy as np
+
+import tensorflow as tf
+from tensorflow.python.client import timeline
+
+from arion.simulator.utils import NUM_RUNS
+from arion.cluster import SSHCluster
+from arion.graph_item import GraphItem
+from arion.kernel.device.resolver import DeviceResolver
+from arion.kernel.partitioner import PartitionerConfig
+from arion.proto.synchronizers_pb2 import AllReduceSynchronizer
+from arion.resource_spec import ResourceSpec
+from arion.strategy.base import Strategy
+from arion.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica
+from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
+from arion.simulator.utils import INFINITY
+
+# tf.compat.v1.disable_eager_execution()
+
+class Var:
+    def __init__(self,
+                 name=None,
+                 is_sparse=False,
+                 synchronizer=None,
+                 shape=None,
+                 dtype=None,
+                 device=None,
+                 compressor=None):
+        self.name = name
+        self.is_sparse = is_sparse
+        self.synchronizer = synchronizer
+        self.shape = shape
+        self.dtype = dtype
+        self.device = device
+        self.compressor = compressor
+        self.device = device
+        self.is_partition = False
+
+        self.original_shape = self.shape
+
+    @property
+    def var_size(self):
+        size = 1
+        if self.shape:
+            for s in self.shape:
+                size *= s
+        return size
+
+    @property
+    def original_var_size(self):
+        size = 1
+        if self.original_shape:
+            for s in self.original_shape:
+                size *= s
+        return size
+
+    def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
+        if not self.is_sparse:
+            return self.var_size
+        else:
+            if not self.shape:  # scalar
+                return 1
+
+            emb_size = 1
+            if len(self.shape) > 1:
+                for i in range(1, len(self.original_shape)):
+                    emb_size = emb_size * self.original_shape[i]
+
+            sparse_data_size = batch_size_per_gpu * seq_len * emb_size
+
+            # estimate the embedding of this partition simply using a proportional formula
+            ret = sparse_data_size * self.var_size / self.original_var_size
+            return ret
+
+class Partition(Var):
+    def __init__(self,
+                 name=None,
+                 is_sparse=False,
+                 synchronizer=None,
+                 shape=None,
+                 dtype=None,
+                 device=None,
+                 compressor=None,
+                 part_id=0,
+                 original_shape=None,
+                 partition_str=None,
+                 num_shards=1):
+        super(Partition, self).__init__(name, is_sparse, synchronizer, shape, dtype, device, compressor)
+        self.is_partition = True
+        self.part_id = part_id
+        self.partition_str = partition_str
+        self.original_shape = original_shape
+        self.num_shards = num_shards
+
+class Resource:
+    def __init__(self, cluster, device_resolver, graph_replicas, network_bandwidth, cpu_worker_list,
+                 gpu_worker_list, max_num_local_replica, total_num_local_replica, worker_num_replicas):
+        self.cluster=cluster
+        self.device_resolver=device_resolver
+        self.graph_replicas=graph_replicas
+        self.network_bandwidth=network_bandwidth
+        self.cpu_worker_list=cpu_worker_list
+        self.gpu_worker_list=gpu_worker_list
+        self.max_num_local_replica=max_num_local_replica
+        self.total_num_local_replica=total_num_local_replica
+        self.worker_num_replicas=worker_num_replicas
+
+class SimulatorBase:
+    """Simulates strategies for a given graph and resource spec."""
+
+    def __init__(self, original_graph_item_path):
+        self._original_graph_item_path = original_graph_item_path
+        self._original_graph_item = GraphItem.deserialize(original_graph_item_path)
+        # self._resource_file = resource_file
+        # self._resource_spec = ResourceSpec(resource_file)
+        # self._cluster = SSHCluster(self._resource_spec)
+        # self._device_resolver = DeviceResolver(self._cluster)
+        #
+        # self._graph_replicas = [_resolve_device_address(k, self._device_resolver)
+        #                         for k, v in self._resource_spec.gpu_devices]
+        #
+        # # bandwidth
+        # self._network_bandwidth = self.network_bandwidth(self._resource_spec, self._device_resolver)
+        # # Other information
+        # self._cpu_worker_list = [_resolve_device_address(device, self._device_resolver)
+        #                          for device, _ in self._resource_spec.cpu_devices]
+        # self._gpu_worker_list = [_resolve_device_address(device, self._device_resolver)
+        #                          for device, _ in self._resource_spec.gpu_devices]
+        # self._max_num_local_replica = _max_num_local_replica(self._graph_replicas, self._cluster)
+        # self._total_num_local_replica = len(self._graph_replicas)
+        # self._worker_num_replicas = [_num_local_replica(cpu_worker, self._graph_replicas, self._cluster)
+        #                              for cpu_worker in self._cpu_worker_list]
+
+    def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint: str):
+        """Return simulated runtime value by feeding features to the cost model."""
+        raise NotImplementedError()
+
+    def inference(self, inputs, checkpoint):
+        raise NotImplementedError()
+
+    def load_checkpoint(self, checkpoint):
+        raise NotImplementedError()
+
+    def save_checkpoint(self, model, checkpoint):
+        raise NotImplementedError()
+
+    def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
+        raise NotImplementedError()
+
+    def extract_pre_feature(self, strategy: Strategy, resource_spec: ResourceSpec):
+        resource = self.setup_resource(resource_spec)
+
+        name2var = {var.name: var for var_op, var in self._original_graph_item.trainable_var_op_to_var.items()}
+
+        meta = defaultdict()
+        for node in strategy.node_config:
+            var_name = node.var_name
+            # for var_op, var in self._original_graph_item.trainable_var_op_to_var.items():
+            #     if var.name == var_name:
+            #         break
+            var = name2var[var_name]
+            var_helper = VariableHelper(var, self._original_graph_item)
+
+            if node.partitioner:
+                pc = PartitionerConfig(partition_str=node.partitioner)
+                for i, part in enumerate(node.part_config):
+                    part_helper = PartHelper(i, var, pc)
+                    synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                    compressor = getattr(synchronizer, 'compressor', None)
+                    reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                    device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                     resource.device_resolver)
+
+                    part_meta = Partition(name=part.var_name,
+                                          is_sparse=var_helper.is_sparse,
+                                          shape=part_helper.shape,
+                                          dtype=var_helper.dtype,
+                                          synchronizer=synchronizer,
+                                          part_id=i,
+                                          num_shards=pc.num_shards,
+                                          partition_str=pc.partition_str,
+                                          original_shape=var_helper.shape,
+                                          compressor=compressor,
+                                          device=device)
+                    meta[part_meta.name] = part_meta
+            else:
+                synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+                compressor = getattr(synchronizer, 'compressor', None)
+                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                 resource.device_resolver)
+
+                var_meta = Var(name=var_name,
+                               is_sparse=var_helper.is_sparse,
+                               shape=var_helper.shape,
+                               dtype=var_helper.dtype,
+                               synchronizer=synchronizer,
+                               compressor=compressor,
+                               device=device)
+                meta[var_meta.name] = var_meta
+        return meta, resource
+
+    def extract_pre_feature_legacy(self, strategy):
+        """Don't use now!!!"""
+        meta = defaultdict()
+        for node in strategy.node_config:
+            var_name = node.var_name
+            for var_op, var in self._original_graph_item.trainable_var_op_to_var.items():
+                if var.name == var_name:
+                    break
+            var_op_name = var_op.name
+            var_helper = VariableHelper(var, self._original_graph_item)
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            compressor = getattr(synchronizer, 'compressor', None)
+            if compressor is not None:
+                compressor = AllReduceSynchronizer.Compressor.Name(compressor)
+            reduction_destinations = getattr(synchronizer, 'reduction_destinations', None)
+            if not reduction_destinations or len(reduction_destinations) <= 1:
+                # this variable is not partitioned
+                device = reduction_destinations[0] if reduction_destinations else var.device
+                var_meta = Var(name=var_name,
+                               is_sparse=var_helper.is_sparse,
+                               shape=var_helper.shape,
+                               dtype=var_helper.dtype,
+                               synchronizer=synchronizer,
+                               compressor=compressor,
+                               device=device)
+                meta[var_meta.name] = var_meta
+            else:
+                # this variable is partitioned
+                num_partitions = len(reduction_destinations)
+                partition_list = [1] * len(var_helper.shape)
+                partition_list[0] = num_partitions
+                pc = PartitionerConfig(partition_list=partition_list)
+                for i, device in enumerate(reduction_destinations):
+                    part_helper = PartHelper(i, var, pc)
+                    part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i),
+                                          is_sparse=var_helper.is_sparse,
+                                          shape=part_helper.shape,
+                                          dtype=var_helper.dtype,
+                                          synchronizer=synchronizer,
+                                          part_id=i,
+                                          partition_str=pc.partition_str,
+                                          original_shape=var_helper.shape,
+                                          compressor=compressor,
+                                          device=device)
+                    meta[part_meta.name] = part_meta
+        return meta
+
+    def setup_resource(self, resource_spec: ResourceSpec):
+        cluster = SSHCluster(resource_spec)
+        device_resolver = DeviceResolver(cluster)
+        graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
+        # bandwidth
+        network_bandwidth = self.network_bandwidth(resource_spec, device_resolver)
+        # Other information
+        cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
+        gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
+        max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
+        total_num_local_replica = len(graph_replicas)
+        worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
+        resource = Resource(cluster=cluster,
+                            device_resolver=device_resolver,
+                            graph_replicas=graph_replicas,
+                            network_bandwidth=network_bandwidth,
+                            cpu_worker_list=cpu_worker_list,
+                            gpu_worker_list=gpu_worker_list,
+                            max_num_local_replica=max_num_local_replica,
+                            total_num_local_replica=total_num_local_replica,
+                            worker_num_replicas=worker_num_replicas)
+        return resource
+
+    @staticmethod
+    def network_bandwidth(resource_spec: ResourceSpec, device_resolver: DeviceResolver):
+        """Calculates all P2P network bandwidths between nodes in the cluster."""
+        devices = [device for device, _ in resource_spec.devices]
+        resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices]
+        gpu_cpu_bw = 10000.  # hardcode for now
+        network_bandwidth = {}  # key: <server, worker>
+        for i in range(len(devices)):
+            if resolved_devices[i] not in network_bandwidth:
+                network_bandwidth[resolved_devices[i]] = {}
+            for j in range(i, len(devices)):
+                if resolved_devices[j] not in network_bandwidth:
+                    network_bandwidth[resolved_devices[j]] = {}
+                ip_i = devices[i].split(':')[0]
+                ip_j = devices[j].split(':')[0]
+                if ip_i != ip_j:
+                    network_bandwidth[resolved_devices[i]][resolved_devices[j]] \
+                        = GIGABITS * resource_spec.network_bandwidth[ip_i]
+                    network_bandwidth[resolved_devices[j]][resolved_devices[i]] \
+                        = GIGABITS * resource_spec.network_bandwidth[ip_j]
+                else:
+                    network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw
+                    network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw
+
+        return network_bandwidth
+
+    @staticmethod
+    def min_bandwitdh(worker_list, bandwidth):
+        min_bandwidth = INFINITY
+        num_workers = len(worker_list)
+        for i in range(num_workers):
+            for j in range(i, num_workers):
+                min_bandwidth = min(min_bandwidth, bandwidth[worker_list[j]][worker_list[i]])
+
+    @property
+    def original_graph_item_path(self):
+        return self._original_graph_item_path
+
+    # @property
+    # def resource_file(self):
+    #     return self._resource_file
+
+    @staticmethod
+    def calculate_op_timings(fetches):
+        # Simple implementation. Calculate averaged run time of certain steps.
+        init_op = tf.compat.v1.initialize_all_variables()
+        outside_times = []
+
+        with tf.compat.v1.Session() as sess:
+            sess.run(init_op)
+            for i in range(NUM_RUNS):
+                start = time.time()
+                sess.run(fetches)
+                end = time.time()
+                outside_times.append(end - start)
+        comp_time_in_sec = np.mean(np.array(outside_times[1:]))
+        return comp_time_in_sec
+
+    @staticmethod
+    def profile_on_single_machine(fetches):
+        # calculate computation time of every op
+        init_op = tf.compat.v1.initialize_all_variables()
+        op_name2runtime = defaultdict(list)
+        outside_times = []
+        all_times = []
+
+        options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+        run_metadata = tf.compat.v1.RunMetadata()
+        with tf.compat.v1.Session() as sess:
+            sess.run(init_op)
+            for i in range(NUM_RUNS):
+                start = time.time() * 1000
+                sess.run(fetches)
+                end = time.time() * 1000
+                outside_times.append(end - start)
+
+                sess.run(fetches, options=options, run_metadata=run_metadata)
+
+                fetched_timeline = timeline.Timeline(run_metadata.step_stats)
+                chrome_trace = fetched_timeline.generate_chrome_trace_format()  # necessary
+                for event in fetched_timeline._chrome_trace._events:
+                    # print('\n')
+                    # print(list(event.keys()))
+                    # for key in list(event.keys()):
+                    #     print(key, event[key])
+                    if 'dur' in event:
+                        op_name2runtime[event['args']['name']].append(event['dur'])
+                    # todo: to be more accurate, add tid (thread/lanes id)
+
+        mean_outside_time = np.mean(np.array(outside_times[1:]))
+        print('mean outside_times: ', mean_outside_time)
+        print(outside_times)
+        # print('average all_times: ', np.mean(np.array(all_times)))
+
+        op_name2meanruntime = {}
+        for op_name, runtimes in op_name2runtime.items():
+            runtimes = np.array(runtimes)
+            if len(runtimes) > 1:  # Do not compute operations that only run once for all steps.
+                mean = np.mean(np.array(runtimes[1:]))
+                op_name2meanruntime[op_name] = mean
+                print(op_name, mean)
+                # print(op_name2runtime[op_name])
+
+        total_op_time = sum([mean_runtime for op_name, mean_runtime in op_name2meanruntime.items()])
+        print('total_op_time', total_op_time / 1000.)
+        # total_op_time = [sum([runtime[i] for op_name, runtime in op_name2runtime.items()])
+        # for i in range(self.num_runs)]
+        # print('total_op_time', np.mean(np.array(total_op_time)), total_op_time)
+
+        return mean_outside_time
+
+    # @staticmethod
+    # def _calculate_op_timings(graph_item: GraphItem):
+    #     """
+    #     Given a graph, calculates an expected running time for each (op, input_size) pair.
+    #
+    #     Args:
+    #         graph_item (GraphItem): The input graph.
+    #
+    #     Returns:
+    #         Dict mapping (op, input_size) to time.
+    #     """
+    #     all_ops = {}
+    #     for op in graph_item.graph.get_operations():
+    #         input_shapes = tuple((tuple(inp.shape.dims) for inp in op.inputs))
+    #         op_type = op.type
+    #         all_ops[(op_type, input_shapes)] = ops.Graph()
+    #
+    #     for ((op, shape), graph) in all_ops.items():
+    #         with graph.as_default():
+    #             getattr(tensorflow.raw_ops, op)
diff --git a/autodist/simulator/models/rankrnn_simulator.py b/autodist/simulator/models/rankrnn_simulator.py
new file mode 100644
index 0000000..4459515
--- /dev/null
+++ b/autodist/simulator/models/rankrnn_simulator.py
@@ -0,0 +1,634 @@
+"""Strategy RankNetSimulator."""
+import glob
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from string import digits
+
+import numpy as np
+import os
+import tensorflow as tf
+tf.compat.v1.disable_eager_execution()
+
+import arion
+from arion.graph_item import GraphItem
+from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from arion.simulator.models.base import SimulatorBase
+from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS
+from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica
+from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
+from arion.strategy.base import Strategy
+from arion.resource_spec import ResourceSpec
+from arion.cluster import SSHCluster
+from arion.kernel.device.resolver import DeviceResolver
+from arion.kernel.partitioner import PartitionerConfig
+from arion.simulator.models.predefined_simulator import PredefinedSimulator
+
+import torch
+import torch.nn as nn
+
+TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# feature settings
+MAX_NUM_WORKERS = 16
+MAX_NUM_GROUPS = 600
+MAX_NUM_VARS = 500
+MAX_NUM_PARS = 1500
+
+# model size
+FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15
+PARTITION_MLP_HIDDEN = 128
+PARTITION_MLP_OUT = 32
+STEM_RNN_HIDDEN = 128
+BIDIECTIONAL = True
+NUM_RNN_LAYERS = 3
+
+# trainer setting
+BATCH_SIZE = 64
+LR = 3e-4
+WD = 3e-4
+
+GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item',
+                'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item',
+                'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item',
+                'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item',
+                'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item',
+                'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item',
+                'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l',
+                'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l',
+                'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l',
+                'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'}
+
+def get_model(path_):
+    if 'densenet121' in path_:
+        return 'densenet121'
+    elif 'ncf' in path_:
+        return 'ncf'
+    elif 'inceptionv3' in path_:
+        return 'inceptionv3'
+    elif 'resnet101' in path_:
+        return 'resnet101'
+    elif 'resnet50' in path_:
+        return 'resnet50'
+    elif 'vgg16' in path_:
+        return 'vgg16'
+    elif 'bert' in path_ and '12l' in path_:
+        return 'bert_12l'
+    elif 'bert' in path_ and '6l' in path_:
+        return 'bert_6l'
+    elif 'bert' in path_ and '3l' in path_:
+        return 'bert_3l'
+    elif 'bert' in path_ and 'large' in path_:
+        return 'bert_large'
+    else:
+        return None
+
+class RankRNN(nn.Module):
+    def __init__(self, input_size=FEATURE_SIZE,
+                       partition_mlp_hidden=PARTITION_MLP_HIDDEN, 
+                       partition_mlp_out=PARTITION_MLP_OUT, 
+                       stem_rnn_hidden=STEM_RNN_HIDDEN, 
+                       num_rnn_layers=NUM_RNN_LAYERS, 
+                       bidirectional=BIDIECTIONAL):
+        super(RankRNN, self).__init__()
+        self.partition_mlp_out = partition_mlp_out
+        # self.num_rnn_layers = num_rnn_layers
+        self.stem_rnn_hidden = stem_rnn_hidden
+        self.partition_mlp = nn.Sequential(nn.Linear(input_size, partition_mlp_hidden),
+                                           nn.ReLU(),
+                                           # nn.Linear(partition_mlp_hidden, partition_mlp_hidden),
+                                           # nn.ReLU(),
+                                           nn.Linear(partition_mlp_hidden, partition_mlp_out),
+                                           )
+
+        self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional)
+        self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1)
+
+        self.relu = nn.ReLU()
+    
+    def forward(self, features, par_indices, var_nums):
+
+        x = features.float()
+        # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float()
+        x = self.partition_mlp(x)
+
+        x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype)
+        x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x)
+
+        # Set initial hidden and cell states 
+        # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) 
+        # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE)
+        
+        # Forward propagate LSTM
+        x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False)
+        out, (ht, ct) = self.stem_rnn(x1)  # out: tensor of shape (batch_size, seq_length, hidden_size)
+
+        # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None]
+        out = ht.permute(1, 0, 2).reshape(x.shape[0], -1)
+        # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]])
+        # print(ht.permute(1, 0, 2).shape, x.shape)
+        out = self.final_fc(out)
+        return out
+
+class TrainTensorDataset(torch.utils.data.Dataset):
+    """TensorDataset with support of transforms.
+    """
+    def __init__(self, tensors):
+        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        x = self.tensors[0][index]
+        x = self.perturbe_device_and_group(x)
+        x1 = self.tensors[1][index]
+        x2 = self.tensors[2][index]
+
+        y = self.tensors[3][index]
+
+        return x, x1, x2, y
+
+    def __len__(self):
+        return self.tensors[0].size(0)
+
+    def perturbe_device_and_group(self, x):
+        # perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32)
+        # perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32)
+        # mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids]
+        # mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids]
+        # x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1)
+        return x
+
+
+def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas):
+    ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)]
+
+    if device is not None:
+        ret[0][device] = 1
+    
+    group = getattr(synchronizer, 'group', None)
+    if group is not None:
+        assert group < MAX_NUM_GROUPS, group
+        ret[1][group] = 1
+
+    compressor = getattr(synchronizer, 'compressor', None)
+    if compressor is not None:
+        if compressor in ["PowerSGDCompressor", 3]:
+            ret[2][2] = 1
+        elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]:
+            ret[2][1] = 1
+        elif compressor in ["NoneCompressor", 0]:
+            ret[2][0] = 1
+        else:
+            raise ValueError('Compressor does not exist: {}'.format(compressor))
+
+    local_replication = getattr(synchronizer, 'local_replication', None)
+    if isinstance(synchronizer, PSSynchronizer):
+        synchronizer = 0
+        if int(local_replication) == 0:
+            if int(is_sparse) == 0:
+                ret[3][0] = 1
+            else:
+                ret[3][1] = 1
+        else:
+            if int(is_sparse) == 0:
+                ret[3][2] = 1
+            else:
+                ret[3][3] = 1
+    else:
+        ret[3][4] = 1
+    ret[4] = np.array([size_ratio, bd, num_replicas])
+
+    return np.concatenate(ret)
+
+def connvert_feature(strategy, resource_spec, graph_item):
+    
+    cluster = SSHCluster(resource_spec)
+    device_resolver = DeviceResolver(cluster)
+    graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
+    # bandwidth
+    network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices])
+    network_bandwidth = network_bandwidth
+    min_network_bandwidth = network_bandwidth.min()
+    # Other information
+    cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
+    gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
+    max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
+    total_num_local_replica = len(graph_replicas)
+    worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
+
+    num_vars = 0
+    total_size_vars = 0
+    for var_op, var in graph_item.trainable_var_op_to_var.items():
+        num_vars += 1
+        if var.initial_value.shape.ndims:
+            var_helper = VariableHelper(var, graph_item)
+            if var_helper.is_sparse:
+                total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape))
+            else:
+                total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype)
+    assert num_vars < MAX_NUM_VARS, num_vars
+    var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32)
+    partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1)
+
+    cnt = 0
+    for node_id, node in enumerate(strategy.node_config):
+        var_name = node.var_name
+        for var_op, var in graph_item.trainable_var_op_to_var.items():
+            if var.name == var_name:
+                break
+        var_helper = VariableHelper(var, graph_item)
+
+        if node.partitioner:
+            pc = PartitionerConfig(partition_str=node.partitioner)
+            for i, part in enumerate(node.part_config):
+                part_helper = PartHelper(i, var, pc)
+                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                 device_resolver)
+                if device == '':
+                    assert(isinstance(synchronizer, AllReduceSynchronizer))
+                    device = None
+                    bd = min_network_bandwidth
+                    num_replicas = 0
+                else:
+                    device = cpu_worker_list.index(device)
+                    bd = network_bandwidth[device]
+                    num_replicas = worker_num_replicas[device]
+
+                if var_helper.is_sparse:
+                    size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars
+                else:
+                    size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars
+                var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
+                partition_indice[cnt] = node_id
+                cnt += 1
+        else:
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+            device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                             device_resolver)
+            if device == '':
+                assert(isinstance(synchronizer, AllReduceSynchronizer))
+                device = None
+                bd = min_network_bandwidth
+                num_replicas = 0
+            else:
+                device = cpu_worker_list.index(device)
+                bd = network_bandwidth[device]
+                num_replicas = worker_num_replicas[device]
+
+            if var_helper.is_sparse:
+                size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars
+            else:
+                size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars
+            var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
+            partition_indice[cnt] = node_id
+            cnt += 1
+    return var_partition_features, partition_indice, np.array(node_id+1)
+
+def create_predefined_features(strategy, resource_spec, predefined_simulator):
+
+    var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec)
+
+    features = []
+    for var_name, sync_time in var_sync_time.items():
+        if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies.
+            transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
+            sync_time = sync_time[0]
+            is_ps = True
+        else:   # AR
+            transmission = sync_time['transmission']
+            is_ps = False
+
+        network_overhead = sync_time['network_overhead']
+        gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
+
+        feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)]
+        features.append(feat)
+    features = np.array(features, dtype=np.float)
+    return features
+
+class RankRNNSimulator(SimulatorBase):
+    """Simulates strategies for a given graph and resource spec."""
+
+    def __init__(self,
+                 original_graph_item_path,
+                 fetches=None,
+                 batch_size=1,
+                 seq_len=1,
+                 checkpoint=None):
+
+        super(RankRNNSimulator, self).__init__(original_graph_item_path=original_graph_item_path)
+        print("It's using RankNet simulator.")
+        self._fetches = fetches
+        self._batch_size_per_gpu = batch_size
+        self._seq_len = seq_len
+        self._checkpoint = checkpoint
+        self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path,
+                                                         batch_size=self._batch_size_per_gpu,
+                                                         seq_len=self._seq_len)
+        if self._checkpoint:
+            self._model = RankRNN().to(TORCH_DEVICE)
+            self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu')))
+
+    def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None):
+        cost = self.predict(strategy, resource_spec, strategy_path, checkpoint)
+        return cost
+
+    def predict(self,
+                strategy,
+                resource_spec,
+                strategy_path=None,
+                checkpoint=None):
+        if checkpoint is None:
+            if self._checkpoint is None:
+                raise ValueError("checkpoint is None: {}".format(checkpoint))
+            else:
+                model = self._model
+        else:
+            model = RankRNN().to(TORCH_DEVICE)
+            model.load_state_dict(torch.load(checkpoint))
+        if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')):
+            loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz'))
+            var_partition_features, partition_indice, var_num, _ = \
+                            loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
+        else:
+            var_partition_features, partition_indice, var_num = \
+                            connvert_feature(strategy, resource_spec, self._original_graph_item)
+
+        if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')):
+            loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz'))
+            predefined_features = loaded['x4']
+        else:
+            predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator)
+            
+        var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
+
+        var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE)
+        partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE)
+        var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE)
+
+        return model(var_partition_features, partition_indice, var_num).view(-1).data.cpu().numpy()
+
+class RankNetTrainer():
+
+    def __init__(self,
+                 checkpoint=None,
+                 batch_size_per_gpu=256,
+                 seq_len=1,
+                 seed=1):
+        self._batch_size_per_gpu = batch_size_per_gpu
+        self._seq_len = seq_len
+        self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()}
+        self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v,
+                                                         batch_size=self._batch_size_per_gpu,
+                                                         seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()}
+        self.model = RankRNN().to(TORCH_DEVICE)
+        if checkpoint:
+            self.model.load_state_dict(torch.load(checkpoint))
+        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR, weight_decay=WD)
+        print("It's using RankNet trainer.")
+
+    def train(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same', num_epochs=200):
+
+        features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()}
+        for training_path in path_list:
+            for path in Path(training_path).rglob('strategies'):
+                strategy_paths = glob.glob(os.path.join(path, '*'))
+                for strategy_path in strategy_paths:
+                    if 'json' in strategy_path or \
+                      'bert_large_batch_8_orca_16_group_2/' in strategy_path:
+                        continue
+                    model = get_model(strategy_path)
+                    if model is None:
+                        if not ('densenets169' in strategy_path or 'densenets201' in strategy_path):
+                            assert False, strategy_path
+                        continue
+                    rs_path = strategy_path.replace('strategies', 'resource_specs')
+                    runtime_path = strategy_path.replace('strategies', 'runtimes')
+                    npz_path = (strategy_path+'.npz').replace('strategies', 'npz')
+                    if not os.path.isfile(rs_path):
+                        rs_path += '.yml'  
+                    if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)):
+                        continue
+                    if not os.path.exists(os.path.dirname(npz_path)):
+                        os.makedirs(os.path.dirname(npz_path))
+
+                    if not os.path.isfile(npz_path):
+                        strategy = Strategy.deserialize(path=strategy_path)
+                        rs = ResourceSpec(resource_file=rs_path)
+                        var_partition_features, partition_indice, var_num = \
+                                        connvert_feature(strategy, rs, self.graph_items[model])
+                        label = np.array(json.load(open(runtime_path))['average'])
+                        np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label)
+                    else:
+                        loaded = np.load(npz_path)
+                        var_partition_features, partition_indice, var_num, label = \
+                                        loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
+
+                    if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')):
+                        predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model])
+                        np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features)
+                    else:
+                        loaded = np.load(npz_path.replace('.npz', '_pdf.npz'))
+                        predefined_features = loaded['x4']
+                    var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
+
+                    is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path or 'vgg_random_orca_11' in strategy_path) # comment here
+                    print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3])
+                    features[model][is_aws][0].append(var_partition_features)
+                    features[model][is_aws][1].append(partition_indice)
+                    features[model][is_aws][2].append(var_num)
+                    features[model][is_aws][3].append(label)
+
+        for k, _ in GRAPH_ITEM_PATHS.items():
+            for i1 in range(2):
+                for i2 in range(4):
+                    if len(features[k][i1][i2]) > 1:
+                        features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16)
+                        print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape)
+                    else:
+                        features[k][i1][i2] = None
+
+        train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0)
+        train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0)
+        train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0)
+        train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0)
+
+        if type(valid_patterns) == str and valid_patterns == 'same':
+            permt = np.random.permutation(train_features.shape[0])
+            split = int(len(permt) * 0.8)
+            val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]]
+            train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]]
+        else:
+            val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0)
+            val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0)
+            val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0)
+            val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0)
+
+            # comment here
+            permt = np.random.permutation(val_features.shape[0])
+            split = int(len(permt) * 0.7)
+            train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0)
+
+            val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]]
+
+        print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min()) 
+
+        ## train the model
+        trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE)))
+        testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE))
+        trainloader = torch.utils.data.DataLoader(dataset=trainset, 
+                                                  batch_size=BATCH_SIZE, 
+                                                  shuffle=True)
+        testloader = torch.utils.data.DataLoader(dataset=testset, 
+                                                  batch_size=32, 
+                                                  shuffle=False)
+        best_val_acc = 0.
+        checkpoint_path = 'model_train_on_{}-{}_new.ckpt'.format(train_patterns[0][0], 'orca' if train_patterns[0][1] == 0 else 'aws')
+        for epoch in range(num_epochs):
+            if epoch == int(num_epochs*2./5. - 1):
+                for param_group in self.optimizer.param_groups: param_group['lr'] = 3e-4
+            if epoch == int(num_epochs*4./5. - 1):
+                for param_group in self.optimizer.param_groups: param_group['lr'] = 1e-4
+
+            labels = []
+            outputs = []
+            for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(trainloader):  
+                
+                # Forward pass
+                outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze()
+                
+                true_comp = (labels_b[:, None] > labels_b[None, :]).float() * 2 - 1
+                pred_comp = outputs_b[:, None] - outputs_b[None, :]
+                loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp)
+                loss = loss.tril(-1).mean()
+                
+                # Backward and optimize
+                self.optimizer.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(self.model.stem_rnn.parameters(), 0.25)
+                self.optimizer.step()
+
+                outputs.append(outputs_b)
+                labels.append(labels_b)
+
+            labels = torch.cat(labels)
+            outputs = torch.cat(outputs)
+            true_comp = (labels[:, None] > labels[None, :])
+            pred_comp = (outputs[:, None] > outputs[None, :])
+            equal = (true_comp == pred_comp).int()
+            train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
+            
+            with torch.no_grad():
+                labels = []
+                outputs = []
+                for features_b, par_indices_b, var_nums_b, labels_b in testloader:
+                    
+                    # Forward pass
+                    outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze()
+                    outputs.append(outputs_b)
+                    labels.append(labels_b)
+
+                labels = torch.cat(labels)
+                outputs = torch.cat(outputs)
+                true_comp = (labels[:, None] > labels[None, :])
+                pred_comp = (outputs[:, None] > outputs[None, :])
+                equal = (true_comp == pred_comp).int()
+                acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
+                if acc.item() > best_val_acc:
+                    best_val_acc = acc.item()
+                    torch.save(self.model.state_dict(), checkpoint_path)
+                    print('Saved model to {}'.format(checkpoint_path))
+                print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc))
+        return checkpoint_path
+
+
+if __name__ == '__main__':
+    
+    trainer = RankNetTrainer()
+    checkpoint_path = trainer.train(
+                                    [
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', 
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', 
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf',
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20',
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/densenet', 
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', 
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', 
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', 
+                                     '/users/hzhang2/oceanus_cost_model_training_data/vgg16',
+                                     # '/users/hzhang2/oceanus_cost_model_training_data/bert',
+                                     ],
+                                    [
+                                      # ('ncf', 0), #('ncf', 1), 
+                                      # ('densenet121', 0), ('densenet121', 1), 
+                                      # ('inceptionv3', 0), ('inceptionv3', 1), 
+                                      # ('resnet101', 0), ('resnet101', 1), 
+                                      # ('resnet50', 0), ('resnet50', 1), 
+                                      # ('bert_12l', 0), ('bert_12l', 1), 
+                                      # ('bert_6l', 0), ('bert_6l', 1), 
+                                      # ('bert_3l', 0), ('bert_3l', 1), 
+                                      # ('bert_large', 0), ('bert_large', 1), 
+                                      ('vgg16', 0), #('vgg16', 1), 
+                                    ], 
+                                    [('vgg16', 1)],
+                                    num_epochs=200)
+    # checkpoint_path = 'model_train_on_vgg16-orca.ckpt'
+    test_list = [
+    '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15',
+    '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11',   #TARGET: 0.9
+    # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
+    ]
+    
+    for data_folder in test_list:
+        simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)],
+                                        batch_size=256,
+                                        seq_len=1,
+                                        checkpoint=checkpoint_path)
+
+        runtimes_folder = os.path.join(data_folder, 'runtimes')
+        results = {}
+        averages= []
+        scores = []
+        for name in os.listdir(runtimes_folder):
+            strategy_path = os.path.join(data_folder, 'strategies', name)
+            rs_path = os.path.join(data_folder, 'resource_specs', name )
+            if not os.path.isfile(rs_path):
+                rs_path += '.yml' 
+            runtime_path = os.path.join(runtimes_folder, name)
+
+            with open(runtime_path, 'r') as f:
+                runtimes = json.load(f)
+            average = np.array(runtimes['average'])
+
+            s = Strategy.deserialize(strategy_path)
+            rs = ResourceSpec(resource_file=rs_path)
+            score = simulator.simulate(s, rs, strategy_path)
+
+            results[name] = (average, score)
+            averages.append(average)
+            scores.append(score)
+
+        # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])}
+        # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])}
+        # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])}
+        # print('Sorted by runtime.......................')
+        # for _, (rt, prediction) in sorted_by_runtime.items():
+        #     print('runtime {}  prediction {}'.format(rt, prediction))
+
+        y_train = np.array(averages)
+        test_score = np.array(scores)
+        true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :])
+        pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :])
+        equal = (true_comp == pred_comp).astype(np.int)
+        test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1)
+
+        print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc))
diff --git a/autodist/simulator/models/rankrnn_simulator_penalty.py b/autodist/simulator/models/rankrnn_simulator_penalty.py
new file mode 100644
index 0000000..380fa10
--- /dev/null
+++ b/autodist/simulator/models/rankrnn_simulator_penalty.py
@@ -0,0 +1,729 @@
+"""Strategy RankNetSimulator."""
+import glob
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from string import digits
+
+import numpy as np
+import os
+import tensorflow as tf
+tf.compat.v1.disable_eager_execution()
+
+import arion
+from arion.graph_item import GraphItem
+from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from arion.simulator.models.base import SimulatorBase
+from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS
+from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica
+from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
+from arion.strategy.base import Strategy
+from arion.resource_spec import ResourceSpec
+from arion.cluster import SSHCluster
+from arion.kernel.device.resolver import DeviceResolver
+from arion.kernel.partitioner import PartitionerConfig
+from arion.simulator.models.predefined_simulator import PredefinedSimulator
+
+import torch
+import torch.nn as nn
+
+TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# feature settings
+MAX_NUM_WORKERS = 16
+MAX_NUM_GROUPS = 600
+MAX_NUM_VARS = 500
+MAX_NUM_PARS = 1500
+FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15
+
+# model size
+PARTITION_MLP_HIDDEN = 128
+PARTITION_MLP_OUT = 32
+STEM_RNN_HIDDEN = 128
+BIDIECTIONAL = True
+BATCH_SIZE = 96
+
+NUM_RNN_LAYERS = 3
+SCORE_TH = 0.005
+LR = 2e-3
+WD = 3e-4
+DATA_AUG = False
+IN_LAYERS = 2
+OUT_LAYERS = 1
+
+# ncf used:
+# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020
+# noaug
+# PARTITION_MLP_HIDDEN = 128
+# PARTITION_MLP_OUT = 32
+# STEM_RNN_HIDDEN = 128
+# BIDIECTIONAL = True
+# NUM_RNN_LAYERS = 4
+# BATCH_SIZE = 64
+# LR = 1e-3
+# WD = 4e-4
+
+# vgg used:
+# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374
+# noaug
+# PARTITION_MLP_HIDDEN = 128
+# PARTITION_MLP_OUT = 32
+# STEM_RNN_HIDDEN = 128
+# BIDIECTIONAL = True
+# NUM_RNN_LAYERS = 3
+# BATCH_SIZE = 64
+# LR = 1e-3
+# WD = 3e-4
+
+GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item',
+                'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item',
+                'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item',
+                'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item',
+                'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item',
+                'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item',
+                'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l',
+                'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l',
+                'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l',
+                'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'}
+
+def get_model(path_):
+    if 'densenet121' in path_:
+        return 'densenet121'
+    elif 'ncf' in path_:
+        return 'ncf'
+    elif 'inceptionv3' in path_:
+        return 'inceptionv3'
+    elif 'resnet101' in path_:
+        return 'resnet101'
+    elif 'resnet50' in path_:
+        return 'resnet50'
+    elif 'vgg16' in path_:
+        return 'vgg16'
+    elif 'bert' in path_ and '12l' in path_:
+        return 'bert_12l'
+    elif 'bert' in path_ and '6l' in path_:
+        return 'bert_6l'
+    elif 'bert' in path_ and '3l' in path_:
+        return 'bert_3l'
+    elif 'bert' in path_ and 'large' in path_:
+        return 'bert_large'
+    else:
+        return None
+
+class RankRNN(nn.Module):
+    def __init__(self, input_size=FEATURE_SIZE,
+                       partition_mlp_hidden=PARTITION_MLP_HIDDEN, 
+                       partition_mlp_out=PARTITION_MLP_OUT, 
+                       stem_rnn_hidden=STEM_RNN_HIDDEN, 
+                       num_rnn_layers=NUM_RNN_LAYERS, 
+                       in_layers=IN_LAYERS,
+                       out_layers=OUT_LAYERS,
+                       bidirectional=BIDIECTIONAL):
+        super(RankRNN, self).__init__()
+        self.partition_mlp_out = partition_mlp_out
+        # self.num_rnn_layers = num_rnn_layers
+        self.stem_rnn_hidden = stem_rnn_hidden
+        tmp = [nn.Linear(input_size, partition_mlp_hidden)]
+        for _ in range(in_layers-2):
+            tmp.append(nn.ReLU())
+            tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden))
+        tmp.append(nn.ReLU())
+        tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out))
+
+        self.partition_mlp = nn.Sequential(*tmp)
+
+        self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional)
+
+        if out_layers == 1:
+            self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1)
+        elif out_layers == 2:
+            self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128),
+                                          nn.ReLU(),
+                                          nn.Linear(128, 1))
+
+        self.relu = nn.ReLU()
+    
+    def forward(self, features, par_indices, var_nums, return_feature=False):
+
+        x = features.float()
+        # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float()
+        x = self.partition_mlp(x)
+
+        x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype)
+        x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x)
+
+        # Set initial hidden and cell states 
+        # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) 
+        # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE)
+        
+        # Forward propagate LSTM
+        x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False)
+        out, (ht, ct) = self.stem_rnn(x1)  # out: tensor of shape (batch_size, seq_length, hidden_size)
+
+        # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None]
+        out = ht.permute(1, 0, 2).reshape(x.shape[0], -1)
+        # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]])
+        # print(ht.permute(1, 0, 2).shape, x.shape)
+        if return_feature:
+            return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt())
+        else:
+            return self.final_fc(out)
+
+class TrainTensorDataset(torch.utils.data.Dataset):
+    """TensorDataset with support of transforms.
+    """
+    def __init__(self, tensors):
+        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        x = self.tensors[0][index]
+        x = self.perturbe_device_and_group(x)
+        x1 = self.tensors[1][index]
+        x2 = self.tensors[2][index]
+
+        y = self.tensors[3][index]
+
+        return x, x1, x2, y
+
+    def __len__(self):
+        return self.tensors[0].size(0)
+
+    def perturbe_device_and_group(self, x):
+        if DATA_AUG:
+            perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32)
+            perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32)
+            mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids]
+            mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids]
+            x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1)
+        return x
+
+
+def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas):
+    ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)]
+
+    if device is not None:
+        ret[0][device] = 1
+    
+    group = getattr(synchronizer, 'group', None)
+    if group is not None:
+        assert group < MAX_NUM_GROUPS, group
+        ret[1][group] = 1
+
+    compressor = getattr(synchronizer, 'compressor', None)
+    if compressor is not None:
+        if compressor in ["PowerSGDCompressor", 3]:
+            ret[2][2] = 1
+        elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]:
+            ret[2][1] = 1
+        elif compressor in ["NoneCompressor", 0]:
+            ret[2][0] = 1
+        else:
+            raise ValueError('Compressor does not exist: {}'.format(compressor))
+
+    local_replication = getattr(synchronizer, 'local_replication', None)
+    if isinstance(synchronizer, PSSynchronizer):
+        synchronizer = 0
+        if int(local_replication) == 0:
+            if int(is_sparse) == 0:
+                ret[3][0] = 1
+            else:
+                ret[3][1] = 1
+        else:
+            if int(is_sparse) == 0:
+                ret[3][2] = 1
+            else:
+                ret[3][3] = 1
+    else:
+        ret[3][4] = 1
+    ret[4] = np.array([size_ratio, bd, num_replicas])
+
+    return np.concatenate(ret)
+
+def connvert_feature(strategy, resource_spec, graph_item):
+    
+    cluster = SSHCluster(resource_spec)
+    device_resolver = DeviceResolver(cluster)
+    graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
+    # bandwidth
+    network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices])
+    network_bandwidth = network_bandwidth
+    min_network_bandwidth = network_bandwidth.min()
+    # Other information
+    cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
+    gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
+    max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
+    total_num_local_replica = len(graph_replicas)
+    worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
+
+    num_vars = 0
+    total_size_vars = 0
+    for var_op, var in graph_item.trainable_var_op_to_var.items():
+        num_vars += 1
+        if var.initial_value.shape.ndims:
+            var_helper = VariableHelper(var, graph_item)
+            if var_helper.is_sparse:
+                total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape))
+            else:
+                total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype)
+    assert num_vars < MAX_NUM_VARS, num_vars
+    var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32)
+    partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1)
+
+    cnt = 0
+    for node_id, node in enumerate(strategy.node_config):
+        var_name = node.var_name
+        for var_op, var in graph_item.trainable_var_op_to_var.items():
+            if var.name == var_name:
+                break
+        var_helper = VariableHelper(var, graph_item)
+
+        if node.partitioner:
+            pc = PartitionerConfig(partition_str=node.partitioner)
+            for i, part in enumerate(node.part_config):
+                part_helper = PartHelper(i, var, pc)
+                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                 device_resolver)
+                if device == '':
+                    assert(isinstance(synchronizer, AllReduceSynchronizer))
+                    device = None
+                    bd = min_network_bandwidth
+                    num_replicas = 0
+                else:
+                    device = cpu_worker_list.index(device)
+                    bd = network_bandwidth[device]
+                    num_replicas = worker_num_replicas[device]
+
+                if var_helper.is_sparse:
+                    size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars
+                else:
+                    size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars
+                var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
+                partition_indice[cnt] = node_id
+                cnt += 1
+        else:
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+            device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                             device_resolver)
+            if device == '':
+                assert(isinstance(synchronizer, AllReduceSynchronizer))
+                device = None
+                bd = min_network_bandwidth
+                num_replicas = 0
+            else:
+                device = cpu_worker_list.index(device)
+                bd = network_bandwidth[device]
+                num_replicas = worker_num_replicas[device]
+
+            if var_helper.is_sparse:
+                size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars
+            else:
+                size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars
+            var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
+            partition_indice[cnt] = node_id
+            cnt += 1
+    return var_partition_features, partition_indice, np.array(node_id+1)
+
+def create_predefined_features(strategy, resource_spec, predefined_simulator):
+
+    var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec)
+
+    features = []
+    for var_name, sync_time in var_sync_time.items():
+        if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies.
+            transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
+            sync_time = sync_time[0]
+            is_ps = True
+        else:   # AR
+            transmission = sync_time['transmission']
+            is_ps = False
+
+        network_overhead = sync_time['network_overhead']
+        gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
+
+        feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)]
+        features.append(feat)
+    features = np.array(features, dtype=np.float)
+    return features
+
+class RankRNNSimulatorPenalty(SimulatorBase):
+    """Simulates strategies for a given graph and resource spec."""
+
+    def __init__(self,
+                 original_graph_item_path,
+                 num_rnn_layers,
+                 in_layers,
+                 out_layers,
+                 fetches=None,
+                 batch_size=1,
+                 seq_len=1,
+                 checkpoint=None):
+
+        super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path)
+        print("It's using RankNet simulator.")
+        self._fetches = fetches
+        self._batch_size_per_gpu = batch_size
+        self._seq_len = seq_len
+        self._checkpoint = checkpoint
+        self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path,
+                                                         batch_size=self._batch_size_per_gpu,
+                                                         seq_len=self._seq_len)
+        if self._checkpoint:
+            self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE)
+            self._model.load_state_dict(torch.load(self._checkpoint))
+
+    def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None):
+        score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint)
+        return score.view(-1).data.cpu().numpy(), feature.view(-1).data.cpu().numpy()
+
+
+    def predict(self,
+                strategy,
+                resource_spec,
+                strategy_path=None,
+                checkpoint=None):
+        if checkpoint is None:
+            if self._checkpoint is None:
+                raise ValueError("checkpoint is None: {}".format(checkpoint))
+            else:
+                model = self._model
+        else:
+            model = RankRNN().to(TORCH_DEVICE)
+            model.load_state_dict(torch.load(checkpoint))
+        if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')):
+            loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz'))
+            var_partition_features, partition_indice, var_num, _ = \
+                            loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
+        else:
+            var_partition_features, partition_indice, var_num = \
+                            connvert_feature(strategy, resource_spec, self._original_graph_item)
+
+        if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')):
+            loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz'))
+            predefined_features = loaded['x4']
+        else:
+            predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator)
+            
+        var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
+
+        var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE)
+        partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE)
+        var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE)
+
+        return model(var_partition_features, partition_indice, var_num, True)
+
+class RankNetTrainer():
+
+    def __init__(self,
+                 batch_size_per_gpu=256,
+                 seq_len=1,
+                 seed=1):
+        self._batch_size_per_gpu = batch_size_per_gpu
+        self._seq_len = seq_len
+        self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()}
+        self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v,
+                                                         batch_size=self._batch_size_per_gpu,
+                                                         seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()}
+        self.best_acc = 0.
+        print("It's using RankNet trainer.")
+
+    def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'):
+        features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()}
+        for training_path in path_list:
+            for path in Path(training_path).rglob('strategies'):
+                strategy_paths = glob.glob(os.path.join(path, '*'))
+                # strategy_paths = np.random.permutation(list(strategy_paths))
+                for strategy_path in strategy_paths:
+                    if 'json' in strategy_path or \
+                      'bert_large_batch_8_orca_16_group_2/' in strategy_path:
+                        continue
+                    model = get_model(strategy_path)
+                    if model is None:
+                        if not ('densenets169' in strategy_path or 'densenets201' in strategy_path):
+                            assert False, strategy_path
+                        continue
+                    rs_path = strategy_path.replace('strategies', 'resource_specs')
+                    runtime_path = strategy_path.replace('strategies', 'runtimes')
+                    npz_path = (strategy_path+'.npz').replace('strategies', 'npz')
+                    if not os.path.isfile(rs_path):
+                        rs_path += '.yml'  
+                    if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)):
+                        continue
+                    if not os.path.exists(os.path.dirname(npz_path)):
+                        os.makedirs(os.path.dirname(npz_path))
+
+                    if not os.path.isfile(npz_path):
+                        strategy = Strategy.deserialize(path=strategy_path)
+                        rs = ResourceSpec(resource_file=rs_path)
+                        var_partition_features, partition_indice, var_num = \
+                                        connvert_feature(strategy, rs, self.graph_items[model])
+                        label = np.array(json.load(open(runtime_path))['average'])
+                        np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label)
+                    else:
+                        loaded = np.load(npz_path)
+                        var_partition_features, partition_indice, var_num, label = \
+                                        loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
+
+                    if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')):
+                        predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model])
+                        np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features)
+                    else:
+                        loaded = np.load(npz_path.replace('.npz', '_pdf.npz'))
+                        predefined_features = loaded['x4']
+                    var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
+
+                    # is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here
+                    is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path)
+                    # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3])
+                    features[model][is_aws][0].append(var_partition_features)
+                    features[model][is_aws][1].append(partition_indice)
+                    features[model][is_aws][2].append(var_num)
+                    features[model][is_aws][3].append(label)
+
+        for k, _ in GRAPH_ITEM_PATHS.items():
+            for i1 in range(2):
+                for i2 in range(4):
+                    if len(features[k][i1][i2]) > 1:
+                        features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16)
+                        print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape)
+                    else:
+                        features[k][i1][i2] = None
+
+        train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0)
+        train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0)
+        train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0)
+        train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0)
+
+        if type(valid_patterns[0]) == str and valid_patterns[0] == 'same':
+            rng = np.random.RandomState(1)
+            permt = rng.permutation(train_features.shape[0])
+            split = int(len(permt) * 0.7)
+            val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]]
+            train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]]
+        else:
+            val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0)
+            val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0)
+            val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0)
+            val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0)
+
+            # comment here
+            rng = np.random.RandomState(1)
+            permt = rng.permutation(val_features.shape[0])
+            split = int(len(permt) * 0.7)
+            train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0)
+
+            val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]]
+        label_max = max(train_labels.max(), val_labels.max())
+        label_min = min(train_labels.min(), val_labels.min())
+        train_labels = (train_labels-label_min)/(label_max-label_min)
+        val_labels = (val_labels-label_min)/(label_max-label_min)
+        print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) 
+
+        ## train the model
+        trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE)))
+        testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE))
+        self.trainloader = torch.utils.data.DataLoader(dataset=trainset, 
+                                                  batch_size=BATCH_SIZE, 
+                                                  shuffle=True)
+        self.testloader = torch.utils.data.DataLoader(dataset=testset, 
+                                                  batch_size=32, 
+                                                  shuffle=False)
+
+    def train(self, name='', num_epochs=200, checkpoint=None):
+
+        checkpoint_path = 'model_on_{}.ckpt'.format(name)
+        print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS)
+
+        np.random.seed(1)
+        torch.manual_seed(1)
+        torch.cuda.manual_seed_all(1)
+        model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE)
+        if checkpoint:
+            model.load_state_dict(torch.load(checkpoint))
+        optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
+
+        best_val_acc = 0.
+        for epoch in range(num_epochs):
+            if epoch == int(num_epochs*2./5. - 1):
+                for param_group in optimizer.param_groups: param_group['lr'] = 3e-4
+            if epoch == int(num_epochs*4./5. - 1):
+                for param_group in optimizer.param_groups: param_group['lr'] = 1e-4
+
+            labels = []
+            outputs = []
+            for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader):  
+                
+                # Forward pass
+                outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze()
+
+                par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1)
+                
+                true_comp = (
+                    (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() 
+                  + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int()
+                  + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int()
+                 ) > 0
+                true_comp = true_comp.float() * 2 - 1
+                pred_comp = outputs_b[:, None] - outputs_b[None, :]
+                loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp)
+                loss = loss.tril(-1).mean()
+                
+                # Backward and optimize
+                optimizer.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25)
+                optimizer.step()
+
+                outputs.append(outputs_b)
+                labels.append(labels_b)
+
+            labels = torch.cat(labels)
+            outputs = torch.cat(outputs)
+            true_comp = (labels[:, None] > labels[None, :])
+            pred_comp = (outputs[:, None] > outputs[None, :])
+            equal = (true_comp == pred_comp).int()
+            train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
+            
+            with torch.no_grad():
+                labels = []
+                outputs = []
+                for features_b, par_indices_b, var_nums_b, labels_b in self.testloader:
+                    
+                    # Forward pass
+                    outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze()
+                    outputs.append(outputs_b)
+                    labels.append(labels_b)
+
+                labels = torch.cat(labels)
+                outputs = torch.cat(outputs)
+                true_comp = (labels[:, None] > labels[None, :])
+                pred_comp = (outputs[:, None] > outputs[None, :])
+                equal = (true_comp == pred_comp).int()
+                acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
+                if acc.item() > best_val_acc:
+                    best_val_acc = acc.item()
+                    if best_val_acc > self.best_acc:
+                        print('Saved model @ acc', best_val_acc)
+                        torch.save(model.state_dict(), checkpoint_path)
+                        self.best_acc = best_val_acc
+                    # print('Saved model to {}'.format(checkpoint_path))
+                if epoch == num_epochs - 1:
+                    print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc))
+        return checkpoint_path
+
+
+if __name__ == '__main__':
+
+    if True:
+        trainer = RankNetTrainer()
+        trainer.load_data([
+            '/users/hzhang2/oceanus_cost_model_training_data/vgg16',
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf',
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only',
+             # '/users/hzhang2/oceanus_cost_model_training_data/densenet', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', 
+             ],
+            [
+              ('vgg16', 0), #('vgg16', 1), 
+              # ('ncf', 0), #('ncf', 1), 
+              # ('bert_large', 1), #('bert_large', 1), 
+              # not used:
+              # ('densenet121', 0), ('densenet121', 1), 
+              # ('inceptionv3', 0), ('inceptionv3', 1), 
+              # ('resnet101', 0), ('resnet101', 1), 
+              # ('resnet50', 0), ('resnet50', 1), 
+              # ('bert_12l', 0), ('bert_12l', 1), 
+              # ('bert_6l', 0), ('bert_6l', 1), 
+              # ('bert_3l', 0), ('bert_3l', 1), 
+            ], 
+            [
+              ('vgg16', 1),
+              # ('ncf', 1), 
+              # ('bert_large', 1), 
+              # 'same',
+            ],
+        )
+        
+        for p2 in [0.01, 0.03]:
+            for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]:
+                for p4 in [1e-3, 2e-3, 1e-4, 3e-4, 5e-4, 5e-5]:
+                    for p1 in [3, 4, 2]:
+                        for p5 in [2, 3]:
+                            for p6 in [1, 2]:
+                                NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6
+                                checkpoint_path = trainer.train(name='vgg-orca-validon-0.83-sim1', num_epochs=200)
+        exit()
+    else:
+        checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_vgg-orca.ckpt'
+    test_list = [
+    # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15',
+    # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11',   #TARGET: 0.9
+    '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
+    ]
+    
+    for data_folder in test_list:
+        simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)],
+                                        num_rnn_layers=3,
+                                        batch_size=256,
+                                        seq_len=1,
+                                        checkpoint=checkpoint_path)
+
+        runtimes_folder = os.path.join(data_folder, 'runtimes')
+        results = {}
+        averages= []
+        scores = []
+        for name in os.listdir(runtimes_folder):
+            strategy_path = os.path.join(data_folder, 'strategies', name)
+            rs_path = os.path.join(data_folder, 'resource_specs', name )
+            if not os.path.isfile(rs_path):
+                rs_path += '.yml' 
+            runtime_path = os.path.join(runtimes_folder, name)
+
+            with open(runtime_path, 'r') as f:
+                runtimes = json.load(f)
+            average = np.array(runtimes['average'])
+
+            s = Strategy.deserialize(strategy_path)
+            rs = ResourceSpec(resource_file=rs_path)
+            score = simulator.simulate(s, rs, strategy_path)
+
+            results[name] = (average, score)
+            averages.append(average)
+            scores.append(score)
+
+        # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])}
+        # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])}
+        # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])}
+        # print('Sorted by runtime.......................')
+        # for _, (rt, prediction) in sorted_by_runtime.items():
+        #     print('runtime {}  prediction {}'.format(rt, prediction))
+
+        y_train = np.array(averages)
+        test_score = np.array(scores)
+        true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :])
+        pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :])
+        equal = (true_comp == pred_comp).astype(np.int)
+        test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1)
+
+        print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc))
diff --git a/autodist/simulator/models/rankrnn_simulator_penalty_fast.py b/autodist/simulator/models/rankrnn_simulator_penalty_fast.py
new file mode 100644
index 0000000..5e08bbd
--- /dev/null
+++ b/autodist/simulator/models/rankrnn_simulator_penalty_fast.py
@@ -0,0 +1,1027 @@
+"""Strategy RankNetSimulator."""
+import glob
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from string import digits
+import time
+
+import numpy as np
+import os
+import tensorflow as tf
+tf.compat.v1.disable_eager_execution()
+
+import arion
+from arion.graph_item import GraphItem
+from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from arion.simulator.models.base import SimulatorBase
+from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS
+from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica, _resolved_devices_on_diff_machine
+from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
+from arion.strategy.base import Strategy
+from arion.resource_spec import ResourceSpec
+from arion.cluster import SSHCluster
+from arion.kernel.device.resolver import DeviceResolver
+from arion.kernel.partitioner import PartitionerConfig
+from arion.simulator.models.predefined_simulator import PredefinedSimulator
+
+import torch
+import torch.nn as nn
+
+import multiprocessing
+from multiprocessing import Process, Queue
+
+TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# feature settings
+MAX_NUM_WORKERS = 16
+MAX_NUM_GROUPS = 600
+MAX_NUM_VARS = 500
+MAX_NUM_PARS = 1500
+FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15
+
+# model size
+PARTITION_MLP_HIDDEN = 128
+PARTITION_MLP_OUT = 32
+STEM_RNN_HIDDEN = 128
+BIDIECTIONAL = True
+BATCH_SIZE = 96
+
+NUM_RNN_LAYERS = 3
+SCORE_TH = 0.005
+LR = 2e-3
+WD = 3e-4
+DATA_AUG = False
+IN_LAYERS = 2
+OUT_LAYERS = 1
+
+# ncf used:
+# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020
+# noaug
+# PARTITION_MLP_HIDDEN = 128
+# PARTITION_MLP_OUT = 32
+# STEM_RNN_HIDDEN = 128
+# BIDIECTIONAL = True
+# NUM_RNN_LAYERS = 4
+# BATCH_SIZE = 64
+# LR = 1e-3
+# WD = 4e-4
+
+# vgg used:
+# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374
+# noaug
+# PARTITION_MLP_HIDDEN = 128
+# PARTITION_MLP_OUT = 32
+# STEM_RNN_HIDDEN = 128
+# BIDIECTIONAL = True
+# NUM_RNN_LAYERS = 3
+# BATCH_SIZE = 64
+# LR = 1e-3
+# WD = 3e-4
+
+GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item',
+                'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item',
+                'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item',
+                'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item',
+                'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item',
+                'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item',
+                'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l',
+                'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l',
+                'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l',
+                'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'}
+
+def get_model(path_):
+    if 'densenet121' in path_:
+        return 'densenet121'
+    elif 'ncf' in path_:
+        return 'ncf'
+    elif 'inceptionv3' in path_:
+        return 'inceptionv3'
+    elif 'resnet101' in path_:
+        return 'resnet101'
+    elif 'resnet50' in path_:
+        return 'resnet50'
+    elif 'vgg16' in path_:
+        return 'vgg16'
+    elif 'bert' in path_ and '12l' in path_:
+        return 'bert_12l'
+    elif 'bert' in path_ and '6l' in path_:
+        return 'bert_6l'
+    elif 'bert' in path_ and '3l' in path_:
+        return 'bert_3l'
+    elif 'bert' in path_ and 'large' in path_:
+        return 'bert_large'
+    else:
+        return None
+
+class RankRNN(nn.Module):
+    def __init__(self, input_size=FEATURE_SIZE,
+                       partition_mlp_hidden=PARTITION_MLP_HIDDEN, 
+                       partition_mlp_out=PARTITION_MLP_OUT, 
+                       stem_rnn_hidden=STEM_RNN_HIDDEN, 
+                       num_rnn_layers=NUM_RNN_LAYERS, 
+                       in_layers=IN_LAYERS,
+                       out_layers=OUT_LAYERS,
+                       bidirectional=BIDIECTIONAL):
+        super(RankRNN, self).__init__()
+        self.partition_mlp_out = partition_mlp_out
+        # self.num_rnn_layers = num_rnn_layers
+        self.stem_rnn_hidden = stem_rnn_hidden
+        tmp = [nn.Linear(input_size, partition_mlp_hidden)]
+        for _ in range(in_layers-2):
+            tmp.append(nn.ReLU())
+            tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden))
+        tmp.append(nn.ReLU())
+        tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out))
+
+        self.partition_mlp = nn.Sequential(*tmp)
+
+        self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional)
+
+        if out_layers == 1:
+            self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1)
+        elif out_layers == 2:
+            self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128),
+                                          nn.ReLU(),
+                                          nn.Linear(128, 1))
+
+        self.relu = nn.ReLU()
+    
+    def forward(self, features, par_indices, var_nums, return_feature=False):
+        # print(features.shape, par_indices.shape, var_nums.shape)
+        x = features.float()
+        # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float()
+        x = self.partition_mlp(x)
+
+        x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype)
+        x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x)
+
+        # Set initial hidden and cell states 
+        # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) 
+        # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE)
+        
+        # Forward propagate LSTM
+        x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False)
+        out, (ht, ct) = self.stem_rnn(x1)  # out: tensor of shape (batch_size, seq_length, hidden_size)
+
+        # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None]
+        out = ht.permute(1, 0, 2).reshape(x.shape[0], -1)
+        # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]])
+        # print(ht.permute(1, 0, 2).shape, x.shape)
+        if return_feature:
+            return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt())
+        else:
+            return self.final_fc(out)
+
+class TrainTensorDataset(torch.utils.data.Dataset):
+    """TensorDataset with support of transforms.
+    """
+    def __init__(self, tensors):
+        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        x = self.tensors[0][index]
+        x = self.perturbe_device_and_group(x)
+        x1 = self.tensors[1][index]
+        x2 = self.tensors[2][index]
+
+        y = self.tensors[3][index]
+
+        return x, x1, x2, y
+
+    def __len__(self):
+        return self.tensors[0].size(0)
+
+    def perturbe_device_and_group(self, x):
+        if DATA_AUG:
+            perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32)
+            perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32)
+            mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids]
+            mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids]
+            x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1)
+        return x
+
+
+def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas):
+    ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)]
+
+    if device is not None:
+        ret[0][device] = 1
+    
+    group = getattr(synchronizer, 'group', None)
+    if group is not None:
+        assert group < MAX_NUM_GROUPS, group
+        ret[1][group] = 1
+
+    compressor = getattr(synchronizer, 'compressor', None)
+    if compressor is not None:
+        if compressor in ["PowerSGDCompressor", 3]:
+            ret[2][2] = 1
+        elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]:
+            ret[2][1] = 1
+        elif compressor in ["NoneCompressor", 0]:
+            ret[2][0] = 1
+        else:
+            raise ValueError('Compressor does not exist: {}'.format(compressor))
+
+    local_replication = getattr(synchronizer, 'local_replication', None)
+    if isinstance(synchronizer, PSSynchronizer):
+        synchronizer = 0
+        if int(local_replication) == 0:
+            if int(is_sparse) == 0:
+                ret[3][0] = 1
+            else:
+                ret[3][1] = 1
+        else:
+            if int(is_sparse) == 0:
+                ret[3][2] = 1
+            else:
+                ret[3][3] = 1
+    else:
+        ret[3][4] = 1
+    ret[4] = np.array([size_ratio, bd, num_replicas])
+
+    return np.concatenate(ret)
+
+def connvert_feature(strategy, resource_spec, graph_item):
+    
+    cluster = SSHCluster(resource_spec)
+    device_resolver = DeviceResolver(cluster)
+    graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
+    # bandwidth
+    network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices])
+    network_bandwidth = network_bandwidth
+    min_network_bandwidth = network_bandwidth.min()
+    # Other information
+    cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
+    gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
+    max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
+    total_num_local_replica = len(graph_replicas)
+    worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
+
+    num_vars = 0
+    total_size_vars = 0
+    for var_op, var in graph_item.trainable_var_op_to_var.items():
+        num_vars += 1
+        if var.initial_value.shape.ndims:
+            var_helper = VariableHelper(var, graph_item)
+            if var_helper.is_sparse:
+                total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape))
+            else:
+                total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype)
+    assert num_vars < MAX_NUM_VARS, num_vars
+    var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32)
+    partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1)
+
+    cnt = 0
+    for node_id, node in enumerate(strategy.node_config):
+        var_name = node.var_name
+        for var_op, var in graph_item.trainable_var_op_to_var.items():
+            if var.name == var_name:
+                break
+        var_helper = VariableHelper(var, graph_item)
+
+        if node.partitioner:
+            pc = PartitionerConfig(partition_str=node.partitioner)
+            for i, part in enumerate(node.part_config):
+                part_helper = PartHelper(i, var, pc)
+                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                 device_resolver)
+                if device == '':
+                    assert(isinstance(synchronizer, AllReduceSynchronizer))
+                    device = None
+                    bd = min_network_bandwidth
+                    num_replicas = 0
+                else:
+                    device = cpu_worker_list.index(device)
+                    bd = network_bandwidth[device]
+                    num_replicas = worker_num_replicas[device]
+
+                if var_helper.is_sparse:
+                    size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars
+                else:
+                    size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars
+                var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
+                partition_indice[cnt] = node_id
+                cnt += 1
+        else:
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+            device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                             device_resolver)
+            if device == '':
+                assert(isinstance(synchronizer, AllReduceSynchronizer))
+                device = None
+                bd = min_network_bandwidth
+                num_replicas = 0
+            else:
+                device = cpu_worker_list.index(device)
+                bd = network_bandwidth[device]
+                num_replicas = worker_num_replicas[device]
+
+            if var_helper.is_sparse:
+                size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars
+            else:
+                size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars
+            var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
+            partition_indice[cnt] = node_id
+            cnt += 1
+    return var_partition_features, partition_indice, np.array(node_id+1)
+
+def create_predefined_features(strategy, resource_spec, predefined_simulator):
+
+    var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec)
+
+    features = []
+    for var_name, sync_time in var_sync_time.items():
+        if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies.
+            transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
+            sync_time = sync_time[0]
+            is_ps = True
+        else:   # AR
+            transmission = sync_time['transmission']
+            is_ps = False
+
+        network_overhead = sync_time['network_overhead']
+        gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
+
+        feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)]
+        features.append(feat)
+    features = np.array(features, dtype=np.float)
+    return features
+
+def extract_graph_item(graph_item):
+    total_size_vars = 0
+    name2var = {}
+    name2var_helper = {}
+    for var_op, var in graph_item.trainable_var_op_to_var.items():
+        name2var[var.name] = var
+        var_helper = VariableHelper(var, graph_item)
+        name2var_helper[var.name] = var_helper
+        if var.initial_value.shape.ndims:
+            if var_helper.is_sparse:
+                total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape))
+            else:
+                total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype)
+
+    return total_size_vars, name2var, name2var_helper
+
+def wrap_fn(queue, idx, run_worker, rs, st):
+    ret = run_worker(rs, st)
+    queue.put((idx, ret))
+
+def convert_feature_batch(strategys, resource_specs, total_size_vars, name2var, name2var_helper, _batch_size_per_gpu, _seq_len):
+
+    def var_ps_time(var_size_to_transfer, is_sparse, device, dtype, local_replication, network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list,  network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+        """Compute synchronization time of a variable in PS strategy."""
+        def _helper(worker_list, worker_num_replicas=None):
+            if worker_num_replicas is None:
+                worker_num_replicas = [1.0] * len(worker_list)
+
+            this_server_time = 0
+            # network transfer: sum up all workers time. equals to the time cost of this server.
+            # TODO(Hao): didn't consider any parallelization among partitions
+            for k, worker in enumerate(worker_list):
+                if _resolved_devices_on_diff_machine(device, worker):
+                    if is_sparse:
+                        this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
+                    else:
+                        this_worker_size = get_dense_var_bits(var_size_to_transfer, dtype)
+                    this_server_time += this_worker_size / network_bandwidth_map[device][worker]
+
+            return {
+                'transmission': this_server_time,
+                'network_overhead': len(worker_list),
+                'gpu_kernel_memory_latency': max_num_local_replica,
+            }
+
+        send_time = _helper(cpu_worker_list)
+        if local_replication:
+            receive_time = _helper(cpu_worker_list)
+        else:
+            receive_time = _helper(gpu_worker_list)
+
+        return send_time, receive_time
+
+    def var_ar_time(var_size_to_transfer, og_shape, dtype, compressor, max_num_local_replica, cpu_worker_list, network_bandwidth_map, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+        """Compute synchronization time of a variable in AR strategy."""
+        worker_list = cpu_worker_list
+        num_workers = len(worker_list)
+        min_bandwidth = None
+        for i in range(num_workers):
+            for j in range(i, num_workers):
+                if min_bandwidth is None:
+                    min_bandwidth = network_bandwidth_map[worker_list[j]][worker_list[i]]
+                else:
+                    min_bandwidth = min(min_bandwidth, network_bandwidth_map[worker_list[j]][worker_list[i]])
+
+        # Compressor
+        if compressor == "PowerSGDCompressor" or compressor == 3:
+            rank = 10  # currently using default value. So hardcode here. # todo: confirm
+            # assume var must be a dense variable.
+            ndims = len(og_shape)
+            if ndims <= 1:  # no compress
+                size_to_transfer = var_size_to_transfer
+            else:
+                if ndims > 2:
+                    n = og_shape[0]
+                    m = 1
+                    for s in og_shape[1:]:
+                        m *= s  # tensor's shape (n, m)
+                else:
+                    n, m = og_shape[0], og_shape[1]
+                size_to_transfer = n * rank + m * rank
+            dtype = tf.float32
+        elif compressor == "HorovodCompressorEF" or compressor == "HorovodCompressor"  \
+                or compressor == 2  or compressor == 1:
+            size_to_transfer = var_size_to_transfer
+            dtype = tf.float32
+        elif compressor == "NoneCompressor" or compressor == 0:
+            size_to_transfer = var_size_to_transfer
+            dtype = dtype
+        else:
+            raise ValueError('Compressor does not exist: {}'.format(compressor))
+
+        time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
+
+        return {
+            'transmission': time,
+            'network_overhead': 1,  # len(worker_list),
+            'gpu_kernel_memory_latency': max_num_local_replica,
+        }
+
+    def network_bandwidth2(resource_spec: ResourceSpec, device_resolver: DeviceResolver):
+        """Calculates all P2P network bandwidths between nodes in the cluster."""
+        devices = [device for device, _ in resource_spec.devices]
+        resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices]
+        gpu_cpu_bw = 10000.  # hardcode for now
+        network_bandwidth = {}  # key: <server, worker>
+        for i in range(len(devices)):
+            if resolved_devices[i] not in network_bandwidth:
+                network_bandwidth[resolved_devices[i]] = {}
+            for j in range(i, len(devices)):
+                if resolved_devices[j] not in network_bandwidth:
+                    network_bandwidth[resolved_devices[j]] = {}
+                ip_i = devices[i].split(':')[0]
+                ip_j = devices[j].split(':')[0]
+                if ip_i != ip_j:
+                    network_bandwidth[resolved_devices[i]][resolved_devices[j]] \
+                        = GIGABITS * resource_spec.network_bandwidth[ip_i]
+                    network_bandwidth[resolved_devices[j]][resolved_devices[i]] \
+                        = GIGABITS * resource_spec.network_bandwidth[ip_j]
+                else:
+                    network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw
+                    network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw
+        return network_bandwidth
+
+    def run_worker(resource_spec, strategy):
+        cluster = SSHCluster(resource_spec)
+        device_resolver = DeviceResolver(cluster)
+        graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
+        # bandwidth
+        network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices])
+        min_network_bandwidth = network_bandwidth.min()
+        network_bandwidth_map = network_bandwidth2(resource_spec, device_resolver)
+        # Other information
+        cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
+        gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
+        max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
+        total_num_local_replica = len(graph_replicas)
+        worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
+
+        var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE)).astype(np.float32)
+        partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1)
+        cnt = 0
+        for node_id, node in enumerate(strategy.node_config):
+            var_name = node.var_name
+            var = name2var[var_name]
+            var_helper = name2var_helper[var_name]
+
+            if node.partitioner:
+                pc = PartitionerConfig(partition_str=node.partitioner)
+                for i, part in enumerate(node.part_config):
+                    synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                    reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                    device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                     device_resolver)
+                    if device == '':
+                        assert(isinstance(synchronizer, AllReduceSynchronizer))
+                        device_id = None
+                        bd = min_network_bandwidth
+                        num_replicas = 0
+                    else:
+                        device_id = cpu_worker_list.index(device)
+                        bd = network_bandwidth[device_id]
+                        num_replicas = worker_num_replicas[device_id]
+
+                    par_shape = var.initial_value.shape.as_list()
+                    dim_size = par_shape[pc.axis] // pc.num_shards
+                    extras = par_shape[pc.axis] % pc.num_shards
+                    if i < extras:
+                        dim_size += 1
+                    par_shape[pc.axis] = dim_size
+
+                    size_to_transfer =np.prod(par_shape)
+                    if var_helper.is_sparse:
+                        raise Error
+                        size_ratio = get_sparse_var_bits(size_to_transfer)/total_size_vars
+                    else:
+                        size_ratio = get_dense_var_bits(size_to_transfer, var_helper.dtype)/total_size_vars
+
+                    if isinstance(synchronizer, AllReduceSynchronizer):
+                        sync_time = var_ar_time(size_to_transfer, par_shape, var_helper.dtype, getattr(synchronizer, 'compressor', None), max_num_local_replica, cpu_worker_list, network_bandwidth_map)
+                        transmission = sync_time['transmission']
+                        is_ps = False
+                    else:
+                        sync_time = var_ps_time(size_to_transfer, var_helper.is_sparse, device, var_helper.dtype, getattr(synchronizer, 'local_replication', None), network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list)
+                        transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
+                        sync_time = sync_time[0]
+                        is_ps = True
+                    network_overhead = sync_time['network_overhead']
+                    gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
+
+                    var_partition_features[cnt] = np.concatenate([to_numpy(synchronizer, device_id, size_ratio, var_helper.is_sparse, bd, num_replicas), np.array([transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)])])
+                    partition_indice[cnt] = node_id
+                    cnt += 1
+            else:
+                synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
+                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
+                                                 device_resolver)
+                if device == '':
+                    assert(isinstance(synchronizer, AllReduceSynchronizer))
+                    device_id = None
+                    bd = min_network_bandwidth
+                    num_replicas = 0
+                else:
+                    device_id = cpu_worker_list.index(device)
+                    bd = network_bandwidth[device_id]
+                    num_replicas = worker_num_replicas[device_id]
+
+                size_to_transfer =np.prod(var_helper.shape)
+                if var_helper.is_sparse:
+                    raise Error
+                    size_ratio = get_sparse_var_bits(size_to_transfer)/total_size_vars
+                else:
+                    size_ratio = get_dense_var_bits(size_to_transfer, var_helper.dtype)/total_size_vars
+
+                if isinstance(synchronizer, AllReduceSynchronizer):
+                    sync_time = var_ar_time(size_to_transfer, var.initial_value.shape.as_list(), var_helper.dtype, getattr(synchronizer, 'compressor', None), max_num_local_replica, cpu_worker_list, network_bandwidth_map)
+                    transmission = sync_time['transmission']
+                    is_ps = False
+                else:
+                    sync_time = var_ps_time(size_to_transfer, var_helper.is_sparse, device, var_helper.dtype, getattr(synchronizer, 'local_replication', None), network_bandwidth_map, max_num_local_replica, cpu_worker_list, gpu_worker_list)
+                    transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
+                    sync_time = sync_time[0]
+                    is_ps = True
+                network_overhead = sync_time['network_overhead']
+                gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
+
+                var_partition_features[cnt] = np.concatenate([to_numpy(synchronizer, device_id, size_ratio, var_helper.is_sparse, bd, num_replicas), np.array([transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)])])
+                partition_indice[cnt] = node_id
+                cnt += 1
+        return (var_partition_features, partition_indice, np.array(node_id+1))
+
+    # t1 =time.time()
+    # with multiprocessing.Pool(processes=32) as pool:
+    #     results = pool.starmap(run_worker, zip(resource_specs, strategys))
+    # ret1, ret2, ret3 = [], [], []        
+    # for tmp in results:
+    #     ret1.append(tmp[0]); ret2.append(tmp[1]); ret3.append(tmp[2])
+
+    q = Queue()
+    rets = []
+    prs = []
+    for idx, (arg1, arg2) in enumerate(zip(resource_specs, strategys)):
+        prs.append(Process(target=wrap_fn, args=(q, idx, run_worker, arg1, arg2)))
+        prs[-1].start()
+    for pr in prs:
+        ret = q.get() # will block
+        rets.append(ret)
+    for pr in prs:
+        pr.join()
+
+    ret1, ret2, ret3 = [], [], []
+    for tmp in sorted(rets, key=lambda x: x[0]):
+        ret1.append(tmp[1][0]); ret2.append(tmp[1][1]); ret3.append(tmp[1][2])
+    # print(time.time() - t1)
+
+    # t1 =time.time()
+    # ret1, ret2, ret3 = [], [], []
+    # for rs, st in zip(resource_specs, strategys):
+    #     tmp = run_worker(rs, st)
+    #     ret1.append(tmp[0]); ret2.append(tmp[1]); ret3.append(tmp[2])
+    # print(time.time() - t1)
+    return np.stack(ret1), np.stack(ret2), np.stack(ret3)
+
+
+class RankRNNSimulatorPenalty(SimulatorBase):
+    """Simulates strategies for a given graph and resource spec."""
+
+    def __init__(self,
+                 original_graph_item_path,
+                 num_rnn_layers,
+                 in_layers,
+                 out_layers,
+                 fetches=None,
+                 batch_size=1,
+                 seq_len=1,
+                 checkpoint=None):
+
+        super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path)
+        print("It's using RankNet simulator.")
+        self._fetches = fetches
+        self._batch_size_per_gpu = batch_size
+        self._seq_len = seq_len
+        self._checkpoint = checkpoint
+        self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path,
+                                                         batch_size=self._batch_size_per_gpu,
+                                                         seq_len=self._seq_len)
+        if self._checkpoint:
+            self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE)
+            self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu')))
+
+        total_size_vars, name2var, name2var_helper = extract_graph_item(self._original_graph_item)
+        self.total_size_vars = total_size_vars
+        self.name2var = name2var
+        self.name2var_helper = name2var_helper
+
+    def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None):
+        score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint)
+        return score.view(-1).data.cpu().numpy(), feature.data.cpu().numpy()
+
+    def predict(self,
+                strategy,
+                resource_spec,
+                strategy_path=None,
+                checkpoint=None):
+        if checkpoint is None:
+            if self._checkpoint is None:
+                raise ValueError("checkpoint is None: {}".format(checkpoint))
+            else:
+                model = self._model
+        else:
+            model = RankRNN().to(TORCH_DEVICE)
+            model.load_state_dict(torch.load(checkpoint))
+        if type(strategy) == list and type(resource_spec) == list:
+            
+            var_partition_features, partition_indice, var_num = convert_feature_batch(strategy, resource_spec, self.total_size_vars, self.name2var, self.name2var_helper, self._batch_size_per_gpu, self._seq_len)
+
+            var_partition_features = torch.from_numpy(var_partition_features).to(TORCH_DEVICE)
+            partition_indice = torch.from_numpy(partition_indice).to(TORCH_DEVICE)
+            var_num = torch.from_numpy(var_num).to(TORCH_DEVICE)
+
+            return model(var_partition_features, partition_indice, var_num, True)
+        else:
+            if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')):
+                loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz'))
+                var_partition_features, partition_indice, var_num, _ = \
+                                loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
+            else:
+                var_partition_features, partition_indice, var_num = \
+                                connvert_feature(strategy, resource_spec, self._original_graph_item)
+
+            if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')):
+                loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz'))
+                predefined_features = loaded['x4']
+            else:
+                predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator)
+                
+            var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
+
+            var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE)
+            partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE)
+            var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE)
+
+            return model(var_partition_features, partition_indice, var_num, True)
+
+class RankNetTrainer():
+
+    def __init__(self,
+                 batch_size_per_gpu=256,
+                 seq_len=1,
+                 seed=1):
+        self._batch_size_per_gpu = batch_size_per_gpu
+        self._seq_len = seq_len
+        self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()}
+        self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v,
+                                                         batch_size=self._batch_size_per_gpu,
+                                                         seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()}
+        self.best_acc = 0.
+        print("It's using RankNet trainer.")
+
+    def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'):
+        features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()}
+        for training_path in path_list:
+            for path in Path(training_path).rglob('strategies'):
+                strategy_paths = glob.glob(os.path.join(path, '*'))
+                # strategy_paths = np.random.permutation(list(strategy_paths))
+                for strategy_path in strategy_paths:
+                    if 'json' in strategy_path or \
+                      'bert_large_batch_8_orca_16_group_2/' in strategy_path:
+                        continue
+                    model = get_model(strategy_path)
+                    if model is None:
+                        if not ('densenets169' in strategy_path or 'densenets201' in strategy_path):
+                            assert False, strategy_path
+                        continue
+                    rs_path = strategy_path.replace('strategies', 'resource_specs')
+                    runtime_path = strategy_path.replace('strategies', 'runtimes')
+                    npz_path = (strategy_path+'.npz').replace('strategies', 'npz')
+                    if not os.path.isfile(rs_path):
+                        rs_path += '.yml'  
+                    if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)):
+                        continue
+                    if not os.path.exists(os.path.dirname(npz_path)):
+                        os.makedirs(os.path.dirname(npz_path))
+
+                    if not os.path.isfile(npz_path):
+                        strategy = Strategy.deserialize(path=strategy_path)
+                        rs = ResourceSpec(resource_file=rs_path)
+                        var_partition_features, partition_indice, var_num = \
+                                        connvert_feature(strategy, rs, self.graph_items[model])
+                        label = np.array(json.load(open(runtime_path))['average'])
+                        np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label)
+                    else:
+                        loaded = np.load(npz_path)
+                        var_partition_features, partition_indice, var_num, label = \
+                                        loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
+
+                    if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')):
+                        predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model])
+                        np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features)
+                    else:
+                        loaded = np.load(npz_path.replace('.npz', '_pdf.npz'))
+                        predefined_features = loaded['x4']
+                    var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
+
+                    is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here
+                    # is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path)
+                    # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3])
+                    features[model][is_aws][0].append(var_partition_features)
+                    features[model][is_aws][1].append(partition_indice)
+                    features[model][is_aws][2].append(var_num)
+                    features[model][is_aws][3].append(label)
+
+        for k, _ in GRAPH_ITEM_PATHS.items():
+            for i1 in range(2):
+                for i2 in range(4):
+                    if len(features[k][i1][i2]) > 1:
+                        features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16)
+                        print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape)
+                    else:
+                        features[k][i1][i2] = None
+
+        train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0)
+        train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0)
+        train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0)
+        train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0)
+
+        if type(valid_patterns[0]) == str and valid_patterns[0] == 'same':
+            rng = np.random.RandomState(1)
+            permt = rng.permutation(train_features.shape[0])
+            split = int(len(permt) * 0.7)
+            val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]]
+            train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]]
+        else:
+            val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0)
+            val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0)
+            val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0)
+            val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0)
+
+            # comment here
+            rng = np.random.RandomState(1)
+            permt = rng.permutation(val_features.shape[0])
+            split = int(len(permt) * 0.7)
+            train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0)
+
+            val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]]
+        label_max = max(train_labels.max(), val_labels.max())
+        label_min = min(train_labels.min(), val_labels.min())
+        train_labels = (train_labels-label_min)/(label_max-label_min)
+        val_labels = (val_labels-label_min)/(label_max-label_min)
+        print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) 
+
+        ## train the model
+        trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE)))
+        testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE))
+        self.trainloader = torch.utils.data.DataLoader(dataset=trainset, 
+                                                  batch_size=BATCH_SIZE, 
+                                                  shuffle=True)
+        self.testloader = torch.utils.data.DataLoader(dataset=testset, 
+                                                  batch_size=32, 
+                                                  shuffle=False)
+
+    def train(self, name='', num_epochs=200, checkpoint=None):
+
+        checkpoint_path = 'model_on_{}.ckpt'.format(name)
+        print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS)
+
+        np.random.seed(1)
+        torch.manual_seed(1)
+        torch.cuda.manual_seed_all(1)
+        model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE)
+        if checkpoint:
+            model.load_state_dict(torch.load(checkpoint))
+        optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
+
+        best_val_acc = 0.
+        for epoch in range(num_epochs):
+            if epoch == int(num_epochs*2./5. - 1):
+                for param_group in optimizer.param_groups: param_group['lr'] = 3e-4
+            if epoch == int(num_epochs*4./5. - 1):
+                for param_group in optimizer.param_groups: param_group['lr'] = 1e-4
+
+            labels = []
+            outputs = []
+            for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader):  
+                
+                # Forward pass
+                outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze()
+
+                par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1)
+                
+                true_comp = (
+                    (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() 
+                  + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int()
+                  + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int()
+                 ) > 0
+                true_comp = true_comp.float() * 2 - 1
+                pred_comp = outputs_b[:, None] - outputs_b[None, :]
+                loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp)
+                loss = loss.tril(-1).mean()
+                
+                # Backward and optimize
+                optimizer.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25)
+                optimizer.step()
+
+                outputs.append(outputs_b)
+                labels.append(labels_b)
+
+            labels = torch.cat(labels)
+            outputs = torch.cat(outputs)
+            true_comp = (labels[:, None] > labels[None, :])
+            pred_comp = (outputs[:, None] > outputs[None, :])
+            equal = (true_comp == pred_comp).int()
+            train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
+            
+            with torch.no_grad():
+                labels = []
+                outputs = []
+                for features_b, par_indices_b, var_nums_b, labels_b in self.testloader:
+                    
+                    # Forward pass
+                    outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze()
+                    outputs.append(outputs_b)
+                    labels.append(labels_b)
+
+                labels = torch.cat(labels)
+                outputs = torch.cat(outputs)
+                true_comp = (labels[:, None] > labels[None, :])
+                pred_comp = (outputs[:, None] > outputs[None, :])
+                equal = (true_comp == pred_comp).int()
+                acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
+                if acc.item() > best_val_acc:
+                    best_val_acc = acc.item()
+                    if best_val_acc > self.best_acc:
+                        print('Saved model @ acc', best_val_acc)
+                        torch.save(model.state_dict(), checkpoint_path)
+                        self.best_acc = best_val_acc
+                    # print('Saved model to {}'.format(checkpoint_path))
+                if epoch == num_epochs - 1:
+                    print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc))
+        return checkpoint_path
+
+
+if __name__ == '__main__':
+
+    if False:
+        trainer = RankNetTrainer()
+        trainer.load_data([
+            '/users/hzhang2/oceanus_cost_model_training_data/vgg16',
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf',
+             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4',
+             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only',
+             # '/users/hzhang2/oceanus_cost_model_training_data/densenet', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', 
+             # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', 
+             ],
+            [
+              ('vgg16', 1), #('vgg16', 1), 
+              # ('ncf', 0), #('ncf', 1), 
+              # ('bert_large', 1), #('bert_large', 1), 
+              # not used:
+              # ('densenet121', 0), ('densenet121', 1), 
+              # ('inceptionv3', 0), ('inceptionv3', 1), 
+              # ('resnet101', 0), ('resnet101', 1), 
+              # ('resnet50', 0), ('resnet50', 1), 
+              # ('bert_12l', 0), ('bert_12l', 1), 
+              # ('bert_6l', 0), ('bert_6l', 1), 
+              # ('bert_3l', 0), ('bert_3l', 1), 
+            ], 
+            [
+              # ('vgg16', 1),
+              # ('ncf', 1), 
+              # ('bert_large', 1), 
+              'same',
+            ],
+        )
+        
+        for p2 in [0.01, 0.03]:
+            for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]:
+                for p4 in [1e-3, 1e-4, 3e-4, 5e-4, 5e-5, 2e-3, ]:
+                    for p1 in [3, 4, 2]:
+                        for p5 in [2, 3]:
+                            for p6 in [1, 2]:
+                                NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6
+                                checkpoint_path = trainer.train(name='vgg-aws-new-2', num_epochs=200)
+        exit()
+    else:
+        checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_bert-aws-only.ckpt'
+    test_list = [
+    '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only',
+    # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15',
+    # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11',   #TARGET: 0.9
+    # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
+    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
+    ]
+    
+    for data_folder in test_list:
+        simulator = RankRNNSimulatorPenalty3(GRAPH_ITEM_PATHS[get_model(data_folder)],
+                                        4,
+                                        2,
+                                        1,
+                                        batch_size=256,
+                                        seq_len=1,
+                                        checkpoint=checkpoint_path)
+
+        runtimes_folder = os.path.join(data_folder, 'runtimes')
+        results = {}
+        averages= []
+        scores = []
+        strategys = []
+        rss = []
+        strategy_paths = []
+        for name in os.listdir(runtimes_folder):
+            strategy_path = os.path.join(data_folder, 'strategies', name)
+            rs_path = os.path.join(data_folder, 'resource_specs', name )
+
+            if not os.path.isfile(rs_path):
+                rs_path += '.yml' 
+            runtime_path = os.path.join(runtimes_folder, name)
+
+            strategy_paths.append(strategy_path)
+
+            with open(runtime_path, 'r') as f:
+                runtimes = json.load(f)
+            average = np.array(runtimes['average'])
+
+            s = Strategy.deserialize(strategy_path)
+            rs = ResourceSpec(resource_file=rs_path)
+            strategys.append(s)
+            rss.append(rs)
+
+            averages.append(average)
+
+        # for tmp1, tmp2, tmp3 in zip(strategys, rss, strategy_paths):
+        #     scores.append(simulator.simulate(tmp1, tmp2, tmp3)[0])
+        # print(np.stack(scores).reshape(-1))
+
+        scores = simulator.simulate(strategys, rss)[0]
+        print(scores)
+
+        # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])}
+        # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])}
+        # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])}
+        # print('Sorted by runtime.......................')
+        # for _, (rt, prediction) in sorted_by_runtime.items():
+        #     print('runtime {}  prediction {}'.format(rt, prediction))
+
+        y_train = np.array(averages)
+        test_score = np.array(scores)
+        true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :])
+        pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :])
+        equal = (true_comp == pred_comp).astype(np.int)
+        test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1)
+
+        print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc))
diff --git a/autodist/simulator/test.py b/autodist/simulator/test.py
new file mode 100644
index 0000000..b481208
--- /dev/null
+++ b/autodist/simulator/test.py
@@ -0,0 +1,17 @@
+from arion.simulator.simulator import Simulator
+from arion.strategy import base
+from arion.graph_item import GraphItem
+
+resource_spec_file = '/home/hao.zhang/project/pycharm/ncf-trial/official/recommendation/trial/trialrun_resource_specs/resource_spec_2.yml'
+strategy_path = '/home/hao.zhang/oceanus_simulator/ncf_3/strategies/20200505T174311M650364'
+original_graph_item_path = '/home/hao.zhang/oceanus_simulator/ncf/strategies/original_graph_item'
+
+s = base.Strategy.deserialize(strategy_path)
+
+
+simulator = Simulator(resource_file=resource_spec_file,
+                      original_graph_item_path=original_graph_item_path)
+
+ret = simulator.simulate(s)
+
+print('finished')
diff --git a/autodist/simulator/train_linear.py b/autodist/simulator/train_linear.py
new file mode 100644
index 0000000..c7e9438
--- /dev/null
+++ b/autodist/simulator/train_linear.py
@@ -0,0 +1,123 @@
+import os
+import glob
+import json
+import numpy as np
+from collections import OrderedDict
+from os.path import expanduser
+from sklearn import linear_model
+from sklearn.linear_model import Ridge
+from arion.simulator.utils import split_dataset
+
+def create_features(simulation):
+	runtime_coefficients = simulation['runtime_coefficients']
+	var_sync_time = simulation['var_sync_time'] # dict: <var_name, [send_time_dict, receive_time_dict]>
+
+	res = OrderedDict({
+		'network_overhead': 0.0,
+		'gpu_kenrel_memory_latency': 0.0,
+		'constant_factor': 0.0,
+		'allreduce_factor': 0.0,
+	})
+	for var_name, sim_time in var_sync_time.items():
+		if isinstance(sim_time, list):
+			# PS strategies
+			send_time, receive_time = sim_time
+			res['constant_factor'] += send_time['transmission'] + receive_time['transmission']
+			res['network_overhead'] += send_time['network_overhead'] + receive_time['network_overhead']
+			res['gpu_kenrel_memory_latency'] += send_time['gpu_kenrel_memory_latency'] + receive_time['gpu_kenrel_memory_latency']
+		elif isinstance(sim_time, dict):
+			# Allreduce strategy
+			res['allreduce_factor'] += sim_time['transmission']
+			res['network_overhead'] += sim_time['network_overhead']
+			res['gpu_kenrel_memory_latency'] += sim_time['gpu_kenrel_memory_latency']
+		else:
+			raise ValueError
+
+	# runtime_coefficients = {
+	# 	'transmission': slowest_server_time,
+	#     'network_overhead': len(worker_list),
+	#     'gpu_kenrel_memory_latency': max_num_local_replica,
+	#     'constant': 1.0,
+	#     # possible affecting factors.
+	#     'var_name': var_name,
+	#     'strategy': 'ps',
+	#     'local_proxy': local_proxy,
+	#     'is_sparse': is_sparse,
+	#     'server_list': [partition.to_dict() for partition in server_list],
+	#     'worker_list': worker_list,
+	#     'cpu_worker_list': cpu_worker_list,
+	#     'gpu_worker_list': gpu_worker_list,
+	#     'worker_num_replicas': worker_num_replicas,
+	#     'max_num_local_replica': max_num_local_replica,
+	# }
+	# runtime_coefficients = [
+	# 	runtime_coefficients['transmission'],
+	# 	runtime_coefficients['network_overhead'],
+	# 	runtime_coefficients['gpu_kenrel_memory_latency'],
+	# ]
+	return list(res.values())
+
+def load_trial_run_data(data_dir):
+	runtimes_folders = glob.glob("{}/**/runtimes".format(data_dir), recursive=True)
+	X = []
+	Y = []
+	for runtimes_folder in runtimes_folders:
+		print(runtimes_folder)
+		runtimes_files = glob.glob(os.path.join(runtimes_folder, '*'))
+		for runtimes_file in runtimes_files:
+			# Target
+			runtime = json.load(open(runtimes_file, 'r'))
+			y = runtime['average']
+			# Features
+			simulation_file = '/'.join(runtimes_file.split('/')[:-2]) + '/simulations/' + runtimes_file.split('/')[-1]
+			assert os.path.isfile(simulation_file), 'simulation_file {} does not exist'.format(simulation_file)
+			simulation = json.load(open(simulation_file, 'r'))
+			x = create_features(simulation)
+			X.append(x)
+			Y.append(y)
+	return X, Y
+
+data_dir = os.path.join(expanduser('~'), 'oceanus_simulator/lm1b-patchon')
+X, Y = load_trial_run_data(data_dir)
+X_train, Y_train, X_valid, Y_valid = split_dataset(X, Y)
+print('X_train', X_train.shape, 'Y_train', Y_train.shape, 'X_valid', X_valid.shape, 'Y_valid', Y_valid.shape)
+
+# Linear regression
+lm = linear_model.LinearRegression()
+model = lm.fit(X_train, Y_train)
+predictions = lm.predict(X_valid)
+print('predictions, targets: ')
+pt = zip(predictions, Y_valid)
+pt = sorted(pt, key=lambda x: x[1])
+for p, t in pt:
+	print(p, t)
+train_score = lm.score(X_train, Y_train)
+valid_score = lm.score(X_valid, Y_valid)
+print('Linear train_score', train_score)
+print('Linear valid_score', valid_score)
+
+# Ridge regression
+ridge = Ridge(alpha=1.0)
+ridge.fit(X_train, Y_train)
+predictions = ridge.predict(X_valid)
+train_score = ridge.score(X_train, Y_train)
+valid_score = ridge.score(X_valid, Y_valid)
+print('Ridge train_score', train_score)
+print('Ridge valid_score', valid_score)
+
+
+# Lasso
+lasso = linear_model.Lasso(alpha=0.1)
+lasso.fit(X_train, Y_train)
+train_score = lasso.score(X_train, Y_train)
+valid_score = lasso.score(X_valid, Y_valid)
+print('Lasso train_score', train_score)
+print('Lasso valid_score', valid_score)
+
+# ElasticNet
+elastic = linear_model.ElasticNet(random_state=0)
+elastic.fit(X_train, Y_train)
+train_score = elastic.score(X_train, Y_train)
+valid_score = elastic.score(X_valid, Y_valid)
+print('ElasticNet train_score', train_score)
+print('ElasticNet valid_score', valid_score)
diff --git a/autodist/simulator/train_predefined_simulator.py b/autodist/simulator/train_predefined_simulator.py
new file mode 100644
index 0000000..43bb08b
--- /dev/null
+++ b/autodist/simulator/train_predefined_simulator.py
@@ -0,0 +1,343 @@
+import sys
+import os
+import numpy as np
+import tensorflow as tf
+from os.path import expanduser
+import tqdm
+
+from tensorflow.python.eager import context
+import tensorflow_ranking as tfr
+
+from arion.strategy.base import Strategy
+from arion.resource_spec import ResourceSpec
+from arion.simulator import utils
+from arion.simulator.models.predefined_simulator import PredefinedSimulator
+from arion.simulator.utils import RankingLossKeys
+
+class TFRIterator:
+	def __init__(self, X, Y, list_size, batch_size, split, baseline=0.0, scale=1.0):
+		assert len(X) > 0, 'data: {}'.format(len(X))
+		self.X = X
+		self.Y = Y
+		self.list_size = list_size
+		self.baseline = baseline
+		self.scale = scale
+		self.batch_size = batch_size
+		self.split = split
+		self.n = len(X)
+		self.num_examples = self.get_num_examples()
+		print('Split: {},\tnumber of samples: {},\tnumber of examples: {},\tmin of y: {}'.format(
+            split, len(X), self.num_examples, self.get_min_y()))
+
+	def get_min_y(self):
+		return np.min(self.Y)
+
+	def get_num_examples(self):
+		n_examples = 1
+		for i in range(self.list_size):
+			n_examples *= (len(self.X) -1)
+		return n_examples
+
+	def get_next(self):
+		xs = [[] for _ in range(self.list_size)]
+		ys = []
+		for i in range(self.batch_size):
+			y =[]
+			for j in range(self.list_size):
+				ri = np.random.randint(self.n)
+				rx = self.X[ri]
+				ry = self.Y[ri]
+				xs[j].append(np.array(rx, dtype=np.float32))
+				y.append(ry)
+				assert ry * self.scale - self.baseline > 0, '{}, {}, {}'.format(ry, self.scale, self.baseline)
+			ys.append(y)
+		xs = [np.array(xx, dtype=np.float32) for xx in xs]
+		ys = np.array(ys, dtype=np.float32)
+		if self.split == 'train': # normalize y as its used for loss weights.
+			ys = (ys * self.scale - self.baseline)
+
+		return xs + [ys]
+
+model_params = {
+	'ncf_large_adam_dense': {
+		'model_batch_size': 256,
+		'model_seq_len': 1,
+		'data_dir': [
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_ar_only',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_by_chunk',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_christy',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced_12_12',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_ar_only_ordered_balanced_20_50',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_sorted_christy_ordered_balanced_30_50',
+			'/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-11-20/ncf_large_adam_dense_sorted_christy_ordered_balanced_30_50_2',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_christy_lb',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_random_search_christy_lb_ps_only',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_16_real_random',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_orca_8',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_orca_4',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_orca_16',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model_2',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_random_search_linear_cost_model_2',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_orca_16_christy_lb_if_partition_lb_linear_cost_ps_only',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_orca_16_christy_lb_if_partition_lb_num_partition_2_32_linear_cost_ps_only',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_random_search_christy_lb_ps_only_if_partition_lb_ranknet_simulator_2',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf-5-9-20/ncf_large_adam_dense_random_search_christy_lb_ps_only_ranknet_simulator',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1_g3.4.25.2_g3.4.25.3_g3.4.25.4_3.4.25.6_g3.4.25.7_g3.4.25.8_g3.4.25.9',
+			# '/home/hao.zhang/oceanus_cost_model_training_data/ncf/ncf_large_adam_dense_g3.4.25.1',
+        ],
+		'original_graph_item_path': '/home/christy.li/oceanus_cost_model_training_data/ncf/original_graph_item',
+        'save_dir': os.path.join(expanduser('~'), 'oceanus_cost_model_training_data/ncf/predefined_checkpoints'),
+		'save_prefix': 'ckpV1_ncf_large_adam_dense_orca_all',
+		# 'save_prefix': 'ckpV2_ncf_large_adam_dense_orca',
+		'baseline': 0.15,
+		# 'baseline': 0.0,
+		'scale': 0.5,
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+		'model_version': 'v1',
+		# 'model_version': 'v2',
+		'do_train': False,
+		'do_test': True,
+		'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_all_600_0.83249_0.84517',
+	},
+	'bert': {
+		'model_batch_size': 32,
+		'model_seq_len': 128,
+		'data_dir': [
+			'/home/christy.li/oceanus_cost_model_training_data/bert/bert_3l_orca_16',
+			# '/home/christy.li/oceanus_cost_model_training_data/bert/bert_6l_orca_15',
+			# '/home/christy.li/oceanus_cost_model_training_data/bert/bert_12l_orca_15',
+			# '/home/christy.li/oceanus_cost_model_training_data/bert/bert.12l_g4.4.50.1_g4.4.50.2',
+			# '/home/christy.li/oceanus_cost_model_training_data/bert/bert.6l_g4.4.50.1_g4.4.50.2',
+		],
+        'original_graph_item_path': '/home/hao.zhang/oceanus_cost_model_training_data/bert/bert_original_graph_item_3l',
+        'save_dir': '/home/christy.li/oceanus_cost_model_training_data/bert/predefined_checkpoints',
+		'save_prefix': 'ckpV1_bert_orca',
+		'baseline': 0.04,
+		'scale': 0.5,
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+		'do_train': False,
+		'do_test': True,
+		'model_version': 'v1',
+		# 'model_version': 'v2',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/checkpoint_500',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_16_300_0.90684_0.91947',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_16_600_0.87000_0.71000',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_all_200_0.80568_0.81116',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV1_ncf_large_adam_dense_orca_200_0.81503_0.82009',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV2_ncf_large_adam_dense_orca_16_600_0.89737_0.92842',
+		# 'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/ncf/predefined_checkpoints/ckpV2_ncf_large_adam_dense_all_500_0.87666_0.85391',
+		'checkpoint': '/home/christy.li/oceanus_cost_model_training_data/bert/predefined_checkpoints/ckpV1_bert_orca_400_0.93600_0.93889',
+	},
+	'resnet101': {
+		'model_batch_size': 32,
+		'model_seq_len': 1,
+		'baseline': 0.5,
+		'scale': 0.5,
+		'data_dir': '',
+		'learning_rate': 0.01,
+		'list_size': 2,
+		'batch_size': 100,
+		'ranking_loss_key': 'pairwise_logistic_loss',
+	},
+}
+
+def main(_):
+	np.random.seed(110)
+
+	# Hyperparameters
+	# model_to_simulate = 'bert'
+	model_to_simulate = 'ncf_large_adam_dense'
+	data_dir = model_params[model_to_simulate]['data_dir']
+	original_graph_item_path = model_params[model_to_simulate]['original_graph_item_path']
+	batch_size = model_params[model_to_simulate]['batch_size']
+	ranking_loss_key = model_params[model_to_simulate]['ranking_loss_key']
+	learning_rate = model_params[model_to_simulate]['learning_rate']
+	list_size = model_params[model_to_simulate]['list_size']
+	baseline = model_params[model_to_simulate]['baseline']
+	scale = model_params[model_to_simulate]['scale']
+	save_dir = model_params[model_to_simulate]['save_dir']
+	save_prefix = model_params[model_to_simulate]['save_prefix']
+	do_train = model_params[model_to_simulate]['do_train']
+	do_test = model_params[model_to_simulate]['do_test']
+	checkpoint = model_params[model_to_simulate]['checkpoint']
+	model_version = model_params[model_to_simulate]['model_version']
+
+	# Create simulator
+	simulator = PredefinedSimulator(original_graph_item_path,
+	                                batch_size=model_params[model_to_simulate]['model_batch_size'],
+	                                seq_len=model_params[model_to_simulate]['model_seq_len'])
+
+	# Create features
+	strategy_resource_files, Y = utils.laod_from_folders(data_dir)
+	print("Createing features...")
+	X = []
+	with context.graph_mode():
+		for strategy_file, resource_file in tqdm.tqdm(strategy_resource_files):
+			x = simulator.create_features(Strategy.deserialize(strategy_file), ResourceSpec(resource_file))
+			X.append(x)
+	X = np.array(X, dtype=np.float)
+	print("Finished createing features.")
+
+	# Create model
+	hidden_dim = 12
+	W = tf.Variable(tf.random.uniform([hidden_dim, 1]), name='W', dtype=tf.float32)
+	b = tf.Variable(0.0, name='b', dtype=tf.float32)
+	if model_version == 'v2':
+		W0 = tf.Variable(tf.random.uniform([hidden_dim, hidden_dim]), name='W0', dtype=tf.float32)
+		b0 = tf.Variable(0.0, name='b0', dtype=tf.float32)
+	loss_fn = tfr.losses.make_loss_fn(RankingLossKeys[ranking_loss_key])
+	major_version, _, _ = tf.version.VERSION.split('.')
+	if major_version == '1':
+		optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+	else:
+		optimizer = tf.optimizers.Adam(learning_rate)
+
+	def forward(xs):
+		rs = []
+		for x in xs:
+			if model_version == 'v2':
+				x = tf.nn.elu(tf.matmul(x, W0) + b0)
+			r = tf.matmul(x, W) + b
+			rs.append(r)
+		r = tf.concat(rs, axis=1, name='logits')
+		return r
+
+	@tf.function
+	def train_steps(inputs_iterator, total_steps):
+
+		def train_step(input):
+			with tf.GradientTape() as tape:
+				logits = forward(input[:-1])
+				loss = loss_fn(labels=input[-1], logits=logits, features={})
+				vs = [W0, b0, W, b] if model_version == 'v2' else [W, b]
+				gradients = tape.gradient(loss, vs)
+				train_op = optimizer.apply_gradients(zip(gradients, vs))
+			pred = tf.squeeze(tf.argmax(logits, axis=1))
+			labels = tf.squeeze(tf.argmax(input[-1], axis=1))
+			acc = tf.equal(pred, labels)
+			return loss, acc
+
+		losses = []
+		accs = []
+		for step in range(total_steps):
+			l, a = train_step(inputs_iterator.get_next())
+			losses.append(l)
+			accs.append(a)
+		return losses, accs
+
+	@tf.function
+	def eval_step(input):
+		logits = forward(input[:-1])
+		preds = tf.squeeze(tf.argmax(logits, axis=1))
+		labels = tf.squeeze(tf.argmax(input[-1], axis=1))
+		acc = tf.equal(preds, labels)
+		return acc, labels, preds, input[-1], logits
+
+	def eval_steps(iterator, total_test_steps):
+		test_acc = []
+		test_preds = []
+		test_labels = []
+		test_logits = []
+		test_scores = []
+		for step in range(total_test_steps):
+			acc, labels, preds, scores, logits = eval_step(iterator.get_next())
+			test_acc.append(acc)
+			test_labels.append(labels)
+			test_preds.append(preds)
+			test_scores.append(scores)
+			test_logits.append(logits)
+		test_acc = tf.concat(test_acc, axis=0)
+		test_acc = tf.cast(test_acc, tf.float32)
+		avg_test_acc = tf.math.reduce_mean(test_acc)
+		test_labels = tf.concat(test_labels, axis=0)
+		test_preds = tf.concat(test_preds, axis=0)
+		test_scores = tf.concat(test_scores, axis=0)
+		test_logits = tf.concat(test_logits, axis=0)
+		return avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits
+
+	if do_train:
+		train_set, valid_set, test_set = utils.split_dataset([X, Y], shuffle=True, train_ratio=0.7, test_ratio=0.15)
+		X_train, Y_train = train_set
+		X_valid, Y_valid = valid_set
+		X_test, Y_test = test_set
+		inputs_iterator = TFRIterator(X=X_train, Y=Y_train, list_size=list_size, batch_size=batch_size, split='train',
+		                              baseline=baseline, scale=scale)
+		valid_iterator = TFRIterator(X=X_valid, Y=Y_valid, list_size=list_size, batch_size=batch_size, split='valid')
+		test_iterator = TFRIterator(X=X_test, Y=Y_test, list_size=list_size, batch_size=batch_size, split='test')
+		total_train_steps = max(1, min(inputs_iterator.get_num_examples() // batch_size, 100))
+		total_valid_steps = max(1, valid_iterator.get_num_examples() // batch_size)
+		total_test_steps = max(1, test_iterator.get_num_examples() // batch_size)
+		print("Total train steps per epoch: {}".format(total_train_steps))
+		print("Total valid steps per epoch: {}".format(total_valid_steps))
+		print("Total test steps: {}".format(total_test_steps))
+		EPOCHS = 2000
+		eval_every_epochs = 100
+		save_every_epochs = 100
+
+		print("\nTrain model...")
+		losses = []
+		for epoch in range(EPOCHS):
+			loss, acc = train_steps(inputs_iterator, total_train_steps)
+			losses.extend(loss)
+			avgloss = sum(losses) / float(len(losses))
+			print('Step: {}, avgloss: {:.5f}'.format(epoch, avgloss))
+			if (epoch+1) % eval_every_epochs == 0:
+				print("\nEvaluate on valid set...")
+				avg_valid_acc, *_= eval_steps(valid_iterator, total_valid_steps)
+				print('avg_valid_acc: {}'.format(avg_valid_acc.numpy()))
+				print("Evaluate on test set...")
+				avg_test_acc, *_= eval_steps(test_iterator, total_test_steps)
+				print('avg_test_acc: {}\n'.format(avg_test_acc.numpy()))
+				print('W', W.numpy())
+				print('b', b.numpy())
+
+				if (epoch+1) % save_every_epochs == 0:
+					if not os.path.exists(save_dir):
+						os.mkdir(save_dir)
+					checkpoint = '{}/{}_{}_{:.5f}_{:.5f}'.format(save_dir, save_prefix, epoch+1,
+					                                             avg_valid_acc, avg_test_acc)
+					print("Save to {}".format(checkpoint))
+					simulator.save_checkpoint([W0, b0, W, b] if model_version == 'v2' else [W, b], checkpoint)
+
+	elif do_test:
+		print("Load from {}".format(checkpoint))
+		weights = simulator.load_checkpoint(checkpoint)
+		if model_version == 'v2' and len(weights) == 4:
+			W0, b0, W, b = weights
+		elif model_version == 'v1' and len(weights) == 2:
+			W, b = weights
+		else:
+			raise ValueError
+
+		test_iterator = TFRIterator(X=X, Y=Y, list_size=list_size, batch_size=batch_size, split='test')
+		total_test_steps = max(1, test_iterator.get_num_examples() // batch_size)
+		print("\nEvaluate on test set...")
+		avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits = eval_steps(test_iterator, total_test_steps)
+		for i, labels, preds, scores, logits in zip(range(100), test_labels, test_preds, test_scores, test_logits):
+			print('labels', labels.numpy(), 'preds', preds.numpy(), 'scores', scores.numpy(), 'logits', logits.numpy())
+		print('avg_test_acc', avg_test_acc.numpy())
+
+		test_iterator_single = TFRIterator(X=X, Y=Y, list_size=1, batch_size=len(X), split='test')
+		print("\nEvaluate each example in test set...")
+		avg_test_acc, test_acc, test_labels, test_preds, test_scores, test_logits = eval_steps(test_iterator_single, 1)
+		for i, labels, preds, scores, logits in zip(range(100), test_labels, test_preds, test_scores, test_logits):
+			print('labels', labels.numpy(), 'preds', preds.numpy(), 'scores', scores.numpy(), 'logits', logits.numpy())
+		test_logits = sorted(list(test_logits.numpy()))
+		top_10_persent = test_logits[:int(len(test_logits)*0.1)]
+		print('top_10_persent', top_10_persent)
+		print('top_10_persent threshold', top_10_persent[-1])
+		print('test_logits', test_logits)
+
+
+main(sys.argv)
diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py
new file mode 100644
index 0000000..a668e75
--- /dev/null
+++ b/autodist/simulator/utils.py
@@ -0,0 +1,342 @@
+import glob
+import json
+import os
+import numpy as np
+
+import tensorflow_ranking as tfr
+import tensorflow as tf
+from tensorflow.python.framework import device_spec
+
+from arion.utils import logging
+from arion.resource_spec import ResourceSpec
+from arion.strategy.base import Strategy
+from arion.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \
+    DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR
+from arion.kernel.device.resolver import DeviceResolver
+
+
+RankingLossKeys = {
+	# Names for the ranking based loss functions.
+	'pairwise_hinge_loss': tfr.losses.RankingLossKey.PAIRWISE_HINGE_LOSS,
+	'pairwise_logistic_loss': tfr.losses.RankingLossKey.PAIRWISE_LOGISTIC_LOSS,
+	'pairwise_soft_zero_one_loss': tfr.losses.RankingLossKey.PAIRWISE_SOFT_ZERO_ONE_LOSS,
+	'softmax_loss': tfr.losses.RankingLossKey.SOFTMAX_LOSS,
+	'sigmoid_cross_entropy_loss': tfr.losses.RankingLossKey.SIGMOID_CROSS_ENTROPY_LOSS,
+	'mean_squared_loss': tfr.losses.RankingLossKey.MEAN_SQUARED_LOSS,
+	'list_mle_loss': tfr.losses.RankingLossKey.LIST_MLE_LOSS,
+	'approx_ndcg_loss': tfr.losses.RankingLossKey.APPROX_NDCG_LOSS,
+}
+
+#########
+# Online
+#########
+
+def laod_from_one_folder(data_folder):
+    strategy_folder = '{}/strategies'.format(data_folder)
+    strategy_files = glob.glob(os.path.join(strategy_folder, '*'))
+    X = []
+    Y = []
+    for strategy_file in strategy_files:
+        # Target
+        runtime_file = '/'.join(strategy_file.split('/')[:-2]) + '/runtimes/' + strategy_file.split('/')[-1]
+        if not os.path.exists(runtime_file) or not os.path.isfile(runtime_file):
+            print('runtime_file does not exist: {}.'.format(runtime_file))
+            continue
+        runtime = json.load(open(runtime_file, 'r'))
+        y = runtime['average']
+        resource_file = strategy_file.replace('strategies', 'resource_specs')
+        if not os.path.exists(resource_file):
+            resource_file += '.yml'
+            if not os.path.exists(resource_file):
+                resource_file = os.path.join(data_folder, 'resource_spec_files/resource_spec.yml')
+                if not os.path.exists(resource_file):
+                    continue
+        Y.append(y)
+        X.append([strategy_file, resource_file])
+    print('Data points:{}, data_folder: {}'.format(len(X), data_folder))
+    return X, Y
+
+
+def laod_from_folders(data_dir):
+    if isinstance(data_dir, str):
+        data_folders = glob.glob("{}/*".format(data_dir), recursive=True)
+    elif isinstance(data_dir, list):
+        data_folders = data_dir
+    else:
+        raise ValueError
+    print('data_folders', data_folders)
+    X = []
+    Y = []
+    for data_folder in data_folders:
+        x, y = laod_from_one_folder(data_folder)
+        if len(x) == 0:
+            print('strategy_folder does not have files: {}, skipping it.'.format(data_folder))
+            continue
+        Y.extend(y)
+        X.extend(x)
+    # Y = np.concatenate(Y, axis=0)
+    if len(Y) > 0:
+        Y = np.array(Y, dtype=np.float)
+        miny = np.min(Y)
+        print('min of all Y values: {}'.format(miny))
+    else:
+        print("no files loaded.")
+    return X, Y
+
+
+##########
+# Offline
+##########
+
+def laod_from_one_folder_offline(simulation_folder):
+    simulation_files = glob.glob(os.path.join(simulation_folder, '*'), recursive=True)
+    X = []
+    Y = []
+    for simulation_file in simulation_files:
+        # Features
+        try:
+            simulation = json.load(open(simulation_file, 'r'))
+        except:
+            print("Can not read simulation_file: ", simulation_file)
+            continue
+        x = simulation_file
+        # Target
+        runtime_file = '/'.join(simulation_file.split('/')[:-2]) + '/runtimes/' + simulation_file.split('/')[-1]
+        if not os.path.exists(runtime_file) or not os.path.isfile(runtime_file):
+            print('runtime_file does not exist: {}.'.format(runtime_file))
+            continue
+        runtime = json.load(open(runtime_file, 'r'))
+        y = runtime['average']
+        Y.append(y)
+        X.append(x)
+    Y = np.array(Y, dtype=np.float)
+    print('Data points:{}, simulation_folder: {}'.format(len(X), simulation_folder))
+    return X, Y
+
+
+def laod_from_folders_offline(data_dir):
+    simulation_folders = glob.glob("{}/*/simulations".format(data_dir), recursive=True)
+    print('simulation_folders', simulation_folders)
+    X = []
+    Y = []
+    for simulation_folder in simulation_folders:
+        x, y = laod_from_one_folder_offline(simulation_folder)
+        if len(x) == 0:
+            print('simulation folder does not have files: {}, skipping it.'.format(simulation_folder))
+            continue
+        Y.append(y)
+        X.append(x)
+    Y = np.concatenate(Y, axis=0)
+    miny = np.min(Y)
+    print('min of Y values: {}'.format(miny))
+    return X, Y
+
+
+def split_dataset(inputs, shuffle=True, train_ratio=0.7, test_ratio=0.15):
+    assert isinstance(inputs, list)
+    nb_elements = len(inputs)
+    nb_samples = len(inputs[0])
+    n_train = int(nb_samples * train_ratio)
+    n_test = int(nb_samples * test_ratio)
+    shuffled = []
+    train = []
+    valid = []
+    test = []
+
+    if shuffle:
+        random_indices = np.random.permutation(list(range(nb_samples)))
+        for i in range(nb_elements):
+            shuffled_i = [inputs[i][j] for j in random_indices]
+            train.append(shuffled_i[:n_train])
+            valid.append(shuffled_i[n_train:-n_test])
+            test.append(shuffled_i[-n_test:])
+    else:
+        for i in range(nb_elements):
+            train.append(inputs[i][:n_train])
+            valid.append(inputs[i][n_train:-n_test])
+            test.append(inputs[i][-n_test:])
+
+    return train, valid, test
+
+def read_trial_runs():
+    runtime_files = glob.glob(os.path.join(DEFAULT_RUNTIME_SERIALIZATION_DIR, '*'))
+    strategy_files = glob.glob(os.path.join(DEFAULT_SERIALIZATION_DIR, '*'))
+    strategy_json_files = glob.glob(os.path.join(DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, '*'))
+    resource_files = glob.glob(os.path.join(DEFAULT_RESOURCE_SERIALIZATION_DIR, '*'))
+    logging.info(len(runtime_files), len(strategy_files), len(strategy_json_files), len(resource_files))
+
+    trialruns = {}
+    for runtime_file in runtime_files:
+        strategy_id = runtime_file.split('/')[-1]
+        strategy_file = os.path.join(DEFAULT_SERIALIZATION_DIR, strategy_id)
+        strategy_json_file = os.path.join(DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, strategy_id)
+        resource_file = os.path.join(DEFAULT_RESOURCE_SERIALIZATION_DIR, strategy_id)
+        if not os.path.exists(strategy_file):
+            logging.info("strategy_file not found, skip it: {}".format(strategy_file))
+            continue
+        if not os.path.exists(strategy_json_file):
+            logging.info("strategy_json_file not found, skip it: {}".format(strategy_json_file))
+            continue
+        if not os.path.exists(resource_file):
+            logging.info("resource_file not found, skip it: {}".format(resource_file))
+            continue
+
+        trialruns[strategy_id] = {
+            'runtime': json.load(open(runtime_file, 'r')),
+            'strategy': Strategy.deserialize(strategy_id),
+            'strategy_json': json.load(open(strategy_json_file, 'r')),
+            'resource_spec': ResourceSpec(resource_file=resource_file),
+        }
+
+    logging.info("Total number of trials: {}".format(len(trialruns)))
+    return trialruns
+
+
+DTYPE2BITS = {
+    tf.float16: 16,
+    "tf.float16": 16,
+    "<dtype: 'float16'>": 16,
+    tf.float32: 32,
+    'tf.float32': 32,
+    "<dtype: 'float32'>": 32,
+    "<dtype: 'float32_ref'>": 32,
+    tf.float64: 64,
+    'tf.float64': 64,
+    "<dtype: 'float64'>": 64,
+    tf.bfloat16: 16,
+    'tf.bfloat16': 16,
+    "<dtype: 'bfloat16'>": 16,
+    tf.complex64: 64,
+    'tf.complex64': 64,
+    "<dtype: 'complex64'>": 64,
+    tf.complex128: 128,
+    'tf.complex128': 128,
+    "<dtype: 'complex128'>": 128,
+    tf.int8: 8,
+    'tf.int8': 8,
+    "<dtype: 'int8'>": 8,
+    tf.uint8: 8,
+    'tf.uint8': 8,
+    "<dtype: 'uint8'>": 8,
+    tf.uint16: 16,
+    'tf.uint16': 16,
+    "<dtype: 'uint16'>": 16,
+    tf.uint32: 32,
+    'tf.uint32': 32,
+    "<dtype: 'uint32'>": 32,
+    tf.uint64: 64,
+    'tf.uint64': 64,
+    "<dtype: 'uint64'>": 64,
+    tf.int16: 16,
+    'tf.int16': 16,
+    "<dtype: 'int16'>": 16,
+    tf.int32: 32,
+    'tf.int32': 32,
+    "<dtype: 'int32'>": 32,
+    tf.int64: 64,
+    'tf.int64': 64,
+    "<dtype: 'int64'>": 64,
+    tf.bool: 1,
+    'tf.bool': 1,
+    "<dtype: 'bool'>": 1,
+    tf.string: 1,  # todo: confirm
+    'tf.string': 1,  # todo: confirm
+    "<dtype: 'string'>": 1,  # todo: confirm
+    tf.qint8: 8,
+    'tf.qint8': 8,
+    "<dtype: 'qint8'>": 8,
+    tf.quint8: 8,
+    'tf.quint8': 8,
+    "<dtype: 'quint8'>": 8,
+    tf.qint16: 16,
+    'tf.qint16': 16,
+    "<dtype: 'qint16'>": 16,
+    tf.quint16: 16,
+    'tf.quint16': 16,
+    "<dtype: 'quint16'>": 16,
+    tf.qint32: 32,
+    'tf.qint32': 32,
+    "<dtype: 'qint32'>": 32,
+    tf.resource: 0,  # its tensor shape is either [] or [None] todo: confirm
+    'tf.resource': 0,  # its tensor shape is either [] or [None] todo: confirm
+    "<dtype: 'resource'>": 0,  # its tensor shape is either [] or [None] todo: confirm
+}
+
+GIGABITS = np.float(1e+9)
+INFINITY = 1e+9
+NUM_RUNS = 500
+
+
+def pad_list(l, max_len):
+	return l + [0.0] * (max_len - len(l))
+
+
+def get_dtype_bits(dtype):
+    return DTYPE2BITS[dtype] if dtype in DTYPE2BITS else DTYPE2BITS[str(dtype)]
+
+
+def get_dense_var_bits(size, dtype):
+    return size * get_dtype_bits(dtype)
+
+
+def get_sparse_var_bits(size):
+    # same size of values, indices, dense_shape
+    return size * (get_dtype_bits(tf.float32) + 2 * get_dtype_bits(tf.int64)) \
+           + 2 * get_dtype_bits(tf.int64)
+
+
+def _resolved_devices_on_diff_machine(device1, device2):
+    # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0'
+    node1 = ':'.join(device1.split('/')[:-1])
+    node2 = ':'.join(device2.split('/')[:-1])
+    return node1 != node2
+
+
+def _resolve_device_address(device: str, device_resolver: DeviceResolver):
+    # change real ip address to /job:worker/task:0
+    if not device:
+        return device
+    parts = device.split(':')
+    if parts and parts[0] in device_resolver._address_to_tasks:
+        resolved_device = device_resolver._address_to_tasks[parts[0]][0]
+        resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task'])
+        resolved = resolved + ':'.join(parts[-2:])
+        return resolved
+    else:
+        raise ValueError("cannot resolve device: {} using device_resolver: {}".format(
+            device, device_resolver._address_to_tasks))
+
+
+def _num_local_replica(host, replicas, cluster):
+    # host: e.g., '/job:worker/task:0/device:CPU:0'
+    replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas}
+    host_device = device_spec.DeviceSpecV2.from_string(host)
+    num_local_replica = sum(1 for d in replica_devices
+                            if cluster.get_address_from_task(d.job, d.task) ==
+                            cluster.get_address_from_task(host_device.job, host_device.task))
+    return num_local_replica
+
+
+def _max_num_local_replica(replicas, cluster):
+    replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas}
+    replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices}
+    max_num_local_replica = 0
+    for host in replica_hosts:
+        num_local_replica = sum(1 for d in replica_devices
+                                if cluster.get_address_from_task(d.job, d.task) == host)
+        max_num_local_replica = max(max_num_local_replica, num_local_replica)
+    return max_num_local_replica
+
+
+def _strip_var_name(name):
+    # strip prefix
+    if not name:
+        return name
+    name = name.split('/')
+    if 'Replica' in name[0]:  # remove prefix
+        name = name[1:]
+    if name and 'part' in name[-1]:  # remove '/part_1' if using partitioned ps
+        name = name[:-1]
+    name = '/'.join(name)
+    name = name.split(':')[0]  # remove ':0'.
+    return name
diff --git a/autodist/strategy/auto/ar_group_assigner.py b/autodist/strategy/auto/ar_group_assigner.py
new file mode 100644
index 0000000..c2d59b6
--- /dev/null
+++ b/autodist/strategy/auto/ar_group_assigner.py
@@ -0,0 +1,57 @@
+from collections import OrderedDict
+
+import numpy as np
+
+
+def chunk_group_assigner(ar_shards, chunk_size=1):
+    assignments = {}
+    for i, shard_name in enumerate(ar_shards):
+        assignments[shard_name] = i // chunk_size
+    assert(len(ar_shards)) == len(assignments)
+    return assignments
+
+
+def christy_group_assigner(ar_shards, var_helpers, num_group):
+    """A probabilistic assigner that  tries to put each ring with balanced message size"""
+    assignments = {}
+
+    sorted_ar_shards = OrderedDict(sorted(ar_shards.items(), key=lambda x: var_helpers[x[0]].byte_size, reverse=True))
+    cur_loads = [0.0 for i in range(num_group)]
+    for shard_name in sorted_ar_shards:
+        total_loads = sum(cur_loads)
+        balanced_loads = [total_loads / num_group for _ in range(num_group)]
+        space = np.array([balanced_load - cur_load for balanced_load, cur_load in zip(balanced_loads, cur_loads)])
+
+        e_x = np.exp(space-np.max(space))
+        accept_prob = e_x / e_x.sum()
+
+        des = np.random.choice(range(0, num_group), 1, p=accept_prob)[0]
+        assignments[shard_name] = des
+        cur_loads[des] += var_helpers[shard_name].byte_size
+    assert(len(ar_shards)) == len(assignments)
+    # entropy = calcuate_entropy(cur_loads)
+    # best_entropy = calcuate_entropy(balanced_loads)
+    # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy))
+    return assignments
+
+def ordered_balanced_group_assigner(ar_shards, var_helpers, num_group):
+    """Greedy assigner that create balanced loads following a given var order."""
+    assignments = {}
+
+    # get total size
+    total_loads = 0.0
+    for shard_name in ar_shards:
+        total_loads += var_helpers[shard_name].byte_size
+
+    avg_load = total_loads / num_group
+
+    cur_bucket = 0
+    loads = [0 for _ in range(num_group)]
+    for shard_name in ar_shards:
+        if loads[cur_bucket] >= avg_load:
+            cur_bucket += 1
+        if loads[cur_bucket] < avg_load:
+            assignments[shard_name] = cur_bucket
+            loads[cur_bucket] += var_helpers[shard_name].byte_size
+    assert(len(ar_shards) == len(assignments))
+    return assignments
\ No newline at end of file
diff --git a/autodist/strategy/auto/auto_strategy.py b/autodist/strategy/auto/auto_strategy.py
new file mode 100644
index 0000000..e69de29
diff --git a/autodist/strategy/auto/ps_load_balancer.py b/autodist/strategy/auto/ps_load_balancer.py
new file mode 100644
index 0000000..dc770d8
--- /dev/null
+++ b/autodist/strategy/auto/ps_load_balancer.py
@@ -0,0 +1,67 @@
+from collections import OrderedDict
+
+import numpy as np
+
+
+def calcuate_entropy(loads):
+    distribution = loads / np.sum(loads)
+    distribution = distribution + 1e-4
+    entropy = - np.sum(distribution * np.log2(distribution))
+    return entropy
+
+def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False):
+    # no randomness
+    assignments = {}
+    reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+    loads = {ps: 0.0 for ps in reduction_device_names}
+
+    sorted_ps_shards = ps_shards
+    if sort_by_size:
+        sorted_ps_shards = OrderedDict(sorted(ps_shards.items(),
+                                              key=lambda x: var_helpers[x[0]].byte_size, reverse=True))
+
+    for shard_name in sorted_ps_shards:
+        sorted_ps = sorted(loads, key=loads.get)
+        destination = sorted_ps[0]
+        assignments[shard_name] = destination
+        loads[destination] += var_helpers[shard_name].byte_size
+    return assignments
+
+def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False):
+    # Sample destination based on a distributed calculated based on loads and available bandwidth
+    reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+    loads = {ps: 0.0 for ps in reduction_device_names}
+    assignments = {}
+
+    loads = sorted(list(loads.items()), key=lambda x: x[0])
+    ps = [load[0] for load in loads]
+    bandwidth = [resource_spec.network_bandwidth[p.split(':')[0]] for p in ps]
+    total_bandwidth = sum(bandwidth)
+    cur_loads = [float(load[1]) for load in loads]
+
+    sorted_ps_shards = ps_shards
+    if sort_by_size:
+        sorted_ps_shards = OrderedDict(sorted(ps_shards.items(),
+                                              key=lambda x: var_helpers[x[0]].byte_size, reverse=True))
+
+    for shard_name in sorted_ps_shards:
+        total_load = sum(cur_loads)  # + var_load
+        balanced_loads = [total_load * b / total_bandwidth for b in bandwidth]
+        space = np.array([balanced_load - cur_load for balanced_load, cur_load in zip(balanced_loads, cur_loads)])
+
+        # softmax
+        e_x = np.exp(space - np.max(space))
+        accept_prob = e_x / e_x.sum()
+
+        # sample according to current load
+        des = np.random.choice(ps, 1, p=accept_prob)[0]
+        assignments[shard_name] = des
+
+        cur_loads[ps.index(des)] += var_helpers[shard_name].byte_size
+    assert (len(ps_shards) == len(assignments))
+
+    # entropy = calcuate_entropy(cur_loads)
+    # best_entropy = calcuate_entropy(balanced_loads)
+    # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy))
+    return assignments
+
diff --git a/autodist/strategy/auto/random_strategy.py b/autodist/strategy/auto/random_strategy.py
new file mode 100644
index 0000000..24150dd
--- /dev/null
+++ b/autodist/strategy/auto/random_strategy.py
@@ -0,0 +1,443 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AllReduce StrategyBuilder."""
+from collections import OrderedDict
+
+from enum import Enum
+from tensorflow.python.framework import ops
+
+from arion.kernel.common.utils import get_op_name, get_consumers
+from arion.kernel.partitioner import PartitionerConfig
+from arion.proto import strategy_pb2, synchronizers_pb2
+from arion.search import sample_util
+from arion.strategy.base import Strategy, StrategyBuilder
+from arion.strategy.base import byte_size_load_fn
+from arion.strategy.component.ps_load_balancer import greedy_load_balancer, christy_load_balancer
+from arion.strategy.component.ar_group_assigner import chunk_group_assigner, \
+    christy_group_assigner, ordered_balanced_group_assigner
+
+class VarType(Enum):
+    SPARSE = 0
+    DENSE = 1
+
+
+class VariableHelper:
+    def __init__(self, var, graph_item):
+        self.var = var
+        self.graph_item = graph_item
+        self._var_op_name = get_op_name(var.name)
+        self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0]
+
+    @property
+    def var_type(self):
+        return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE
+
+    @property
+    def is_sparse(self):
+        return True if self.var_type == VarType.SPARSE else False
+
+    @property
+    def is_embedding(self):
+        for op in get_consumers(self.var.op):
+            if op.type == "ResourceGather":
+                return True
+            # op = new_graph_item.graph.get_operation_by_name(
+            #     ops.prepend_name_scope(op.name, ARION_TO_DELETE_SCOPE)
+            # )
+        return False
+
+    @property
+    def shape(self):
+        if self.var.initial_value.shape.ndims:
+            return self.var.initial_value.shape.as_list()
+        else:
+            return None
+
+    @property
+    def partitionable_axis(self):
+        valid_axis = []
+        if not self.shape:
+            return valid_axis
+        # Sparse variable can only be partition along the 0th axis
+        # only sample axis for dense variables
+        if self.is_sparse or self.is_embedding:
+            valid_axis = [0]
+            return valid_axis
+        for idx, dim in enumerate(self.shape):
+            if dim > 1:
+                valid_axis.append(idx)
+        return valid_axis
+
+    @property
+    def byte_size(self):
+        return float(byte_size_load_fn(self.var))
+
+    @property
+    def dtype(self):
+        return self.var.dtype
+
+
+class PartHelper:
+    def __init__(self, part_idx, var, pc):
+        self.var = var
+        self.part_idx = part_idx
+        self.pc = pc
+
+    @property
+    def shape(self):
+        shape = self.var.initial_value.shape.as_list()
+        dim_size = shape[self.pc.axis] // self.pc.num_shards
+        extras = shape[self.pc.axis] % self.pc.num_shards
+        if self.part_idx < extras:
+            dim_size += 1
+        shape[self.pc.axis] = dim_size
+        return shape
+
+    @property
+    def var_shape(self):
+        return self.var.initial_value.shape.as_list()
+
+    @property
+    def byte_size(self):
+        return float(byte_size_load_fn(self.var)) \
+               * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis])
+
+
+class RandomStrategy(StrategyBuilder):
+    def __init__(self, space, heuristics):
+        """
+
+        Args:
+            self:
+            enable_ps_load_balancer:
+            enable_chunk:
+
+        Returns:
+
+        """
+        self.space = space
+        self.heuristics = heuristics
+        self.helpers = {}
+
+    def reset(self):
+        self.helpers = {}
+
+    def build(self, graph_item, resource_spec):
+        expr = Strategy()
+
+        # number of graph replica is equal to number of GPU devices
+        expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices])
+        variables = graph_item.trainable_var_op_to_var.values()
+
+        # A fully MCMC process to generate node configs
+        node_config = []
+        for var in variables:
+            var_helper = VariableHelper(var, graph_item)
+            self.helpers[var_helper.var.name] = var_helper
+
+            node = strategy_pb2.Strategy.Node()
+            node.var_name = var_helper.var.name
+
+            # Step 1: determine whether or not to partition
+            # TODO(Hao): other factor not considered -- number of reduction_device_names
+            maybe_partition = sample_if_partition(var_helper, resource_spec, self.space, self.heuristics)
+
+            # Step 2.1: if not partition, sample a synchronizer type for it
+            if not maybe_partition:  # no partition
+                sample_var_synchronizer(node, var_helper, resource_spec, self.space)
+            else:  # Step 2.2: if partition
+                # Step 2.2.1: sample a partitioner config
+                pc = sample_partition_config(var_helper, resource_spec, self.space, self.heuristics)
+                node.partitioner = pc.partition_str
+
+                # step 2.2.2: sample a synchornizer type for each partition
+                parts = []
+                for i in range(pc.num_shards):
+                    part = strategy_pb2.Strategy.Node()
+                    part.var_name = '{}/part_{}:0'.format(get_op_name(var.name), i)
+                    self.helpers[part.var_name] = PartHelper(i, var, pc)
+                    parts.append(part)
+                sample_parts_synchronizers(parts, var_helper, resource_spec, self.space, self.heuristics)
+                node.part_config.extend(parts)
+            node_config.append(node)
+
+        sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics)
+        # Mark each variable to be synchronized with a Parameter Server
+        expr.node_config.extend(node_config)
+        return expr
+
+
+def sample_if_partition(var_helper, resource_spec, space, heuristics):
+    reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+    if len(space['maybe_partition']) == 1:
+        return space['maybe_partition']
+    if heuristics['enable_single_node_no_partition'] and len(reduction_device_names) <= 1:
+        return False
+
+    # intersection of variable's partitonable axis and global constraints
+    if var_helper.partitionable_axis:
+        if space['partitionable_axis']:
+            a = set(var_helper.partitionable_axis) & set(space['partitionable_axis'])
+            if len(a) < 1:
+                return False
+    else:
+        return False
+
+    # lower bound for abandoning partitioning
+    lb = heuristics['maybe_partition_bounds'][0]
+    ub = heuristics['maybe_partition_bounds'][1]
+    if var_helper.byte_size <= lb:
+        return False
+    if var_helper.byte_size >= ub:
+        return True
+    assert (len(space['maybe_partition']) == 2)
+
+    if heuristics['maybe_partition_by_size']:
+        #  By variable size -- a large variable has a higher chance to be partitioned
+        # TODO (Hao): MAX_INT32 is too large, reconsider later...
+        chance = float(var_helper.byte_size - lb) / float(ub - lb)
+        return sample_util.binary_sample(boundary=chance)
+    else:
+        return sample_util.uniform_sample_by_choices(space['maybe_partition'])
+
+
+def sample_var_synchronizer(node, var_helper, resource_spec, space):
+    # sample a single synchornizer for an unpartitioned variable,
+    # will eave merge_group of reduction_destination as empty
+
+    # We ALWAYS use PS for sparse variables
+    synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \
+        else sample_util.uniform_sample_by_choices(space['synchronizer_types'])
+    if synchronizer_type == 'PS':
+        node.PSSynchronizer.sync = True  # we don't consider async at this moment
+        node.PSSynchronizer.staleness = 0
+        node.PSSynchronizer.local_replication = sample_if_local_replication(space['local_replication'],
+                                                                            resource_spec)
+    else:
+        # no other option for spec
+        node.AllReduceSynchronizer.spec = synchronizers_pb2.AllReduceSynchronizer.Spec.Value('AUTO')
+        node.AllReduceSynchronizer.compressor = \
+            synchronizers_pb2.AllReduceSynchronizer.Compressor.Value(
+                sample_ar_compressor(space['compressor']))
+
+
+def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristics):
+    # sample synchornizer for a group of variable partitions
+
+    if var_helper.var_type == VarType.SPARSE:
+        synchronizer_types = ['PS'] * len(parts)
+    else:
+        if heuristics['same_synchronizer_for_parts']:
+            type = sample_util.uniform_sample_by_choices(space['synchronizer_types'])
+            synchronizer_types = [type] * len(parts)
+        else:
+            synchronizer_types = [sample_util.uniform_sample_by_choices(space['synchronizer_types'])
+                                  for part in parts]
+    for i, part in enumerate(parts):
+        if synchronizer_types[i] == 'PS':
+            part.PSSynchronizer.sync = True  # we don't consider async at this moment
+            part.PSSynchronizer.staleness = 0
+            part.PSSynchronizer.local_replication = sample_if_local_replication(space['local_replication'],
+                                                                                resource_spec)
+        else:
+            # no other option for spec
+            part.AllReduceSynchronizer.spec = synchronizers_pb2.AllReduceSynchronizer.Spec.Value('AUTO')
+            part.AllReduceSynchronizer.compressor = \
+                synchronizers_pb2.AllReduceSynchronizer.Compressor.Value(
+                    sample_ar_compressor(space['compressor']))
+
+
+def sample_partition_config(var_helper, resource_spec, space, heuristics):
+    # Since Arion only support parttion along one axis,
+    # we first sample a partition axis, then sammple #partition along that axis, we obtain the partition config.
+    assert len(var_helper.partitionable_axis) > 0, 'No partition axis available'
+    # sample partition axis
+    # TODO(Hao): some heursitics here available?
+    valid_axis = var_helper.partitionable_axis
+    if space['partitionable_axis']:
+        valid_axis = list(set(valid_axis) & set(space['partitionable_axis']))
+    partition_axis = sample_util.uniform_sample_by_choices(valid_axis)
+
+    # sample how many partition to go
+    num_nodes = resource_spec.num_cpus
+    dim_size = var_helper.shape[partition_axis]
+    if heuristics['num_partition_bounds'][1] == 'num_nodes':
+        max_shards = min(dim_size, num_nodes)
+    elif isinstance(heuristics['num_partition_bounds'][1], int):
+        max_shards = min(dim_size, heuristics['num_partition_bounds'][1])
+    else:
+        raise ValueError('unseen num_partition_bounds config')
+
+    min_shards = 2
+    if isinstance(heuristics['num_partition_bounds'][0], int):
+        min_shards = max(min_shards, heuristics['num_partition_bounds'][0])
+    elif heuristics['num_partition_bounds'][0] == 'num_nodes':
+        min_shards = max(min_shards, heuristics['num_partition_bounds'][0])
+    else:
+        raise ValueError('unseen num_partition_bounds config')
+
+    # sample from [min_shards, max_shards]
+    num_shards = sample_util.uniform_sample_by_choices(range(min_shards, max_shards + 1))
+
+    # construct a PartitionerConfig (pc)
+    partition_list = [1] * len(var_helper.shape)
+    partition_list[partition_axis] = num_shards
+    pc = PartitionerConfig(partition_list=partition_list)
+    return pc
+
+
+def sample_if_local_replication(local_replication_space, resource_spec):
+    # Local replication is a PS-specific semantic; it represents whether to use hierarchical PS
+    if resource_spec.num_gpus <= resource_spec.num_cpus:
+        # meaning every machine has at most 1 GPU
+        return False
+    return sample_util.uniform_sample_by_choices(local_replication_space)
+
+
+def sample_ar_compressor(compressor_space):
+    # [NoneCompressor, HorovodCompressor, HorovodCompressorEF, PowerSGDCompressor]
+    # [ HorovodCompressorEF, PowerSGDCompressor] will change gradient value
+    # so only two choices here
+    # TODO(Hao): try to use all four options
+    return sample_util.uniform_sample_by_choices(compressor_space)
+
+
+def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics):
+    ps_shards = OrderedDict()
+    ar_shards = OrderedDict()
+    idx = 0
+    for node in node_config:
+        if node.partitioner:
+            for part in node.part_config:
+                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                if hasattr(synchronizer, 'compressor'):
+                    ar_shards[part.var_name] = (idx,)
+                else:
+                    ps_shards[part.var_name] = (idx,)
+                idx += 1
+        else:
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            if hasattr(synchronizer, 'compressor'):
+                ar_shards[node.var_name] = (idx,)
+            else:
+                ps_shards[node.var_name] = (idx,)
+            idx += 1
+
+    if len(ps_shards) > 0:
+        sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics)
+
+    # step 4: assign ar merge groups globally
+    if len(ar_shards) > 0:
+        sample_ar_groups(node_config, ar_shards, helpers, heuristics)
+
+
+def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics):
+    load_balancer = heuristics['ps_load_balancer']
+    reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
+    if not load_balancer:
+        destinations = {}
+        for shard_name in ps_shards:
+            destinations[shard_name] = sample_util.uniform_sample_by_choices(reduction_device_names)
+    elif load_balancer == 'greedy':
+        destinations = greedy_load_balancer(ps_shards, resource_spec, helpers)
+    elif load_balancer == 'christy':
+        # copy Christy's partitionedPS
+        destinations = christy_load_balancer(ps_shards, resource_spec, helpers)
+    elif load_balancer == 'sorted_christy':
+        destinations = christy_load_balancer(ps_shards, resource_spec, helpers, sort_by_size=True)
+    elif load_balancer == 'sorted_greedy':
+        destinations = greedy_load_balancer(ps_shards, resource_spec, helpers, sort_by_size=True)
+    else:
+        raise ValueError('Cannot recognize load balancer')
+
+    for shard_name, (idx, ) in ps_shards.items():
+        ps_shards[shard_name] = (idx, destinations[shard_name])
+
+    assign_ps_reduction_destinations(node_config, ps_shards)
+
+
+def assign_ps_reduction_destinations(node_config, ps_shards):
+    for node in node_config:
+        if node.partitioner:
+            for part in node.part_config:
+                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                if hasattr(synchronizer, 'reduction_destination'):
+                    synchronizer.reduction_destination = ps_shards[part.var_name][1]
+        else:
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            if hasattr(synchronizer, 'reduction_destination'):
+                synchronizer.reduction_destination = ps_shards[node.var_name][1]
+
+
+def sample_ar_groups(node_config, ar_shards, helpers, heuristics):
+    merge_scheme = heuristics['merge_scheme']
+    if merge_scheme == 'by_chunk':
+        if 'chunk_size' in heuristics and heuristics['chunk_size'] > 0:
+            chunk_size_or_num_group = heuristics['chunk_size']
+        else:
+            chunk_size_or_num_group = sample_chunk_size(len(ar_shards))
+    else:
+        chunk_size_or_num_group = sample_num_ar_groups(ar_shards,
+                                                       heuristics['num_group_bounds'][0],
+                                                       heuristics['num_group_bounds'][1])
+    assert chunk_size_or_num_group > 0, "chunk_size or num_groups need to > 1..."
+
+    if merge_scheme in ['random', None]:
+        tmp_assignments = sample_util.sample_merge_group(chunk_size_or_num_group, len(ar_shards))
+        group_assignments = OrderedDict()
+        for i, shard_name in enumerate(ar_shards):
+            group_assignments[shard_name] = tmp_assignments[i]
+    elif merge_scheme == 'by_chunk':
+        # sample chunk_size
+        group_assignments = chunk_group_assigner(ar_shards, chunk_size_or_num_group)
+    elif merge_scheme == 'christy':
+        group_assignments = christy_group_assigner(ar_shards,
+                                                   helpers,
+                                                   chunk_size_or_num_group)
+    elif merge_scheme == 'ordered_balanced':
+        group_assignments = ordered_balanced_group_assigner(ar_shards,
+                                                            helpers,
+                                                            chunk_size_or_num_group)
+    else:
+        raise ValueError('unseen merge scheme..')
+
+    for shard_name, (idx,) in ar_shards.items():
+        ar_shards[shard_name] = (idx, group_assignments[shard_name])
+    assign_ar_group(node_config, ar_shards)
+
+
+def sample_num_ar_groups(ar_shards, lb, ub):
+    min_num_group = max(1, lb)
+    max_num_group = min(len(ar_shards), ub)
+    num_group = sample_util.uniform_sample_by_choices(range(min_num_group, max_num_group + 1))
+    return num_group
+
+
+def sample_chunk_size(num_ar_shards):
+    chunk_size = sample_util.uniform_sample_by_choices(range(1, num_ar_shards + 1))
+    return chunk_size
+
+
+def assign_ar_group(node_config, ar_shards):
+    for node in node_config:
+        if node.partitioner:
+            for part in node.part_config:
+                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                if hasattr(synchronizer, 'compressor'):
+                    synchronizer.group = ar_shards[part.var_name][1]
+        else:
+            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+            if hasattr(synchronizer, 'compressor'):
+                synchronizer.group = ar_shards[node.var_name][1]

From fd2128dac36a2f647c7ac34af0d778ad3968dfb9 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Thu, 16 Jul 2020 00:47:42 -0400
Subject: [PATCH 02/11] add docstrings for strategy sampler and minor
 improvements

---
 autodist/strategy/auto/sample_util.py         |  61 ++++
 ...random_strategy.py => strategy_sampler.py} | 307 +++++++++++++++---
 autodist/strategy/base.py                     |  36 ++
 autodist/strategy/partitioned_ps_strategy.py  |  37 +--
 4 files changed, 356 insertions(+), 85 deletions(-)
 create mode 100644 autodist/strategy/auto/sample_util.py
 rename autodist/strategy/auto/{random_strategy.py => strategy_sampler.py} (67%)

diff --git a/autodist/strategy/auto/sample_util.py b/autodist/strategy/auto/sample_util.py
new file mode 100644
index 0000000..2547304
--- /dev/null
+++ b/autodist/strategy/auto/sample_util.py
@@ -0,0 +1,61 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample utility functions."""
+
+import numpy as np
+
+
+def uniform_sample_by_choices(choices):
+    """
+    Uniformly sample an option from a list of options.
+
+    Args:
+        choices (list): a list of values to be sampled from.
+
+    Returns:
+        choice: the sampled value.
+
+    """
+    assert choices
+    p = np.random.uniform()
+    t = 1.0 / len(choices)
+    sample = choices[0]
+    for i, c in enumerate(choices):
+        if p < t * (i+1):
+            sample = c
+            break
+    return sample
+
+
+def binary_sample(boundary=0.5):
+    p = np.random.uniform()
+    if p < boundary:
+        return True
+    else:
+        return False
+
+
+def sample_merge_group(num_group, num_candidates):
+
+    def is_valid(assignment):
+        unique_assignment = np.unique(assignment)
+        if unique_assignment.shape[0] == num_group:
+            return True
+        return False
+
+    assignment = np.random.randint(1, num_group+1, [num_candidates])
+    while not is_valid(assignment):
+        assignment = np.random.randint(1, num_group+1, [num_candidates])
+    return assignment
diff --git a/autodist/strategy/auto/random_strategy.py b/autodist/strategy/auto/strategy_sampler.py
similarity index 67%
rename from autodist/strategy/auto/random_strategy.py
rename to autodist/strategy/auto/strategy_sampler.py
index 24150dd..3281c4b 100644
--- a/autodist/strategy/auto/random_strategy.py
+++ b/autodist/strategy/auto/strategy_sampler.py
@@ -12,21 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""AllReduce StrategyBuilder."""
+"""Strategy sampler that generates random strategies given model and resource spec."""
+
 from collections import OrderedDict
 
 from enum import Enum
 from tensorflow.python.framework import ops
 
-from arion.kernel.common.utils import get_op_name, get_consumers
-from arion.kernel.partitioner import PartitionerConfig
-from arion.proto import strategy_pb2, synchronizers_pb2
-from arion.search import sample_util
-from arion.strategy.base import Strategy, StrategyBuilder
-from arion.strategy.base import byte_size_load_fn
-from arion.strategy.component.ps_load_balancer import greedy_load_balancer, christy_load_balancer
-from arion.strategy.component.ar_group_assigner import chunk_group_assigner, \
-    christy_group_assigner, ordered_balanced_group_assigner
+from autodist.kernel.common.utils import get_op_name, get_consumers
+from autodist.kernel.partitioner import PartitionerConfig
+from autodist.proto import strategy_pb2, synchronizers_pb2
+from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn
+from autodist.strategy.auto.ps_load_balancer import greedy_load_balancer, christy_load_balancer
+from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \
+    ordered_balanced_group_assigner
+from autodist.strategy.auto import sample_util
+
 
 class VarType(Enum):
     SPARSE = 0
@@ -34,6 +35,7 @@ class VarType(Enum):
 
 
 class VariableHelper:
+    """Helper class to include meta information about a variable."""
     def __init__(self, var, graph_item):
         self.var = var
         self.graph_item = graph_item
@@ -42,54 +44,97 @@ def __init__(self, var, graph_item):
 
     @property
     def var_type(self):
+        """
+        Return the type of the variable (VarType.SPARSE or VarType.DENSE).
+
+        Returns:
+            VarType
+        """
         return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE
 
     @property
     def is_sparse(self):
+        """
+        Return whether the variable is sparse.
+
+        Returns:
+            Bool
+        """
         return True if self.var_type == VarType.SPARSE else False
 
     @property
     def is_embedding(self):
+        """
+        Return whether the variable corresponds to an embedding.
+
+        Returns:
+            Bool
+        """
+        # TODO (Hao): better way to determine is_embedding?
         for op in get_consumers(self.var.op):
             if op.type == "ResourceGather":
                 return True
-            # op = new_graph_item.graph.get_operation_by_name(
-            #     ops.prepend_name_scope(op.name, ARION_TO_DELETE_SCOPE)
-            # )
         return False
 
     @property
     def shape(self):
+        """
+        Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar).
+
+        Returns:
+            List(int)
+        """
         if self.var.initial_value.shape.ndims:
             return self.var.initial_value.shape.as_list()
         else:
             return None
 
     @property
-    def partitionable_axis(self):
-        valid_axis = []
+    def partitionable_axes(self):
+        """
+        Return the list of available axes that are legitimate to partition along.
+
+        Returns:
+            List(int)
+        """
+        valid_axes = []
+
+        # scalar
         if not self.shape:
-            return valid_axis
-        # Sparse variable can only be partition along the 0th axis
-        # only sample axis for dense variables
+            return valid_axes
+
+        # Sparse variable can only be partition along the 0th axis in current implementation.
         if self.is_sparse or self.is_embedding:
-            valid_axis = [0]
-            return valid_axis
+            valid_axes = [0]
+            return valid_axes
         for idx, dim in enumerate(self.shape):
             if dim > 1:
-                valid_axis.append(idx)
-        return valid_axis
+                valid_axes.append(idx)
+        return valid_axes
 
     @property
     def byte_size(self):
+        """
+        Return the byte size of the variable.
+
+        Returns:
+            float
+        """
         return float(byte_size_load_fn(self.var))
 
     @property
     def dtype(self):
+        """
+        Return the dtype of the variable.
+
+        Returns:
+            dtype
+        """
         return self.var.dtype
 
 
 class PartHelper:
+    """Helper class to include meta information about a variable partition."""
     def __init__(self, part_idx, var, pc):
         self.var = var
         self.part_idx = part_idx
@@ -97,6 +142,13 @@ def __init__(self, part_idx, var, pc):
 
     @property
     def shape(self):
+        """
+        Return the shape of this partition.
+
+        Returns:
+            List(int)
+
+        """
         shape = self.var.initial_value.shape.as_list()
         dim_size = shape[self.pc.axis] // self.pc.num_shards
         extras = shape[self.pc.axis] % self.pc.num_shards
@@ -107,41 +159,62 @@ def shape(self):
 
     @property
     def var_shape(self):
+        """
+        Return the shape of the original value this part belonged to.
+
+        Returns:
+            List(int)
+        """
         return self.var.initial_value.shape.as_list()
 
     @property
     def byte_size(self):
+        """
+        Return the byte size of this partition.
+
+        Returns:
+            float
+        """
         return float(byte_size_load_fn(self.var)) \
                * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis])
 
 
-class RandomStrategy(StrategyBuilder):
+class RandomStrategySampler(StrategyBuilder):
+    """
+    Random Strategy Sampler.
+
+    This StrategyBuilder samples a strategy given graph_item and resource_spec. The sampling process is
+    constrained by `space`, and guided by `heuristics`, both as required arguments of its constructor.
+    """
     def __init__(self, space, heuristics):
         """
 
         Args:
-            self:
-            enable_ps_load_balancer:
-            enable_chunk:
-
-        Returns:
-
+            space (dict): the strategy space that the random strategy should be drawn from. An example of the space
+                          can be found at TODO(Hao).
+            heuristics (dict): heuristics used to guide the random sampling process.
         """
+        if not space:
+            raise ValueError('Space to perform strategy sampling is not provided.')
+        if not heuristics:
+            raise ValueError('Heuristic to guide strategy sampling is not provided.')
         self.space = space
         self.heuristics = heuristics
         self.helpers = {}
 
     def reset(self):
+        """Reset the helpers every time a strategy is sampled."""
         self.helpers = {}
 
     def build(self, graph_item, resource_spec):
+        """Generate a randomized strategy given model and resource spec."""
         expr = Strategy()
 
         # number of graph replica is equal to number of GPU devices
         expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices])
         variables = graph_item.trainable_var_op_to_var.values()
 
-        # A fully MCMC process to generate node configs
+        # Perform MCMC to generate each node configs
         node_config = []
         for var in variables:
             var_helper = VariableHelper(var, graph_item)
@@ -151,18 +224,18 @@ def build(self, graph_item, resource_spec):
             node.var_name = var_helper.var.name
 
             # Step 1: determine whether or not to partition
-            # TODO(Hao): other factor not considered -- number of reduction_device_names
+            # TODO(Hao): some factor is not considered, e.g. number of reduction_device_names
             maybe_partition = sample_if_partition(var_helper, resource_spec, self.space, self.heuristics)
 
             # Step 2.1: if not partition, sample a synchronizer type for it
             if not maybe_partition:  # no partition
                 sample_var_synchronizer(node, var_helper, resource_spec, self.space)
-            else:  # Step 2.2: if partition
+            else:  # Step 2.2: else partition
                 # Step 2.2.1: sample a partitioner config
                 pc = sample_partition_config(var_helper, resource_spec, self.space, self.heuristics)
                 node.partitioner = pc.partition_str
 
-                # step 2.2.2: sample a synchornizer type for each partition
+                # step 2.2.2: sample a synchronizer type for each partition
                 parts = []
                 for i in range(pc.num_shards):
                     part = strategy_pb2.Strategy.Node()
@@ -173,13 +246,26 @@ def build(self, graph_item, resource_spec):
                 node.part_config.extend(parts)
             node_config.append(node)
 
+        # Step 3: Post-assign group or placement.
         sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics)
-        # Mark each variable to be synchronized with a Parameter Server
+
         expr.node_config.extend(node_config)
         return expr
 
 
 def sample_if_partition(var_helper, resource_spec, space, heuristics):
+    """
+    Sample a bool value determining whether to partition a variable or not.
+
+    Args:
+        var_helper: the variable helper corresponded to the variable of interest.
+        resource_spec: the target cluster spec.
+        space: the space argument controlling where to sample from.
+        heuristics: the heuristics argument  guiding the sampling process.
+
+    Returns:
+        Bool
+    """
     reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
     if len(space['maybe_partition']) == 1:
         return space['maybe_partition']
@@ -214,9 +300,16 @@ def sample_if_partition(var_helper, resource_spec, space, heuristics):
 
 
 def sample_var_synchronizer(node, var_helper, resource_spec, space):
-    # sample a single synchornizer for an unpartitioned variable,
-    # will eave merge_group of reduction_destination as empty
-
+    """
+    Sample a synchronizer (and all associated aspects) for an unpartitioned variable,
+    leaving merge_group or reduction_destination as empty.
+
+    Args:
+        node (strategy_pb2.Strategy.Node): the corresponded node_config to be rewritten.
+        var_helper (VariableHelper): the variable helper corresponded to the variable.
+        resource_spec (ResourceSpec): the target cluster spec
+        space (dict): space.
+    """
     # We ALWAYS use PS for sparse variables
     synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \
         else sample_util.uniform_sample_by_choices(space['synchronizer_types'])
@@ -234,8 +327,18 @@ def sample_var_synchronizer(node, var_helper, resource_spec, space):
 
 
 def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristics):
-    # sample synchornizer for a group of variable partitions
-
+    """
+    Sample synchronizers for all the partitions of a variable.
+
+    Args:
+        parts:
+        var_helper:
+        resource_spec:
+        space:
+        heuristics:
+
+    Returns:
+    """
     if var_helper.var_type == VarType.SPARSE:
         synchronizer_types = ['PS'] * len(parts)
     else:
@@ -260,8 +363,19 @@ def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristi
 
 
 def sample_partition_config(var_helper, resource_spec, space, heuristics):
-    # Since Arion only support parttion along one axis,
-    # we first sample a partition axis, then sammple #partition along that axis, we obtain the partition config.
+    """
+    Sample the PartitionerConfig of a variable (that is to be partitioned).
+
+    Args:
+        var_helper:
+        resource_spec:
+        space:
+        heuristics:
+
+    Returns:
+    """
+    # Arion only support partitioning along one axis -- we first sample a partition axis,
+    # then sample the number of partitions along that axis, and obtain the partition config.
     assert len(var_helper.partitionable_axis) > 0, 'No partition axis available'
     # sample partition axis
     # TODO(Hao): some heursitics here available?
@@ -299,7 +413,19 @@ def sample_partition_config(var_helper, resource_spec, space, heuristics):
 
 
 def sample_if_local_replication(local_replication_space, resource_spec):
-    # Local replication is a PS-specific semantic; it represents whether to use hierarchical PS
+    """
+    Sample whether to perform local replication.
+
+    Local replication is a PS-specific semantic; it represents whether to transfer parameters or updates
+    via a transfer device.
+
+    Args:
+        local_replication_space:
+        resource_spec:
+
+    Returns:
+
+    """
     if resource_spec.num_gpus <= resource_spec.num_cpus:
         # meaning every machine has at most 1 GPU
         return False
@@ -307,14 +433,34 @@ def sample_if_local_replication(local_replication_space, resource_spec):
 
 
 def sample_ar_compressor(compressor_space):
-    # [NoneCompressor, HorovodCompressor, HorovodCompressorEF, PowerSGDCompressor]
-    # [ HorovodCompressorEF, PowerSGDCompressor] will change gradient value
-    # so only two choices here
+    """
+    Sample the type of the compressor being applied with collective ops.
+
+    Available options include `NoneCompressor`, `HorovodCompressor`, `HorovodCompressorEF`,
+    `PowerSGDCompressor`, but `HorovodCompressorEF`, `PowerSGDCompressor` will change gradient value.
+    Args:
+        compressor_space:
+
+    Returns:
+    """
     # TODO(Hao): try to use all four options
     return sample_util.uniform_sample_by_choices(compressor_space)
 
 
 def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics):
+    """
+    Sample the merge group or parameter placement (a.k.a. reduction_destination) after all other semantics
+    have been determined.
+
+    Args:
+        node_config:
+        resource_spec:
+        helpers:
+        heuristics:
+
+    Returns:
+
+    """
     ps_shards = OrderedDict()
     ar_shards = OrderedDict()
     idx = 0
@@ -337,13 +483,24 @@ def sample_group_and_reduction_destinations(node_config, resource_spec, helpers,
 
     if len(ps_shards) > 0:
         sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics)
-
-    # step 4: assign ar merge groups globally
     if len(ar_shards) > 0:
         sample_ar_groups(node_config, ar_shards, helpers, heuristics)
 
 
 def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, helpers, heuristics):
+    """
+    Sample the placement of shared parameter variables (a.k.a. reduction destinations).
+
+    Args:
+        node_config:
+        ps_shards:
+        resource_spec:
+        helpers:
+        heuristics:
+
+    Returns:
+
+    """
     load_balancer = heuristics['ps_load_balancer']
     reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
     if not load_balancer:
@@ -369,6 +526,16 @@ def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, help
 
 
 def assign_ps_reduction_destinations(node_config, ps_shards):
+    """
+    Assign the sampled reduction destinations to node_config.
+
+    Args:
+        node_config:
+        ps_shards:
+
+    Returns:
+
+    """
     for node in node_config:
         if node.partitioner:
             for part in node.part_config:
@@ -382,6 +549,18 @@ def assign_ps_reduction_destinations(node_config, ps_shards):
 
 
 def sample_ar_groups(node_config, ar_shards, helpers, heuristics):
+    """
+    Sample the group of collective operations.
+
+    Args:
+        node_config:
+        ar_shards:
+        helpers:
+        heuristics:
+
+    Returns:
+
+    """
     merge_scheme = heuristics['merge_scheme']
     if merge_scheme == 'by_chunk':
         if 'chunk_size' in heuristics and heuristics['chunk_size'] > 0:
@@ -419,18 +598,48 @@ def sample_ar_groups(node_config, ar_shards, helpers, heuristics):
 
 
 def sample_num_ar_groups(ar_shards, lb, ub):
+    """
+    Sample the number of collective groups.
+
+    Args:
+        ar_shards:
+        lb:
+        ub:
+
+    Returns:
+
+    """
     min_num_group = max(1, lb)
     max_num_group = min(len(ar_shards), ub)
-    num_group = sample_util.uniform_sample_by_choices(range(min_num_group, max_num_group + 1))
+    num_group = sample_util.uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1)))
     return num_group
 
 
 def sample_chunk_size(num_ar_shards):
-    chunk_size = sample_util.uniform_sample_by_choices(range(1, num_ar_shards + 1))
+    """
+    Sample the chunk_size if following a chunk-based merge scheme.
+
+    Args:
+        num_ar_shards:
+
+    Returns:
+
+    """
+    chunk_size = sample_util.uniform_sample_by_choices(list(range(1, num_ar_shards + 1)))
     return chunk_size
 
 
 def assign_ar_group(node_config, ar_shards):
+    """
+    Assign the sampled group values to node configs.
+
+    Args:
+        node_config:
+        ar_shards:
+
+    Returns:
+
+    """
     for node in node_config:
         if node.partitioner:
             for part in node.part_config:
diff --git a/autodist/strategy/base.py b/autodist/strategy/base.py
index 965e1ff..df562da 100644
--- a/autodist/strategy/base.py
+++ b/autodist/strategy/base.py
@@ -18,6 +18,8 @@
 from abc import ABC, abstractmethod
 from datetime import datetime
 
+from tensorflow.python.framework import tensor_shape
+
 from autodist.const import DEFAULT_SERIALIZATION_DIR
 from autodist.graph_item import GraphItem
 from autodist.kernel.common.utils import get_op_name
@@ -166,3 +168,37 @@ def compile(self, strategy):
         if self._device_resolver:
             strategy = self._resolve_devices(strategy)
         return strategy
+
+
+def byte_size_load_fn(op):
+    """
+    Load function that computes the byte size of a single-output `Operation`.
+
+    Copied (with modifications) from tensorflow.contrib.training.python.training.device_setter.
+
+    This is intended to be used with `"Variable"` ops, which have a single
+    `Tensor` output with the contents of the variable.  However, it can also be
+    used for calculating the size of any op that has a single output.
+
+    Intended to be used with `GreedyLoadBalancingStrategy`.
+
+    Args:
+      op: An `Operation` with a single output, typically a "Variable" op.
+
+    Returns:
+      The number of bytes in the output `Tensor`.
+
+    Raises:
+      ValueError: if `op` does not have a single output, or if the shape of the
+        single output is not fully-defined.
+    """
+    elem_size = op.dtype.size
+    shape = op.get_shape()
+    if not shape.is_fully_defined():
+        # Due to legacy behavior, scalar "Variable" ops have output Tensors that
+        # have unknown shape when the op is created (and hence passed to this
+        # load function for placement), even though the scalar shape is set
+        # explicitly immediately afterward.
+        shape = tensor_shape.TensorShape(op.get_attr("shape"))
+    shape.assert_is_fully_defined()
+    return shape.num_elements() * elem_size
diff --git a/autodist/strategy/partitioned_ps_strategy.py b/autodist/strategy/partitioned_ps_strategy.py
index b1259a6..ecca253 100644
--- a/autodist/strategy/partitioned_ps_strategy.py
+++ b/autodist/strategy/partitioned_ps_strategy.py
@@ -15,13 +15,12 @@
 """Partitioned PS StrategyBuilder with Greedy Load Balancer."""
 
 from math import ceil
-from tensorflow.python.framework import tensor_shape
 
 from autodist.const import ENV
 from autodist.kernel.common.op_info import CONTROL_FLOW_OPS
 from autodist.kernel.common.utils import get_consumers, get_op_name
 from autodist.kernel.partitioner import PartitionerConfig
-from autodist.strategy.base import Strategy, StrategyBuilder
+from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn
 from autodist.proto import strategy_pb2
 
 
@@ -133,37 +132,3 @@ def get_num_shards(var):
             if n % i == 0:
                 return i
         return n
-
-
-def byte_size_load_fn(op):
-    """
-    Load function that computes the byte size of a single-output `Operation`.
-
-    Copied (with modifications) from tensorflow.contrib.training.python.training.device_setter.
-
-    This is intended to be used with `"Variable"` ops, which have a single
-    `Tensor` output with the contents of the variable.  However, it can also be
-    used for calculating the size of any op that has a single output.
-
-    Intended to be used with `GreedyLoadBalancingStrategy`.
-
-    Args:
-      op: An `Operation` with a single output, typically a "Variable" op.
-
-    Returns:
-      The number of bytes in the output `Tensor`.
-
-    Raises:
-      ValueError: if `op` does not have a single output, or if the shape of the
-        single output is not fully-defined.
-    """
-    elem_size = op.dtype.size
-    shape = op.get_shape()
-    if not shape.is_fully_defined():
-        # Due to legacy behavior, scalar "Variable" ops have output Tensors that
-        # have unknown shape when the op is created (and hence passed to this
-        # load function for placement), even though the scalar shape is set
-        # explicitly immediately afterward.
-        shape = tensor_shape.TensorShape(op.get_attr("shape"))
-    shape.assert_is_fully_defined()
-    return shape.num_elements() * elem_size

From 59fb2a371dda68ea406abb5f09b72a39b7adf57c Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Thu, 16 Jul 2020 02:15:16 -0400
Subject: [PATCH 03/11] add a few more comments and predefined simulator

---
 autodist/simulator/{models => }/base.py       |  92 +--
 autodist/simulator/models/__init__.py         |   0
 .../simulator/models/rankrnn_simulator.py     | 634 ---------------
 .../models/rankrnn_simulator_penalty.py       | 729 ------------------
 autodist/simulator/predefined_simulator.py    | 374 +++++++++
 ...r_penalty_fast.py => rankrnn_simulator.py} |   0
 autodist/strategy/auto/ar_group_assigner.py   |  28 +-
 autodist/strategy/auto/auto_strategy.py       | 249 ++++++
 autodist/strategy/auto/ps_load_balancer.py    |  44 +-
 9 files changed, 739 insertions(+), 1411 deletions(-)
 rename autodist/simulator/{models => }/base.py (83%)
 delete mode 100644 autodist/simulator/models/__init__.py
 delete mode 100644 autodist/simulator/models/rankrnn_simulator.py
 delete mode 100644 autodist/simulator/models/rankrnn_simulator_penalty.py
 create mode 100644 autodist/simulator/predefined_simulator.py
 rename autodist/simulator/{models/rankrnn_simulator_penalty_fast.py => rankrnn_simulator.py} (100%)

diff --git a/autodist/simulator/models/base.py b/autodist/simulator/base.py
similarity index 83%
rename from autodist/simulator/models/base.py
rename to autodist/simulator/base.py
index a12c147..964302b 100644
--- a/autodist/simulator/models/base.py
+++ b/autodist/simulator/base.py
@@ -203,52 +203,52 @@ def extract_pre_feature(self, strategy: Strategy, resource_spec: ResourceSpec):
                 meta[var_meta.name] = var_meta
         return meta, resource
 
-    def extract_pre_feature_legacy(self, strategy):
-        """Don't use now!!!"""
-        meta = defaultdict()
-        for node in strategy.node_config:
-            var_name = node.var_name
-            for var_op, var in self._original_graph_item.trainable_var_op_to_var.items():
-                if var.name == var_name:
-                    break
-            var_op_name = var_op.name
-            var_helper = VariableHelper(var, self._original_graph_item)
-            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-            compressor = getattr(synchronizer, 'compressor', None)
-            if compressor is not None:
-                compressor = AllReduceSynchronizer.Compressor.Name(compressor)
-            reduction_destinations = getattr(synchronizer, 'reduction_destinations', None)
-            if not reduction_destinations or len(reduction_destinations) <= 1:
-                # this variable is not partitioned
-                device = reduction_destinations[0] if reduction_destinations else var.device
-                var_meta = Var(name=var_name,
-                               is_sparse=var_helper.is_sparse,
-                               shape=var_helper.shape,
-                               dtype=var_helper.dtype,
-                               synchronizer=synchronizer,
-                               compressor=compressor,
-                               device=device)
-                meta[var_meta.name] = var_meta
-            else:
-                # this variable is partitioned
-                num_partitions = len(reduction_destinations)
-                partition_list = [1] * len(var_helper.shape)
-                partition_list[0] = num_partitions
-                pc = PartitionerConfig(partition_list=partition_list)
-                for i, device in enumerate(reduction_destinations):
-                    part_helper = PartHelper(i, var, pc)
-                    part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i),
-                                          is_sparse=var_helper.is_sparse,
-                                          shape=part_helper.shape,
-                                          dtype=var_helper.dtype,
-                                          synchronizer=synchronizer,
-                                          part_id=i,
-                                          partition_str=pc.partition_str,
-                                          original_shape=var_helper.shape,
-                                          compressor=compressor,
-                                          device=device)
-                    meta[part_meta.name] = part_meta
-        return meta
+    # def extract_pre_feature_legacy(self, strategy):
+    #     """Don't use now!!!"""
+    #     meta = defaultdict()
+    #     for node in strategy.node_config:
+    #         var_name = node.var_name
+    #         for var_op, var in self._original_graph_item.trainable_var_op_to_var.items():
+    #             if var.name == var_name:
+    #                 break
+    #         var_op_name = var_op.name
+    #         var_helper = VariableHelper(var, self._original_graph_item)
+    #         synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+    #         compressor = getattr(synchronizer, 'compressor', None)
+    #         if compressor is not None:
+    #             compressor = AllReduceSynchronizer.Compressor.Name(compressor)
+    #         reduction_destinations = getattr(synchronizer, 'reduction_destinations', None)
+    #         if not reduction_destinations or len(reduction_destinations) <= 1:
+    #             # this variable is not partitioned
+    #             device = reduction_destinations[0] if reduction_destinations else var.device
+    #             var_meta = Var(name=var_name,
+    #                            is_sparse=var_helper.is_sparse,
+    #                            shape=var_helper.shape,
+    #                            dtype=var_helper.dtype,
+    #                            synchronizer=synchronizer,
+    #                            compressor=compressor,
+    #                            device=device)
+    #             meta[var_meta.name] = var_meta
+    #         else:
+    #             # this variable is partitioned
+    #             num_partitions = len(reduction_destinations)
+    #             partition_list = [1] * len(var_helper.shape)
+    #             partition_list[0] = num_partitions
+    #             pc = PartitionerConfig(partition_list=partition_list)
+    #             for i, device in enumerate(reduction_destinations):
+    #                 part_helper = PartHelper(i, var, pc)
+    #                 part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i),
+    #                                       is_sparse=var_helper.is_sparse,
+    #                                       shape=part_helper.shape,
+    #                                       dtype=var_helper.dtype,
+    #                                       synchronizer=synchronizer,
+    #                                       part_id=i,
+    #                                       partition_str=pc.partition_str,
+    #                                       original_shape=var_helper.shape,
+    #                                       compressor=compressor,
+    #                                       device=device)
+    #                 meta[part_meta.name] = part_meta
+    #     return meta
 
     def setup_resource(self, resource_spec: ResourceSpec):
         cluster = SSHCluster(resource_spec)
diff --git a/autodist/simulator/models/__init__.py b/autodist/simulator/models/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/autodist/simulator/models/rankrnn_simulator.py b/autodist/simulator/models/rankrnn_simulator.py
deleted file mode 100644
index 4459515..0000000
--- a/autodist/simulator/models/rankrnn_simulator.py
+++ /dev/null
@@ -1,634 +0,0 @@
-"""Strategy RankNetSimulator."""
-import glob
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-from string import digits
-
-import numpy as np
-import os
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
-
-import arion
-from arion.graph_item import GraphItem
-from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
-from arion.simulator.models.base import SimulatorBase
-from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS
-from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica
-from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
-from arion.strategy.base import Strategy
-from arion.resource_spec import ResourceSpec
-from arion.cluster import SSHCluster
-from arion.kernel.device.resolver import DeviceResolver
-from arion.kernel.partitioner import PartitionerConfig
-from arion.simulator.models.predefined_simulator import PredefinedSimulator
-
-import torch
-import torch.nn as nn
-
-TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# feature settings
-MAX_NUM_WORKERS = 16
-MAX_NUM_GROUPS = 600
-MAX_NUM_VARS = 500
-MAX_NUM_PARS = 1500
-
-# model size
-FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15
-PARTITION_MLP_HIDDEN = 128
-PARTITION_MLP_OUT = 32
-STEM_RNN_HIDDEN = 128
-BIDIECTIONAL = True
-NUM_RNN_LAYERS = 3
-
-# trainer setting
-BATCH_SIZE = 64
-LR = 3e-4
-WD = 3e-4
-
-GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item',
-                'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item',
-                'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item',
-                'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item',
-                'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item',
-                'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item',
-                'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l',
-                'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l',
-                'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l',
-                'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'}
-
-def get_model(path_):
-    if 'densenet121' in path_:
-        return 'densenet121'
-    elif 'ncf' in path_:
-        return 'ncf'
-    elif 'inceptionv3' in path_:
-        return 'inceptionv3'
-    elif 'resnet101' in path_:
-        return 'resnet101'
-    elif 'resnet50' in path_:
-        return 'resnet50'
-    elif 'vgg16' in path_:
-        return 'vgg16'
-    elif 'bert' in path_ and '12l' in path_:
-        return 'bert_12l'
-    elif 'bert' in path_ and '6l' in path_:
-        return 'bert_6l'
-    elif 'bert' in path_ and '3l' in path_:
-        return 'bert_3l'
-    elif 'bert' in path_ and 'large' in path_:
-        return 'bert_large'
-    else:
-        return None
-
-class RankRNN(nn.Module):
-    def __init__(self, input_size=FEATURE_SIZE,
-                       partition_mlp_hidden=PARTITION_MLP_HIDDEN, 
-                       partition_mlp_out=PARTITION_MLP_OUT, 
-                       stem_rnn_hidden=STEM_RNN_HIDDEN, 
-                       num_rnn_layers=NUM_RNN_LAYERS, 
-                       bidirectional=BIDIECTIONAL):
-        super(RankRNN, self).__init__()
-        self.partition_mlp_out = partition_mlp_out
-        # self.num_rnn_layers = num_rnn_layers
-        self.stem_rnn_hidden = stem_rnn_hidden
-        self.partition_mlp = nn.Sequential(nn.Linear(input_size, partition_mlp_hidden),
-                                           nn.ReLU(),
-                                           # nn.Linear(partition_mlp_hidden, partition_mlp_hidden),
-                                           # nn.ReLU(),
-                                           nn.Linear(partition_mlp_hidden, partition_mlp_out),
-                                           )
-
-        self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional)
-        self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1)
-
-        self.relu = nn.ReLU()
-    
-    def forward(self, features, par_indices, var_nums):
-
-        x = features.float()
-        # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float()
-        x = self.partition_mlp(x)
-
-        x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype)
-        x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x)
-
-        # Set initial hidden and cell states 
-        # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) 
-        # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE)
-        
-        # Forward propagate LSTM
-        x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False)
-        out, (ht, ct) = self.stem_rnn(x1)  # out: tensor of shape (batch_size, seq_length, hidden_size)
-
-        # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None]
-        out = ht.permute(1, 0, 2).reshape(x.shape[0], -1)
-        # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]])
-        # print(ht.permute(1, 0, 2).shape, x.shape)
-        out = self.final_fc(out)
-        return out
-
-class TrainTensorDataset(torch.utils.data.Dataset):
-    """TensorDataset with support of transforms.
-    """
-    def __init__(self, tensors):
-        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
-        self.tensors = tensors
-
-    def __getitem__(self, index):
-        x = self.tensors[0][index]
-        x = self.perturbe_device_and_group(x)
-        x1 = self.tensors[1][index]
-        x2 = self.tensors[2][index]
-
-        y = self.tensors[3][index]
-
-        return x, x1, x2, y
-
-    def __len__(self):
-        return self.tensors[0].size(0)
-
-    def perturbe_device_and_group(self, x):
-        # perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32)
-        # perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32)
-        # mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids]
-        # mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids]
-        # x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1)
-        return x
-
-
-def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas):
-    ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)]
-
-    if device is not None:
-        ret[0][device] = 1
-    
-    group = getattr(synchronizer, 'group', None)
-    if group is not None:
-        assert group < MAX_NUM_GROUPS, group
-        ret[1][group] = 1
-
-    compressor = getattr(synchronizer, 'compressor', None)
-    if compressor is not None:
-        if compressor in ["PowerSGDCompressor", 3]:
-            ret[2][2] = 1
-        elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]:
-            ret[2][1] = 1
-        elif compressor in ["NoneCompressor", 0]:
-            ret[2][0] = 1
-        else:
-            raise ValueError('Compressor does not exist: {}'.format(compressor))
-
-    local_replication = getattr(synchronizer, 'local_replication', None)
-    if isinstance(synchronizer, PSSynchronizer):
-        synchronizer = 0
-        if int(local_replication) == 0:
-            if int(is_sparse) == 0:
-                ret[3][0] = 1
-            else:
-                ret[3][1] = 1
-        else:
-            if int(is_sparse) == 0:
-                ret[3][2] = 1
-            else:
-                ret[3][3] = 1
-    else:
-        ret[3][4] = 1
-    ret[4] = np.array([size_ratio, bd, num_replicas])
-
-    return np.concatenate(ret)
-
-def connvert_feature(strategy, resource_spec, graph_item):
-    
-    cluster = SSHCluster(resource_spec)
-    device_resolver = DeviceResolver(cluster)
-    graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
-    # bandwidth
-    network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices])
-    network_bandwidth = network_bandwidth
-    min_network_bandwidth = network_bandwidth.min()
-    # Other information
-    cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
-    gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
-    max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
-    total_num_local_replica = len(graph_replicas)
-    worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
-
-    num_vars = 0
-    total_size_vars = 0
-    for var_op, var in graph_item.trainable_var_op_to_var.items():
-        num_vars += 1
-        if var.initial_value.shape.ndims:
-            var_helper = VariableHelper(var, graph_item)
-            if var_helper.is_sparse:
-                total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape))
-            else:
-                total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype)
-    assert num_vars < MAX_NUM_VARS, num_vars
-    var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32)
-    partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1)
-
-    cnt = 0
-    for node_id, node in enumerate(strategy.node_config):
-        var_name = node.var_name
-        for var_op, var in graph_item.trainable_var_op_to_var.items():
-            if var.name == var_name:
-                break
-        var_helper = VariableHelper(var, graph_item)
-
-        if node.partitioner:
-            pc = PartitionerConfig(partition_str=node.partitioner)
-            for i, part in enumerate(node.part_config):
-                part_helper = PartHelper(i, var, pc)
-                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
-                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
-                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
-                                                 device_resolver)
-                if device == '':
-                    assert(isinstance(synchronizer, AllReduceSynchronizer))
-                    device = None
-                    bd = min_network_bandwidth
-                    num_replicas = 0
-                else:
-                    device = cpu_worker_list.index(device)
-                    bd = network_bandwidth[device]
-                    num_replicas = worker_num_replicas[device]
-
-                if var_helper.is_sparse:
-                    size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars
-                else:
-                    size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars
-                var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
-                partition_indice[cnt] = node_id
-                cnt += 1
-        else:
-            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-            reduction_destination = getattr(synchronizer, 'reduction_destination', None)
-            device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
-                                             device_resolver)
-            if device == '':
-                assert(isinstance(synchronizer, AllReduceSynchronizer))
-                device = None
-                bd = min_network_bandwidth
-                num_replicas = 0
-            else:
-                device = cpu_worker_list.index(device)
-                bd = network_bandwidth[device]
-                num_replicas = worker_num_replicas[device]
-
-            if var_helper.is_sparse:
-                size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars
-            else:
-                size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars
-            var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
-            partition_indice[cnt] = node_id
-            cnt += 1
-    return var_partition_features, partition_indice, np.array(node_id+1)
-
-def create_predefined_features(strategy, resource_spec, predefined_simulator):
-
-    var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec)
-
-    features = []
-    for var_name, sync_time in var_sync_time.items():
-        if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies.
-            transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
-            sync_time = sync_time[0]
-            is_ps = True
-        else:   # AR
-            transmission = sync_time['transmission']
-            is_ps = False
-
-        network_overhead = sync_time['network_overhead']
-        gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
-
-        feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)]
-        features.append(feat)
-    features = np.array(features, dtype=np.float)
-    return features
-
-class RankRNNSimulator(SimulatorBase):
-    """Simulates strategies for a given graph and resource spec."""
-
-    def __init__(self,
-                 original_graph_item_path,
-                 fetches=None,
-                 batch_size=1,
-                 seq_len=1,
-                 checkpoint=None):
-
-        super(RankRNNSimulator, self).__init__(original_graph_item_path=original_graph_item_path)
-        print("It's using RankNet simulator.")
-        self._fetches = fetches
-        self._batch_size_per_gpu = batch_size
-        self._seq_len = seq_len
-        self._checkpoint = checkpoint
-        self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path,
-                                                         batch_size=self._batch_size_per_gpu,
-                                                         seq_len=self._seq_len)
-        if self._checkpoint:
-            self._model = RankRNN().to(TORCH_DEVICE)
-            self._model.load_state_dict(torch.load(self._checkpoint, map_location=torch.device('cpu')))
-
-    def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None):
-        cost = self.predict(strategy, resource_spec, strategy_path, checkpoint)
-        return cost
-
-    def predict(self,
-                strategy,
-                resource_spec,
-                strategy_path=None,
-                checkpoint=None):
-        if checkpoint is None:
-            if self._checkpoint is None:
-                raise ValueError("checkpoint is None: {}".format(checkpoint))
-            else:
-                model = self._model
-        else:
-            model = RankRNN().to(TORCH_DEVICE)
-            model.load_state_dict(torch.load(checkpoint))
-        if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')):
-            loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz'))
-            var_partition_features, partition_indice, var_num, _ = \
-                            loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
-        else:
-            var_partition_features, partition_indice, var_num = \
-                            connvert_feature(strategy, resource_spec, self._original_graph_item)
-
-        if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')):
-            loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz'))
-            predefined_features = loaded['x4']
-        else:
-            predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator)
-            
-        var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
-
-        var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE)
-        partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE)
-        var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE)
-
-        return model(var_partition_features, partition_indice, var_num).view(-1).data.cpu().numpy()
-
-class RankNetTrainer():
-
-    def __init__(self,
-                 checkpoint=None,
-                 batch_size_per_gpu=256,
-                 seq_len=1,
-                 seed=1):
-        self._batch_size_per_gpu = batch_size_per_gpu
-        self._seq_len = seq_len
-        self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()}
-        self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v,
-                                                         batch_size=self._batch_size_per_gpu,
-                                                         seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()}
-        self.model = RankRNN().to(TORCH_DEVICE)
-        if checkpoint:
-            self.model.load_state_dict(torch.load(checkpoint))
-        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=LR, weight_decay=WD)
-        print("It's using RankNet trainer.")
-
-    def train(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same', num_epochs=200):
-
-        features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()}
-        for training_path in path_list:
-            for path in Path(training_path).rglob('strategies'):
-                strategy_paths = glob.glob(os.path.join(path, '*'))
-                for strategy_path in strategy_paths:
-                    if 'json' in strategy_path or \
-                      'bert_large_batch_8_orca_16_group_2/' in strategy_path:
-                        continue
-                    model = get_model(strategy_path)
-                    if model is None:
-                        if not ('densenets169' in strategy_path or 'densenets201' in strategy_path):
-                            assert False, strategy_path
-                        continue
-                    rs_path = strategy_path.replace('strategies', 'resource_specs')
-                    runtime_path = strategy_path.replace('strategies', 'runtimes')
-                    npz_path = (strategy_path+'.npz').replace('strategies', 'npz')
-                    if not os.path.isfile(rs_path):
-                        rs_path += '.yml'  
-                    if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)):
-                        continue
-                    if not os.path.exists(os.path.dirname(npz_path)):
-                        os.makedirs(os.path.dirname(npz_path))
-
-                    if not os.path.isfile(npz_path):
-                        strategy = Strategy.deserialize(path=strategy_path)
-                        rs = ResourceSpec(resource_file=rs_path)
-                        var_partition_features, partition_indice, var_num = \
-                                        connvert_feature(strategy, rs, self.graph_items[model])
-                        label = np.array(json.load(open(runtime_path))['average'])
-                        np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label)
-                    else:
-                        loaded = np.load(npz_path)
-                        var_partition_features, partition_indice, var_num, label = \
-                                        loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
-
-                    if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')):
-                        predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model])
-                        np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features)
-                    else:
-                        loaded = np.load(npz_path.replace('.npz', '_pdf.npz'))
-                        predefined_features = loaded['x4']
-                    var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
-
-                    is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path or 'vgg_random_orca_11' in strategy_path) # comment here
-                    print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3])
-                    features[model][is_aws][0].append(var_partition_features)
-                    features[model][is_aws][1].append(partition_indice)
-                    features[model][is_aws][2].append(var_num)
-                    features[model][is_aws][3].append(label)
-
-        for k, _ in GRAPH_ITEM_PATHS.items():
-            for i1 in range(2):
-                for i2 in range(4):
-                    if len(features[k][i1][i2]) > 1:
-                        features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16)
-                        print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape)
-                    else:
-                        features[k][i1][i2] = None
-
-        train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0)
-        train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0)
-        train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0)
-        train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0)
-
-        if type(valid_patterns) == str and valid_patterns == 'same':
-            permt = np.random.permutation(train_features.shape[0])
-            split = int(len(permt) * 0.8)
-            val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]]
-            train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]]
-        else:
-            val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0)
-            val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0)
-            val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0)
-            val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0)
-
-            # comment here
-            permt = np.random.permutation(val_features.shape[0])
-            split = int(len(permt) * 0.7)
-            train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0)
-
-            val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]]
-
-        print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min()) 
-
-        ## train the model
-        trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE)))
-        testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE))
-        trainloader = torch.utils.data.DataLoader(dataset=trainset, 
-                                                  batch_size=BATCH_SIZE, 
-                                                  shuffle=True)
-        testloader = torch.utils.data.DataLoader(dataset=testset, 
-                                                  batch_size=32, 
-                                                  shuffle=False)
-        best_val_acc = 0.
-        checkpoint_path = 'model_train_on_{}-{}_new.ckpt'.format(train_patterns[0][0], 'orca' if train_patterns[0][1] == 0 else 'aws')
-        for epoch in range(num_epochs):
-            if epoch == int(num_epochs*2./5. - 1):
-                for param_group in self.optimizer.param_groups: param_group['lr'] = 3e-4
-            if epoch == int(num_epochs*4./5. - 1):
-                for param_group in self.optimizer.param_groups: param_group['lr'] = 1e-4
-
-            labels = []
-            outputs = []
-            for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(trainloader):  
-                
-                # Forward pass
-                outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze()
-                
-                true_comp = (labels_b[:, None] > labels_b[None, :]).float() * 2 - 1
-                pred_comp = outputs_b[:, None] - outputs_b[None, :]
-                loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp)
-                loss = loss.tril(-1).mean()
-                
-                # Backward and optimize
-                self.optimizer.zero_grad()
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(self.model.stem_rnn.parameters(), 0.25)
-                self.optimizer.step()
-
-                outputs.append(outputs_b)
-                labels.append(labels_b)
-
-            labels = torch.cat(labels)
-            outputs = torch.cat(outputs)
-            true_comp = (labels[:, None] > labels[None, :])
-            pred_comp = (outputs[:, None] > outputs[None, :])
-            equal = (true_comp == pred_comp).int()
-            train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
-            
-            with torch.no_grad():
-                labels = []
-                outputs = []
-                for features_b, par_indices_b, var_nums_b, labels_b in testloader:
-                    
-                    # Forward pass
-                    outputs_b = self.model(features_b, par_indices_b, var_nums_b).squeeze()
-                    outputs.append(outputs_b)
-                    labels.append(labels_b)
-
-                labels = torch.cat(labels)
-                outputs = torch.cat(outputs)
-                true_comp = (labels[:, None] > labels[None, :])
-                pred_comp = (outputs[:, None] > outputs[None, :])
-                equal = (true_comp == pred_comp).int()
-                acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
-                if acc.item() > best_val_acc:
-                    best_val_acc = acc.item()
-                    torch.save(self.model.state_dict(), checkpoint_path)
-                    print('Saved model to {}'.format(checkpoint_path))
-                print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc))
-        return checkpoint_path
-
-
-if __name__ == '__main__':
-    
-    trainer = RankNetTrainer()
-    checkpoint_path = trainer.train(
-                                    [
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', 
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', 
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf',
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20',
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/densenet', 
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', 
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', 
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', 
-                                     '/users/hzhang2/oceanus_cost_model_training_data/vgg16',
-                                     # '/users/hzhang2/oceanus_cost_model_training_data/bert',
-                                     ],
-                                    [
-                                      # ('ncf', 0), #('ncf', 1), 
-                                      # ('densenet121', 0), ('densenet121', 1), 
-                                      # ('inceptionv3', 0), ('inceptionv3', 1), 
-                                      # ('resnet101', 0), ('resnet101', 1), 
-                                      # ('resnet50', 0), ('resnet50', 1), 
-                                      # ('bert_12l', 0), ('bert_12l', 1), 
-                                      # ('bert_6l', 0), ('bert_6l', 1), 
-                                      # ('bert_3l', 0), ('bert_3l', 1), 
-                                      # ('bert_large', 0), ('bert_large', 1), 
-                                      ('vgg16', 0), #('vgg16', 1), 
-                                    ], 
-                                    [('vgg16', 1)],
-                                    num_epochs=200)
-    # checkpoint_path = 'model_train_on_vgg16-orca.ckpt'
-    test_list = [
-    '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15',
-    '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11',   #TARGET: 0.9
-    # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
-    ]
-    
-    for data_folder in test_list:
-        simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)],
-                                        batch_size=256,
-                                        seq_len=1,
-                                        checkpoint=checkpoint_path)
-
-        runtimes_folder = os.path.join(data_folder, 'runtimes')
-        results = {}
-        averages= []
-        scores = []
-        for name in os.listdir(runtimes_folder):
-            strategy_path = os.path.join(data_folder, 'strategies', name)
-            rs_path = os.path.join(data_folder, 'resource_specs', name )
-            if not os.path.isfile(rs_path):
-                rs_path += '.yml' 
-            runtime_path = os.path.join(runtimes_folder, name)
-
-            with open(runtime_path, 'r') as f:
-                runtimes = json.load(f)
-            average = np.array(runtimes['average'])
-
-            s = Strategy.deserialize(strategy_path)
-            rs = ResourceSpec(resource_file=rs_path)
-            score = simulator.simulate(s, rs, strategy_path)
-
-            results[name] = (average, score)
-            averages.append(average)
-            scores.append(score)
-
-        # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])}
-        # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])}
-        # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])}
-        # print('Sorted by runtime.......................')
-        # for _, (rt, prediction) in sorted_by_runtime.items():
-        #     print('runtime {}  prediction {}'.format(rt, prediction))
-
-        y_train = np.array(averages)
-        test_score = np.array(scores)
-        true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :])
-        pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :])
-        equal = (true_comp == pred_comp).astype(np.int)
-        test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1)
-
-        print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc))
diff --git a/autodist/simulator/models/rankrnn_simulator_penalty.py b/autodist/simulator/models/rankrnn_simulator_penalty.py
deleted file mode 100644
index 380fa10..0000000
--- a/autodist/simulator/models/rankrnn_simulator_penalty.py
+++ /dev/null
@@ -1,729 +0,0 @@
-"""Strategy RankNetSimulator."""
-import glob
-import json
-import sys
-from datetime import datetime
-from pathlib import Path
-from string import digits
-
-import numpy as np
-import os
-import tensorflow as tf
-tf.compat.v1.disable_eager_execution()
-
-import arion
-from arion.graph_item import GraphItem
-from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
-from arion.simulator.models.base import SimulatorBase
-from arion.simulator.utils import get_dense_var_bits, get_sparse_var_bits, GIGABITS
-from arion.simulator.utils import _resolve_device_address, _max_num_local_replica, _num_local_replica
-from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
-from arion.strategy.base import Strategy
-from arion.resource_spec import ResourceSpec
-from arion.cluster import SSHCluster
-from arion.kernel.device.resolver import DeviceResolver
-from arion.kernel.partitioner import PartitionerConfig
-from arion.simulator.models.predefined_simulator import PredefinedSimulator
-
-import torch
-import torch.nn as nn
-
-TORCH_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-# feature settings
-MAX_NUM_WORKERS = 16
-MAX_NUM_GROUPS = 600
-MAX_NUM_VARS = 500
-MAX_NUM_PARS = 1500
-FEATURE_SIZE = MAX_NUM_WORKERS+MAX_NUM_GROUPS+15
-
-# model size
-PARTITION_MLP_HIDDEN = 128
-PARTITION_MLP_OUT = 32
-STEM_RNN_HIDDEN = 128
-BIDIECTIONAL = True
-BATCH_SIZE = 96
-
-NUM_RNN_LAYERS = 3
-SCORE_TH = 0.005
-LR = 2e-3
-WD = 3e-4
-DATA_AUG = False
-IN_LAYERS = 2
-OUT_LAYERS = 1
-
-# ncf used:
-# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_ncf-orca_new.ckpt 0.9020
-# noaug
-# PARTITION_MLP_HIDDEN = 128
-# PARTITION_MLP_OUT = 32
-# STEM_RNN_HIDDEN = 128
-# BIDIECTIONAL = True
-# NUM_RNN_LAYERS = 4
-# BATCH_SIZE = 64
-# LR = 1e-3
-# WD = 4e-4
-
-# vgg used:
-# ~/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_train_on_vgg16-orca_new_new_new.ckpt 0.8374
-# noaug
-# PARTITION_MLP_HIDDEN = 128
-# PARTITION_MLP_OUT = 32
-# STEM_RNN_HIDDEN = 128
-# BIDIECTIONAL = True
-# NUM_RNN_LAYERS = 3
-# BATCH_SIZE = 64
-# LR = 1e-3
-# WD = 3e-4
-
-GRAPH_ITEM_PATHS = {'ncf':'/users/hzhang2/projects/pycharm/zhijie/5-5-2020/original_graph_item',
-                'densenet121': '/users/hzhang2/projects/pycharm/zhijie/graph_items/densenet121_original_graph_item',
-                'inceptionv3': '/users/hzhang2/projects/pycharm/zhijie/graph_items/inceptionv3_original_graph_item',
-                'resnet101': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet101_original_graph_item',
-                'resnet50': '/users/hzhang2/projects/pycharm/zhijie/graph_items/resnet50_original_graph_item',
-                'vgg16': '/users/hzhang2/projects/pycharm/zhijie/graph_items/vgg16_original_graph_item',
-                'bert_12l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_12l',
-                'bert_6l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_6l',
-                'bert_3l': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_3l',
-                'bert_large': '/users/hzhang2/projects/pycharm/zhijie/graph_items/bert_original_graph_item_large'}
-
-def get_model(path_):
-    if 'densenet121' in path_:
-        return 'densenet121'
-    elif 'ncf' in path_:
-        return 'ncf'
-    elif 'inceptionv3' in path_:
-        return 'inceptionv3'
-    elif 'resnet101' in path_:
-        return 'resnet101'
-    elif 'resnet50' in path_:
-        return 'resnet50'
-    elif 'vgg16' in path_:
-        return 'vgg16'
-    elif 'bert' in path_ and '12l' in path_:
-        return 'bert_12l'
-    elif 'bert' in path_ and '6l' in path_:
-        return 'bert_6l'
-    elif 'bert' in path_ and '3l' in path_:
-        return 'bert_3l'
-    elif 'bert' in path_ and 'large' in path_:
-        return 'bert_large'
-    else:
-        return None
-
-class RankRNN(nn.Module):
-    def __init__(self, input_size=FEATURE_SIZE,
-                       partition_mlp_hidden=PARTITION_MLP_HIDDEN, 
-                       partition_mlp_out=PARTITION_MLP_OUT, 
-                       stem_rnn_hidden=STEM_RNN_HIDDEN, 
-                       num_rnn_layers=NUM_RNN_LAYERS, 
-                       in_layers=IN_LAYERS,
-                       out_layers=OUT_LAYERS,
-                       bidirectional=BIDIECTIONAL):
-        super(RankRNN, self).__init__()
-        self.partition_mlp_out = partition_mlp_out
-        # self.num_rnn_layers = num_rnn_layers
-        self.stem_rnn_hidden = stem_rnn_hidden
-        tmp = [nn.Linear(input_size, partition_mlp_hidden)]
-        for _ in range(in_layers-2):
-            tmp.append(nn.ReLU())
-            tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_hidden))
-        tmp.append(nn.ReLU())
-        tmp.append(nn.Linear(partition_mlp_hidden, partition_mlp_out))
-
-        self.partition_mlp = nn.Sequential(*tmp)
-
-        self.stem_rnn = nn.LSTM(partition_mlp_out, stem_rnn_hidden, num_rnn_layers, batch_first=True, bidirectional=bidirectional)
-
-        if out_layers == 1:
-            self.final_fc = nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 1)
-        elif out_layers == 2:
-            self.final_fc = nn.Sequential(nn.Linear(stem_rnn_hidden*num_rnn_layers*(1+int(bidirectional)), 128),
-                                          nn.ReLU(),
-                                          nn.Linear(128, 1))
-
-        self.relu = nn.ReLU()
-    
-    def forward(self, features, par_indices, var_nums, return_feature=False):
-
-        x = features.float()
-        # x = torch.cat([features[:, :, :MAX_NUM_WORKERS], features[:, :, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 2).float()
-        x = self.partition_mlp(x)
-
-        x1 = torch.zeros(features.shape[0], MAX_NUM_VARS, self.partition_mlp_out, device=TORCH_DEVICE, dtype=x.dtype)
-        x1.scatter_add_(1, par_indices.long()[:, :, None].expand(par_indices.shape[0], par_indices.shape[1], self.partition_mlp_out), x)
-
-        # Set initial hidden and cell states 
-        # h0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE) 
-        # c0 = torch.zeros(self.num_rnn_layers, x.size(0), self.stem_rnn_hidden).to(TORCH_DEVICE)
-        
-        # Forward propagate LSTM
-        x1 = torch.nn.utils.rnn.pack_padded_sequence(x1, var_nums.long(), batch_first=True, enforce_sorted=False)
-        out, (ht, ct) = self.stem_rnn(x1)  # out: tensor of shape (batch_size, seq_length, hidden_size)
-
-        # out = torch.nn.utils.rnn.pad_packed_sequence(out, batch_first=True)[0].sum(1) / var_nums[:, None]
-        out = ht.permute(1, 0, 2).reshape(x.shape[0], -1)
-        # print(out[0, var_nums[0] -1, [3]], out[0, var_nums[0], [3]])
-        # print(ht.permute(1, 0, 2).shape, x.shape)
-        if return_feature:
-            return self.final_fc(out), out.div((out**2).sum(1, keepdim=True).sqrt())
-        else:
-            return self.final_fc(out)
-
-class TrainTensorDataset(torch.utils.data.Dataset):
-    """TensorDataset with support of transforms.
-    """
-    def __init__(self, tensors):
-        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
-        self.tensors = tensors
-
-    def __getitem__(self, index):
-        x = self.tensors[0][index]
-        x = self.perturbe_device_and_group(x)
-        x1 = self.tensors[1][index]
-        x2 = self.tensors[2][index]
-
-        y = self.tensors[3][index]
-
-        return x, x1, x2, y
-
-    def __len__(self):
-        return self.tensors[0].size(0)
-
-    def perturbe_device_and_group(self, x):
-        if DATA_AUG:
-            perturbed_device_ids = np.random.permutation(MAX_NUM_WORKERS).astype(np.int32)
-            perturbed_group_ids = np.random.permutation(MAX_NUM_GROUPS).astype(np.int32)
-            mat_device = torch.eye(MAX_NUM_WORKERS, device=x.device, dtype=x.dtype)[perturbed_device_ids]
-            mat_group = torch.eye(MAX_NUM_GROUPS, device=x.device, dtype=x.dtype)[perturbed_group_ids]
-            x = torch.cat([torch.matmul(x[:, :MAX_NUM_WORKERS], mat_device), torch.matmul(x[:, MAX_NUM_WORKERS:MAX_NUM_WORKERS+MAX_NUM_GROUPS], mat_group), x[:, MAX_NUM_WORKERS+MAX_NUM_GROUPS:]], 1)
-        return x
-
-
-def to_numpy(synchronizer, device, size_ratio, is_sparse, bd, num_replicas):
-    ret = [np.zeros(MAX_NUM_WORKERS), np.zeros(MAX_NUM_GROUPS), np.zeros(3), np.zeros(5), np.zeros(3)]
-
-    if device is not None:
-        ret[0][device] = 1
-    
-    group = getattr(synchronizer, 'group', None)
-    if group is not None:
-        assert group < MAX_NUM_GROUPS, group
-        ret[1][group] = 1
-
-    compressor = getattr(synchronizer, 'compressor', None)
-    if compressor is not None:
-        if compressor in ["PowerSGDCompressor", 3]:
-            ret[2][2] = 1
-        elif compressor in ["HorovodCompressorEF", "HorovodCompressor", 2, 1]:
-            ret[2][1] = 1
-        elif compressor in ["NoneCompressor", 0]:
-            ret[2][0] = 1
-        else:
-            raise ValueError('Compressor does not exist: {}'.format(compressor))
-
-    local_replication = getattr(synchronizer, 'local_replication', None)
-    if isinstance(synchronizer, PSSynchronizer):
-        synchronizer = 0
-        if int(local_replication) == 0:
-            if int(is_sparse) == 0:
-                ret[3][0] = 1
-            else:
-                ret[3][1] = 1
-        else:
-            if int(is_sparse) == 0:
-                ret[3][2] = 1
-            else:
-                ret[3][3] = 1
-    else:
-        ret[3][4] = 1
-    ret[4] = np.array([size_ratio, bd, num_replicas])
-
-    return np.concatenate(ret)
-
-def connvert_feature(strategy, resource_spec, graph_item):
-    
-    cluster = SSHCluster(resource_spec)
-    device_resolver = DeviceResolver(cluster)
-    graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
-    # bandwidth
-    network_bandwidth = np.array([resource_spec.network_bandwidth[device.split(':')[0]] for device, _ in resource_spec.cpu_devices])
-    network_bandwidth = network_bandwidth
-    min_network_bandwidth = network_bandwidth.min()
-    # Other information
-    cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
-    gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
-    max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
-    total_num_local_replica = len(graph_replicas)
-    worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
-
-    num_vars = 0
-    total_size_vars = 0
-    for var_op, var in graph_item.trainable_var_op_to_var.items():
-        num_vars += 1
-        if var.initial_value.shape.ndims:
-            var_helper = VariableHelper(var, graph_item)
-            if var_helper.is_sparse:
-                total_size_vars += get_sparse_var_bits(np.prod(var_helper.shape))
-            else:
-                total_size_vars += get_dense_var_bits(np.prod(var_helper.shape), var.dtype)
-    assert num_vars < MAX_NUM_VARS, num_vars
-    var_partition_features = np.zeros((MAX_NUM_PARS, FEATURE_SIZE-4)).astype(np.float32)
-    partition_indice = np.ones(MAX_NUM_PARS).astype(np.float32) * (MAX_NUM_VARS - 1)
-
-    cnt = 0
-    for node_id, node in enumerate(strategy.node_config):
-        var_name = node.var_name
-        for var_op, var in graph_item.trainable_var_op_to_var.items():
-            if var.name == var_name:
-                break
-        var_helper = VariableHelper(var, graph_item)
-
-        if node.partitioner:
-            pc = PartitionerConfig(partition_str=node.partitioner)
-            for i, part in enumerate(node.part_config):
-                part_helper = PartHelper(i, var, pc)
-                synchronizer = getattr(part, part.WhichOneof('synchronizer'))
-                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
-                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
-                                                 device_resolver)
-                if device == '':
-                    assert(isinstance(synchronizer, AllReduceSynchronizer))
-                    device = None
-                    bd = min_network_bandwidth
-                    num_replicas = 0
-                else:
-                    device = cpu_worker_list.index(device)
-                    bd = network_bandwidth[device]
-                    num_replicas = worker_num_replicas[device]
-
-                if var_helper.is_sparse:
-                    size_ratio = get_sparse_var_bits(np.prod(part_helper.shape))/total_size_vars
-                else:
-                    size_ratio = get_dense_var_bits(np.prod(part_helper.shape), var_helper.dtype)/total_size_vars
-                var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
-                partition_indice[cnt] = node_id
-                cnt += 1
-        else:
-            synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-            reduction_destination = getattr(synchronizer, 'reduction_destination', None)
-            device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
-                                             device_resolver)
-            if device == '':
-                assert(isinstance(synchronizer, AllReduceSynchronizer))
-                device = None
-                bd = min_network_bandwidth
-                num_replicas = 0
-            else:
-                device = cpu_worker_list.index(device)
-                bd = network_bandwidth[device]
-                num_replicas = worker_num_replicas[device]
-
-            if var_helper.is_sparse:
-                size_ratio = get_sparse_var_bits(np.prod(var_helper.shape))/total_size_vars
-            else:
-                size_ratio = get_dense_var_bits(np.prod(var_helper.shape), var_helper.dtype)/total_size_vars
-            var_partition_features[cnt] = to_numpy(synchronizer, device, size_ratio, var_helper.is_sparse, bd, num_replicas)
-            partition_indice[cnt] = node_id
-            cnt += 1
-    return var_partition_features, partition_indice, np.array(node_id+1)
-
-def create_predefined_features(strategy, resource_spec, predefined_simulator):
-
-    var_sync_time, vars, resource = predefined_simulator.predefined_sync_time(strategy, resource_spec)
-
-    features = []
-    for var_name, sync_time in var_sync_time.items():
-        if isinstance(sync_time, list) or isinstance(sync_time, tuple): # send_time, receive_time in PS strategies.
-            transmission = sync_time[0]['transmission'] + sync_time[1]['transmission']
-            sync_time = sync_time[0]
-            is_ps = True
-        else:   # AR
-            transmission = sync_time['transmission']
-            is_ps = False
-
-        network_overhead = sync_time['network_overhead']
-        gpu_kernel_memory_latency = sync_time['gpu_kernel_memory_latency']
-
-        feat = [transmission, network_overhead, gpu_kernel_memory_latency, float(is_ps)]
-        features.append(feat)
-    features = np.array(features, dtype=np.float)
-    return features
-
-class RankRNNSimulatorPenalty(SimulatorBase):
-    """Simulates strategies for a given graph and resource spec."""
-
-    def __init__(self,
-                 original_graph_item_path,
-                 num_rnn_layers,
-                 in_layers,
-                 out_layers,
-                 fetches=None,
-                 batch_size=1,
-                 seq_len=1,
-                 checkpoint=None):
-
-        super(RankRNNSimulatorPenalty, self).__init__(original_graph_item_path=original_graph_item_path)
-        print("It's using RankNet simulator.")
-        self._fetches = fetches
-        self._batch_size_per_gpu = batch_size
-        self._seq_len = seq_len
-        self._checkpoint = checkpoint
-        self._predefined_simulator=PredefinedSimulator(original_graph_item_path=original_graph_item_path,
-                                                         batch_size=self._batch_size_per_gpu,
-                                                         seq_len=self._seq_len)
-        if self._checkpoint:
-            self._model = RankRNN(num_rnn_layers=num_rnn_layers, in_layers=in_layers, out_layers=out_layers).to(TORCH_DEVICE)
-            self._model.load_state_dict(torch.load(self._checkpoint))
-
-    def simulate(self, strategy, resource_spec, strategy_path=None, checkpoint=None):
-        score, feature = self.predict(strategy, resource_spec, strategy_path, checkpoint)
-        return score.view(-1).data.cpu().numpy(), feature.view(-1).data.cpu().numpy()
-
-
-    def predict(self,
-                strategy,
-                resource_spec,
-                strategy_path=None,
-                checkpoint=None):
-        if checkpoint is None:
-            if self._checkpoint is None:
-                raise ValueError("checkpoint is None: {}".format(checkpoint))
-            else:
-                model = self._model
-        else:
-            model = RankRNN().to(TORCH_DEVICE)
-            model.load_state_dict(torch.load(checkpoint))
-        if strategy_path and os.path.isfile((strategy_path+'.npz').replace('strategies', 'npz')):
-            loaded = np.load((strategy_path+'.npz').replace('strategies', 'npz'))
-            var_partition_features, partition_indice, var_num, _ = \
-                            loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
-        else:
-            var_partition_features, partition_indice, var_num = \
-                            connvert_feature(strategy, resource_spec, self._original_graph_item)
-
-        if strategy_path and os.path.isfile((strategy_path+'_pdf.npz').replace('strategies', 'npz')):
-            loaded = np.load((strategy_path+'_pdf.npz').replace('strategies', 'npz'))
-            predefined_features = loaded['x4']
-        else:
-            predefined_features = create_predefined_features(strategy, resource_spec, self._predefined_simulator)
-            
-        var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
-
-        var_partition_features = torch.from_numpy(var_partition_features).unsqueeze(0).to(TORCH_DEVICE)
-        partition_indice = torch.from_numpy(partition_indice).unsqueeze(0).to(TORCH_DEVICE)
-        var_num = torch.from_numpy(var_num).unsqueeze(0).to(TORCH_DEVICE)
-
-        return model(var_partition_features, partition_indice, var_num, True)
-
-class RankNetTrainer():
-
-    def __init__(self,
-                 batch_size_per_gpu=256,
-                 seq_len=1,
-                 seed=1):
-        self._batch_size_per_gpu = batch_size_per_gpu
-        self._seq_len = seq_len
-        self.graph_items = {k:GraphItem.deserialize(v) for k, v in GRAPH_ITEM_PATHS.items()}
-        self.predefined_simulators = {k: PredefinedSimulator(original_graph_item_path=v,
-                                                         batch_size=self._batch_size_per_gpu,
-                                                         seq_len=self._seq_len) for k, v in GRAPH_ITEM_PATHS.items()}
-        self.best_acc = 0.
-        print("It's using RankNet trainer.")
-
-    def load_data(self, path_list, train_patterns=[('ncf', 0)], valid_patterns='same'):
-        features = {k: [[[], [], [], []], [[], [], [], []]] for k, _ in GRAPH_ITEM_PATHS.items()}
-        for training_path in path_list:
-            for path in Path(training_path).rglob('strategies'):
-                strategy_paths = glob.glob(os.path.join(path, '*'))
-                # strategy_paths = np.random.permutation(list(strategy_paths))
-                for strategy_path in strategy_paths:
-                    if 'json' in strategy_path or \
-                      'bert_large_batch_8_orca_16_group_2/' in strategy_path:
-                        continue
-                    model = get_model(strategy_path)
-                    if model is None:
-                        if not ('densenets169' in strategy_path or 'densenets201' in strategy_path):
-                            assert False, strategy_path
-                        continue
-                    rs_path = strategy_path.replace('strategies', 'resource_specs')
-                    runtime_path = strategy_path.replace('strategies', 'runtimes')
-                    npz_path = (strategy_path+'.npz').replace('strategies', 'npz')
-                    if not os.path.isfile(rs_path):
-                        rs_path += '.yml'  
-                    if not (os.path.isfile(rs_path) and os.path.isfile(runtime_path)):
-                        continue
-                    if not os.path.exists(os.path.dirname(npz_path)):
-                        os.makedirs(os.path.dirname(npz_path))
-
-                    if not os.path.isfile(npz_path):
-                        strategy = Strategy.deserialize(path=strategy_path)
-                        rs = ResourceSpec(resource_file=rs_path)
-                        var_partition_features, partition_indice, var_num = \
-                                        connvert_feature(strategy, rs, self.graph_items[model])
-                        label = np.array(json.load(open(runtime_path))['average'])
-                        np.savez_compressed(npz_path, x1=var_partition_features, x2=partition_indice, x3=var_num, y=label)
-                    else:
-                        loaded = np.load(npz_path)
-                        var_partition_features, partition_indice, var_num, label = \
-                                        loaded['x1'], loaded['x2'], loaded['x3'], loaded['y']
-
-                    if not os.path.isfile(npz_path.replace('.npz', '_pdf.npz')):
-                        predefined_features = create_predefined_features(Strategy.deserialize(path=strategy_path), ResourceSpec(resource_file=rs_path), self.predefined_simulators[model])
-                        np.savez_compressed(npz_path.replace('.npz', '_pdf.npz'), x4=predefined_features)
-                    else:
-                        loaded = np.load(npz_path.replace('.npz', '_pdf.npz'))
-                        predefined_features = loaded['x4']
-                    var_partition_features = np.concatenate([var_partition_features, np.concatenate([predefined_features, np.zeros((MAX_NUM_PARS-predefined_features.shape[0], predefined_features.shape[1]))], 0)], 1)
-
-                    # is_aws = int('g3' in strategy_path or 'g4' in strategy_path or 'aws' in strategy_path) # comment here
-                    is_aws = int('vgg16_orca_11_random_rejection-4_trial-100-_expolre-2000_0.83-model_embedding_sim-weight-1_max-par-40/' in strategy_path)
-                    # print(model, 'orca' if is_aws == 0 else 'aws', strategy_path.split('/')[-3])
-                    features[model][is_aws][0].append(var_partition_features)
-                    features[model][is_aws][1].append(partition_indice)
-                    features[model][is_aws][2].append(var_num)
-                    features[model][is_aws][3].append(label)
-
-        for k, _ in GRAPH_ITEM_PATHS.items():
-            for i1 in range(2):
-                for i2 in range(4):
-                    if len(features[k][i1][i2]) > 1:
-                        features[k][i1][i2] = np.stack(features[k][i1][i2]).astype(np.float16)
-                        print(k, 'orca' if i1 == 0 else 'aws', features[k][i1][i2].shape)
-                    else:
-                        features[k][i1][i2] = None
-
-        train_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in train_patterns if features[model_][is_aws_][0] is not None], 0)
-        train_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in train_patterns if features[model_][is_aws_][1] is not None], 0)
-        train_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in train_patterns if features[model_][is_aws_][2] is not None], 0)
-        train_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in train_patterns if features[model_][is_aws_][3] is not None], 0)
-
-        if type(valid_patterns[0]) == str and valid_patterns[0] == 'same':
-            rng = np.random.RandomState(1)
-            permt = rng.permutation(train_features.shape[0])
-            split = int(len(permt) * 0.7)
-            val_features, val_par_indices, val_var_nums, val_labels = train_features[permt[split:]], train_par_indices[permt[split:]], train_var_nums[permt[split:]], train_labels[permt[split:]]
-            train_features, train_par_indices, train_var_nums, train_labels = train_features[permt[:split]], train_par_indices[permt[:split]], train_var_nums[permt[:split]], train_labels[permt[:split]]
-        else:
-            val_features = np.concatenate([features[model_][is_aws_][0] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][0] is not None], 0)
-            val_par_indices = np.concatenate([features[model_][is_aws_][1] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][1] is not None], 0)
-            val_var_nums = np.concatenate([features[model_][is_aws_][2] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][2] is not None], 0)
-            val_labels = np.concatenate([features[model_][is_aws_][3] for model_, is_aws_ in valid_patterns if features[model_][is_aws_][3] is not None], 0)
-
-            # comment here
-            rng = np.random.RandomState(1)
-            permt = rng.permutation(val_features.shape[0])
-            split = int(len(permt) * 0.7)
-            train_features, train_par_indices, train_var_nums, train_labels = np.concatenate([train_features, val_features[permt[:split]]], 0), np.concatenate([train_par_indices, val_par_indices[permt[:split]]], 0), np.concatenate([train_var_nums, val_var_nums[permt[:split]]], 0), np.concatenate([train_labels, val_labels[permt[:split]]], 0)
-
-            val_features, val_par_indices, val_var_nums, val_labels = val_features[permt[split:]], val_par_indices[permt[split:]], val_var_nums[permt[split:]], val_labels[permt[split:]]
-        label_max = max(train_labels.max(), val_labels.max())
-        label_min = min(train_labels.min(), val_labels.min())
-        train_labels = (train_labels-label_min)/(label_max-label_min)
-        val_labels = (val_labels-label_min)/(label_max-label_min)
-        print(train_features.shape, val_features.shape, train_features.max(), train_features.min(), val_features.max(), val_features.min(), train_labels.max(), val_labels.min()) 
-
-        ## train the model
-        trainset = TrainTensorDataset((torch.from_numpy(train_features).half().to(TORCH_DEVICE), torch.from_numpy(train_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(train_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(train_labels).half().to(TORCH_DEVICE)))
-        testset = torch.utils.data.TensorDataset(torch.from_numpy(val_features).half().to(TORCH_DEVICE), torch.from_numpy(val_par_indices).half().to(TORCH_DEVICE), torch.from_numpy(val_var_nums).half().to(TORCH_DEVICE), torch.from_numpy(val_labels).half().to(TORCH_DEVICE))
-        self.trainloader = torch.utils.data.DataLoader(dataset=trainset, 
-                                                  batch_size=BATCH_SIZE, 
-                                                  shuffle=True)
-        self.testloader = torch.utils.data.DataLoader(dataset=testset, 
-                                                  batch_size=32, 
-                                                  shuffle=False)
-
-    def train(self, name='', num_epochs=200, checkpoint=None):
-
-        checkpoint_path = 'model_on_{}.ckpt'.format(name)
-        print('LSTM layers: ', NUM_RNN_LAYERS, 'score th: ', SCORE_TH, 'lr: ', LR, 'wd: ', WD,'use data aug: ', DATA_AUG, 'OUT_LAYERS: ', OUT_LAYERS, 'IN_LAYERS: ',IN_LAYERS)
-
-        np.random.seed(1)
-        torch.manual_seed(1)
-        torch.cuda.manual_seed_all(1)
-        model = RankRNN(num_rnn_layers=NUM_RNN_LAYERS, out_layers=OUT_LAYERS, in_layers=IN_LAYERS).to(TORCH_DEVICE)
-        if checkpoint:
-            model.load_state_dict(torch.load(checkpoint))
-        optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
-
-        best_val_acc = 0.
-        for epoch in range(num_epochs):
-            if epoch == int(num_epochs*2./5. - 1):
-                for param_group in optimizer.param_groups: param_group['lr'] = 3e-4
-            if epoch == int(num_epochs*4./5. - 1):
-                for param_group in optimizer.param_groups: param_group['lr'] = 1e-4
-
-            labels = []
-            outputs = []
-            for i, (features_b, par_indices_b, var_nums_b, labels_b) in enumerate(self.trainloader):  
-                
-                # Forward pass
-                outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze()
-
-                par_cnt = (par_indices_b.int() != MAX_NUM_VARS - 1).int().sum(1)
-                
-                true_comp = (
-                    (labels_b[:, None]+SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] > par_cnt[None, :]).int() 
-                  + (labels_b[:, None]-SCORE_TH>labels_b[None,:]).int()*(par_cnt[:, None] < par_cnt[None, :]).int()
-                  + (labels_b[:, None] > labels_b[None,:]).int() * (par_cnt[:, None] == par_cnt[None, :]).int()
-                 ) > 0
-                true_comp = true_comp.float() * 2 - 1
-                pred_comp = outputs_b[:, None] - outputs_b[None, :]
-                loss = (1 - true_comp) * pred_comp / 2 + torch.nn.functional.softplus(-pred_comp)
-                loss = loss.tril(-1).mean()
-                
-                # Backward and optimize
-                optimizer.zero_grad()
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.stem_rnn.parameters(), 0.25)
-                optimizer.step()
-
-                outputs.append(outputs_b)
-                labels.append(labels_b)
-
-            labels = torch.cat(labels)
-            outputs = torch.cat(outputs)
-            true_comp = (labels[:, None] > labels[None, :])
-            pred_comp = (outputs[:, None] > outputs[None, :])
-            equal = (true_comp == pred_comp).int()
-            train_acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
-            
-            with torch.no_grad():
-                labels = []
-                outputs = []
-                for features_b, par_indices_b, var_nums_b, labels_b in self.testloader:
-                    
-                    # Forward pass
-                    outputs_b = model(features_b, par_indices_b, var_nums_b).squeeze()
-                    outputs.append(outputs_b)
-                    labels.append(labels_b)
-
-                labels = torch.cat(labels)
-                outputs = torch.cat(outputs)
-                true_comp = (labels[:, None] > labels[None, :])
-                pred_comp = (outputs[:, None] > outputs[None, :])
-                equal = (true_comp == pred_comp).int()
-                acc = equal.tril(-1).sum() * 2. /float(equal.shape[0])/(float(equal.shape[0]) - 1)
-                if acc.item() > best_val_acc:
-                    best_val_acc = acc.item()
-                    if best_val_acc > self.best_acc:
-                        print('Saved model @ acc', best_val_acc)
-                        torch.save(model.state_dict(), checkpoint_path)
-                        self.best_acc = best_val_acc
-                    # print('Saved model to {}'.format(checkpoint_path))
-                if epoch == num_epochs - 1:
-                    print('Epoch: {}, training acc: {:.4f}, test acc: {:.4f}, best acc: {:.4f}, overall best acc: {:.4f}'.format(epoch, train_acc.item(), acc.item(), best_val_acc, self.best_acc))
-        return checkpoint_path
-
-
-if __name__ == '__main__':
-
-    if True:
-        trainer = RankNetTrainer()
-        trainer.load_data([
-            '/users/hzhang2/oceanus_cost_model_training_data/vgg16',
-             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-11-20', 
-             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-9-20', 
-             # '/users/hzhang2/oceanus_cost_model_training_data/ncf',
-             # '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_random_orca_11',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert-large-aws4g4',
-             # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert-aws/bert_large_random_search_aws_4_ps_only',
-             # '/users/hzhang2/oceanus_cost_model_training_data/densenet', 
-             # '/users/hzhang2/oceanus_cost_model_training_data/inceptionv3', 
-             # '/users/hzhang2/oceanus_cost_model_training_data/resnet101', 
-             # '/users/hzhang2/oceanus_cost_model_training_data/resnet50', 
-             ],
-            [
-              ('vgg16', 0), #('vgg16', 1), 
-              # ('ncf', 0), #('ncf', 1), 
-              # ('bert_large', 1), #('bert_large', 1), 
-              # not used:
-              # ('densenet121', 0), ('densenet121', 1), 
-              # ('inceptionv3', 0), ('inceptionv3', 1), 
-              # ('resnet101', 0), ('resnet101', 1), 
-              # ('resnet50', 0), ('resnet50', 1), 
-              # ('bert_12l', 0), ('bert_12l', 1), 
-              # ('bert_6l', 0), ('bert_6l', 1), 
-              # ('bert_3l', 0), ('bert_3l', 1), 
-            ], 
-            [
-              ('vgg16', 1),
-              # ('ncf', 1), 
-              # ('bert_large', 1), 
-              # 'same',
-            ],
-        )
-        
-        for p2 in [0.01, 0.03]:
-            for p3 in [1e-3, 3e-3, 1e-4, 3e-4, 5e-3]:
-                for p4 in [1e-3, 2e-3, 1e-4, 3e-4, 5e-4, 5e-5]:
-                    for p1 in [3, 4, 2]:
-                        for p5 in [2, 3]:
-                            for p6 in [1, 2]:
-                                NUM_RNN_LAYERS, SCORE_TH, LR, WD, IN_LAYERS, OUT_LAYERS = p1, p2, p3, p4, p5, p6
-                                checkpoint_path = trainer.train(name='vgg-orca-validon-0.83-sim1', num_epochs=200)
-        exit()
-    else:
-        checkpoint_path = '/users/hzhang2/projects/pycharm/zhijie/5-9-2020/oceanus-zhijie/arion/simulator/models/model_on_vgg-orca.ckpt'
-    test_list = [
-    # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg16_orca_15',
-    # '/users/hzhang2/oceanus_cost_model_training_data/vgg16/vgg_random_orca_11',   #TARGET: 0.9
-    '/users/hzhang2/oceanus_cost_model_training_data/ncf-5-13-20/ncf_large_adam_random_search_aws_4',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_12_orca_16',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_3',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_4',
-    # '/users/hzhang2/oceanus_cost_model_training_data/bert/bert_large_batch_8_orca_16_group_5',
-    ]
-    
-    for data_folder in test_list:
-        simulator = RankRNNSimulator(GRAPH_ITEM_PATHS[get_model(data_folder)],
-                                        num_rnn_layers=3,
-                                        batch_size=256,
-                                        seq_len=1,
-                                        checkpoint=checkpoint_path)
-
-        runtimes_folder = os.path.join(data_folder, 'runtimes')
-        results = {}
-        averages= []
-        scores = []
-        for name in os.listdir(runtimes_folder):
-            strategy_path = os.path.join(data_folder, 'strategies', name)
-            rs_path = os.path.join(data_folder, 'resource_specs', name )
-            if not os.path.isfile(rs_path):
-                rs_path += '.yml' 
-            runtime_path = os.path.join(runtimes_folder, name)
-
-            with open(runtime_path, 'r') as f:
-                runtimes = json.load(f)
-            average = np.array(runtimes['average'])
-
-            s = Strategy.deserialize(strategy_path)
-            rs = ResourceSpec(resource_file=rs_path)
-            score = simulator.simulate(s, rs, strategy_path)
-
-            results[name] = (average, score)
-            averages.append(average)
-            scores.append(score)
-
-        # sorted_by_runtime = {k: v for k, v in sorted(results.items(), key=lambda item: item[1][0])}
-        # # sorted_by_scores = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][1])}
-        # # sorted_by_latency = {k: v for k, v in sorted(res.items(), key=lambda item: item[1][2])}
-        # print('Sorted by runtime.......................')
-        # for _, (rt, prediction) in sorted_by_runtime.items():
-        #     print('runtime {}  prediction {}'.format(rt, prediction))
-
-        y_train = np.array(averages)
-        test_score = np.array(scores)
-        true_comp = (y_train.ravel()[:, None] > y_train.ravel()[None, :])
-        pred_comp = (test_score.ravel()[:, None] > test_score.ravel()[None, :])
-        equal = (true_comp == pred_comp).astype(np.int)
-        test_acc = np.tril(equal, -1).sum() * 2. / float(equal.shape[0]) / (float(equal.shape[0]) - 1)
-
-        print('Test {} on {}, acc {:.4f}'.format(checkpoint_path, data_folder.split('/')[-1], test_acc))
diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py
new file mode 100644
index 0000000..91519c6
--- /dev/null
+++ b/autodist/simulator/predefined_simulator.py
@@ -0,0 +1,374 @@
+"""Strategy Simulator."""
+
+import numpy as np
+import json
+import pickle as pkl
+
+import tensorflow as tf
+from tensorflow.python.eager import context
+
+from arion.strategy.base import Strategy
+from arion.resource_spec import ResourceSpec
+from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from arion.simulator.models.base import SimulatorBase
+from arion.simulator.utils import _resolve_device_address, _resolved_devices_on_diff_machine, \
+	get_dense_var_bits, get_sparse_var_bits
+
+class PredefinedSimulator(SimulatorBase):
+	"""Simulates strategies for a given graph and resource spec."""
+
+	def __init__(self,
+				 original_graph_item_path,
+				 fetches=None,
+				 batch_size=1,
+				 seq_len=1,
+				 get_coef=True,
+				 checkpoint=None):
+
+		super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path)
+
+		print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size))
+		self._fetches = fetches
+		self._batch_size_per_gpu = batch_size
+		self._seq_len = seq_len
+		self._get_coef = get_coef
+		self._checkpoint = checkpoint
+		self._weights = None
+		with context.eager_mode():
+			if self._checkpoint:
+				self._weights = self.load_checkpoint(self._checkpoint)
+
+	def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None):
+		"""Return simulated runtime value."""
+		inputs = self.create_features(strategy, resource_spec)
+		with context.eager_mode():
+			cost = self.inference(inputs, checkpoint)
+		return cost
+
+	def inference(self, inputs, checkpoint=None):
+		if checkpoint is not None:
+			weights = self.load_checkpoint(checkpoint)
+		elif self._weights is not None:
+			weights = self._weights
+		else:
+			raise ValueError("No checkpoint provided in either initialization or inference.")
+
+		if not isinstance(inputs, tf.Tensor):
+			inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
+
+		if len(weights) == 4:
+			W0, b0, W, b = weights
+			inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0)
+			cost = tf.matmul(inputs, W) + b
+		elif len(weights) == 2:
+			W, b = weights
+			cost = tf.matmul(inputs, W) + b
+		else:
+			raise ValueError
+		return cost
+
+	def load_checkpoint(self, checkpoint=None):
+		if checkpoint is None:
+			if self._checkpoint is not None:
+				checkpoint = self._checkpoint
+			else:
+				raise ValueError("checkpoint is None: {}".format(checkpoint))
+		self._weights = pkl.load(open(checkpoint, 'rb'))
+		# self._weights = json.load(open(checkpoint, 'r'))
+		print("Load checkpoint: ")
+		print(self._weights)
+		return self._weights
+
+	def save_checkpoint(self, model, checkpoint):
+		pkl.dump(model, open(checkpoint, 'wb'))
+		self._checkpoint = checkpoint
+		self._weights = model
+
+	def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec):
+		var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
+
+		# Add up sync time per device to find the slowest server time.
+		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
+		device_ps_sync_time = {}
+		var_ar_sync_time = {}
+		for var_name, sync_time in var_sync_time.items():
+			if isinstance(vars[var_name].synchronizer, PSSynchronizer):
+				device = vars[var_name].device
+				if device not in device_ps_sync_time:
+					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
+
+			else: # AllReduce
+				if var_name not in var_ar_sync_time:
+					var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					var_ar_sync_time[var_name][key] += sync_time[key]
+
+		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		sum_var_ar_sync_time = {key: 0.0 for key in feature_keys}
+		for key in feature_keys:
+			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0])
+
+		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
+		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
+		       + [sum_var_ar_sync_time[key] for key in feature_keys]
+
+		return feat
+
+	def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
+		# var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
+
+		vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec)
+
+		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
+		device_ps_sync_time = {}
+		group_ar_sync_time = {}
+
+		for var_name, var in vars.items():
+			if isinstance(var.synchronizer, PSSynchronizer):
+				sync_time = self.var_ps_time(var, resource)
+				device = vars[var_name].device
+				if device not in device_ps_sync_time:
+					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
+			elif isinstance(var.synchronizer, AllReduceSynchronizer):
+				sync_time = self.var_ar_time(var, resource)
+				var_group = sync_time['group']
+				if var_group not in group_ar_sync_time:
+					group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					group_ar_sync_time[var_group][key] += sync_time[key]
+			else:
+				raise ValueError('{}'.format(type(var.synchronizer)))
+
+		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		max_group_ar_sync_time = {key: 0.0 for key in feature_keys}
+		sum_group_ar_sync_time = {key: 0.0 for key in feature_keys}
+		for key in feature_keys:
+			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0])
+			sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0])
+
+		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
+		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
+		       + [max_group_ar_sync_time[key] for key in feature_keys] \
+		       + [sum_group_ar_sync_time[key] for key in feature_keys]
+
+		return feat
+
+	def predefined_sync_time(self, strategy, resource_spec):
+		""" graph_item: transformed graph item """
+		vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec)
+		# Compute synchronization time for every var
+		var_sync_time = {}
+		for var_name, var in vars.items():
+			if isinstance(var.synchronizer, PSSynchronizer):
+				var_sync_time[var_name] = self.var_ps_time(var, resource)
+			elif isinstance(var.synchronizer, AllReduceSynchronizer):
+				var_sync_time[var_name] = self.var_ar_time(var, resource)
+			else:
+				raise ValueError('{}'.format(type(var.synchronizer)))
+		return var_sync_time, vars, resource
+
+	def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+		"""Compute synchronization time of a variable in PS strategy."""
+		def _helper(worker_list, worker_num_replicas=None):
+			if worker_num_replicas is None:
+				worker_num_replicas = [1.0] * len(worker_list)
+
+			this_server_time = 0
+			# network transfer: sum up all workers time. equals to the time cost of this server.
+			# TODO(Hao): didn't consider any parallelization among partitions
+			for k, worker in enumerate(worker_list):
+				if _resolved_devices_on_diff_machine(var.device, worker):
+					if var.is_sparse:
+						this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
+					else:
+						this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
+					this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
+
+			if self._get_coef:
+				return {
+					'transmission': this_server_time,
+					'network_overhead': len(worker_list),
+					'gpu_kernel_memory_latency': resource.max_num_local_replica,
+					'constant': 1.0,
+					# possible affecting factors.
+					'var_name': var.name,
+					'strategy': 'ps',
+					'local_proxy': var.synchronizer.local_replication,
+					'is_sparse': var.is_sparse,
+					'size_to_transfer': var_size_to_transfer,
+					'dtype': str(var.dtype),
+					# 'server_list': [partition.to_dict() for partition in server_list],
+					'worker_list': worker_list,
+					'cpu_worker_list': resource.cpu_worker_list,
+					'gpu_worker_list': resource.gpu_worker_list,
+					'worker_num_replicas': worker_num_replicas,
+					'max_num_local_replica': resource.max_num_local_replica,
+					'is_ps': True,
+				}
+			else:
+				return this_server_time + len(worker_list) * network_overhead + \
+					   gpu_kernel_memory_latency * resource.max_num_local_replica
+
+		var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+													seq_len=self._seq_len)
+
+		if var.is_sparse:
+			send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas)
+			receive_time = _helper(resource.gpu_worker_list)
+		else:
+			send_time = _helper(resource.cpu_worker_list)
+			if var.synchronizer.local_replication:
+				receive_time = _helper(resource.cpu_worker_list)
+			else:
+				receive_time = _helper(resource.gpu_worker_list)
+
+		return send_time, receive_time
+
+	def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+		"""Compute synchronization time of a variable in AR strategy."""
+		worker_list = resource.cpu_worker_list
+		num_workers = len(worker_list)
+		min_bandwidth = None
+		for i in range(num_workers):
+			for j in range(i, num_workers):
+				if min_bandwidth is None:
+					min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]]
+				else:
+					min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]])
+
+		# Compressor
+		if var.compressor == "PowerSGDCompressor" or var.compressor == 3:
+			rank = 10  # currently using default value. So hardcode here. # todo: confirm
+			# assume var must be a dense variable.
+			og_shape = var.shape
+			ndims = len(og_shape)
+			if ndims <= 1:  # no compress
+				size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+														seq_len=self._seq_len)
+			else:
+				if ndims > 2:
+					n = og_shape[0]
+					m = 1
+					for s in og_shape[1:]:
+						m *= s  # tensor's shape (n, m)
+				else:
+					n, m = og_shape[0], og_shape[1]
+				size_to_transfer = n * rank + m * rank
+			dtype = tf.float32
+		elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor"  \
+				or var.compressor == 2  or var.compressor == 1:
+			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+													seq_len=self._seq_len)
+			dtype = tf.float32
+		elif var.compressor == "NoneCompressor" or var.compressor == 0:
+			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+													seq_len=self._seq_len)
+			dtype = var.dtype
+		else:
+			raise ValueError('Compressor does not exist: {}'.format(var.compressor))
+
+		# todo: chunk_size
+		# AllReduce communication time
+		# time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers)
+		time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
+
+		if self._get_coef:
+			return {
+				'transmission': time,
+				'network_overhead': 1,  # len(worker_list),
+				'gpu_kernel_memory_latency': resource.max_num_local_replica,
+				'constant': 1.0,
+				# possible affecting factors.
+				'var_name': var.name,
+				'group': var.synchronizer.group,
+				'strategy': 'allreduce',
+				'is_sparse': False,
+				# 'chunk_size': chunk_size,
+				'spec': 'NCCL',  # default
+				'compressor': var.compressor,
+				'worker_list': worker_list,
+				'num_workers': num_workers,
+				'size_to_transfer': size_to_transfer,
+				'dtype': str(dtype),
+				'min_bandwidth': min_bandwidth,
+				'max_num_local_replica': resource.max_num_local_replica,
+				'is_ps': False,
+			}
+		else:
+			return time + network_overhead * len(worker_list) \
+			       + gpu_kernel_memory_latency * resource.max_num_local_replica
+
+
+
+	# @staticmethod
+	# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
+	#				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
+	#				 network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+	#	 """Compute synchrinzation time of a variable in PS strategy."""
+	#
+	#	 def _helper(worker_list, worker_num_replicas=None):
+	#		 if worker_num_replicas is None:
+	#			 worker_num_replicas = [1.0] * len(worker_list)
+	#		 # Compute the slowest server
+	#		 slowest_server_time = 0
+	#		 for j, server in enumerate(server_list):
+	#			 if server.size_to_transfer == 0:
+	#				 continue
+	#			 # network transfer: sum up all workers time. equals to the time cost of this server.
+	#			 this_server_time = 0
+	#			 for k, worker in enumerate(worker_list):
+	#				 if _resolved_devices_on_diff_machine(server.device, worker):
+	#					 if is_sparse:
+	#						 this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k]
+	#					 else:
+	#						 this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype)
+	#					 this_server_time += this_worker_size / network_bandwidth[server.device][worker]
+	#			 slowest_server_time = max(slowest_server_time, this_server_time)
+	#
+	#		 if get_coef:
+	#			 return {
+	#				 'transmission': slowest_server_time,
+	#				 'network_overhead': len(worker_list),
+	#				 'gpu_kernel_memory_latency': max_num_local_replica,
+	#				 'constant': 1.0,
+	#				 # possible affecting factors.
+	#				 'var_name': var_name,
+	#				 'strategy': 'ps',
+	#				 'local_proxy': local_proxy,
+	#				 'is_sparse': is_sparse,
+	#				 'server_list': [partition.to_dict() for partition in server_list],
+	#				 'worker_list': worker_list,
+	#				 'cpu_worker_list': cpu_worker_list,
+	#				 'gpu_worker_list': gpu_worker_list,
+	#				 'worker_num_replicas': worker_num_replicas,
+	#				 'max_num_local_replica': max_num_local_replica,
+	#			 }
+	#		 else:
+	#			 return slowest_server_time + len(worker_list) * network_overhead + \
+	#					gpu_kernel_memory_latency * max_num_local_replica
+	#
+	#	 if is_sparse:
+	#		 send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas)
+	#		 receive_time = _helper(gpu_worker_list)
+	#	 else:
+	#		 send_time = _helper(cpu_worker_list)
+	#		 if local_proxy:
+	#			 receive_time = _helper(cpu_worker_list)
+	#		 else:
+	#			 receive_time = _helper(gpu_worker_list)
+	#
+	#	 if get_coef:
+	#		 # return {key: send_time[key]+receive_time[key] for key in send_time.keys()}
+	#		 return send_time, receive_time
+	#	 else:
+	#		 return send_time, receive_time
diff --git a/autodist/simulator/models/rankrnn_simulator_penalty_fast.py b/autodist/simulator/rankrnn_simulator.py
similarity index 100%
rename from autodist/simulator/models/rankrnn_simulator_penalty_fast.py
rename to autodist/simulator/rankrnn_simulator.py
diff --git a/autodist/strategy/auto/ar_group_assigner.py b/autodist/strategy/auto/ar_group_assigner.py
index c2d59b6..7a529d3 100644
--- a/autodist/strategy/auto/ar_group_assigner.py
+++ b/autodist/strategy/auto/ar_group_assigner.py
@@ -1,9 +1,35 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Collective group assigners."""
+
 from collections import OrderedDict
 
 import numpy as np
 
 
 def chunk_group_assigner(ar_shards, chunk_size=1):
+    """
+    Assigner that determines the group following a chunk parameter.
+
+    Args:
+        ar_shards:
+        chunk_size:
+
+    Returns:
+
+    """
     assignments = {}
     for i, shard_name in enumerate(ar_shards):
         assignments[shard_name] = i // chunk_size
@@ -54,4 +80,4 @@ def ordered_balanced_group_assigner(ar_shards, var_helpers, num_group):
             assignments[shard_name] = cur_bucket
             loads[cur_bucket] += var_helpers[shard_name].byte_size
     assert(len(ar_shards) == len(assignments))
-    return assignments
\ No newline at end of file
+    return assignments
diff --git a/autodist/strategy/auto/auto_strategy.py b/autodist/strategy/auto/auto_strategy.py
index e69de29..260c7be 100644
--- a/autodist/strategy/auto/auto_strategy.py
+++ b/autodist/strategy/auto/auto_strategy.py
@@ -0,0 +1,249 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PS StrategyBuilder."""
+
+from autodist.strategy.base import Strategy, StrategyBuilder
+from autodist.proto import strategy_pb2
+from autodist.strategy.auto.strategy_sampler import RandomStrategySampler
+
+
+class AutoStrategy(StrategyBuilder):
+    """
+    AutoStrategy Builder.
+
+    It generates a suitable Strategy based on graph_item and resource_spec following the AutoSync framework.
+    """
+
+    def __init__(self):
+        return
+
+    def build(self, graph_item, resource_spec):
+        # TODO: merge the code in search and propose here.
+        return
+
+    def search(self):
+        # candidates, scores, features = self.propose(self.search_params['num_candidate_explore'])
+        candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore'])
+        n_pick = self.search_params['num_candidate_per_trial']
+
+        # cast them to be np arrays
+        if self.search_params['diversity_metric'] == 'embedding':
+            picked_candidates = self.submodular_pick_by_embedding(np.array(scores),
+                                                                  candidates,
+                                                                  np.stack(features),
+                                                                  n_pick,
+                                                                  self.search_params['simulation_weight'],
+                                                                  self.search_params['diversity_weight'])
+        elif self.search_params['diversity_metric'] == 'expression':
+            picked_candidates = self.submodular_pick_by_expression(np.array(scores),
+                                                                   candidates,
+                                                                   n_pick,
+                                                                   self.search_params['simulation_weight'],
+                                                                   self.search_params['diversity_weight'])
+        else:
+            raise ValueError('Unrecognized diversity metric...')
+        if self.trial_run_fn:
+            self.trial_run(picked_candidates, search_iteration=0)
+
+    def propose(self, num_proposal, use_simulator=True):
+        builder = RandomStrategy(self.space, self.heuristics)
+        candidates = []
+        features = []
+        scores = []
+        # np.random.seed(1)
+        idx = 0
+
+        while len(candidates) < num_proposal:
+            logging.info('Sampling strategy {}'.format(idx))
+            start_time = time.time()
+            expr = builder.build(self._original_graph_item, self._resource_spec)
+            elapsed = time.time() - start_time
+            logging.info('Sampling strategy takes {}'.format(elapsed))
+            builder.reset()
+            idx += 1
+            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
+            if self.simulator and use_simulator:
+                start_time = time.time()
+                score, feature = self.simulator.simulate(expr, self._resource_spec)
+                elapsed = time.time() - start_time
+                logging.info('Inference strategy takes {}'.format(elapsed))
+                if score > self.search_params['rejection_score']:
+                    logging.info('strategy {} has score {} > {}, '
+                                 'rejected..'.format(idx, score, self.search_params['rejection_score']))
+                    continue
+                else:
+                    candidates.append(expr)
+                    features.append(feature)
+                    scores.append(score[0])
+            else:
+                candidates.append(expr)
+                features.append([])
+                scores.append(0)
+        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
+        return candidates, scores, features
+
+    def batch_propose(self, num_proposal, batch_size=32, use_simulator=True):
+
+        builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)]
+        graph_items = [self._original_graph_item for _ in range(batch_size)]
+        rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)]
+        candidates = []
+        features = []
+        scores = []
+        # np.random.seed(1)
+        idx = 0
+
+        while len(candidates) < num_proposal:
+            logging.info('Sampling strategy {}'.format(idx))
+            start_time = time.time()
+
+            q = Queue()
+            exprs = []
+            prs = []
+            for obj, arg1, arg2 in zip(builders, graph_items, rss):
+                prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2)))
+                prs[-1].start()
+            for pr in prs:
+                expr = q.get() # will block
+                exprs.append(expr)
+            for pr in prs:
+                pr.join()
+
+            elapsed = time.time() - start_time
+            logging.info('Sampling strategy takes {}'.format(elapsed))
+            for builder in builders: builder.reset()
+            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
+            if self.simulator and use_simulator:
+                start_time = time.time()
+                batch_score, batch_feature = self.simulator.simulate(exprs, rss)
+                elapsed = time.time() - start_time
+                logging.info('Inference strategy takes {}'.format(elapsed))
+                for ite, expr in enumerate(exprs):
+                    # print(batch_score[ite], batch_feature[ite].shape)
+                    if batch_score[ite] > self.search_params['rejection_score']:
+                        logging.info('strategy {} has score {} > {}, '
+                                     'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score']))
+                    else:
+                        candidates.append(expr)
+                        features.append(batch_feature[ite])
+                        scores.append(batch_score[ite])
+            else:
+                for ite, expr in enumerate(exprs):
+                    candidates.append(expr)
+                    features.append([])
+                    scores.append(0)
+            idx += batch_size
+        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
+        return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal]
+
+    def submodular_pick_by_embedding(self,
+                                     scores,
+                                     candidates,
+                                     candidate_features,
+                                     n_pick,
+                                     beta=1.0,
+                                     alpha=1.0):
+        n = len(scores)
+        assert n == len(candidate_features)
+
+        ret = []
+        sim = np.dot(candidate_features, candidate_features.T)
+        remain = list(range(len(scores)))
+
+        for _ in range(n_pick):
+            tmp_delta = -scores[remain] * beta
+            if len(ret) > 0:
+                tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1)
+            max_x = tmp_delta.argmax()
+            max_x = remain[max_x]
+
+            ret.append(max_x)
+            remain.remove(max_x)
+
+        return [candidates[i] for i in ret]
+
+    def submodular_pick_by_expression(self,
+                                      scores,
+                                      candidates,
+                                      n_pick,
+                                      beta=1.0,
+                                      alpha=1.0):
+
+        def remove_group_or_reduction_destination(strategy):
+            tmp_strategy = copy.deepcopy(strategy)
+            for node in tmp_strategy.node_config:
+                if node.partitioner:
+                    for part in node.part_config:
+                        synchronizer = getattr(part, part.WhichOneof('synchronizer'))
+                        if hasattr(synchronizer, 'reduction_destination'):
+                            synchronizer.reduction_destination = ''
+                        else:
+                            synchronizer.group = 0
+                else:
+                    synchronizer = getattr(node, node.WhichOneof('synchronizer'))
+                    if hasattr(synchronizer, 'reduction_destination'):
+                        synchronizer.reduction_destination = ''
+                    else:
+                        synchronizer.group = 0
+            return tmp_strategy
+
+        def estimate_difference(strategy, node_config_set):
+            score = 0
+            for i, node in enumerate(strategy.node_config):
+                if_seen = False
+                for seen_node in node_config_set[i]:
+                    if seen_node == node:
+                        if_seen = True
+                        break
+                if not if_seen:
+                    score += 1
+            return score
+
+        assert len(scores) == len(candidates)
+
+        node_config_set = [list() for _ in candidates[0].node_config]
+        remain = list(range(len(scores)))
+        ret = []
+        for _ in range(n_pick):
+            max_x = -1
+            max_delta = -1e9
+            max_strategy_copy = None
+
+            for x in remain:
+                tmp_strategy = remove_group_or_reduction_destination(candidates[x])
+                diff_score = estimate_difference(tmp_strategy, node_config_set)
+                assert(diff_score <= len(tmp_strategy.node_config))
+                # print('diff score {}..'.format(diff_score))
+                tmp_delta = - scores[x] * beta + diff_score * alpha
+                if tmp_delta > max_delta:
+                    max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy
+                    max_diff_score = diff_score *alpha
+                    max_simulation_score= -scores[x]
+
+            print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta))
+            ret.append(max_x)
+            remain.remove(max_x)
+
+            # update the node config set
+            for i, node in enumerate(max_strategy_copy.node_config):
+                if_seen = False
+                for seen_node in node_config_set[i]:
+                    if seen_node == node:
+                        if_seen = True
+                        break
+                if not if_seen:
+                    node_config_set[i].append(node)
+
+        return [candidates[i] for i in ret]
diff --git a/autodist/strategy/auto/ps_load_balancer.py b/autodist/strategy/auto/ps_load_balancer.py
index dc770d8..55a3d6e 100644
--- a/autodist/strategy/auto/ps_load_balancer.py
+++ b/autodist/strategy/auto/ps_load_balancer.py
@@ -1,3 +1,19 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PS load balancers."""
+
 from collections import OrderedDict
 
 import numpy as np
@@ -9,7 +25,19 @@ def calcuate_entropy(loads):
     entropy = - np.sum(distribution * np.log2(distribution))
     return entropy
 
+
 def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False):
+    """
+    A greedy load balancer that places the next largest load on the least loaded server.
+    Args:
+        ps_shards:
+        resource_spec:
+        var_helpers:
+        sort_by_size:
+
+    Returns:
+
+    """
     # no randomness
     assignments = {}
     reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
@@ -27,7 +55,22 @@ def greedy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=Fal
         loads[destination] += var_helpers[shard_name].byte_size
     return assignments
 
+
 def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=False):
+    """
+    A randomized greedy load balancer. It places the variable by sampling from a multinomial distribution
+    correlated with their current load status -- node with least loads will have highest probability being
+    sampled.
+
+    Args:
+        ps_shards:
+        resource_spec:
+        var_helpers:
+        sort_by_size:
+
+    Returns:
+
+    """
     # Sample destination based on a distributed calculated based on loads and available bandwidth
     reduction_device_names = [k for k, _ in resource_spec.cpu_devices]
     loads = {ps: 0.0 for ps in reduction_device_names}
@@ -64,4 +107,3 @@ def christy_load_balancer(ps_shards, resource_spec, var_helpers, sort_by_size=Fa
     # best_entropy = calcuate_entropy(balanced_loads)
     # print('entropy {} vs. max entropy {}'.format(entropy, best_entropy))
     return assignments
-

From f74e650d8797afd1ade627fb1660e521ea9a2cf8 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Thu, 16 Jul 2020 20:08:46 -0400
Subject: [PATCH 04/11] temporally remove op profiling code

---
 autodist/simulator/base.py | 135 +++++++------------------------------
 1 file changed, 24 insertions(+), 111 deletions(-)

diff --git a/autodist/simulator/base.py b/autodist/simulator/base.py
index 964302b..bac33d5 100644
--- a/autodist/simulator/base.py
+++ b/autodist/simulator/base.py
@@ -1,24 +1,31 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Strategy Simulator."""
-import time
-from collections import defaultdict
-import numpy as np
 
-import tensorflow as tf
-from tensorflow.python.client import timeline
+from collections import defaultdict
 
-from arion.simulator.utils import NUM_RUNS
-from arion.cluster import SSHCluster
-from arion.graph_item import GraphItem
-from arion.kernel.device.resolver import DeviceResolver
-from arion.kernel.partitioner import PartitionerConfig
-from arion.proto.synchronizers_pb2 import AllReduceSynchronizer
-from arion.resource_spec import ResourceSpec
-from arion.strategy.base import Strategy
-from arion.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica
-from arion.strategy.random_sample_strategy import VariableHelper, PartHelper
-from arion.simulator.utils import INFINITY
+from autodist.cluster import SSHCluster
+from autodist.graph_item import GraphItem
+from autodist.kernel.device.resolver import DeviceResolver
+from autodist.kernel.partitioner import PartitionerConfig
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy.base import Strategy
+from autodist.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica
+from autodist.strategy.auto.strategy_sampler import VariableHelper, PartHelper
+from autodist.simulator.utils import INFINITY
 
-# tf.compat.v1.disable_eager_execution()
 
 class Var:
     def __init__(self,
@@ -310,97 +317,3 @@ def min_bandwitdh(worker_list, bandwidth):
     @property
     def original_graph_item_path(self):
         return self._original_graph_item_path
-
-    # @property
-    # def resource_file(self):
-    #     return self._resource_file
-
-    @staticmethod
-    def calculate_op_timings(fetches):
-        # Simple implementation. Calculate averaged run time of certain steps.
-        init_op = tf.compat.v1.initialize_all_variables()
-        outside_times = []
-
-        with tf.compat.v1.Session() as sess:
-            sess.run(init_op)
-            for i in range(NUM_RUNS):
-                start = time.time()
-                sess.run(fetches)
-                end = time.time()
-                outside_times.append(end - start)
-        comp_time_in_sec = np.mean(np.array(outside_times[1:]))
-        return comp_time_in_sec
-
-    @staticmethod
-    def profile_on_single_machine(fetches):
-        # calculate computation time of every op
-        init_op = tf.compat.v1.initialize_all_variables()
-        op_name2runtime = defaultdict(list)
-        outside_times = []
-        all_times = []
-
-        options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
-        run_metadata = tf.compat.v1.RunMetadata()
-        with tf.compat.v1.Session() as sess:
-            sess.run(init_op)
-            for i in range(NUM_RUNS):
-                start = time.time() * 1000
-                sess.run(fetches)
-                end = time.time() * 1000
-                outside_times.append(end - start)
-
-                sess.run(fetches, options=options, run_metadata=run_metadata)
-
-                fetched_timeline = timeline.Timeline(run_metadata.step_stats)
-                chrome_trace = fetched_timeline.generate_chrome_trace_format()  # necessary
-                for event in fetched_timeline._chrome_trace._events:
-                    # print('\n')
-                    # print(list(event.keys()))
-                    # for key in list(event.keys()):
-                    #     print(key, event[key])
-                    if 'dur' in event:
-                        op_name2runtime[event['args']['name']].append(event['dur'])
-                    # todo: to be more accurate, add tid (thread/lanes id)
-
-        mean_outside_time = np.mean(np.array(outside_times[1:]))
-        print('mean outside_times: ', mean_outside_time)
-        print(outside_times)
-        # print('average all_times: ', np.mean(np.array(all_times)))
-
-        op_name2meanruntime = {}
-        for op_name, runtimes in op_name2runtime.items():
-            runtimes = np.array(runtimes)
-            if len(runtimes) > 1:  # Do not compute operations that only run once for all steps.
-                mean = np.mean(np.array(runtimes[1:]))
-                op_name2meanruntime[op_name] = mean
-                print(op_name, mean)
-                # print(op_name2runtime[op_name])
-
-        total_op_time = sum([mean_runtime for op_name, mean_runtime in op_name2meanruntime.items()])
-        print('total_op_time', total_op_time / 1000.)
-        # total_op_time = [sum([runtime[i] for op_name, runtime in op_name2runtime.items()])
-        # for i in range(self.num_runs)]
-        # print('total_op_time', np.mean(np.array(total_op_time)), total_op_time)
-
-        return mean_outside_time
-
-    # @staticmethod
-    # def _calculate_op_timings(graph_item: GraphItem):
-    #     """
-    #     Given a graph, calculates an expected running time for each (op, input_size) pair.
-    #
-    #     Args:
-    #         graph_item (GraphItem): The input graph.
-    #
-    #     Returns:
-    #         Dict mapping (op, input_size) to time.
-    #     """
-    #     all_ops = {}
-    #     for op in graph_item.graph.get_operations():
-    #         input_shapes = tuple((tuple(inp.shape.dims) for inp in op.inputs))
-    #         op_type = op.type
-    #         all_ops[(op_type, input_shapes)] = ops.Graph()
-    #
-    #     for ((op, shape), graph) in all_ops.items():
-    #         with graph.as_default():
-    #             getattr(tensorflow.raw_ops, op)

From 98ec2260b4dae1cb02b3581675f8ed54804a96c8 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Sat, 18 Jul 2020 02:00:00 -0400
Subject: [PATCH 05/11] some refactoring on AutoStrategy interface

---
 autodist/simulator/linear_simulator.py       |  21 ++
 autodist/simulator/predefined_simulator.py   |  29 ++-
 autodist/simulator/utils.py                  |  30 ++-
 autodist/strategy/auto/auto_strategy.py      | 249 -------------------
 autodist/strategy/auto/base.py               | 112 +++++++++
 autodist/strategy/auto/default_constraint.py |   0
 autodist/strategy/auto/strategy_sampler.py   |  34 ++-
 autodist/strategy/auto_strategy.py           |  55 ++++
 autodist/simulator/test.py => test.py        |   0
 9 files changed, 261 insertions(+), 269 deletions(-)
 create mode 100644 autodist/simulator/linear_simulator.py
 delete mode 100644 autodist/strategy/auto/auto_strategy.py
 create mode 100644 autodist/strategy/auto/base.py
 create mode 100644 autodist/strategy/auto/default_constraint.py
 create mode 100644 autodist/strategy/auto_strategy.py
 rename autodist/simulator/test.py => test.py (100%)

diff --git a/autodist/simulator/linear_simulator.py b/autodist/simulator/linear_simulator.py
new file mode 100644
index 0000000..527d923
--- /dev/null
+++ b/autodist/simulator/linear_simulator.py
@@ -0,0 +1,21 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Linear simulator."""
+
+from autodist.simulator.base import SimulatorBase
+
+class LinearSimulator(SimulatorBase):
+    def __init__(self):
+        super(LinearSimulator, self).__init__()
diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py
index 91519c6..973fbef 100644
--- a/autodist/simulator/predefined_simulator.py
+++ b/autodist/simulator/predefined_simulator.py
@@ -1,18 +1,31 @@
-"""Strategy Simulator."""
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Predefined simulator with linear model."""
 
-import numpy as np
-import json
 import pickle as pkl
 
 import tensorflow as tf
 from tensorflow.python.eager import context
 
-from arion.strategy.base import Strategy
-from arion.resource_spec import ResourceSpec
-from arion.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
-from arion.simulator.models.base import SimulatorBase
-from arion.simulator.utils import _resolve_device_address, _resolved_devices_on_diff_machine, \
+from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from autodist.resource_spec import ResourceSpec
+from autodist.simulator.base import SimulatorBase
+from autodist.simulator.utils import _resolved_devices_on_diff_machine, \
 	get_dense_var_bits, get_sparse_var_bits
+from autodist.strategy.base import Strategy
+
 
 class PredefinedSimulator(SimulatorBase):
 	"""Simulates strategies for a given graph and resource spec."""
diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py
index a668e75..2febd63 100644
--- a/autodist/simulator/utils.py
+++ b/autodist/simulator/utils.py
@@ -1,18 +1,34 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Simulator-related utility functions."""
+
 import glob
 import json
 import os
 import numpy as np
 
-import tensorflow_ranking as tfr
 import tensorflow as tf
 from tensorflow.python.framework import device_spec
+import tensorflow_ranking as tfr
 
-from arion.utils import logging
-from arion.resource_spec import ResourceSpec
-from arion.strategy.base import Strategy
-from arion.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \
+from autodist.utils import logging
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy.base import Strategy
+from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \
     DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR
-from arion.kernel.device.resolver import DeviceResolver
+from autodist.kernel.device.resolver import DeviceResolver
 
 
 RankingLossKeys = {
@@ -268,7 +284,7 @@ def read_trial_runs():
 
 
 def pad_list(l, max_len):
-	return l + [0.0] * (max_len - len(l))
+    return l + [0.0] * (max_len - len(l))
 
 
 def get_dtype_bits(dtype):
diff --git a/autodist/strategy/auto/auto_strategy.py b/autodist/strategy/auto/auto_strategy.py
deleted file mode 100644
index 260c7be..0000000
--- a/autodist/strategy/auto/auto_strategy.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2020 Petuum. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""PS StrategyBuilder."""
-
-from autodist.strategy.base import Strategy, StrategyBuilder
-from autodist.proto import strategy_pb2
-from autodist.strategy.auto.strategy_sampler import RandomStrategySampler
-
-
-class AutoStrategy(StrategyBuilder):
-    """
-    AutoStrategy Builder.
-
-    It generates a suitable Strategy based on graph_item and resource_spec following the AutoSync framework.
-    """
-
-    def __init__(self):
-        return
-
-    def build(self, graph_item, resource_spec):
-        # TODO: merge the code in search and propose here.
-        return
-
-    def search(self):
-        # candidates, scores, features = self.propose(self.search_params['num_candidate_explore'])
-        candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore'])
-        n_pick = self.search_params['num_candidate_per_trial']
-
-        # cast them to be np arrays
-        if self.search_params['diversity_metric'] == 'embedding':
-            picked_candidates = self.submodular_pick_by_embedding(np.array(scores),
-                                                                  candidates,
-                                                                  np.stack(features),
-                                                                  n_pick,
-                                                                  self.search_params['simulation_weight'],
-                                                                  self.search_params['diversity_weight'])
-        elif self.search_params['diversity_metric'] == 'expression':
-            picked_candidates = self.submodular_pick_by_expression(np.array(scores),
-                                                                   candidates,
-                                                                   n_pick,
-                                                                   self.search_params['simulation_weight'],
-                                                                   self.search_params['diversity_weight'])
-        else:
-            raise ValueError('Unrecognized diversity metric...')
-        if self.trial_run_fn:
-            self.trial_run(picked_candidates, search_iteration=0)
-
-    def propose(self, num_proposal, use_simulator=True):
-        builder = RandomStrategy(self.space, self.heuristics)
-        candidates = []
-        features = []
-        scores = []
-        # np.random.seed(1)
-        idx = 0
-
-        while len(candidates) < num_proposal:
-            logging.info('Sampling strategy {}'.format(idx))
-            start_time = time.time()
-            expr = builder.build(self._original_graph_item, self._resource_spec)
-            elapsed = time.time() - start_time
-            logging.info('Sampling strategy takes {}'.format(elapsed))
-            builder.reset()
-            idx += 1
-            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
-            if self.simulator and use_simulator:
-                start_time = time.time()
-                score, feature = self.simulator.simulate(expr, self._resource_spec)
-                elapsed = time.time() - start_time
-                logging.info('Inference strategy takes {}'.format(elapsed))
-                if score > self.search_params['rejection_score']:
-                    logging.info('strategy {} has score {} > {}, '
-                                 'rejected..'.format(idx, score, self.search_params['rejection_score']))
-                    continue
-                else:
-                    candidates.append(expr)
-                    features.append(feature)
-                    scores.append(score[0])
-            else:
-                candidates.append(expr)
-                features.append([])
-                scores.append(0)
-        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
-        return candidates, scores, features
-
-    def batch_propose(self, num_proposal, batch_size=32, use_simulator=True):
-
-        builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)]
-        graph_items = [self._original_graph_item for _ in range(batch_size)]
-        rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)]
-        candidates = []
-        features = []
-        scores = []
-        # np.random.seed(1)
-        idx = 0
-
-        while len(candidates) < num_proposal:
-            logging.info('Sampling strategy {}'.format(idx))
-            start_time = time.time()
-
-            q = Queue()
-            exprs = []
-            prs = []
-            for obj, arg1, arg2 in zip(builders, graph_items, rss):
-                prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2)))
-                prs[-1].start()
-            for pr in prs:
-                expr = q.get() # will block
-                exprs.append(expr)
-            for pr in prs:
-                pr.join()
-
-            elapsed = time.time() - start_time
-            logging.info('Sampling strategy takes {}'.format(elapsed))
-            for builder in builders: builder.reset()
-            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
-            if self.simulator and use_simulator:
-                start_time = time.time()
-                batch_score, batch_feature = self.simulator.simulate(exprs, rss)
-                elapsed = time.time() - start_time
-                logging.info('Inference strategy takes {}'.format(elapsed))
-                for ite, expr in enumerate(exprs):
-                    # print(batch_score[ite], batch_feature[ite].shape)
-                    if batch_score[ite] > self.search_params['rejection_score']:
-                        logging.info('strategy {} has score {} > {}, '
-                                     'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score']))
-                    else:
-                        candidates.append(expr)
-                        features.append(batch_feature[ite])
-                        scores.append(batch_score[ite])
-            else:
-                for ite, expr in enumerate(exprs):
-                    candidates.append(expr)
-                    features.append([])
-                    scores.append(0)
-            idx += batch_size
-        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
-        return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal]
-
-    def submodular_pick_by_embedding(self,
-                                     scores,
-                                     candidates,
-                                     candidate_features,
-                                     n_pick,
-                                     beta=1.0,
-                                     alpha=1.0):
-        n = len(scores)
-        assert n == len(candidate_features)
-
-        ret = []
-        sim = np.dot(candidate_features, candidate_features.T)
-        remain = list(range(len(scores)))
-
-        for _ in range(n_pick):
-            tmp_delta = -scores[remain] * beta
-            if len(ret) > 0:
-                tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1)
-            max_x = tmp_delta.argmax()
-            max_x = remain[max_x]
-
-            ret.append(max_x)
-            remain.remove(max_x)
-
-        return [candidates[i] for i in ret]
-
-    def submodular_pick_by_expression(self,
-                                      scores,
-                                      candidates,
-                                      n_pick,
-                                      beta=1.0,
-                                      alpha=1.0):
-
-        def remove_group_or_reduction_destination(strategy):
-            tmp_strategy = copy.deepcopy(strategy)
-            for node in tmp_strategy.node_config:
-                if node.partitioner:
-                    for part in node.part_config:
-                        synchronizer = getattr(part, part.WhichOneof('synchronizer'))
-                        if hasattr(synchronizer, 'reduction_destination'):
-                            synchronizer.reduction_destination = ''
-                        else:
-                            synchronizer.group = 0
-                else:
-                    synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-                    if hasattr(synchronizer, 'reduction_destination'):
-                        synchronizer.reduction_destination = ''
-                    else:
-                        synchronizer.group = 0
-            return tmp_strategy
-
-        def estimate_difference(strategy, node_config_set):
-            score = 0
-            for i, node in enumerate(strategy.node_config):
-                if_seen = False
-                for seen_node in node_config_set[i]:
-                    if seen_node == node:
-                        if_seen = True
-                        break
-                if not if_seen:
-                    score += 1
-            return score
-
-        assert len(scores) == len(candidates)
-
-        node_config_set = [list() for _ in candidates[0].node_config]
-        remain = list(range(len(scores)))
-        ret = []
-        for _ in range(n_pick):
-            max_x = -1
-            max_delta = -1e9
-            max_strategy_copy = None
-
-            for x in remain:
-                tmp_strategy = remove_group_or_reduction_destination(candidates[x])
-                diff_score = estimate_difference(tmp_strategy, node_config_set)
-                assert(diff_score <= len(tmp_strategy.node_config))
-                # print('diff score {}..'.format(diff_score))
-                tmp_delta = - scores[x] * beta + diff_score * alpha
-                if tmp_delta > max_delta:
-                    max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy
-                    max_diff_score = diff_score *alpha
-                    max_simulation_score= -scores[x]
-
-            print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta))
-            ret.append(max_x)
-            remain.remove(max_x)
-
-            # update the node config set
-            for i, node in enumerate(max_strategy_copy.node_config):
-                if_seen = False
-                for seen_node in node_config_set[i]:
-                    if seen_node == node:
-                        if_seen = True
-                        break
-                if not if_seen:
-                    node_config_set[i].append(node)
-
-        return [candidates[i] for i in ret]
diff --git a/autodist/strategy/auto/base.py b/autodist/strategy/auto/base.py
new file mode 100644
index 0000000..05f0be4
--- /dev/null
+++ b/autodist/strategy/auto/base.py
@@ -0,0 +1,112 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A base class to implementating different auto strategies."""
+
+from multiprocessing import Process, Queue
+
+import numpy as np
+
+from autodist.strategy.auto.strategy_sampler import RandomStrategySampler, \
+    default_space, default_heuristics
+from autodist.strategy.base import StrategyBuilder
+from autodist.utils import logging
+
+
+class AutoStrategyBase(StrategyBuilder):
+    """AutoStrategy Base class."""
+
+    def __init__(self,
+                 space=None,
+                 heuristics=None,
+                 num_proposals=1000,
+                 simulator=None,
+                 train_simulator=False):
+        # space and heuristics params
+        if not space:
+            self._space = default_space
+        if not heuristics:
+            self._heuristics = default_heuristics
+
+        # params
+        self._num_proposals = num_proposals
+        self._sampler = RandomStrategySampler(self._space,
+                                              self._heuristics)
+        if train_simulator:
+            raise NotImplementedError()
+        self._simulator = simulator
+
+    def build(self, graph_item, resource_spec):
+        raise NotImplementedError()
+
+    def propose_one(self, graph_item, resource_spec):
+        """
+        Sequentially generate `self._num_proposals` strategies.
+
+        Args:
+            graph_item:
+            resource_spec:
+
+        Returns:
+            Strategy
+        """
+        proposal = self._sampler.build(graph_item, resource_spec)
+        return proposal
+
+    def propose_n(self,
+                  graph_item,
+                  resource_spec,
+                  num_proposals,
+                  num_threads=1):
+        """
+        Proposal `num_proposals` strategies using multi-threading.
+
+        Args:
+            graph_item:
+            resource_spec:
+            num_proposals:
+            num_threads:
+
+        Returns:
+            List(Strategy)
+        """
+        if num_threads > 1:
+            def sampler_worker(q, sampler, graph_item, resource_spec):
+                np.random.seed()
+                expr = sampler.build(graph_item, resource_spec)
+                q.put(expr)
+
+            proposals = []
+            while len(proposals) < num_proposals:
+                # create thread-safe objects before multi-threading
+                samplers = [RandomStrategySampler(graph_item, resource_spec) for _ in range(num_threads)]
+                graph_items = [graph_item for _ in range(num_threads)]
+                resource_specs = [resource_spec for _ in range(num_threads)]
+                q = Queue()
+                threads = []
+                try:
+                    for sampler, gi, rs in zip(samplers, graph_items, resource_specs):
+                        thread = Process(target=sampler_worker, args=(q,sampler, gi, rs))
+                        thread.start()
+                        threads.append(thread)
+                    batch = [q.get() for _ in threads]
+                    proposals.extend(batch)
+                    for thread in threads:
+                        thread.join()
+                except:
+                    logging.error('Error when proposing strategies with {} threads'.format(num_threads))
+                    raise
+        else:
+            proposals = [self.propose_one(graph_item, resource_spec) for i in range(num_proposals)]
+        return proposals
diff --git a/autodist/strategy/auto/default_constraint.py b/autodist/strategy/auto/default_constraint.py
new file mode 100644
index 0000000..e69de29
diff --git a/autodist/strategy/auto/strategy_sampler.py b/autodist/strategy/auto/strategy_sampler.py
index 3281c4b..a317089 100644
--- a/autodist/strategy/auto/strategy_sampler.py
+++ b/autodist/strategy/auto/strategy_sampler.py
@@ -27,6 +27,7 @@
 from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \
     ordered_balanced_group_assigner
 from autodist.strategy.auto import sample_util
+from autodist.const import MAX_INT32
 
 
 class VarType(Enum):
@@ -179,7 +180,7 @@ def byte_size(self):
                * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis])
 
 
-class RandomStrategySampler(StrategyBuilder):
+class RandomStrategySampler():
     """
     Random Strategy Sampler.
 
@@ -202,10 +203,6 @@ def __init__(self, space, heuristics):
         self.heuristics = heuristics
         self.helpers = {}
 
-    def reset(self):
-        """Reset the helpers every time a strategy is sampled."""
-        self.helpers = {}
-
     def build(self, graph_item, resource_spec):
         """Generate a randomized strategy given model and resource spec."""
         expr = Strategy()
@@ -250,8 +247,13 @@ def build(self, graph_item, resource_spec):
         sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics)
 
         expr.node_config.extend(node_config)
+        self._reset()
         return expr
 
+    def _reset(self):
+        """Reset the helpers every time a strategy is sampled."""
+        self.helpers = {}
+
 
 def sample_if_partition(var_helper, resource_spec, space, heuristics):
     """
@@ -650,3 +652,25 @@ def assign_ar_group(node_config, ar_shards):
             synchronizer = getattr(node, node.WhichOneof('synchronizer'))
             if hasattr(synchronizer, 'compressor'):
                 synchronizer.group = ar_shards[node.var_name][1]
+
+
+default_space = {
+    'synchronizer_types': ['PS', 'AR'],
+    'maybe_partition': [True, False],
+    'compressor': ['HorovodCompressor', 'NoneCompressor', 'HorovodCompressorEF'],
+    'local_replication': [False],
+    'partitionable_axis': []
+}
+
+
+default_heuristics = {
+    'ps_load_balancer': None, # None, 'christy', 'greedy', 'LP'
+    'merge_scheme': None,  # random, by_chunk, christy, ordered_balanced
+    'chunk_size': -1,
+    'num_group_bounds': [-1, MAX_INT32],
+    'maybe_partition_bounds': [0, MAX_INT32],
+    'maybe_partition_by_size': None,
+    'num_partition_bounds': [2, MAX_INT32],
+    'enable_single_node_no_partition': False,
+    'same_synchronizer_for_parts': False,
+}
diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py
new file mode 100644
index 0000000..5d6b78f
--- /dev/null
+++ b/autodist/strategy/auto_strategy.py
@@ -0,0 +1,55 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""An AutoStrategy using a trained linear simulator."""
+
+from autodist.strategy.auto.base import AutoStrategyBase
+from autodist.simulator.linear_simulator import LinearSimulator
+
+class AutoStrategy(AutoStrategyBase):
+    """
+    AutoStrategy builder using a trained linear simulator
+
+    It generates a suitable Strategy based on graph_item and resource_spec using a pretrained simulator weight.
+    This implementation currenlty provides a linear simulator weight trained on > 9000 data points.
+    """
+
+    def __init__(self):
+        space = {
+            'synchronizer_types': ['PS', 'AR'],
+            'maybe_partition': [True, False],
+            'compressor': ['HorovodCompressor', 'NoneCompressor'],
+            'local_replication': [True, False],
+            'partitionable_axis': [],
+        }
+        heuristics = {
+            'ps_load_balancer': 'sorted_christy',  # None, 'christy', 'greedy', 'LP'
+            'merge_scheme': 'ordered_balanced',  # random, by_chunk, christy, ordered_balanced
+            'num_group_bounds': [-1, 20],
+            'num_partition_bounds': [2, 40],
+            'enable_single_node_no_partition': False,
+            'same_synchronizer_for_parts': False,
+        }
+
+        simulator = LinearSimulator()
+
+        super(AutoStrategy, self).__init__(
+            space=space,
+            heuristics=heuristics,
+            num_proposals=2000,
+            simulator=simulator
+        )
+
+    def build(self, graph_item, resource_spec):
+        return
\ No newline at end of file
diff --git a/autodist/simulator/test.py b/test.py
similarity index 100%
rename from autodist/simulator/test.py
rename to test.py

From d404f7fa715bd6d5cfde888656e1ed731fb6032b Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Mon, 20 Jul 2020 23:31:57 -0400
Subject: [PATCH 06/11] refactor some simulator and autostrategy utilities

---
 autodist/kernel/device/resolver.py           |  19 +-
 autodist/resource_spec.py                    |  14 +
 autodist/simulator/base.py                   | 394 +++++-----------
 autodist/simulator/predefined_simulator.py   |   8 +-
 autodist/simulator/utils.py                  |  73 +--
 autodist/strategy/__init__.py                |   1 +
 autodist/strategy/auto/default_constraint.py |   0
 autodist/strategy/auto/item.py               | 463 +++++++++++++++++++
 autodist/strategy/auto/sample_util.py        |  61 ---
 autodist/strategy/auto/strategy_sampler.py   | 278 ++++-------
 autodist/strategy/auto_strategy.py           |   7 +-
 examples/linear_regression.py                |   4 +-
 tests/test_simulator.py                      |  27 ++
 13 files changed, 780 insertions(+), 569 deletions(-)
 delete mode 100644 autodist/strategy/auto/default_constraint.py
 create mode 100644 autodist/strategy/auto/item.py
 delete mode 100644 autodist/strategy/auto/sample_util.py
 create mode 100644 tests/test_simulator.py

diff --git a/autodist/kernel/device/resolver.py b/autodist/kernel/device/resolver.py
index 609f471..8fcfecf 100644
--- a/autodist/kernel/device/resolver.py
+++ b/autodist/kernel/device/resolver.py
@@ -45,6 +45,15 @@ def _get_address_to_tasks(cluster):
         return d
 
     def resolve_to_device_spec(self, device):
+        """
+        Resolve an AutoDist DeviceSpec or string to a TensorFlow DeviceSpec.
+
+        Args:
+            device: (a container of) AutoDist DeviceSpec or DeviceSpec string.
+
+        Returns:
+            device_spec, List(device_spec), or Set(device_spec)
+        """
         """Resolve an AutoDist DeviceSpec or its string to a TensorFlow DeviceSpec."""
         if isinstance(device, (list, set)):
             return type(device)(self.resolve_to_device_spec(d) for d in device)
@@ -59,7 +68,15 @@ def resolve_to_device_spec(self, device):
         )
 
     def resolve_to_device_str(self, device):
-        """Resolve an AutoDist DeviceSpec or its string to a TensorFlow device string."""
+        """Resolve an AutoDist DeviceSpec or its string to a TensorFlow device string.
+
+        E.g. 192.168.0.1:GPU:0 or localhost:CPU:1 -> job:worker/task:0/device:GPU:0
+        Args:
+            device: (a container of) AutoDist DeviceSpec or DeviceSpec string.
+
+        Returns:
+            str, List(str), or Set(str)
+        """
         if isinstance(device, (list, set)):
             return type(device)(self.resolve_to_device_spec(d).to_string() for d in device)
         elif isinstance(device, RepeatedScalarContainer):
diff --git a/autodist/resource_spec.py b/autodist/resource_spec.py
index 017faea..be1d570 100644
--- a/autodist/resource_spec.py
+++ b/autodist/resource_spec.py
@@ -72,6 +72,7 @@ def __init__(self, resource_file=None):
         self.__chief_address = None
         self.__ssh_config_map = dict()
         self.__ssh_group = dict()
+        self.__network_bandwidth = dict()
 
         # set self.__devices
         self._from_resource_info(resource_file)
@@ -147,6 +148,11 @@ def ssh_group(self):
         """SSH Group for each node."""
         return self.__ssh_group
 
+    @property
+    def network_bandwidth(self):
+        """Network bandwidth of each node."""
+        return self.__network_bandwidth
+
     def _add_device(self, device_spec):
         if device_spec.name_string() not in self.__devices:
             self.__devices[device_spec.name_string()] = device_spec
@@ -200,6 +206,14 @@ def _parse_node(self, node, num_nodes):
         self.__ssh_group[host_address] = node.get('ssh_config')
         if self.__ssh_group[host_address] is None and self.__chief_address != host_address:
             raise ValueError("Need to define SSH groups for all non-chief nodes.")
+        # network bandwidth
+        if node.get('network_bandwidth'):
+            self.__network_bandwidth[host_address] = node.get('network_bandwidth')
+        else:
+            # TODO (Hao): we could also raise ValueError here.
+            logging.warning('Bandwidth for {} is undefined and set as default. '
+                            'Caution when using AutoStrategy.'.format(host_address))
+            self.__network_bandwidth[host_address] = 1
 
 
 class DeviceSpec:
diff --git a/autodist/simulator/base.py b/autodist/simulator/base.py
index bac33d5..31e2d1a 100644
--- a/autodist/simulator/base.py
+++ b/autodist/simulator/base.py
@@ -12,308 +12,156 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Strategy Simulator."""
+"""Simulator base class."""
+import os
+from collections import OrderedDict
 
-from collections import defaultdict
-
-from autodist.cluster import SSHCluster
 from autodist.graph_item import GraphItem
-from autodist.kernel.device.resolver import DeviceResolver
 from autodist.kernel.partitioner import PartitionerConfig
 from autodist.resource_spec import ResourceSpec
 from autodist.strategy.base import Strategy
-from autodist.simulator.utils import _resolve_device_address, GIGABITS, _max_num_local_replica, _num_local_replica
-from autodist.strategy.auto.strategy_sampler import VariableHelper, PartHelper
-from autodist.simulator.utils import INFINITY
-
-
-class Var:
-    def __init__(self,
-                 name=None,
-                 is_sparse=False,
-                 synchronizer=None,
-                 shape=None,
-                 dtype=None,
-                 device=None,
-                 compressor=None):
-        self.name = name
-        self.is_sparse = is_sparse
-        self.synchronizer = synchronizer
-        self.shape = shape
-        self.dtype = dtype
-        self.device = device
-        self.compressor = compressor
-        self.device = device
-        self.is_partition = False
+from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem
 
-        self.original_shape = self.shape
-
-    @property
-    def var_size(self):
-        size = 1
-        if self.shape:
-            for s in self.shape:
-                size *= s
-        return size
-
-    @property
-    def original_var_size(self):
-        size = 1
-        if self.original_shape:
-            for s in self.original_shape:
-                size *= s
-        return size
-
-    def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
-        if not self.is_sparse:
-            return self.var_size
-        else:
-            if not self.shape:  # scalar
-                return 1
 
-            emb_size = 1
-            if len(self.shape) > 1:
-                for i in range(1, len(self.original_shape)):
-                    emb_size = emb_size * self.original_shape[i]
-
-            sparse_data_size = batch_size_per_gpu * seq_len * emb_size
-
-            # estimate the embedding of this partition simply using a proportional formula
-            ret = sparse_data_size * self.var_size / self.original_var_size
-            return ret
+class SimulatorBase:
+    """Simulates strategies for a given graph and resource spec."""
 
-class Partition(Var):
     def __init__(self,
-                 name=None,
-                 is_sparse=False,
-                 synchronizer=None,
-                 shape=None,
-                 dtype=None,
-                 device=None,
-                 compressor=None,
-                 part_id=0,
-                 original_shape=None,
-                 partition_str=None,
-                 num_shards=1):
-        super(Partition, self).__init__(name, is_sparse, synchronizer, shape, dtype, device, compressor)
-        self.is_partition = True
-        self.part_id = part_id
-        self.partition_str = partition_str
-        self.original_shape = original_shape
-        self.num_shards = num_shards
+                 graph_item=None,
+                 resource_spec=None):
+        """
+        Constructor for simulator base class
+        Args:
+            graph_item: a GraphItem object, or a path to a serialized GraphItem object.
+            resource_spec: a ResourceSpec object, or a path to a resource file.
+        """
+        # check if it is a path
+        self._graph_item = None
+        if isinstance(graph_item, GraphItem):
+            self._graph_item = graph_item
+        elif isinstance(graph_item, str) and os.path.exists(graph_item):
+            self._graph_item = GraphItem.deserialize(graph_item)
+        else:
+            raise ValueError("Invalid graph_item: {}".format(graph_item))
 
-class Resource:
-    def __init__(self, cluster, device_resolver, graph_replicas, network_bandwidth, cpu_worker_list,
-                 gpu_worker_list, max_num_local_replica, total_num_local_replica, worker_num_replicas):
-        self.cluster=cluster
-        self.device_resolver=device_resolver
-        self.graph_replicas=graph_replicas
-        self.network_bandwidth=network_bandwidth
-        self.cpu_worker_list=cpu_worker_list
-        self.gpu_worker_list=gpu_worker_list
-        self.max_num_local_replica=max_num_local_replica
-        self.total_num_local_replica=total_num_local_replica
-        self.worker_num_replicas=worker_num_replicas
+        self._resource_spec = None
+        if isinstance(resource_spec, ResourceSpec):
+            self._resource_spec = resource_spec
+        elif isinstance(resource_spec, str) and os.path.exists(resource_spec):
+            self._resource_spec = ResourceSpec(resource_spec)
+        else:
+            raise ValueError("Invalid resource_spec: {}".format(resource_spec))
+
+    def update_graph_item(self, graph_item):
+        """Change the default graph_item with this simulator."""
+        if not graph_item:
+            raise ValueError('Empty graph item.')
+        self._graph_item = graph_item
+
+    def update_resource_spec(self, resource_spec):
+        """Change the default resource_spec with this simulator."""
+        if not resource_spec:
+            raise ValueError('Empty resource spec.')
+        self._resource_spec = resource_spec
+
+    def simulate(self,
+                 strategy,
+                 graph_item=None,
+                 resource_spec=None,
+                 checkpoint=None):
+        """
+        Return simulated runtime cost given (Strategy, GraphItem, ResourceSpec) tuple.
+
+        Args:
+            strategy:
+            graph_item:
+            resource_spec:
+            checkpoint:
+
+        Returns:
+            float
+        """
+        raise NotImplementedError()
 
-class SimulatorBase:
-    """Simulates strategies for a given graph and resource spec."""
+    def inference(self,
+                  features,
+                  checkpoint=None):
+        """
+        Abstract method for simulator inference.
 
-    def __init__(self, original_graph_item_path):
-        self._original_graph_item_path = original_graph_item_path
-        self._original_graph_item = GraphItem.deserialize(original_graph_item_path)
-        # self._resource_file = resource_file
-        # self._resource_spec = ResourceSpec(resource_file)
-        # self._cluster = SSHCluster(self._resource_spec)
-        # self._device_resolver = DeviceResolver(self._cluster)
-        #
-        # self._graph_replicas = [_resolve_device_address(k, self._device_resolver)
-        #                         for k, v in self._resource_spec.gpu_devices]
-        #
-        # # bandwidth
-        # self._network_bandwidth = self.network_bandwidth(self._resource_spec, self._device_resolver)
-        # # Other information
-        # self._cpu_worker_list = [_resolve_device_address(device, self._device_resolver)
-        #                          for device, _ in self._resource_spec.cpu_devices]
-        # self._gpu_worker_list = [_resolve_device_address(device, self._device_resolver)
-        #                          for device, _ in self._resource_spec.gpu_devices]
-        # self._max_num_local_replica = _max_num_local_replica(self._graph_replicas, self._cluster)
-        # self._total_num_local_replica = len(self._graph_replicas)
-        # self._worker_num_replicas = [_num_local_replica(cpu_worker, self._graph_replicas, self._cluster)
-        #                              for cpu_worker in self._cpu_worker_list]
+        Args:
+            features: feature input extracted from (GraphItem, ResourceSpec, Strategy) tuple.
+            checkpoint: optional simulator weight.
 
-    def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint: str):
-        """Return simulated runtime value by feeding features to the cost model."""
+        Returns:
+            float
+        """
         raise NotImplementedError()
 
-    def inference(self, inputs, checkpoint):
-        raise NotImplementedError()
+    def load_checkpoint(self, checkpoint=None):
+        """
+        Load a checkpoint file as weights of the simulator.
 
-    def load_checkpoint(self, checkpoint):
+        Args:
+            checkpoint: path to a checkpoint file.
+        """
         raise NotImplementedError()
 
     def save_checkpoint(self, model, checkpoint):
+        """
+        Save a trained weight as a checkpoint file.
+
+        Args:
+            model: trained model.
+            checkpoint: path where to save the checkpoint.
+        """
         raise NotImplementedError()
 
     def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
         raise NotImplementedError()
 
-    def extract_pre_feature(self, strategy: Strategy, resource_spec: ResourceSpec):
-        resource = self.setup_resource(resource_spec)
+    def preprocess(self,
+                   strategy,
+                   graph_item=None,
+                   resource_spec=None):
+        """
+        Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features.
+
+        Args:
+            strategy: a distribution strategy
+            graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used.
+            resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used.
+
+        Returns:
+            OrderedDict(): variable/part name to variable/part items.
+            ResourceItem:
+        """
+        if not graph_item:
+            if not self._graph_item:
+                raise ValueError('No graph item provided.')
+            else:
+                graph_item = self._graph_item
+        if not resource_spec:
+            if not self._resource_spec:
+                raise ValueError('No resource spec provided.')
+            else:
+                resource_spec = self._resource_spec
+        if not strategy:
+            raise ValueError('No strategy provided.')
 
-        name2var = {var.name: var for var_op, var in self._original_graph_item.trainable_var_op_to_var.items()}
+        resource_item = ResourceItem(resource_spec)
+        name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()}
 
-        meta = defaultdict()
+        name_to_items = OrderedDict()
         for node in strategy.node_config:
             var_name = node.var_name
-            # for var_op, var in self._original_graph_item.trainable_var_op_to_var.items():
-            #     if var.name == var_name:
-            #         break
-            var = name2var[var_name]
-            var_helper = VariableHelper(var, self._original_graph_item)
-
+            var = name_to_var[var_name]
             if node.partitioner:
                 pc = PartitionerConfig(partition_str=node.partitioner)
                 for i, part in enumerate(node.part_config):
-                    part_helper = PartHelper(i, var, pc)
-                    synchronizer = getattr(part, part.WhichOneof('synchronizer'))
-                    compressor = getattr(synchronizer, 'compressor', None)
-                    reduction_destination = getattr(synchronizer, 'reduction_destination', None)
-                    device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
-                                                     resource.device_resolver)
-
-                    part_meta = Partition(name=part.var_name,
-                                          is_sparse=var_helper.is_sparse,
-                                          shape=part_helper.shape,
-                                          dtype=var_helper.dtype,
-                                          synchronizer=synchronizer,
-                                          part_id=i,
-                                          num_shards=pc.num_shards,
-                                          partition_str=pc.partition_str,
-                                          original_shape=var_helper.shape,
-                                          compressor=compressor,
-                                          device=device)
-                    meta[part_meta.name] = part_meta
+                    part_item = PartItem(var, graph_item, i, pc, part)
+                    name_to_items[part_item.name] = part_item
             else:
-                synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-                compressor = getattr(synchronizer, 'compressor', None)
-                reduction_destination = getattr(synchronizer, 'reduction_destination', None)
-                device = _resolve_device_address(reduction_destination if reduction_destination else var.device,
-                                                 resource.device_resolver)
-
-                var_meta = Var(name=var_name,
-                               is_sparse=var_helper.is_sparse,
-                               shape=var_helper.shape,
-                               dtype=var_helper.dtype,
-                               synchronizer=synchronizer,
-                               compressor=compressor,
-                               device=device)
-                meta[var_meta.name] = var_meta
-        return meta, resource
-
-    # def extract_pre_feature_legacy(self, strategy):
-    #     """Don't use now!!!"""
-    #     meta = defaultdict()
-    #     for node in strategy.node_config:
-    #         var_name = node.var_name
-    #         for var_op, var in self._original_graph_item.trainable_var_op_to_var.items():
-    #             if var.name == var_name:
-    #                 break
-    #         var_op_name = var_op.name
-    #         var_helper = VariableHelper(var, self._original_graph_item)
-    #         synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-    #         compressor = getattr(synchronizer, 'compressor', None)
-    #         if compressor is not None:
-    #             compressor = AllReduceSynchronizer.Compressor.Name(compressor)
-    #         reduction_destinations = getattr(synchronizer, 'reduction_destinations', None)
-    #         if not reduction_destinations or len(reduction_destinations) <= 1:
-    #             # this variable is not partitioned
-    #             device = reduction_destinations[0] if reduction_destinations else var.device
-    #             var_meta = Var(name=var_name,
-    #                            is_sparse=var_helper.is_sparse,
-    #                            shape=var_helper.shape,
-    #                            dtype=var_helper.dtype,
-    #                            synchronizer=synchronizer,
-    #                            compressor=compressor,
-    #                            device=device)
-    #             meta[var_meta.name] = var_meta
-    #         else:
-    #             # this variable is partitioned
-    #             num_partitions = len(reduction_destinations)
-    #             partition_list = [1] * len(var_helper.shape)
-    #             partition_list[0] = num_partitions
-    #             pc = PartitionerConfig(partition_list=partition_list)
-    #             for i, device in enumerate(reduction_destinations):
-    #                 part_helper = PartHelper(i, var, pc)
-    #                 part_meta = Partition(name='{}/part_{}:0'.format(var_op_name, i),
-    #                                       is_sparse=var_helper.is_sparse,
-    #                                       shape=part_helper.shape,
-    #                                       dtype=var_helper.dtype,
-    #                                       synchronizer=synchronizer,
-    #                                       part_id=i,
-    #                                       partition_str=pc.partition_str,
-    #                                       original_shape=var_helper.shape,
-    #                                       compressor=compressor,
-    #                                       device=device)
-    #                 meta[part_meta.name] = part_meta
-    #     return meta
-
-    def setup_resource(self, resource_spec: ResourceSpec):
-        cluster = SSHCluster(resource_spec)
-        device_resolver = DeviceResolver(cluster)
-        graph_replicas = [_resolve_device_address(k, device_resolver) for k, v in resource_spec.gpu_devices]
-        # bandwidth
-        network_bandwidth = self.network_bandwidth(resource_spec, device_resolver)
-        # Other information
-        cpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.cpu_devices]
-        gpu_worker_list = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.gpu_devices]
-        max_num_local_replica = _max_num_local_replica(graph_replicas, cluster)
-        total_num_local_replica = len(graph_replicas)
-        worker_num_replicas = [_num_local_replica(cpu_worker, graph_replicas, cluster) for cpu_worker in cpu_worker_list]
-        resource = Resource(cluster=cluster,
-                            device_resolver=device_resolver,
-                            graph_replicas=graph_replicas,
-                            network_bandwidth=network_bandwidth,
-                            cpu_worker_list=cpu_worker_list,
-                            gpu_worker_list=gpu_worker_list,
-                            max_num_local_replica=max_num_local_replica,
-                            total_num_local_replica=total_num_local_replica,
-                            worker_num_replicas=worker_num_replicas)
-        return resource
-
-    @staticmethod
-    def network_bandwidth(resource_spec: ResourceSpec, device_resolver: DeviceResolver):
-        """Calculates all P2P network bandwidths between nodes in the cluster."""
-        devices = [device for device, _ in resource_spec.devices]
-        resolved_devices = [_resolve_device_address(device, device_resolver) for device, _ in resource_spec.devices]
-        gpu_cpu_bw = 10000.  # hardcode for now
-        network_bandwidth = {}  # key: <server, worker>
-        for i in range(len(devices)):
-            if resolved_devices[i] not in network_bandwidth:
-                network_bandwidth[resolved_devices[i]] = {}
-            for j in range(i, len(devices)):
-                if resolved_devices[j] not in network_bandwidth:
-                    network_bandwidth[resolved_devices[j]] = {}
-                ip_i = devices[i].split(':')[0]
-                ip_j = devices[j].split(':')[0]
-                if ip_i != ip_j:
-                    network_bandwidth[resolved_devices[i]][resolved_devices[j]] \
-                        = GIGABITS * resource_spec.network_bandwidth[ip_i]
-                    network_bandwidth[resolved_devices[j]][resolved_devices[i]] \
-                        = GIGABITS * resource_spec.network_bandwidth[ip_j]
-                else:
-                    network_bandwidth[resolved_devices[i]][resolved_devices[j]] = GIGABITS * gpu_cpu_bw
-                    network_bandwidth[resolved_devices[j]][resolved_devices[i]] = GIGABITS * gpu_cpu_bw
-
-        return network_bandwidth
-
-    @staticmethod
-    def min_bandwitdh(worker_list, bandwidth):
-        min_bandwidth = INFINITY
-        num_workers = len(worker_list)
-        for i in range(num_workers):
-            for j in range(i, num_workers):
-                min_bandwidth = min(min_bandwidth, bandwidth[worker_list[j]][worker_list[i]])
-
-    @property
-    def original_graph_item_path(self):
-        return self._original_graph_item_path
+                var_item = VariableItem(var, graph_item, node)
+                name_to_items[var_item.name] = var_item
+        return name_to_items, resource_item
diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py
index 973fbef..a419126 100644
--- a/autodist/simulator/predefined_simulator.py
+++ b/autodist/simulator/predefined_simulator.py
@@ -31,8 +31,8 @@ class PredefinedSimulator(SimulatorBase):
 	"""Simulates strategies for a given graph and resource spec."""
 
 	def __init__(self,
-				 original_graph_item_path,
-				 fetches=None,
+				 graph_item=None,
+				 resource_spec=None,
 				 batch_size=1,
 				 seq_len=1,
 				 get_coef=True,
@@ -135,7 +135,7 @@ def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec):
 	def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
 		# var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
 
-		vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec)
+		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
 
 		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
 		device_ps_sync_time = {}
@@ -178,7 +178,7 @@ def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
 
 	def predefined_sync_time(self, strategy, resource_spec):
 		""" graph_item: transformed graph item """
-		vars, resource = self.extract_pre_feature(strategy=strategy, resource_spec=resource_spec)
+		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
 		# Compute synchronization time for every var
 		var_sync_time = {}
 		for var_name, var in vars.items():
diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py
index 2febd63..b200007 100644
--- a/autodist/simulator/utils.py
+++ b/autodist/simulator/utils.py
@@ -26,8 +26,8 @@
 from autodist.utils import logging
 from autodist.resource_spec import ResourceSpec
 from autodist.strategy.base import Strategy
-from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \
-    DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR
+# from autodist.const import DEFAULT_RUNTIME_SERIALIZATION_DIR, DEFAULT_SERIALIZATION_DIR, \
+#     DEFAULT_STRATEGY_JSON_SERIALIZATION_DIR, DEFAULT_RESOURCE_SERIALIZATION_DIR
 from autodist.kernel.device.resolver import DeviceResolver
 
 
@@ -281,6 +281,7 @@ def read_trial_runs():
 GIGABITS = np.float(1e+9)
 INFINITY = 1e+9
 NUM_RUNS = 500
+GPU_TO_CPU_BANDWIDTH = 1000 # Gbps
 
 
 def pad_list(l, max_len):
@@ -308,40 +309,40 @@ def _resolved_devices_on_diff_machine(device1, device2):
     return node1 != node2
 
 
-def _resolve_device_address(device: str, device_resolver: DeviceResolver):
-    # change real ip address to /job:worker/task:0
-    if not device:
-        return device
-    parts = device.split(':')
-    if parts and parts[0] in device_resolver._address_to_tasks:
-        resolved_device = device_resolver._address_to_tasks[parts[0]][0]
-        resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task'])
-        resolved = resolved + ':'.join(parts[-2:])
-        return resolved
-    else:
-        raise ValueError("cannot resolve device: {} using device_resolver: {}".format(
-            device, device_resolver._address_to_tasks))
-
-
-def _num_local_replica(host, replicas, cluster):
-    # host: e.g., '/job:worker/task:0/device:CPU:0'
-    replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas}
-    host_device = device_spec.DeviceSpecV2.from_string(host)
-    num_local_replica = sum(1 for d in replica_devices
-                            if cluster.get_address_from_task(d.job, d.task) ==
-                            cluster.get_address_from_task(host_device.job, host_device.task))
-    return num_local_replica
-
-
-def _max_num_local_replica(replicas, cluster):
-    replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas}
-    replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices}
-    max_num_local_replica = 0
-    for host in replica_hosts:
-        num_local_replica = sum(1 for d in replica_devices
-                                if cluster.get_address_from_task(d.job, d.task) == host)
-        max_num_local_replica = max(max_num_local_replica, num_local_replica)
-    return max_num_local_replica
+# def _resolve_device_address(device: str, device_resolver: DeviceResolver):
+#     # change real ip address to /job:worker/task:0
+#     if not device:
+#         return device
+#     parts = device.split(':')
+#     if parts and parts[0] in device_resolver._address_to_tasks:
+#         resolved_device = device_resolver._address_to_tasks[parts[0]][0]
+#         resolved = '/job:{}/task:{}/device:'.format(resolved_device['job'], resolved_device['task'])
+#         resolved = resolved + ':'.join(parts[-2:])
+#         return resolved
+#     else:
+#         raise ValueError("cannot resolve device: {} using device_resolver: {}".format(
+#             device, device_resolver._address_to_tasks))
+
+
+# def _num_local_replica(host, replicas, cluster):
+#     # host: e.g., '/job:worker/task:0/device:CPU:0'
+#     replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas}
+#     host_device = device_spec.DeviceSpecV2.from_string(host)
+#     num_local_replica = sum(1 for d in replica_devices
+#                             if cluster.get_address_from_task(d.job, d.task) ==
+#                             cluster.get_address_from_task(host_device.job, host_device.task))
+#     return num_local_replica
+#
+#
+# def _max_num_local_replica(replicas, cluster):
+#     replica_devices = {device_spec.DeviceSpecV2.from_string(r) for r in replicas}
+#     replica_hosts = {cluster.get_address_from_task(d.job, d.task) for d in replica_devices}
+#     max_num_local_replica = 0
+#     for host in replica_hosts:
+#         num_local_replica = sum(1 for d in replica_devices
+#                                 if cluster.get_address_from_task(d.job, d.task) == host)
+#         max_num_local_replica = max(max_num_local_replica, num_local_replica)
+#     return max_num_local_replica
 
 
 def _strip_var_name(name):
diff --git a/autodist/strategy/__init__.py b/autodist/strategy/__init__.py
index 3be1c34..fe6a366 100644
--- a/autodist/strategy/__init__.py
+++ b/autodist/strategy/__init__.py
@@ -25,3 +25,4 @@
 from .partitioned_all_reduce_strategy import PartitionedAR
 from .random_axis_partition_all_reduce_strategy import RandomAxisPartitionAR
 from .uneven_partition_ps_strategy import UnevenPartitionedPS
+# from .auto_strategy import AutoStrategy
diff --git a/autodist/strategy/auto/default_constraint.py b/autodist/strategy/auto/default_constraint.py
deleted file mode 100644
index e69de29..0000000
diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py
new file mode 100644
index 0000000..d2377e6
--- /dev/null
+++ b/autodist/strategy/auto/item.py
@@ -0,0 +1,463 @@
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper classes and functions for automatic strategy generation."""
+
+from enum import Enum
+
+from tensorflow.python.framework import ops, device_spec
+
+from autodist.kernel.common.utils import get_op_name, get_consumers
+from autodist.kernel.device.resolver import DeviceResolver
+from autodist.graph_item import cached_property
+from autodist.strategy.base import byte_size_load_fn
+from autodist.utils import logging
+from autodist.cluster import SSHCluster
+from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS
+
+
+class VarType(Enum):
+    SPARSE = 0
+    DENSE = 1
+
+
+class VariableItem:
+    """Helper class to include meta information about a variable."""
+    def __init__(self,
+                 var,
+                 graph_item,
+                 node_config=None):
+        self.var = var
+        self.graph_item = graph_item
+        self._var_op_name = get_op_name(var.name)
+        self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0]
+
+        self._config = None
+        if node_config:
+            self.update_config(node_config)
+        else:
+            logging.warning('Item with name {} has empty config.'.format(self.name))
+
+    def update_config(self, config):
+        """
+        Update the nodeconfig of this variable.
+
+        Args:
+            config:
+        """
+        assert not config
+        self._node_config = config
+
+    @property
+    def var_type(self):
+        """
+        Return the type of the variable (VarType.SPARSE or VarType.DENSE).
+
+        Returns:
+            VarType
+        """
+        return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE
+
+    @property
+    def name(self):
+        """
+        Return the name of the variable.
+
+        Returns:
+            String
+        """
+        return self.var.name
+
+    @property
+    def is_sparse(self):
+        """
+        Return whether the variable is sparse.
+
+        Returns:
+            Bool
+        """
+        return True if self.var_type == VarType.SPARSE else False
+
+    @property
+    def is_embedding(self):
+        """
+        Return whether the variable corresponds to an embedding.
+
+        Returns:
+            Bool
+        """
+        # TODO (Hao): better way to determine is_embedding?
+        for op in get_consumers(self.var.op):
+            if op.type == "ResourceGather":
+                return True
+        return False
+
+    @property
+    def shape(self):
+        """
+        Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar).
+
+        Returns:
+            List(int)
+        """
+        return self.original_shape
+
+    @property
+    def original_shape(self):
+        if self.var.initial_value.shape.ndims:
+            return self.var.initial_value.shape.as_list()
+        else:
+            return None
+
+    @property
+    def size(self):
+        size = 1
+        if self.shape:
+            for s in self.shape:
+                size *= s
+        return size
+
+    @property
+    def original_size(self):
+        size = 1
+        if self.original_shape:
+            for s in self.original_shape:
+                size *= s
+        return size
+
+    @property
+    def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
+        if not self.is_sparse:
+            return self.size
+        else:
+            if not self.shape: # scalar
+                return 1
+
+            emb_size = 1
+            if len(self.shape) > 1:
+                # infer the embedding size from original shape
+                for i in range(1, len(self.original_shape)):
+                    emb_size *= self.original_shape[i]
+
+            sparse_data_size = batch_size_per_gpu * seq_len * emb_size
+
+            # estimate the embedding of this partition simply using a proportional formula
+            return sparse_data_size * self.size / self.original_size
+
+    @property
+    def partitionable_axes(self):
+        """
+        Return the list of available axes that are legitimate to partition along.
+
+        Returns:
+            List(int)
+        """
+        valid_axes = []
+
+        # scalar
+        if not self.shape:
+            return valid_axes
+
+        # Sparse variable can only be partition along the 0th axis in current implementation.
+        if self.is_sparse or self.is_embedding:
+            valid_axes = [0]
+            return valid_axes
+        for idx, dim in enumerate(self.shape):
+            if dim > 1:
+                valid_axes.append(idx)
+        return valid_axes
+
+    @property
+    def byte_size(self):
+        """
+        Return the byte size of the variable.
+
+        Returns:
+            float
+        """
+        return float(byte_size_load_fn(self.var))
+
+    @property
+    def dtype(self):
+        """
+        Return the dtype of the variable.
+
+        Returns:
+            dtype
+        """
+        return self.var.dtype
+
+    @property
+    def synchronizer(self):
+        """
+        Return the synchronizer protobuf in the config of this variable.
+
+        Returns:
+            NodeConfig
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if self._node_config.partitioner:
+            logging.warning('This variable will be partitioned')
+            return None
+        return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer'))
+
+    @property
+    def compressor(self):
+        """
+        Return the compressor in the node config of this variable.
+
+        Returns:
+            Compressor type.
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if self._node_config.partitioner:
+            logging.warning('This variable will be partitioned')
+            return None
+        return getattr(self.synchronizer, 'compressor', None)
+
+    @property
+    def reduction_destination(self):
+        """
+        Return the reduction_destination in the node config of this variable.
+
+        Returns:
+            Reduction destinaiton.
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if self._node_config.partitioner:
+            logging.warning('This variable will be partitioned')
+            return None
+        return getattr(self.synchronizer, 'reduction_destination', None)
+
+    def device(self, resolver):
+        device_str = self.reduction_destination if self.reduction_destination else self.var.device
+        if device_str:
+            device_str =  resolver.resolve_to_device_str(device_str)
+        return device_str
+
+class PartItem(VariableItem):
+    """Helper class to include meta information about a variable partition."""
+    def __init__(self,
+                 var,
+                 graph_item,
+                 part_idx,
+                 pc,
+                 part_config=None):
+        super(PartItem, self).__init__(var, graph_item, part_config)
+
+        self.part_idx = part_idx
+        self.pc = pc
+
+    @property
+    def name(self):
+        """
+        Return the name of this partition.
+
+        Returns:
+            String
+        """
+        name = '{}/part_{}:0'.format(get_op_name(self.var.name), self.part_idx)
+        return name
+
+    @property
+    def partition_str(self):
+        return self.pc.partition_str
+
+    @property
+    def shape(self):
+        """
+        Return the shape of this partition.
+
+        Returns:
+            List(int)
+
+        """
+        shape = self.original_shape
+        if shape:
+            dim_size = shape[self.pc.axis] // self.pc.num_shards
+            extras = shape[self.pc.axis] % self.pc.num_shards
+            if self.part_idx < extras:
+                dim_size += 1
+            shape[self.pc.axis] = dim_size
+        return shape
+
+    @property
+    def partitionable_axes(self):
+        """
+        Return the list of available axes that are legitimate to partition along.
+
+        Returns:
+            None: because this is a partition (not allowed to be partitioned further).
+        """
+        return []
+
+    @property
+    def byte_size(self):
+        """
+        Return the byte size of this partition.
+
+        Returns:
+            float
+        """
+        return float(byte_size_load_fn(self.var)) \
+            * float(self.shape[self.pc.axis]) / float(self.original_shape[self.pc.axis])
+
+    @property
+    def synchronizer(self):
+        """
+
+        Returns:
+
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if not self._node_config.partitioner:
+            raise ValueError('Partitioner field is empty for a variable partition.')
+        return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer'))
+
+    @property
+    def compressor(self):
+        """
+        Return the compressor in the node config of this variable partition.
+
+        Returns:
+            Compressor.
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if not self._node_config.partitioner:
+            raise ValueError('Partitioner field is empty for a variable partition.')
+        return getattr(self.synchronizer, 'compressor', None)
+
+    @property
+    def reduction_destination(self):
+        """
+        Return the reduction_destination in the node config of this variable partition.
+
+        Returns:
+            Reduction destination.
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if not self._node_config.partitioner:
+            logging.warning('Partitioner field is empty for a variable partition.')
+            return None
+        return getattr(self.synchronizer, 'reduction_destination', None)
+
+
+class ResourceItem:
+    """ResourceItem.
+
+    Helper class that includes meta information about a resource spec. All addresses are resolved (in TF format).
+
+    TODO(zhisbug): merge ResourceItem class with ResourceSpec.
+    """
+
+    def __init__(self, resource_spec):
+        self._resource_spec = resource_spec
+        self._cluster = SSHCluster(resource_spec)
+        self._device_resolver = DeviceResolver(self._cluster)
+
+    @property
+    def replicas(self):
+        """Return the list of replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0."""
+        device_strs = [k for k, _ in self._resource_spec.devices]
+        return self._device_resolver.resolve_to_device_str(device_strs)
+
+    @property
+    def gpu_replicas(self):
+        """
+        Return the list of GPU replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0.
+
+        Returns:
+            List(string)
+        """
+        # device_str is autodist device string, e.g. 192.168.0.1:CPU:0
+        device_strs = [k for k, _ in self._resource_spec.gpu_devices]
+        return self._device_resolver.resolve_to_device_str(device_strs)
+
+    @property
+    def cpu_replicas(self):
+        """
+        Return the list of CPU replicas in the format of TF device string, e.g. job:worker/task:0/device:cpu:0.
+
+        Returns:
+            List(string)
+        """
+        device_strs = [k for k, _ in self._resource_spec.cpu_devices]
+        return self._device_resolver.resolve_to_device_str(device_strs)
+
+    @property
+    def total_num_gpu_replica(self):
+        return len(self.gpu_replicas)
+
+    def num_local_gpu_replica(self, host):
+        """
+        Return the number of gpu replica on a TF host address, e.g. '/job:worker/task:0/device:CPU:0'.
+
+        Args:
+            host: TF host address,e .g. '/job:worker/task:0/device:CPU:0'
+
+        Returns:
+            int
+        """
+        gpu_device_specs = {device_spec.DeviceSpecV2.from_string(d) for d in self.gpu_replicas}
+        num = 0
+        host_device_spec = device_spec.DeviceSpecV2.from_string(host)
+        for d in gpu_device_specs:
+            if self._cluster.get_address_from_task(d.job, d.task) \
+                 == self._cluster.get_address_from_task(host_device_spec.job, host_device_spec.task):
+                num += 1
+        return num
+
+    @property
+    def max_num_local_gpu_replica(self):
+        """Return the max number of local gpu replicas on the cluster."""
+        return max([self.num_local_gpu_replica(host) for host in self.cpu_replicas])
+
+    @cached_property
+    def p2p_bandwidth(self):
+        """Calculates P2P network bandwidth between nodes in the cluster.
+
+        Note that this is NOT a sysmetric
+        """
+        bw = {} # key: (device1, device2)
+        devices = [device for device, _ in self._resource_spec.devices]
+        resolved_devices = self.replicas
+
+        for i in range(len(self.replicas)):
+            ip_i = devices[i].split(':')[0]
+            d_i = resolved_devices[i]
+            if d_i not in bw:
+                bw[d_i] = {}
+            for j in range(i, len(self.replicas)):
+                ip_j = devices[j].split(':')[0]
+                d_j = resolved_devices[j]
+                if d_j not in bw:
+                    bw[d_j] = {}
+                if ip_i != ip_j:
+                    bw[d_i][d_j] = GIGABITS * self._resource_spec[ip_i].bandwidth[ip_i]
+                    bw[d_j][d_i] = GIGABITS * self._resource_spec[ip_j].bandwidth[ip_j]
+                else:
+                    bw[d_i][d_j] = GIGABITS * GPU_TO_CPU_BANDWIDTH
+                    bw[d_j][d_i] = GIGABITS * GPU_TO_CPU_BANDWIDTH
+        return bw
+
+    @cached_property
+    def min_bandwidth(self):
+        """Return the minimum bandwidth (bottleneck) of all p2p connections on this cluster."""
+        return min([min(v.values()) for k, v in self.p2p_bandwidth])
diff --git a/autodist/strategy/auto/sample_util.py b/autodist/strategy/auto/sample_util.py
deleted file mode 100644
index 2547304..0000000
--- a/autodist/strategy/auto/sample_util.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2020 Petuum. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Sample utility functions."""
-
-import numpy as np
-
-
-def uniform_sample_by_choices(choices):
-    """
-    Uniformly sample an option from a list of options.
-
-    Args:
-        choices (list): a list of values to be sampled from.
-
-    Returns:
-        choice: the sampled value.
-
-    """
-    assert choices
-    p = np.random.uniform()
-    t = 1.0 / len(choices)
-    sample = choices[0]
-    for i, c in enumerate(choices):
-        if p < t * (i+1):
-            sample = c
-            break
-    return sample
-
-
-def binary_sample(boundary=0.5):
-    p = np.random.uniform()
-    if p < boundary:
-        return True
-    else:
-        return False
-
-
-def sample_merge_group(num_group, num_candidates):
-
-    def is_valid(assignment):
-        unique_assignment = np.unique(assignment)
-        if unique_assignment.shape[0] == num_group:
-            return True
-        return False
-
-    assignment = np.random.randint(1, num_group+1, [num_candidates])
-    while not is_valid(assignment):
-        assignment = np.random.randint(1, num_group+1, [num_candidates])
-    return assignment
diff --git a/autodist/strategy/auto/strategy_sampler.py b/autodist/strategy/auto/strategy_sampler.py
index a317089..1ebb76e 100644
--- a/autodist/strategy/auto/strategy_sampler.py
+++ b/autodist/strategy/auto/strategy_sampler.py
@@ -16,170 +16,19 @@
 
 from collections import OrderedDict
 
-from enum import Enum
-from tensorflow.python.framework import ops
+import numpy as np
 
-from autodist.kernel.common.utils import get_op_name, get_consumers
+from autodist.kernel.common.utils import get_op_name
 from autodist.kernel.partitioner import PartitionerConfig
 from autodist.proto import strategy_pb2, synchronizers_pb2
-from autodist.strategy.base import Strategy, StrategyBuilder, byte_size_load_fn
+from autodist.strategy.base import Strategy
+from autodist.strategy.auto.item import VariableItem, PartItem
 from autodist.strategy.auto.ps_load_balancer import greedy_load_balancer, christy_load_balancer
 from autodist.strategy.auto.ar_group_assigner import chunk_group_assigner, christy_group_assigner, \
     ordered_balanced_group_assigner
-from autodist.strategy.auto import sample_util
 from autodist.const import MAX_INT32
 
 
-class VarType(Enum):
-    SPARSE = 0
-    DENSE = 1
-
-
-class VariableHelper:
-    """Helper class to include meta information about a variable."""
-    def __init__(self, var, graph_item):
-        self.var = var
-        self.graph_item = graph_item
-        self._var_op_name = get_op_name(var.name)
-        self._grad = graph_item.var_op_name_to_grad_info[self._var_op_name][0]
-
-    @property
-    def var_type(self):
-        """
-        Return the type of the variable (VarType.SPARSE or VarType.DENSE).
-
-        Returns:
-            VarType
-        """
-        return VarType.DENSE if isinstance(self._grad, ops.Tensor) else VarType.SPARSE
-
-    @property
-    def is_sparse(self):
-        """
-        Return whether the variable is sparse.
-
-        Returns:
-            Bool
-        """
-        return True if self.var_type == VarType.SPARSE else False
-
-    @property
-    def is_embedding(self):
-        """
-        Return whether the variable corresponds to an embedding.
-
-        Returns:
-            Bool
-        """
-        # TODO (Hao): better way to determine is_embedding?
-        for op in get_consumers(self.var.op):
-            if op.type == "ResourceGather":
-                return True
-        return False
-
-    @property
-    def shape(self):
-        """
-        Return the shape of the variable, or None if it does not emit a tensor (e.g. scalar).
-
-        Returns:
-            List(int)
-        """
-        if self.var.initial_value.shape.ndims:
-            return self.var.initial_value.shape.as_list()
-        else:
-            return None
-
-    @property
-    def partitionable_axes(self):
-        """
-        Return the list of available axes that are legitimate to partition along.
-
-        Returns:
-            List(int)
-        """
-        valid_axes = []
-
-        # scalar
-        if not self.shape:
-            return valid_axes
-
-        # Sparse variable can only be partition along the 0th axis in current implementation.
-        if self.is_sparse or self.is_embedding:
-            valid_axes = [0]
-            return valid_axes
-        for idx, dim in enumerate(self.shape):
-            if dim > 1:
-                valid_axes.append(idx)
-        return valid_axes
-
-    @property
-    def byte_size(self):
-        """
-        Return the byte size of the variable.
-
-        Returns:
-            float
-        """
-        return float(byte_size_load_fn(self.var))
-
-    @property
-    def dtype(self):
-        """
-        Return the dtype of the variable.
-
-        Returns:
-            dtype
-        """
-        return self.var.dtype
-
-
-class PartHelper:
-    """Helper class to include meta information about a variable partition."""
-    def __init__(self, part_idx, var, pc):
-        self.var = var
-        self.part_idx = part_idx
-        self.pc = pc
-
-    @property
-    def shape(self):
-        """
-        Return the shape of this partition.
-
-        Returns:
-            List(int)
-
-        """
-        shape = self.var.initial_value.shape.as_list()
-        dim_size = shape[self.pc.axis] // self.pc.num_shards
-        extras = shape[self.pc.axis] % self.pc.num_shards
-        if self.part_idx < extras:
-            dim_size += 1
-        shape[self.pc.axis] = dim_size
-        return shape
-
-    @property
-    def var_shape(self):
-        """
-        Return the shape of the original value this part belonged to.
-
-        Returns:
-            List(int)
-        """
-        return self.var.initial_value.shape.as_list()
-
-    @property
-    def byte_size(self):
-        """
-        Return the byte size of this partition.
-
-        Returns:
-            float
-        """
-        return float(byte_size_load_fn(self.var)) \
-               * float(self.shape[self.pc.axis]) / float(self.var_shape[self.pc.axis])
-
-
 class RandomStrategySampler():
     """
     Random Strategy Sampler.
@@ -192,7 +41,7 @@ def __init__(self, space, heuristics):
 
         Args:
             space (dict): the strategy space that the random strategy should be drawn from. An example of the space
-                          can be found at TODO(Hao).
+                          can be found at
             heuristics (dict): heuristics used to guide the random sampling process.
         """
         if not space:
@@ -201,7 +50,6 @@ def __init__(self, space, heuristics):
             raise ValueError('Heuristic to guide strategy sampling is not provided.')
         self.space = space
         self.heuristics = heuristics
-        self.helpers = {}
 
     def build(self, graph_item, resource_spec):
         """Generate a randomized strategy given model and resource spec."""
@@ -210,41 +58,43 @@ def build(self, graph_item, resource_spec):
         # number of graph replica is equal to number of GPU devices
         expr.graph_config.replicas.extend([k for k, v in resource_spec.gpu_devices])
         variables = graph_item.trainable_var_op_to_var.values()
+        name_to_item = OrderedDict()
 
         # Perform MCMC to generate each node configs
         node_config = []
         for var in variables:
-            var_helper = VariableHelper(var, graph_item)
-            self.helpers[var_helper.var.name] = var_helper
+            var_item = VariableItem(var, graph_item)
+            name_to_item[var_item.name] = var_item
 
             node = strategy_pb2.Strategy.Node()
-            node.var_name = var_helper.var.name
+            node.var_name = var_item.name
 
             # Step 1: determine whether or not to partition
             # TODO(Hao): some factor is not considered, e.g. number of reduction_device_names
-            maybe_partition = sample_if_partition(var_helper, resource_spec, self.space, self.heuristics)
+            maybe_partition = sample_if_partition(var_item, resource_spec, self.space, self.heuristics)
 
             # Step 2.1: if not partition, sample a synchronizer type for it
             if not maybe_partition:  # no partition
-                sample_var_synchronizer(node, var_helper, resource_spec, self.space)
+                sample_var_synchronizer(node, var_item, resource_spec, self.space)
             else:  # Step 2.2: else partition
                 # Step 2.2.1: sample a partitioner config
-                pc = sample_partition_config(var_helper, resource_spec, self.space, self.heuristics)
+                pc = sample_partition_config(var_item, resource_spec, self.space, self.heuristics)
                 node.partitioner = pc.partition_str
 
                 # step 2.2.2: sample a synchronizer type for each partition
                 parts = []
                 for i in range(pc.num_shards):
                     part = strategy_pb2.Strategy.Node()
+                    part_item = PartItem(var, graph_item, i, pc)
                     part.var_name = '{}/part_{}:0'.format(get_op_name(var.name), i)
-                    self.helpers[part.var_name] = PartHelper(i, var, pc)
+                    name_to_item[part.var_name] = part_item
                     parts.append(part)
-                sample_parts_synchronizers(parts, var_helper, resource_spec, self.space, self.heuristics)
+                sample_parts_synchronizers(parts, var_item, resource_spec, self.space, self.heuristics)
                 node.part_config.extend(parts)
             node_config.append(node)
 
         # Step 3: Post-assign group or placement.
-        sample_group_and_reduction_destinations(node_config, resource_spec, self.helpers, self.heuristics)
+        sample_group_and_reduction_destinations(node_config, resource_spec, name_to_item, self.heuristics)
 
         expr.node_config.extend(node_config)
         self._reset()
@@ -255,12 +105,12 @@ def _reset(self):
         self.helpers = {}
 
 
-def sample_if_partition(var_helper, resource_spec, space, heuristics):
+def sample_if_partition(var_item, resource_spec, space, heuristics):
     """
     Sample a bool value determining whether to partition a variable or not.
 
     Args:
-        var_helper: the variable helper corresponded to the variable of interest.
+        var_item: the variable item.
         resource_spec: the target cluster spec.
         space: the space argument controlling where to sample from.
         heuristics: the heuristics argument  guiding the sampling process.
@@ -275,9 +125,9 @@ def sample_if_partition(var_helper, resource_spec, space, heuristics):
         return False
 
     # intersection of variable's partitonable axis and global constraints
-    if var_helper.partitionable_axis:
-        if space['partitionable_axis']:
-            a = set(var_helper.partitionable_axis) & set(space['partitionable_axis'])
+    if var_item.partitionable_axes:
+        if space['partitionable_axes']:
+            a = set(var_item.partitionable_axes) & set(space['partitionable_axes'])
             if len(a) < 1:
                 return False
     else:
@@ -286,19 +136,19 @@ def sample_if_partition(var_helper, resource_spec, space, heuristics):
     # lower bound for abandoning partitioning
     lb = heuristics['maybe_partition_bounds'][0]
     ub = heuristics['maybe_partition_bounds'][1]
-    if var_helper.byte_size <= lb:
+    if var_item.byte_size <= lb:
         return False
-    if var_helper.byte_size >= ub:
+    if var_item.byte_size >= ub:
         return True
     assert (len(space['maybe_partition']) == 2)
 
     if heuristics['maybe_partition_by_size']:
         #  By variable size -- a large variable has a higher chance to be partitioned
         # TODO (Hao): MAX_INT32 is too large, reconsider later...
-        chance = float(var_helper.byte_size - lb) / float(ub - lb)
-        return sample_util.binary_sample(boundary=chance)
+        chance = float(var_item.byte_size - lb) / float(ub - lb)
+        return binary_sample(boundary=chance)
     else:
-        return sample_util.uniform_sample_by_choices(space['maybe_partition'])
+        return uniform_sample_by_choices(space['maybe_partition'])
 
 
 def sample_var_synchronizer(node, var_helper, resource_spec, space):
@@ -314,7 +164,7 @@ def sample_var_synchronizer(node, var_helper, resource_spec, space):
     """
     # We ALWAYS use PS for sparse variables
     synchronizer_type = 'PS' if var_helper.var_type == VarType.SPARSE \
-        else sample_util.uniform_sample_by_choices(space['synchronizer_types'])
+        else uniform_sample_by_choices(space['synchronizer_types'])
     if synchronizer_type == 'PS':
         node.PSSynchronizer.sync = True  # we don't consider async at this moment
         node.PSSynchronizer.staleness = 0
@@ -345,10 +195,10 @@ def sample_parts_synchronizers(parts, var_helper, resource_spec, space, heuristi
         synchronizer_types = ['PS'] * len(parts)
     else:
         if heuristics['same_synchronizer_for_parts']:
-            type = sample_util.uniform_sample_by_choices(space['synchronizer_types'])
+            type = uniform_sample_by_choices(space['synchronizer_types'])
             synchronizer_types = [type] * len(parts)
         else:
-            synchronizer_types = [sample_util.uniform_sample_by_choices(space['synchronizer_types'])
+            synchronizer_types = [uniform_sample_by_choices(space['synchronizer_types'])
                                   for part in parts]
     for i, part in enumerate(parts):
         if synchronizer_types[i] == 'PS':
@@ -378,13 +228,13 @@ def sample_partition_config(var_helper, resource_spec, space, heuristics):
     """
     # Arion only support partitioning along one axis -- we first sample a partition axis,
     # then sample the number of partitions along that axis, and obtain the partition config.
-    assert len(var_helper.partitionable_axis) > 0, 'No partition axis available'
+    assert len(var_helper.partitionable_axes) > 0, 'No partition axis available'
     # sample partition axis
     # TODO(Hao): some heursitics here available?
-    valid_axis = var_helper.partitionable_axis
-    if space['partitionable_axis']:
-        valid_axis = list(set(valid_axis) & set(space['partitionable_axis']))
-    partition_axis = sample_util.uniform_sample_by_choices(valid_axis)
+    valid_axis = var_helper.partitionable_axes
+    if space['partitionable_axes']:
+        valid_axis = list(set(valid_axis) & set(space['partitionable_axes']))
+    partition_axis = uniform_sample_by_choices(valid_axis)
 
     # sample how many partition to go
     num_nodes = resource_spec.num_cpus
@@ -405,7 +255,7 @@ def sample_partition_config(var_helper, resource_spec, space, heuristics):
         raise ValueError('unseen num_partition_bounds config')
 
     # sample from [min_shards, max_shards]
-    num_shards = sample_util.uniform_sample_by_choices(range(min_shards, max_shards + 1))
+    num_shards = uniform_sample_by_choices(list(range(min_shards, max_shards + 1)))
 
     # construct a PartitionerConfig (pc)
     partition_list = [1] * len(var_helper.shape)
@@ -431,7 +281,7 @@ def sample_if_local_replication(local_replication_space, resource_spec):
     if resource_spec.num_gpus <= resource_spec.num_cpus:
         # meaning every machine has at most 1 GPU
         return False
-    return sample_util.uniform_sample_by_choices(local_replication_space)
+    return uniform_sample_by_choices(local_replication_space)
 
 
 def sample_ar_compressor(compressor_space):
@@ -446,7 +296,7 @@ def sample_ar_compressor(compressor_space):
     Returns:
     """
     # TODO(Hao): try to use all four options
-    return sample_util.uniform_sample_by_choices(compressor_space)
+    return uniform_sample_by_choices(compressor_space)
 
 
 def sample_group_and_reduction_destinations(node_config, resource_spec, helpers, heuristics):
@@ -508,7 +358,7 @@ def sample_ps_reduction_destinations(node_config, ps_shards, resource_spec, help
     if not load_balancer:
         destinations = {}
         for shard_name in ps_shards:
-            destinations[shard_name] = sample_util.uniform_sample_by_choices(reduction_device_names)
+            destinations[shard_name] = uniform_sample_by_choices(reduction_device_names)
     elif load_balancer == 'greedy':
         destinations = greedy_load_balancer(ps_shards, resource_spec, helpers)
     elif load_balancer == 'christy':
@@ -576,7 +426,7 @@ def sample_ar_groups(node_config, ar_shards, helpers, heuristics):
     assert chunk_size_or_num_group > 0, "chunk_size or num_groups need to > 1..."
 
     if merge_scheme in ['random', None]:
-        tmp_assignments = sample_util.sample_merge_group(chunk_size_or_num_group, len(ar_shards))
+        tmp_assignments = sample_merge_group(chunk_size_or_num_group, len(ar_shards))
         group_assignments = OrderedDict()
         for i, shard_name in enumerate(ar_shards):
             group_assignments[shard_name] = tmp_assignments[i]
@@ -613,7 +463,7 @@ def sample_num_ar_groups(ar_shards, lb, ub):
     """
     min_num_group = max(1, lb)
     max_num_group = min(len(ar_shards), ub)
-    num_group = sample_util.uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1)))
+    num_group = uniform_sample_by_choices(list(range(min_num_group, max_num_group + 1)))
     return num_group
 
 
@@ -627,7 +477,7 @@ def sample_chunk_size(num_ar_shards):
     Returns:
 
     """
-    chunk_size = sample_util.uniform_sample_by_choices(list(range(1, num_ar_shards + 1)))
+    chunk_size = uniform_sample_by_choices(list(range(1, num_ar_shards + 1)))
     return chunk_size
 
 
@@ -654,12 +504,56 @@ def assign_ar_group(node_config, ar_shards):
                 synchronizer.group = ar_shards[node.var_name][1]
 
 
+def uniform_sample_by_choices(choices):
+    """
+    Uniformly sample an option from a list of options.
+
+    Args:
+        choices (list): a list of values to be sampled from.
+
+    Returns:
+        choice: the sampled value.
+
+    """
+    assert choices
+    p = np.random.uniform()
+    t = 1.0 / len(choices)
+    sample = choices[0]
+    for i, c in enumerate(choices):
+        if p < t * (i+1):
+            sample = c
+            break
+    return sample
+
+
+def binary_sample(boundary=0.5):
+    p = np.random.uniform()
+    if p < boundary:
+        return True
+    else:
+        return False
+
+
+def sample_merge_group(num_group, num_candidates):
+
+    def is_valid(assignment):
+        unique_assignment = np.unique(assignment)
+        if unique_assignment.shape[0] == num_group:
+            return True
+        return False
+
+    assignment = np.random.randint(1, num_group+1, [num_candidates])
+    while not is_valid(assignment):
+        assignment = np.random.randint(1, num_group+1, [num_candidates])
+    return assignment
+
+
 default_space = {
     'synchronizer_types': ['PS', 'AR'],
     'maybe_partition': [True, False],
     'compressor': ['HorovodCompressor', 'NoneCompressor', 'HorovodCompressorEF'],
     'local_replication': [False],
-    'partitionable_axis': []
+    'partitionable_axes': []
 }
 
 
diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py
index 5d6b78f..354d62d 100644
--- a/autodist/strategy/auto_strategy.py
+++ b/autodist/strategy/auto_strategy.py
@@ -52,4 +52,9 @@ def __init__(self):
         )
 
     def build(self, graph_item, resource_spec):
-        return
\ No newline at end of file
+        candidates = self.propose_n(graph_item, resource_spec, self._num_proposals)
+
+        # Assess all candidates and simply pick the highest-scored one
+        features, scores = self._simulator.inference(candidates)
+        best_index = scores.index(min(scores))
+        return candidates[best_index]
diff --git a/examples/linear_regression.py b/examples/linear_regression.py
index d14a3f8..4145626 100644
--- a/examples/linear_regression.py
+++ b/examples/linear_regression.py
@@ -7,12 +7,14 @@
 
 from autodist import AutoDist
 from autodist.strategy import PS, PSLoadBalancing, PartitionedPS, AllReduce, Parallax
+from autodist.strategy import AutoStrategy
 
 resource_spec_file = os.path.join(os.path.dirname(__file__), 'resource_spec.yml')
 
 
 def main(_):
-    autodist = AutoDist(resource_spec_file, AllReduce(128))
+    # autodist = AutoDist(resource_spec_file, AllReduce(128))
+    autodist = AutoDist(resource_spec_file, AutoStrategy())
 
     TRUE_W = 3.0
     TRUE_b = 2.0
diff --git a/tests/test_simulator.py b/tests/test_simulator.py
new file mode 100644
index 0000000..f2aaeb1
--- /dev/null
+++ b/tests/test_simulator.py
@@ -0,0 +1,27 @@
+from autodist.simulator.utils import _resolve_device_address
+from autodist.resource_spec import ResourceSpec
+from autodist.cluster import SSHCluster
+from autodist.kernel.device.resolver import DeviceResolver
+from autodist.simulator.base import SimulatorBase
+from autodist.simulator.utils import _resolve_device_address
+
+# def test_resolve_device_address():
+#     resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml'
+#     rs = ResourceSpec(resource_spec_file)
+#     cluster = SSHCluster(rs)
+#     resolver = DeviceResolver(cluster)
+#     return True
+
+def test_resolve():
+    resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml'
+    rs = ResourceSpec(resource_spec_file)
+    cluster = SSHCluster(rs)
+    resolver = DeviceResolver(cluster)
+    SimulatorBase.network_bandwidth(rs, resolver)
+    devices = [device for device, _ in rs.devices]
+
+    resolved_devices_1 = [_resolve_device_address(device, resolver) for device, _ in rs.devices]
+    devices = resolver.resolve_to_device_str(devices)
+
+    for d1, d2 in zip(resolved_devices_1, devices):
+        assert d1 == d2
\ No newline at end of file

From 15e491f1d1d5498cd00ddd82a15a1f63afae2ed0 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Tue, 28 Jul 2020 02:45:26 -0400
Subject: [PATCH 07/11] update predefined simulator and linear simulator

---
 autodist/simulator/base.py                 |  11 +-
 autodist/simulator/linear_simulator.py     | 374 +++++++++-
 autodist/simulator/predefined_simulator.py | 795 +++++++++++----------
 autodist/strategy/auto/item.py             |   7 +-
 4 files changed, 814 insertions(+), 373 deletions(-)

diff --git a/autodist/simulator/base.py b/autodist/simulator/base.py
index 31e2d1a..19b965d 100644
--- a/autodist/simulator/base.py
+++ b/autodist/simulator/base.py
@@ -68,9 +68,10 @@ def simulate(self,
                  strategy,
                  graph_item=None,
                  resource_spec=None,
-                 checkpoint=None):
+                 *args,
+                 **kwargs):
         """
-        Return simulated runtime cost given (Strategy, GraphItem, ResourceSpec) tuple.
+        Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.
 
         Args:
             strategy:
@@ -84,8 +85,7 @@ def simulate(self,
         raise NotImplementedError()
 
     def inference(self,
-                  features,
-                  checkpoint=None):
+                  features):
         """
         Abstract method for simulator inference.
 
@@ -117,9 +117,6 @@ def save_checkpoint(self, model, checkpoint):
         """
         raise NotImplementedError()
 
-    def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
-        raise NotImplementedError()
-
     def preprocess(self,
                    strategy,
                    graph_item=None,
diff --git a/autodist/simulator/linear_simulator.py b/autodist/simulator/linear_simulator.py
index 527d923..5dc2e6b 100644
--- a/autodist/simulator/linear_simulator.py
+++ b/autodist/simulator/linear_simulator.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Petuum. All Rights Reserved.
+# Copyright 2020 Petuum Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +12,376 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Linear simulator."""
+"""Predefined simulator with linear model."""
 
+import pickle as pkl
+
+import tensorflow as tf
+from tensorflow.python.eager import context
+
+from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from autodist.resource_spec import ResourceSpec
 from autodist.simulator.base import SimulatorBase
+from autodist.simulator.utils import _resolved_devices_on_diff_machine, \
+	get_dense_var_bits, get_sparse_var_bits
+from autodist.strategy.base import Strategy
+
 
 class LinearSimulator(SimulatorBase):
-    def __init__(self):
-        super(LinearSimulator, self).__init__()
+	"""Simulates strategies for a given graph and resource spec."""
+
+	def __init__(self,
+				 graph_item=None,
+				 resource_spec=None,
+				 batch_size=1,
+				 seq_len=1,
+				 get_coef=True,
+				 checkpoint=None):
+
+		super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path)
+
+		print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size))
+		self._fetches = fetches
+		self._batch_size_per_gpu = batch_size
+		self._seq_len = seq_len
+		self._get_coef = get_coef
+		self._checkpoint = checkpoint
+		self._weights = None
+		with context.eager_mode():
+			if self._checkpoint:
+				self._weights = self.load_checkpoint(self._checkpoint)
+
+	def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None):
+		"""Return simulated runtime value."""
+		inputs = self.create_features(strategy, resource_spec)
+		with context.eager_mode():
+			cost = self.inference(inputs, checkpoint)
+		return cost
+
+	def inference(self, inputs, checkpoint=None):
+		if checkpoint is not None:
+			weights = self.load_checkpoint(checkpoint)
+		elif self._weights is not None:
+			weights = self._weights
+		else:
+			raise ValueError("No checkpoint provided in either initialization or inference.")
+
+		if not isinstance(inputs, tf.Tensor):
+			inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
+
+		if len(weights) == 4:
+			W0, b0, W, b = weights
+			inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0)
+			cost = tf.matmul(inputs, W) + b
+		elif len(weights) == 2:
+			W, b = weights
+			cost = tf.matmul(inputs, W) + b
+		else:
+			raise ValueError
+		return cost
+
+	def load_checkpoint(self, checkpoint=None):
+		if checkpoint is None:
+			if self._checkpoint is not None:
+				checkpoint = self._checkpoint
+			else:
+				raise ValueError("checkpoint is None: {}".format(checkpoint))
+		self._weights = pkl.load(open(checkpoint, 'rb'))
+		# self._weights = json.load(open(checkpoint, 'r'))
+		print("Load checkpoint: ")
+		print(self._weights)
+		return self._weights
+
+	def save_checkpoint(self, model, checkpoint):
+		pkl.dump(model, open(checkpoint, 'wb'))
+		self._checkpoint = checkpoint
+		self._weights = model
+
+	def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec):
+		var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
+
+		# Add up sync time per device to find the slowest server time.
+		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
+		device_ps_sync_time = {}
+		var_ar_sync_time = {}
+		for var_name, sync_time in var_sync_time.items():
+			if isinstance(vars[var_name].synchronizer, PSSynchronizer):
+				device = vars[var_name].device
+				if device not in device_ps_sync_time:
+					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
+
+			else: # AllReduce
+				if var_name not in var_ar_sync_time:
+					var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					var_ar_sync_time[var_name][key] += sync_time[key]
+
+		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		sum_var_ar_sync_time = {key: 0.0 for key in feature_keys}
+		for key in feature_keys:
+			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0])
+
+		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
+		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
+		       + [sum_var_ar_sync_time[key] for key in feature_keys]
+
+		return feat
+
+	def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
+		# var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
+
+		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
+
+		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
+		device_ps_sync_time = {}
+		group_ar_sync_time = {}
+
+		for var_name, var in vars.items():
+			if isinstance(var.synchronizer, PSSynchronizer):
+				sync_time = self.var_ps_time(var, resource)
+				device = vars[var_name].device
+				if device not in device_ps_sync_time:
+					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
+			elif isinstance(var.synchronizer, AllReduceSynchronizer):
+				sync_time = self.var_ar_time(var, resource)
+				var_group = sync_time['group']
+				if var_group not in group_ar_sync_time:
+					group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys}
+				for key in feature_keys:
+					group_ar_sync_time[var_group][key] += sync_time[key]
+			else:
+				raise ValueError('{}'.format(type(var.synchronizer)))
+
+		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+		max_group_ar_sync_time = {key: 0.0 for key in feature_keys}
+		sum_group_ar_sync_time = {key: 0.0 for key in feature_keys}
+		for key in feature_keys:
+			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
+			max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0])
+			sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0])
+
+		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
+		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
+		       + [max_group_ar_sync_time[key] for key in feature_keys] \
+		       + [sum_group_ar_sync_time[key] for key in feature_keys]
+
+		return feat
+
+	def predefined_sync_time(self, strategy, resource_spec):
+		""" graph_item: transformed graph item """
+		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
+		# Compute synchronization time for every var
+		var_sync_time = {}
+		for var_name, var in vars.items():
+			if isinstance(var.synchronizer, PSSynchronizer):
+				var_sync_time[var_name] = self.var_ps_time(var, resource)
+			elif isinstance(var.synchronizer, AllReduceSynchronizer):
+				var_sync_time[var_name] = self.var_ar_time(var, resource)
+			else:
+				raise ValueError('{}'.format(type(var.synchronizer)))
+		return var_sync_time, vars, resource
+
+	def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+		"""Compute synchronization time of a variable in PS strategy."""
+		def _helper(worker_list, worker_num_replicas=None):
+			if worker_num_replicas is None:
+				worker_num_replicas = [1.0] * len(worker_list)
+
+			this_server_time = 0
+			# network transfer: sum up all workers time. equals to the time cost of this server.
+			# TODO(Hao): didn't consider any parallelization among partitions
+			for k, worker in enumerate(worker_list):
+				if _resolved_devices_on_diff_machine(var.device, worker):
+					if var.is_sparse:
+						this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
+					else:
+						this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
+					this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
+
+			if self._get_coef:
+				return {
+					'transmission': this_server_time,
+					'network_overhead': len(worker_list),
+					'gpu_kernel_memory_latency': resource.max_num_local_replica,
+					'constant': 1.0,
+					# possible affecting factors.
+					'var_name': var.name,
+					'strategy': 'ps',
+					'local_proxy': var.synchronizer.local_replication,
+					'is_sparse': var.is_sparse,
+					'size_to_transfer': var_size_to_transfer,
+					'dtype': str(var.dtype),
+					# 'server_list': [partition.to_dict() for partition in server_list],
+					'worker_list': worker_list,
+					'cpu_worker_list': resource.cpu_worker_list,
+					'gpu_worker_list': resource.gpu_worker_list,
+					'worker_num_replicas': worker_num_replicas,
+					'max_num_local_replica': resource.max_num_local_replica,
+					'is_ps': True,
+				}
+			else:
+				return this_server_time + len(worker_list) * network_overhead + \
+					   gpu_kernel_memory_latency * resource.max_num_local_replica
+
+		var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+													seq_len=self._seq_len)
+
+		if var.is_sparse:
+			send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas)
+			receive_time = _helper(resource.gpu_worker_list)
+		else:
+			send_time = _helper(resource.cpu_worker_list)
+			if var.synchronizer.local_replication:
+				receive_time = _helper(resource.cpu_worker_list)
+			else:
+				receive_time = _helper(resource.gpu_worker_list)
+
+		return send_time, receive_time
+
+	def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+		"""Compute synchronization time of a variable in AR strategy."""
+		worker_list = resource.cpu_worker_list
+		num_workers = len(worker_list)
+		min_bandwidth = None
+		for i in range(num_workers):
+			for j in range(i, num_workers):
+				if min_bandwidth is None:
+					min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]]
+				else:
+					min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]])
+
+		# Compressor
+		if var.compressor == "PowerSGDCompressor" or var.compressor == 3:
+			rank = 10  # currently using default value. So hardcode here. # todo: confirm
+			# assume var must be a dense variable.
+			og_shape = var.shape
+			ndims = len(og_shape)
+			if ndims <= 1:  # no compress
+				size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+														seq_len=self._seq_len)
+			else:
+				if ndims > 2:
+					n = og_shape[0]
+					m = 1
+					for s in og_shape[1:]:
+						m *= s  # tensor's shape (n, m)
+				else:
+					n, m = og_shape[0], og_shape[1]
+				size_to_transfer = n * rank + m * rank
+			dtype = tf.float32
+		elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor"  \
+				or var.compressor == 2  or var.compressor == 1:
+			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+													seq_len=self._seq_len)
+			dtype = tf.float32
+		elif var.compressor == "NoneCompressor" or var.compressor == 0:
+			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+													seq_len=self._seq_len)
+			dtype = var.dtype
+		else:
+			raise ValueError('Compressor does not exist: {}'.format(var.compressor))
+
+		# todo: chunk_size
+		# AllReduce communication time
+		# time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers)
+		time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
+
+		if self._get_coef:
+			return {
+				'transmission': time,
+				'network_overhead': 1,  # len(worker_list),
+				'gpu_kernel_memory_latency': resource.max_num_local_replica,
+				'constant': 1.0,
+				# possible affecting factors.
+				'var_name': var.name,
+				'group': var.synchronizer.group,
+				'strategy': 'allreduce',
+				'is_sparse': False,
+				# 'chunk_size': chunk_size,
+				'spec': 'NCCL',  # default
+				'compressor': var.compressor,
+				'worker_list': worker_list,
+				'num_workers': num_workers,
+				'size_to_transfer': size_to_transfer,
+				'dtype': str(dtype),
+				'min_bandwidth': min_bandwidth,
+				'max_num_local_replica': resource.max_num_local_replica,
+				'is_ps': False,
+			}
+		else:
+			return time + network_overhead * len(worker_list) \
+			       + gpu_kernel_memory_latency * resource.max_num_local_replica
+
+
+
+	# @staticmethod
+	# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
+	#				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
+	#				 network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+	#	 """Compute synchrinzation time of a variable in PS strategy."""
+	#
+	#	 def _helper(worker_list, worker_num_replicas=None):
+	#		 if worker_num_replicas is None:
+	#			 worker_num_replicas = [1.0] * len(worker_list)
+	#		 # Compute the slowest server
+	#		 slowest_server_time = 0
+	#		 for j, server in enumerate(server_list):
+	#			 if server.size_to_transfer == 0:
+	#				 continue
+	#			 # network transfer: sum up all workers time. equals to the time cost of this server.
+	#			 this_server_time = 0
+	#			 for k, worker in enumerate(worker_list):
+	#				 if _resolved_devices_on_diff_machine(server.device, worker):
+	#					 if is_sparse:
+	#						 this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k]
+	#					 else:
+	#						 this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype)
+	#					 this_server_time += this_worker_size / network_bandwidth[server.device][worker]
+	#			 slowest_server_time = max(slowest_server_time, this_server_time)
+	#
+	#		 if get_coef:
+	#			 return {
+	#				 'transmission': slowest_server_time,
+	#				 'network_overhead': len(worker_list),
+	#				 'gpu_kernel_memory_latency': max_num_local_replica,
+	#				 'constant': 1.0,
+	#				 # possible affecting factors.
+	#				 'var_name': var_name,
+	#				 'strategy': 'ps',
+	#				 'local_proxy': local_proxy,
+	#				 'is_sparse': is_sparse,
+	#				 'server_list': [partition.to_dict() for partition in server_list],
+	#				 'worker_list': worker_list,
+	#				 'cpu_worker_list': cpu_worker_list,
+	#				 'gpu_worker_list': gpu_worker_list,
+	#				 'worker_num_replicas': worker_num_replicas,
+	#				 'max_num_local_replica': max_num_local_replica,
+	#			 }
+	#		 else:
+	#			 return slowest_server_time + len(worker_list) * network_overhead + \
+	#					gpu_kernel_memory_latency * max_num_local_replica
+	#
+	#	 if is_sparse:
+	#		 send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas)
+	#		 receive_time = _helper(gpu_worker_list)
+	#	 else:
+	#		 send_time = _helper(cpu_worker_list)
+	#		 if local_proxy:
+	#			 receive_time = _helper(cpu_worker_list)
+	#		 else:
+	#			 receive_time = _helper(gpu_worker_list)
+	#
+	#	 if get_coef:
+	#		 # return {key: send_time[key]+receive_time[key] for key in send_time.keys()}
+	#		 return send_time, receive_time
+	#	 else:
+	#		 return send_time, receive_time
diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py
index a419126..6b141c8 100644
--- a/autodist/simulator/predefined_simulator.py
+++ b/autodist/simulator/predefined_simulator.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Petuum. All Rights Reserved.
+# Copyright 2020 Petuum Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 """Predefined simulator with linear model."""
 
 import pickle as pkl
+from collections import OrderedDict
 
 import tensorflow as tf
 from tensorflow.python.eager import context
@@ -23,365 +24,441 @@
 from autodist.resource_spec import ResourceSpec
 from autodist.simulator.base import SimulatorBase
 from autodist.simulator.utils import _resolved_devices_on_diff_machine, \
-	get_dense_var_bits, get_sparse_var_bits
+    get_dense_var_bits, get_sparse_var_bits
 from autodist.strategy.base import Strategy
+from autodist.utils import logging
 
 
 class PredefinedSimulator(SimulatorBase):
-	"""Simulates strategies for a given graph and resource spec."""
-
-	def __init__(self,
-				 graph_item=None,
-				 resource_spec=None,
-				 batch_size=1,
-				 seq_len=1,
-				 get_coef=True,
-				 checkpoint=None):
-
-		super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path)
-
-		print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size))
-		self._fetches = fetches
-		self._batch_size_per_gpu = batch_size
-		self._seq_len = seq_len
-		self._get_coef = get_coef
-		self._checkpoint = checkpoint
-		self._weights = None
-		with context.eager_mode():
-			if self._checkpoint:
-				self._weights = self.load_checkpoint(self._checkpoint)
-
-	def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None):
-		"""Return simulated runtime value."""
-		inputs = self.create_features(strategy, resource_spec)
-		with context.eager_mode():
-			cost = self.inference(inputs, checkpoint)
-		return cost
-
-	def inference(self, inputs, checkpoint=None):
-		if checkpoint is not None:
-			weights = self.load_checkpoint(checkpoint)
-		elif self._weights is not None:
-			weights = self._weights
-		else:
-			raise ValueError("No checkpoint provided in either initialization or inference.")
-
-		if not isinstance(inputs, tf.Tensor):
-			inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
-
-		if len(weights) == 4:
-			W0, b0, W, b = weights
-			inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0)
-			cost = tf.matmul(inputs, W) + b
-		elif len(weights) == 2:
-			W, b = weights
-			cost = tf.matmul(inputs, W) + b
-		else:
-			raise ValueError
-		return cost
-
-	def load_checkpoint(self, checkpoint=None):
-		if checkpoint is None:
-			if self._checkpoint is not None:
-				checkpoint = self._checkpoint
-			else:
-				raise ValueError("checkpoint is None: {}".format(checkpoint))
-		self._weights = pkl.load(open(checkpoint, 'rb'))
-		# self._weights = json.load(open(checkpoint, 'r'))
-		print("Load checkpoint: ")
-		print(self._weights)
-		return self._weights
-
-	def save_checkpoint(self, model, checkpoint):
-		pkl.dump(model, open(checkpoint, 'wb'))
-		self._checkpoint = checkpoint
-		self._weights = model
-
-	def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec):
-		var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
-
-		# Add up sync time per device to find the slowest server time.
-		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
-		device_ps_sync_time = {}
-		var_ar_sync_time = {}
-		for var_name, sync_time in var_sync_time.items():
-			if isinstance(vars[var_name].synchronizer, PSSynchronizer):
-				device = vars[var_name].device
-				if device not in device_ps_sync_time:
-					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
-
-			else: # AllReduce
-				if var_name not in var_ar_sync_time:
-					var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					var_ar_sync_time[var_name][key] += sync_time[key]
-
-		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		sum_var_ar_sync_time = {key: 0.0 for key in feature_keys}
-		for key in feature_keys:
-			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0])
-
-		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
-		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
-		       + [sum_var_ar_sync_time[key] for key in feature_keys]
-
-		return feat
-
-	def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
-		# var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
-
-		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
-
-		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
-		device_ps_sync_time = {}
-		group_ar_sync_time = {}
-
-		for var_name, var in vars.items():
-			if isinstance(var.synchronizer, PSSynchronizer):
-				sync_time = self.var_ps_time(var, resource)
-				device = vars[var_name].device
-				if device not in device_ps_sync_time:
-					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
-			elif isinstance(var.synchronizer, AllReduceSynchronizer):
-				sync_time = self.var_ar_time(var, resource)
-				var_group = sync_time['group']
-				if var_group not in group_ar_sync_time:
-					group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					group_ar_sync_time[var_group][key] += sync_time[key]
-			else:
-				raise ValueError('{}'.format(type(var.synchronizer)))
-
-		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		max_group_ar_sync_time = {key: 0.0 for key in feature_keys}
-		sum_group_ar_sync_time = {key: 0.0 for key in feature_keys}
-		for key in feature_keys:
-			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0])
-			sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0])
-
-		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
-		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
-		       + [max_group_ar_sync_time[key] for key in feature_keys] \
-		       + [sum_group_ar_sync_time[key] for key in feature_keys]
-
-		return feat
-
-	def predefined_sync_time(self, strategy, resource_spec):
-		""" graph_item: transformed graph item """
-		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
-		# Compute synchronization time for every var
-		var_sync_time = {}
-		for var_name, var in vars.items():
-			if isinstance(var.synchronizer, PSSynchronizer):
-				var_sync_time[var_name] = self.var_ps_time(var, resource)
-			elif isinstance(var.synchronizer, AllReduceSynchronizer):
-				var_sync_time[var_name] = self.var_ar_time(var, resource)
-			else:
-				raise ValueError('{}'.format(type(var.synchronizer)))
-		return var_sync_time, vars, resource
-
-	def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-		"""Compute synchronization time of a variable in PS strategy."""
-		def _helper(worker_list, worker_num_replicas=None):
-			if worker_num_replicas is None:
-				worker_num_replicas = [1.0] * len(worker_list)
-
-			this_server_time = 0
-			# network transfer: sum up all workers time. equals to the time cost of this server.
-			# TODO(Hao): didn't consider any parallelization among partitions
-			for k, worker in enumerate(worker_list):
-				if _resolved_devices_on_diff_machine(var.device, worker):
-					if var.is_sparse:
-						this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
-					else:
-						this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
-					this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
-
-			if self._get_coef:
-				return {
-					'transmission': this_server_time,
-					'network_overhead': len(worker_list),
-					'gpu_kernel_memory_latency': resource.max_num_local_replica,
-					'constant': 1.0,
-					# possible affecting factors.
-					'var_name': var.name,
-					'strategy': 'ps',
-					'local_proxy': var.synchronizer.local_replication,
-					'is_sparse': var.is_sparse,
-					'size_to_transfer': var_size_to_transfer,
-					'dtype': str(var.dtype),
-					# 'server_list': [partition.to_dict() for partition in server_list],
-					'worker_list': worker_list,
-					'cpu_worker_list': resource.cpu_worker_list,
-					'gpu_worker_list': resource.gpu_worker_list,
-					'worker_num_replicas': worker_num_replicas,
-					'max_num_local_replica': resource.max_num_local_replica,
-					'is_ps': True,
-				}
-			else:
-				return this_server_time + len(worker_list) * network_overhead + \
-					   gpu_kernel_memory_latency * resource.max_num_local_replica
-
-		var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-													seq_len=self._seq_len)
-
-		if var.is_sparse:
-			send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas)
-			receive_time = _helper(resource.gpu_worker_list)
-		else:
-			send_time = _helper(resource.cpu_worker_list)
-			if var.synchronizer.local_replication:
-				receive_time = _helper(resource.cpu_worker_list)
-			else:
-				receive_time = _helper(resource.gpu_worker_list)
-
-		return send_time, receive_time
-
-	def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-		"""Compute synchronization time of a variable in AR strategy."""
-		worker_list = resource.cpu_worker_list
-		num_workers = len(worker_list)
-		min_bandwidth = None
-		for i in range(num_workers):
-			for j in range(i, num_workers):
-				if min_bandwidth is None:
-					min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]]
-				else:
-					min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]])
-
-		# Compressor
-		if var.compressor == "PowerSGDCompressor" or var.compressor == 3:
-			rank = 10  # currently using default value. So hardcode here. # todo: confirm
-			# assume var must be a dense variable.
-			og_shape = var.shape
-			ndims = len(og_shape)
-			if ndims <= 1:  # no compress
-				size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-														seq_len=self._seq_len)
-			else:
-				if ndims > 2:
-					n = og_shape[0]
-					m = 1
-					for s in og_shape[1:]:
-						m *= s  # tensor's shape (n, m)
-				else:
-					n, m = og_shape[0], og_shape[1]
-				size_to_transfer = n * rank + m * rank
-			dtype = tf.float32
-		elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor"  \
-				or var.compressor == 2  or var.compressor == 1:
-			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-													seq_len=self._seq_len)
-			dtype = tf.float32
-		elif var.compressor == "NoneCompressor" or var.compressor == 0:
-			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-													seq_len=self._seq_len)
-			dtype = var.dtype
-		else:
-			raise ValueError('Compressor does not exist: {}'.format(var.compressor))
-
-		# todo: chunk_size
-		# AllReduce communication time
-		# time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers)
-		time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
-
-		if self._get_coef:
-			return {
-				'transmission': time,
-				'network_overhead': 1,  # len(worker_list),
-				'gpu_kernel_memory_latency': resource.max_num_local_replica,
-				'constant': 1.0,
-				# possible affecting factors.
-				'var_name': var.name,
-				'group': var.synchronizer.group,
-				'strategy': 'allreduce',
-				'is_sparse': False,
-				# 'chunk_size': chunk_size,
-				'spec': 'NCCL',  # default
-				'compressor': var.compressor,
-				'worker_list': worker_list,
-				'num_workers': num_workers,
-				'size_to_transfer': size_to_transfer,
-				'dtype': str(dtype),
-				'min_bandwidth': min_bandwidth,
-				'max_num_local_replica': resource.max_num_local_replica,
-				'is_ps': False,
-			}
-		else:
-			return time + network_overhead * len(worker_list) \
-			       + gpu_kernel_memory_latency * resource.max_num_local_replica
-
-
-
-	# @staticmethod
-	# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
-	#				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
-	#				 network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-	#	 """Compute synchrinzation time of a variable in PS strategy."""
-	#
-	#	 def _helper(worker_list, worker_num_replicas=None):
-	#		 if worker_num_replicas is None:
-	#			 worker_num_replicas = [1.0] * len(worker_list)
-	#		 # Compute the slowest server
-	#		 slowest_server_time = 0
-	#		 for j, server in enumerate(server_list):
-	#			 if server.size_to_transfer == 0:
-	#				 continue
-	#			 # network transfer: sum up all workers time. equals to the time cost of this server.
-	#			 this_server_time = 0
-	#			 for k, worker in enumerate(worker_list):
-	#				 if _resolved_devices_on_diff_machine(server.device, worker):
-	#					 if is_sparse:
-	#						 this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k]
-	#					 else:
-	#						 this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype)
-	#					 this_server_time += this_worker_size / network_bandwidth[server.device][worker]
-	#			 slowest_server_time = max(slowest_server_time, this_server_time)
-	#
-	#		 if get_coef:
-	#			 return {
-	#				 'transmission': slowest_server_time,
-	#				 'network_overhead': len(worker_list),
-	#				 'gpu_kernel_memory_latency': max_num_local_replica,
-	#				 'constant': 1.0,
-	#				 # possible affecting factors.
-	#				 'var_name': var_name,
-	#				 'strategy': 'ps',
-	#				 'local_proxy': local_proxy,
-	#				 'is_sparse': is_sparse,
-	#				 'server_list': [partition.to_dict() for partition in server_list],
-	#				 'worker_list': worker_list,
-	#				 'cpu_worker_list': cpu_worker_list,
-	#				 'gpu_worker_list': gpu_worker_list,
-	#				 'worker_num_replicas': worker_num_replicas,
-	#				 'max_num_local_replica': max_num_local_replica,
-	#			 }
-	#		 else:
-	#			 return slowest_server_time + len(worker_list) * network_overhead + \
-	#					gpu_kernel_memory_latency * max_num_local_replica
-	#
-	#	 if is_sparse:
-	#		 send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas)
-	#		 receive_time = _helper(gpu_worker_list)
-	#	 else:
-	#		 send_time = _helper(cpu_worker_list)
-	#		 if local_proxy:
-	#			 receive_time = _helper(cpu_worker_list)
-	#		 else:
-	#			 receive_time = _helper(gpu_worker_list)
-	#
-	#	 if get_coef:
-	#		 # return {key: send_time[key]+receive_time[key] for key in send_time.keys()}
-	#		 return send_time, receive_time
-	#	 else:
-	#		 return send_time, receive_time
+    """
+    Simulator that uses a predefined communication model to estimate the runtime of strategies.
+
+    See this paper TODO(Hao): put the paper link.
+    """
+    def __init__(self,
+                 graph_item=None,
+                 resource_spec=None,
+                 batch_size=1,
+                 seq_len=1):
+        """
+        Construct a predefined simulator.
+
+        The reason we need the per-replica batch size and the length of the inputsequence is to estimate
+        the communication load of variables that are sparsely access (e.g. embeddings). For dense variables,
+        these two arguments have no influence on estimation.
+
+        Args:
+            graph_item: a GraphItem object, or a path to a serialized GraphItem object.
+            resource_spec: a ResourceSpec object, or a path to a resource file.
+            batch_size: the per-replica batch size used to train this model, if there are sparse variables.
+            seq_len: the average length of input sequences (if there is any).
+        """
+        super(PredefinedSimulator, self).__init__(graph_item, resource_spec)
+        logging.debug('A PredefinedSimualtor is instantiated: batch_size_per_gpu is {}'.format(batch_size))
+        self._batch_size_per_gpu = batch_size
+        self._seq_len = seq_len
+
+    def simulate(self,
+                 strategy,
+                 graph_item=None,
+                 resource_spec=None,
+                 *args,
+                 **kwargs):
+        """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple."""
+        inputs = self.create_features(strategy, resource_spec)
+        with context.eager_mode():
+            cost = self.inference(inputs, checkpoint)
+        return cost
+
+    def inference(self, inputs, checkpoint=None):
+        if checkpoint is not None:
+            weights = self.load_checkpoint(checkpoint)
+        elif self._weights is not None:
+            weights = self._weights
+        else:
+            raise ValueError("No checkpoint provided in either initialization or inference.")
+
+        if not isinstance(inputs, tf.Tensor):
+            inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
+
+        if len(weights) == 4:
+            W0, b0, W, b = weights
+            inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0)
+            cost = tf.matmul(inputs, W) + b
+        elif len(weights) == 2:
+            W, b = weights
+            cost = tf.matmul(inputs, W) + b
+        else:
+            raise ValueError
+        return cost
+
+    def estimate_sync_time(self,
+                           strategy,
+                           graph_item=None,
+                           resource_spec=None):
+        if not strategy:
+            raise ValueError('strategy is None.')
+        if not graph_item:
+            if not self._graph_item:
+                raise ValueError('No graph item provided.')
+            else:
+                graph_item = self._graph_item
+        if not resource_spec:
+            if not self._resource_spec:
+                raise ValueError('No resource spec provided.')
+            else:
+                resource_spec = self._resource_spec
+
+        # construct the meta objects
+        name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec)
+
+        # Now estimate the per-variable sync time
+        var_sync_time = OrderedDict()
+        for var_name, var_item in name_to_items.items():
+            if isinstance(var_item.synchronizer, PSSynchronizer):
+                var_sync_time[var_name] = self.var_ps_time(var_item, resource_item)
+            elif isinstance(var_item.synchronizer, AllReduceSynchronizer):
+                var_sync_time = self.var_ar_time(var_item, resource_item)
+            else:
+                raise ValueError('{}'.format(type(var_item.synchronizer)))
+        return var_sync_time
+
+
+
+
+
+
+    def create_features(self,
+                        strategy,
+                        resource_spec):
+        # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
+
+        vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
+
+        feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
+        device_ps_sync_time = {}
+        group_ar_sync_time = {}
+
+        for var_name, var in vars.items():
+            if isinstance(var.synchronizer, PSSynchronizer):
+                sync_time = self.var_ps_time(var, resource)
+                device = vars[var_name].device
+                if device not in device_ps_sync_time:
+                    device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
+                for key in feature_keys:
+                    device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
+            elif isinstance(var.synchronizer, AllReduceSynchronizer):
+                sync_time = self.var_ar_time(var, resource)
+                var_group = sync_time['group']
+                if var_group not in group_ar_sync_time:
+                    group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys}
+                for key in feature_keys:
+                    group_ar_sync_time[var_group][key] += sync_time[key]
+            else:
+                raise ValueError('{}'.format(type(var.synchronizer)))
+
+        max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+        sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
+        max_group_ar_sync_time = {key: 0.0 for key in feature_keys}
+        sum_group_ar_sync_time = {key: 0.0 for key in feature_keys}
+        for key in feature_keys:
+            max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
+            sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
+            max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0])
+            sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0])
+
+        feat = [max_device_ps_sync_time[key] for key in feature_keys] \
+               + [sum_device_ps_sync_time[key] for key in feature_keys] \
+               + [max_group_ar_sync_time[key] for key in feature_keys] \
+               + [sum_group_ar_sync_time[key] for key in feature_keys]
+
+        return feat
+
+
+
+
+    # def predefined_sync_time(self, strategy, resource_spec):
+    #     """ graph_item: transformed graph item """
+    #     vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
+    #     # Compute synchronization time for every var
+    #     var_sync_time = {}
+    #     for var_name, var in vars.items():
+    #         if isinstance(var.synchronizer, PSSynchronizer):
+    #             var_sync_time[var_name] = self.var_ps_time(var, resource)
+    #         elif isinstance(var.synchronizer, AllReduceSynchronizer):
+    #             var_sync_time[var_name] = self.var_ar_time(var, resource)
+    #         else:
+    #             raise ValueError('{}'.format(type(var.synchronizer)))
+    #     return var_sync_time, vars, resource
+
+
+    def var_ps_time(self,
+                    var_item,
+                    resource_item,
+                    network_overhead=0.0,
+                    gpu_kernel_memory_latency=0.0,
+                    get_coef=False):
+        """
+        Estimate the synchronization time of a variable with PS synchronizer.
+
+        Args:
+            var_item:
+            resource_item:
+            network_overhead:
+            gpu_kernel_memory_latency:
+            get_coef: return the
+
+        Returns:
+
+        """
+
+        def _helper(worker_list, worker_num_replicas=None):
+            if worker_num_replicas is None:
+                worker_num_replicas = [1.0] * len(worker_list)
+
+            this_server_time = 0
+            # network transfer: sum up all workers time. equals to the time cost of this server.
+            # TODO(Hao): didn't consider any parallelization among partitions
+            for k, worker in enumerate(worker_list):
+                if _resolved_devices_on_diff_machine(var.device, worker):
+                    if var.is_sparse:
+                        this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
+                    else:
+                        this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
+                    this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
+
+            if get_coef:
+                return {
+                    'transmission': this_server_time,
+                    'network_overhead': len(worker_list),
+                    'gpu_kernel_memory_latency': resource.max_num_local_replica,
+                    'constant': 1.0,
+                    # possible affecting factors.
+                    'var_name': var.name,
+                    'strategy': 'ps',
+                    'local_proxy': var.synchronizer.local_replication,
+                    'is_sparse': var.is_sparse,
+                    'size_to_transfer': var_size_to_transfer,
+                    'dtype': str(var.dtype),
+                    # 'server_list': [partition.to_dict() for partition in server_list],
+                    'worker_list': worker_list,
+                    'cpu_worker_list': resource.cpu_worker_list,
+                    'gpu_worker_list': resource.gpu_worker_list,
+                    'worker_num_replicas': worker_num_replicas,
+                    'max_num_local_replica': resource.max_num_local_replica,
+                    'is_ps': True,
+                }
+            else:
+                return this_server_time + len(worker_list) * network_overhead + \
+                       gpu_kernel_memory_latency * resource.max_num_local_replica
+
+        var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+                                                    seq_len=self._seq_len)
+
+        if var.is_sparse:
+            send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas)
+            receive_time = _helper(resource.gpu_worker_list)
+        else:
+            send_time = _helper(resource.cpu_worker_list)
+            if var.synchronizer.local_replication:
+                receive_time = _helper(resource.cpu_worker_list)
+            else:
+                receive_time = _helper(resource.gpu_worker_list)
+
+        return send_time, receive_time
+
+    @staticmethod
+    def _estimate_ps_send_receive_time(var_item,
+                                       resource_item,
+                                       hosts,
+                                       virtual_num_local_replica):
+        """
+        Estimate the send and receive time of a ps and return multiple impacting factors.
+
+        Args:
+            var_item:
+            resource_item:
+            hosts:
+            virtual_num_local_replica:
+
+        Returns:
+            Dict: a dictionary of impacting factors.
+        """
+        if worker_num_replicas is None:
+            worker_num_replicas = [1.0] * len(worker_list)
+
+        this_server_time = 0
+        # network transfer: sum up all workers time. equals to the time cost of this server.
+        # TODO(Hao): didn't consider any parallelization among partitions
+        for k, worker in enumerate(worker_list):
+            if _resolved_devices_on_diff_machine(var.device, worker):
+                if var.is_sparse:
+                    this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
+                else:
+                    this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
+                this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
+
+        if get_coef:
+            return {
+                'transmission': this_server_time,
+                'network_overhead': len(worker_list),
+                'gpu_kernel_memory_latency': resource.max_num_local_replica,
+                'constant': 1.0,
+                # possible affecting factors.
+                'var_name': var.name,
+                'strategy': 'ps',
+                'local_proxy': var.synchronizer.local_replication,
+                'is_sparse': var.is_sparse,
+                'size_to_transfer': var_size_to_transfer,
+                'dtype': str(var.dtype),
+                # 'server_list': [partition.to_dict() for partition in server_list],
+                'worker_list': worker_list,
+                'cpu_worker_list': resource.cpu_worker_list,
+                'gpu_worker_list': resource.gpu_worker_list,
+                'worker_num_replicas': worker_num_replicas,
+                'max_num_local_replica': resource.max_num_local_replica,
+                'is_ps': True,
+            }
+        else:
+            return this_server_time + len(worker_list) * network_overhead + \
+                   gpu_kernel_memory_latency * resource.max_num_local_replica
+
+
+    def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0, get_coef=False):
+        """Compute synchronization time of a variable in AR strategy."""
+        worker_list = resource.cpu_worker_list
+        num_workers = len(worker_list)
+        min_bandwidth = None
+        for i in range(num_workers):
+            for j in range(i, num_workers):
+                if min_bandwidth is None:
+                    min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]]
+                else:
+                    min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]])
+
+        # Compressor
+        if var.compressor == "PowerSGDCompressor" or var.compressor == 3:
+            rank = 10  # currently using default value. So hardcode here. # todo: confirm
+            # assume var must be a dense variable.
+            og_shape = var.shape
+            ndims = len(og_shape)
+            if ndims <= 1:  # no compress
+                size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+                                                        seq_len=self._seq_len)
+            else:
+                if ndims > 2:
+                    n = og_shape[0]
+                    m = 1
+                    for s in og_shape[1:]:
+                        m *= s  # tensor's shape (n, m)
+                else:
+                    n, m = og_shape[0], og_shape[1]
+                size_to_transfer = n * rank + m * rank
+            dtype = tf.float32
+        elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \
+                or var.compressor == 2  or var.compressor == 1:
+            size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+                                                    seq_len=self._seq_len)
+            dtype = tf.float32
+        elif var.compressor == "NoneCompressor" or var.compressor == 0:
+            size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+                                                    seq_len=self._seq_len)
+            dtype = var.dtype
+        else:
+            raise ValueError('Compressor does not exist: {}'.format(var.compressor))
+
+        # todo: chunk_size
+        # AllReduce communication time
+        # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers)
+        time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
+
+        if get_coef:
+            return {
+                'transmission': time,
+                'network_overhead': 1,  # len(worker_list),
+                'gpu_kernel_memory_latency': resource.max_num_local_replica,
+                'constant': 1.0,
+                # possible affecting factors.
+                'var_name': var.name,
+                'group': var.synchronizer.group,
+                'strategy': 'allreduce',
+                'is_sparse': False,
+                # 'chunk_size': chunk_size,
+                'spec': 'NCCL',  # default
+                'compressor': var.compressor,
+                'worker_list': worker_list,
+                'num_workers': num_workers,
+                'size_to_transfer': size_to_transfer,
+                'dtype': str(dtype),
+                'min_bandwidth': min_bandwidth,
+                'max_num_local_replica': resource.max_num_local_replica,
+                'is_ps': False,
+            }
+        else:
+            return time + network_overhead * len(worker_list) \
+                   + gpu_kernel_memory_latency * resource.max_num_local_replica
+
+
+
+# @staticmethod
+# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
+#				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
+#				 network_overhead=0.0, gpu_kernel_memory_latency=0.0):
+#	 """Compute synchrinzation time of a variable in PS strategy."""
+#
+#	 def _helper(worker_list, worker_num_replicas=None):
+#		 if worker_num_replicas is None:
+#			 worker_num_replicas = [1.0] * len(worker_list)
+#		 # Compute the slowest server
+#		 slowest_server_time = 0
+#		 for j, server in enumerate(server_list):
+#			 if server.size_to_transfer == 0:
+#				 continue
+#			 # network transfer: sum up all workers time. equals to the time cost of this server.
+#			 this_server_time = 0
+#			 for k, worker in enumerate(worker_list):
+#				 if _resolved_devices_on_diff_machine(server.device, worker):
+#					 if is_sparse:
+#						 this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k]
+#					 else:
+#						 this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype)
+#					 this_server_time += this_worker_size / network_bandwidth[server.device][worker]
+#			 slowest_server_time = max(slowest_server_time, this_server_time)
+#
+#		 if get_coef:
+#			 return {
+#				 'transmission': slowest_server_time,
+#				 'network_overhead': len(worker_list),
+#				 'gpu_kernel_memory_latency': max_num_local_replica,
+#				 'constant': 1.0,
+#				 # possible affecting factors.
+#				 'var_name': var_name,
+#				 'strategy': 'ps',
+#				 'local_proxy': local_proxy,
+#				 'is_sparse': is_sparse,
+#				 'server_list': [partition.to_dict() for partition in server_list],
+#				 'worker_list': worker_list,
+#				 'cpu_worker_list': cpu_worker_list,
+#				 'gpu_worker_list': gpu_worker_list,
+#				 'worker_num_replicas': worker_num_replicas,
+#				 'max_num_local_replica': max_num_local_replica,
+#			 }
+#		 else:
+#			 return slowest_server_time + len(worker_list) * network_overhead + \
+#					gpu_kernel_memory_latency * max_num_local_replica
+#
+#	 if is_sparse:
+#		 send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas)
+#		 receive_time = _helper(gpu_worker_list)
+#	 else:
+#		 send_time = _helper(cpu_worker_list)
+#		 if local_proxy:
+#			 receive_time = _helper(cpu_worker_list)
+#		 else:
+#			 receive_time = _helper(gpu_worker_list)
+#
+#	 if get_coef:
+#		 # return {key: send_time[key]+receive_time[key] for key in send_time.keys()}
+#		 return send_time, receive_time
+#	 else:
+#		 return send_time, receive_time
diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py
index d2377e6..6d2d871 100644
--- a/autodist/strategy/auto/item.py
+++ b/autodist/strategy/auto/item.py
@@ -249,6 +249,7 @@ def device(self, resolver):
             device_str =  resolver.resolve_to_device_str(device_str)
         return device_str
 
+
 class PartItem(VariableItem):
     """Helper class to include meta information about a variable partition."""
     def __init__(self,
@@ -364,7 +365,7 @@ class ResourceItem:
 
     Helper class that includes meta information about a resource spec. All addresses are resolved (in TF format).
 
-    TODO(zhisbug): merge ResourceItem class with ResourceSpec.
+    TODO(Hao): merge ResourceItem class with ResourceSpec.
     """
 
     def __init__(self, resource_spec):
@@ -405,7 +406,7 @@ def cpu_replicas(self):
     def total_num_gpu_replica(self):
         return len(self.gpu_replicas)
 
-    def num_local_gpu_replica(self, host):
+    def num_local_gpu_replica_on(self, host):
         """
         Return the number of gpu replica on a TF host address, e.g. '/job:worker/task:0/device:CPU:0'.
 
@@ -427,7 +428,7 @@ def num_local_gpu_replica(self, host):
     @property
     def max_num_local_gpu_replica(self):
         """Return the max number of local gpu replicas on the cluster."""
-        return max([self.num_local_gpu_replica(host) for host in self.cpu_replicas])
+        return max([self.num_local_gpu_replica_on(host) for host in self.cpu_replicas])
 
     @cached_property
     def p2p_bandwidth(self):

From 668127a2c59570bb8f9931e913a857595a177251 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Wed, 29 Jul 2020 02:50:46 -0400
Subject: [PATCH 08/11] improve the estimator for ps syncers, some minor
 changes

---
 autodist/simulator/predefined_simulator.py | 216 ++++++++++-----------
 autodist/simulator/utils.py                |  29 ++-
 autodist/strategy/auto/item.py             |  87 ++++++++-
 3 files changed, 208 insertions(+), 124 deletions(-)

diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py
index 6b141c8..f6f37c6 100644
--- a/autodist/simulator/predefined_simulator.py
+++ b/autodist/simulator/predefined_simulator.py
@@ -23,7 +23,7 @@
 from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
 from autodist.resource_spec import ResourceSpec
 from autodist.simulator.base import SimulatorBase
-from autodist.simulator.utils import _resolved_devices_on_diff_machine, \
+from autodist.simulator.utils import on_same_host, \
     get_dense_var_bits, get_sparse_var_bits
 from autodist.strategy.base import Strategy
 from autodist.utils import logging
@@ -196,133 +196,89 @@ def var_ps_time(self,
                     var_item,
                     resource_item,
                     network_overhead=0.0,
-                    gpu_kernel_memory_latency=0.0,
-                    get_coef=False):
+                    gpu_kernel_memory_latency=0.0):
         """
-        Estimate the synchronization time of a variable with PS synchronizer.
+        Estimate the synchronization time of a variable using PS synchronizer.
 
         Args:
             var_item:
             resource_item:
             network_overhead:
             gpu_kernel_memory_latency:
-            get_coef: return the
 
         Returns:
-
+            tuple(dict)
         """
-
-        def _helper(worker_list, worker_num_replicas=None):
-            if worker_num_replicas is None:
-                worker_num_replicas = [1.0] * len(worker_list)
-
-            this_server_time = 0
-            # network transfer: sum up all workers time. equals to the time cost of this server.
-            # TODO(Hao): didn't consider any parallelization among partitions
-            for k, worker in enumerate(worker_list):
-                if _resolved_devices_on_diff_machine(var.device, worker):
-                    if var.is_sparse:
-                        this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
-                    else:
-                        this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
-                    this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
-
-            if get_coef:
-                return {
-                    'transmission': this_server_time,
-                    'network_overhead': len(worker_list),
-                    'gpu_kernel_memory_latency': resource.max_num_local_replica,
-                    'constant': 1.0,
-                    # possible affecting factors.
-                    'var_name': var.name,
-                    'strategy': 'ps',
-                    'local_proxy': var.synchronizer.local_replication,
-                    'is_sparse': var.is_sparse,
-                    'size_to_transfer': var_size_to_transfer,
-                    'dtype': str(var.dtype),
-                    # 'server_list': [partition.to_dict() for partition in server_list],
-                    'worker_list': worker_list,
-                    'cpu_worker_list': resource.cpu_worker_list,
-                    'gpu_worker_list': resource.gpu_worker_list,
-                    'worker_num_replicas': worker_num_replicas,
-                    'max_num_local_replica': resource.max_num_local_replica,
-                    'is_ps': True,
-                }
-            else:
-                return this_server_time + len(worker_list) * network_overhead + \
-                       gpu_kernel_memory_latency * resource.max_num_local_replica
-
-        var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-                                                    seq_len=self._seq_len)
-
-        if var.is_sparse:
-            send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas)
-            receive_time = _helper(resource.gpu_worker_list)
+        bits_to_transfer = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len)
+
+        num_local_replica_on_each_worker = [resource_item.num_local_gpu_replica_on(host)
+                                            for host in resource_item.cpu_replicas]
+        if var_item.is_sparse:
+            send_time = self._estimate_ps_time(var_item,
+                                               resource_item,
+                                               resource_item.cpu_replicas,
+                                               num_local_replica_on_each_worker)
+            recv_time = self._estimate_ps_time(var_item,
+                                               resource_item,
+                                               resource_item.gpu_replicas,
+                                               [1.0] * len(resource_item.gpu_replicas))
         else:
-            send_time = _helper(resource.cpu_worker_list)
-            if var.synchronizer.local_replication:
-                receive_time = _helper(resource.cpu_worker_list)
+            send_time = self._estimate_ps_time(var_item,
+                                               resource_item,
+                                               resource_item.cpu_replicas,
+                                               [1.0] * len(resource_item.cpu_replicas))
+            if var_item.local_replication:
+                recv_time = self._estimate_ps_time(var_item,
+                                                   resource_item,
+                                                   resource_item.cpu_replicas,
+                                                   [1.0] * len(resource_item.cpu_replicas))
             else:
-                receive_time = _helper(resource.gpu_worker_list)
-
-        return send_time, receive_time
-
-    @staticmethod
-    def _estimate_ps_send_receive_time(var_item,
-                                       resource_item,
-                                       hosts,
-                                       virtual_num_local_replica):
+                recv_time = self._estimate_ps_time(var_item,
+                                                   resource_item,
+                                                   resource_item.gpu_replicas,
+                                                   [1.0] * len(resource_item.gpu_replicas))
+        return send_time, recv_time
+
+    def _estimate_ps_time(self,
+                          var_item,
+                          resource_item,
+                          virtual_worker_list,
+                          virtual_num_local_replica):
         """
-        Estimate the send and receive time of a ps and return multiple impacting factors.
+        Estimate the send or receive time of a ps and return multiple impacting factors.
 
         Args:
-            var_item:
+            var_item: the variable whose communication time will be estimated.
             resource_item:
-            hosts:
-            virtual_num_local_replica:
+            virtual_worker_list: A list of virtual workers (could be actual gpu workers, or virtual cpu worker).
+            virtual_num_local_replica: A list of integers indicating the number of local replica on each virtual worker.
 
         Returns:
             Dict: a dictionary of impacting factors.
         """
-        if worker_num_replicas is None:
-            worker_num_replicas = [1.0] * len(worker_list)
-
-        this_server_time = 0
-        # network transfer: sum up all workers time. equals to the time cost of this server.
-        # TODO(Hao): didn't consider any parallelization among partitions
-        for k, worker in enumerate(worker_list):
-            if _resolved_devices_on_diff_machine(var.device, worker):
-                if var.is_sparse:
-                    this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
-                else:
-                    this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
-                this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
-
-        if get_coef:
-            return {
-                'transmission': this_server_time,
-                'network_overhead': len(worker_list),
-                'gpu_kernel_memory_latency': resource.max_num_local_replica,
-                'constant': 1.0,
-                # possible affecting factors.
-                'var_name': var.name,
-                'strategy': 'ps',
-                'local_proxy': var.synchronizer.local_replication,
-                'is_sparse': var.is_sparse,
-                'size_to_transfer': var_size_to_transfer,
-                'dtype': str(var.dtype),
-                # 'server_list': [partition.to_dict() for partition in server_list],
-                'worker_list': worker_list,
-                'cpu_worker_list': resource.cpu_worker_list,
-                'gpu_worker_list': resource.gpu_worker_list,
-                'worker_num_replicas': worker_num_replicas,
-                'max_num_local_replica': resource.max_num_local_replica,
-                'is_ps': True,
-            }
-        else:
-            return this_server_time + len(worker_list) * network_overhead + \
-                   gpu_kernel_memory_latency * resource.max_num_local_replica
-
+        transmission_time = 0.0
+
+        # To estimate network transmission time for the given variable var_item on PS, we simply sum up the time of
+        # transmitting (or say, synchronizing) this variable across all workers.
+        # The time is separately estimated as send_time and recv_time by calling this function twice with different
+        # values of arguments.
+        # TODO(Hao): didn't consider any parallelization between variables or partitions.
+        for k, worker in enumerate(virtual_worker_list):
+            if not on_same_host(var_item.device, worker):
+                bits_on_this_worker = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) * \
+                                      virtual_num_local_replica[k]
+                bandwidth = min(resource_item.p2p_bandwidth[var_item.device][worker],
+                                resource_item.p2p_bandwidth[worker][var_item.device])
+                transmission_time += bits_on_this_worker / bandwidth
+
+
+        factors = {
+            'transmission': transmission_time,
+            'network_overhead': len(virtual_worker_list),
+            'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica,
+            'constant': 1.0
+        }
+        return factors
 
     def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0, get_coef=False):
         """Compute synchronization time of a variable in AR strategy."""
@@ -400,6 +356,7 @@ def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_lat
 
 
 
+
 # @staticmethod
 # def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
 #				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
@@ -462,3 +419,46 @@ def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_lat
 #		 return send_time, receive_time
 #	 else:
 #		 return send_time, receive_time
+
+
+
+
+        # def _helper(worker_list, worker_num_replicas=None):
+        #     if worker_num_replicas is None:
+        #         worker_num_replicas = [1.0] * len(worker_list)
+        #
+        #     this_server_time = 0
+        #     # network transfer: sum up all workers time. equals to the time cost of this server.
+        #     # TODO(Hao): didn't consider any parallelization among partitions
+        #     for k, worker in enumerate(worker_list):
+        #         if _resolved_devices_on_diff_machine(var.device, worker):
+        #             if var.is_sparse:
+        #                 this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
+        #             else:
+        #                 this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
+        #             this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
+        #
+        #     if get_coef:
+        #         return {
+        #             'transmission': this_server_time,
+        #             'network_overhead': len(worker_list),
+        #             'gpu_kernel_memory_latency': resource.max_num_local_replica,
+        #             'constant': 1.0,
+        #             # possible affecting factors.
+        #             'var_name': var.name,
+        #             'strategy': 'ps',
+        #             'local_proxy': var.synchronizer.local_replication,
+        #             'is_sparse': var.is_sparse,
+        #             'size_to_transfer': var_size_to_transfer,
+        #             'dtype': str(var.dtype),
+        #             # 'server_list': [partition.to_dict() for partition in server_list],
+        #             'worker_list': worker_list,
+        #             'cpu_worker_list': resource.cpu_worker_list,
+        #             'gpu_worker_list': resource.gpu_worker_list,
+        #             'worker_num_replicas': worker_num_replicas,
+        #             'max_num_local_replica': resource.max_num_local_replica,
+        #             'is_ps': True,
+        #         }
+        #     else:
+        #         return this_server_time + len(worker_list) * network_overhead + \
+        #                gpu_kernel_memory_latency * resource.max_num_local_replica
\ No newline at end of file
diff --git a/autodist/simulator/utils.py b/autodist/simulator/utils.py
index b200007..d0c6436 100644
--- a/autodist/simulator/utils.py
+++ b/autodist/simulator/utils.py
@@ -258,11 +258,11 @@ def read_trial_runs():
     tf.string: 1,  # todo: confirm
     'tf.string': 1,  # todo: confirm
     "<dtype: 'string'>": 1,  # todo: confirm
+    tf.quint8: 8,
+    'tf.quint8': 8,
     tf.qint8: 8,
     'tf.qint8': 8,
     "<dtype: 'qint8'>": 8,
-    tf.quint8: 8,
-    'tf.quint8': 8,
     "<dtype: 'quint8'>": 8,
     tf.qint16: 16,
     'tf.qint16': 16,
@@ -302,11 +302,26 @@ def get_sparse_var_bits(size):
            + 2 * get_dtype_bits(tf.int64)
 
 
-def _resolved_devices_on_diff_machine(device1, device2):
-    # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0'
-    node1 = ':'.join(device1.split('/')[:-1])
-    node2 = ':'.join(device2.split('/')[:-1])
-    return node1 != node2
+def on_same_host(device_str1, device_str2):
+    """
+    Return True if d1 and d2 are on the same host.
+
+    Args:
+        device_str1 (string): the first device as a TF device string, e.g. /job:worker/task:0/device:CPU:0.
+        device_str2 (string): the first device as a TF device string, e.g. /job:worker/task:0/device:GPU:0.
+
+    Returns:
+        Bool: True if they are on the same host, otherwise False.
+    """
+    host1 = '/'.join(device_str1.split('/')[:-1])
+    host2 = '/'.join(device_str2.split('/')[:-1])
+    return host1 == host2
+
+# def _resolved_devices_on_diff_machine(device1, device2):
+#     # e.g., '/job:worker/task:1/device:CPU:0', '/job:worker/task:1/GPU:0'
+#     node1 = ':'.join(device1.split('/')[:-1])
+#     node2 = ':'.join(device2.split('/')[:-1])
+#     return node1 != node2
 
 
 # def _resolve_device_address(device: str, device_resolver: DeviceResolver):
diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py
index 6d2d871..263ebba 100644
--- a/autodist/strategy/auto/item.py
+++ b/autodist/strategy/auto/item.py
@@ -16,6 +16,7 @@
 
 from enum import Enum
 
+import tensorflow as tf
 from tensorflow.python.framework import ops, device_spec
 
 from autodist.kernel.common.utils import get_op_name, get_consumers
@@ -24,7 +25,7 @@
 from autodist.strategy.base import byte_size_load_fn
 from autodist.utils import logging
 from autodist.cluster import SSHCluster
-from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS
+from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits
 
 
 class VarType(Enum):
@@ -136,8 +137,18 @@ def original_size(self):
                 size *= s
         return size
 
-    @property
     def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
+        """
+        Return the number of elements (e.g. float, integer) to transfer for this variable per iteration.
+
+        To estimate the size to transfer for sparse variables, batch_size_per_gpu and seq_len are required.
+        Args:
+            batch_size_per_gpu: batch size used on each GPU replica.
+            seq_len: the length of the sequence of each input example.
+
+        Returns:
+            integer
+        """
         if not self.is_sparse:
             return self.size
         else:
@@ -153,7 +164,30 @@ def size_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
             sparse_data_size = batch_size_per_gpu * seq_len * emb_size
 
             # estimate the embedding of this partition simply using a proportional formula
-            return sparse_data_size * self.size / self.original_size
+            return sparse_data_size * float(self.size) / float(self.original_size)
+
+    @property
+    def bits_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
+        """
+        Estimate the bits to transfer across the network per iteration.
+
+        For sparse variables, this is an over-estimation as we think all columns corresponded to this batch
+        is unique.
+        Args:
+            batch_size_per_gpu:
+            seq_len:
+
+        Returns:
+            integer
+        """
+        s = self.size_to_transfer(batch_size_per_gpu, seq_len)
+        if self.is_sparse: # IndexSlices: values, indices, dense_shape
+            bits = s * get_dtype_bits(self.dtype) + \
+                   batch_size_per_gpu * seq_len * self.size / self.original_size * get_dtype_bits(tf.int64) + \
+                   2 * get_dtype_bits(tf.int64)
+            return bits
+        else: # Tensor
+            return s * get_dtype_bits(self.dtype)
 
     @property
     def partitionable_axes(self):
@@ -234,7 +268,7 @@ def reduction_destination(self):
         Return the reduction_destination in the node config of this variable.
 
         Returns:
-            Reduction destinaiton.
+            str.
         """
         if not self._node_config:
             raise ValueError('Node config is unset.')
@@ -248,6 +282,21 @@ def device(self, resolver):
         if device_str:
             device_str =  resolver.resolve_to_device_str(device_str)
         return device_str
+    
+    @property
+    def local_replication(self):
+        """
+        Return the local_replication in the node config of this variable.
+
+        Returns:
+            bool
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if self._node_config.partitioner:
+            logging.warning('This variable will be partitioned')
+            return None
+        return getattr(self.synchronizer, 'local_replication', False)
 
 
 class PartItem(VariableItem):
@@ -359,6 +408,21 @@ def reduction_destination(self):
             return None
         return getattr(self.synchronizer, 'reduction_destination', None)
 
+    @property
+    def local_replication(self):
+        """
+        Return the local_replication in the node config of this variable partition.
+
+        Returns:
+            bool
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if not self._node_config.partitioner:
+            logging.warning('Partitioner field is empty for a variable partition.')
+            return None
+        return getattr(self.synchronizer, 'local_replication', False)
+
 
 class ResourceItem:
     """ResourceItem.
@@ -373,6 +437,11 @@ def __init__(self, resource_spec):
         self._cluster = SSHCluster(resource_spec)
         self._device_resolver = DeviceResolver(self._cluster)
 
+    @property
+    def device_resolver(self):
+        """Resolver of this resource_spec that resolves an AutoDist device to TF device."""
+        return self._device_resolver
+
     @property
     def replicas(self):
         """Return the list of replicas in the format of TF device string, e.g. job:worker/task:0/device:gpu:0."""
@@ -389,7 +458,7 @@ def gpu_replicas(self):
         """
         # device_str is autodist device string, e.g. 192.168.0.1:CPU:0
         device_strs = [k for k, _ in self._resource_spec.gpu_devices]
-        return self._device_resolver.resolve_to_device_str(device_strs)
+        return self.device_resolver.resolve_to_device_str(device_strs)
 
     @property
     def cpu_replicas(self):
@@ -400,7 +469,7 @@ def cpu_replicas(self):
             List(string)
         """
         device_strs = [k for k, _ in self._resource_spec.cpu_devices]
-        return self._device_resolver.resolve_to_device_str(device_strs)
+        return self.device_resolver.resolve_to_device_str(device_strs)
 
     @property
     def total_num_gpu_replica(self):
@@ -434,7 +503,7 @@ def max_num_local_gpu_replica(self):
     def p2p_bandwidth(self):
         """Calculates P2P network bandwidth between nodes in the cluster.
 
-        Note that this is NOT a sysmetric
+        Note that this is NOT a symmetric matrix.
         """
         bw = {} # key: (device1, device2)
         devices = [device for device, _ in self._resource_spec.devices]
@@ -451,8 +520,8 @@ def p2p_bandwidth(self):
                 if d_j not in bw:
                     bw[d_j] = {}
                 if ip_i != ip_j:
-                    bw[d_i][d_j] = GIGABITS * self._resource_spec[ip_i].bandwidth[ip_i]
-                    bw[d_j][d_i] = GIGABITS * self._resource_spec[ip_j].bandwidth[ip_j]
+                    bw[d_i][d_j] = GIGABITS * self._resource_spec.network_bandwidth[ip_i]
+                    bw[d_j][d_i] = GIGABITS * self._resource_spec.network_bandwidth[ip_j]
                 else:
                     bw[d_i][d_j] = GIGABITS * GPU_TO_CPU_BANDWIDTH
                     bw[d_j][d_i] = GIGABITS * GPU_TO_CPU_BANDWIDTH

From 0f9755d0f06879020f0ee74cc5945bd680d5f40c Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Tue, 4 Aug 2020 00:22:29 -0400
Subject: [PATCH 09/11] predefined simulator refactoring done

---
 autodist/simulator/predefined_simulator.py | 495 ++++++++-------------
 autodist/strategy/auto/item.py             |  31 +-
 2 files changed, 207 insertions(+), 319 deletions(-)

diff --git a/autodist/simulator/predefined_simulator.py b/autodist/simulator/predefined_simulator.py
index f6f37c6..0e3f60b 100644
--- a/autodist/simulator/predefined_simulator.py
+++ b/autodist/simulator/predefined_simulator.py
@@ -14,18 +14,14 @@
 
 """Predefined simulator with linear model."""
 
-import pickle as pkl
 from collections import OrderedDict
 
 import tensorflow as tf
-from tensorflow.python.eager import context
 
 from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
-from autodist.resource_spec import ResourceSpec
 from autodist.simulator.base import SimulatorBase
 from autodist.simulator.utils import on_same_host, \
-    get_dense_var_bits, get_sparse_var_bits
-from autodist.strategy.base import Strategy
+    get_dtype_bits
 from autodist.utils import logging
 
 
@@ -35,28 +31,39 @@ class PredefinedSimulator(SimulatorBase):
 
     See this paper TODO(Hao): put the paper link.
     """
+
     def __init__(self,
                  graph_item=None,
                  resource_spec=None,
                  batch_size=1,
-                 seq_len=1):
+                 seq_len=1,
+                 mode='sum'):
         """
         Construct a predefined simulator.
 
-        The reason we need the per-replica batch size and the length of the inputsequence is to estimate
-        the communication load of variables that are sparsely access (e.g. embeddings). For dense variables,
-        these two arguments have no influence on estimation.
+        We need the per-replica batch size and the length of the input sequence to estimate the communication load of
+        variables that are sparsely accessed (e.g. embeddings). For dense variables, these two arguments have no
+        influence on estimation.
+        Note that graph_item and resource_spec are not required to instantiate a simulator object as we allow
+        transferring a trained simulator on a graph_item (or resource_spec) to a different graph_item (or different
+        resource_spec). This can be done by passing graph_item or resource_spec
 
         Args:
             graph_item: a GraphItem object, or a path to a serialized GraphItem object.
             resource_spec: a ResourceSpec object, or a path to a resource file.
             batch_size: the per-replica batch size used to train this model, if there are sparse variables.
             seq_len: the average length of input sequences (if there is any).
+            mode: use the `sum` or `max` of all variable sync time as the cost.
         """
         super(PredefinedSimulator, self).__init__(graph_item, resource_spec)
         logging.debug('A PredefinedSimualtor is instantiated: batch_size_per_gpu is {}'.format(batch_size))
         self._batch_size_per_gpu = batch_size
         self._seq_len = seq_len
+        self._mode = mode
+
+        # Constants for predefined modeling.
+        self._network_overhead = 0.0
+        self._gpu_kernel_memory_latency = 0.0
 
     def simulate(self,
                  strategy,
@@ -64,38 +71,83 @@ def simulate(self,
                  resource_spec=None,
                  *args,
                  **kwargs):
-        """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple."""
-        inputs = self.create_features(strategy, resource_spec)
-        with context.eager_mode():
-            cost = self.inference(inputs, checkpoint)
-        return cost
-
-    def inference(self, inputs, checkpoint=None):
-        if checkpoint is not None:
-            weights = self.load_checkpoint(checkpoint)
-        elif self._weights is not None:
-            weights = self._weights
-        else:
-            raise ValueError("No checkpoint provided in either initialization or inference.")
-
-        if not isinstance(inputs, tf.Tensor):
-            inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
-
-        if len(weights) == 4:
-            W0, b0, W, b = weights
-            inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0)
-            cost = tf.matmul(inputs, W) + b
-        elif len(weights) == 2:
-            W, b = weights
-            cost = tf.matmul(inputs, W) + b
+        """
+        Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.
+
+        Args:
+            strategy: the strategy to simulate
+            graph_item: the graph_item this strategy is generated on.
+            resource_spec: the resource_spec this strategy is on.
+
+        Returns:
+            float: the estimated runtime (lower is better).
+        """
+        var_name_to_items, resource_item, var_name_to_sync_time = \
+            self.extract_prefeature(strategy, graph_item, resource_spec)
+
+        # Now use the estimated per-variable sync time to calculate the overall sync time.
+        ps_server_sync_time = {}
+        cc_group_sync_time = {}
+
+        for var_name, var_item in var_name_to_items.items():
+            sync_time = var_name_to_sync_time[var_name]
+
+            # we use a simple formula:
+            # time = transmission + network_overhead * participating_workers + gpu_memory_latency * max(#gpus)
+            if isinstance(var_item.synchronizer, PSSynchronizer):
+                server = var_item.device
+                if server not in ps_server_sync_time:
+                    ps_server_sync_time[server] = 0.0
+                send_time = sync_time[0]['transmission'] + \
+                            sync_time[0]['network_overhead'] * self._network_overhead + \
+                            sync_time[0]['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency
+                recv_time = sync_time[1]['transmission'] + \
+                            sync_time[1]['network_overhead'] * self._network_overhead + \
+                            sync_time[1]['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency
+                # Then accumulate the time for each variable on this PS. Note this is not necessarily accurate as
+                # there might exist parallel communication of variables even on one server.
+                ps_server_sync_time[server] += send_time
+                ps_server_sync_time[server] += recv_time
+            elif isinstance(var_item.synchronizer, AllReduceSynchronizer):
+                group = var_item.group
+                if group not in cc_group_sync_time:
+                    # Each group of variables are fused as one message to pass, so we accumulate the
+                    # overhead and latency for only ONCE.
+                    cc_group_sync_time[group] += sync_time['network_overhead'] * self._network_overhead + \
+                        sync_time['gpu_kernel_memory_latency'] * self._gpu_kernel_memory_latency
+                cc_group_sync_time[group] += sync_time['transmission']
+            else:
+                raise ValueError('Unrecognized type of synchronizer: {}'.format(type(var_item.synchronizer)))
+
+        sync_time = [v for v in ps_server_sync_time.values()] + [v for v in cc_group_sync_time.values()]
+        if self._mode == 'max':
+            # In `max` mode, we assume all PS and collective groups communicate in parallel, and the PS/group that
+            # takes the longest time to sync would bound the overall per-iter time.
+            per_iter_time = max(sync_time)
+        elif self._mode == 'sum':
+            # In `sum` mode, we assume all PS and collective groups synchronize sequentially, and the overall per-iter
+            # time is the summation of the sync time of all serviers and collective groups.
+            # !!Note: both modes have over-simplified assumptions than a real system.
+            per_iter_time = sum(sync_time)
         else:
-            raise ValueError
-        return cost
+            raise ValueError('Unrecognized simulation mode: {}'.format(self._mode))
+        return per_iter_time
 
-    def estimate_sync_time(self,
+    def extract_prefeature(self,
                            strategy,
                            graph_item=None,
                            resource_spec=None):
+        """
+        Extract impacting factors of the communication time for each variable.
+
+        Args:
+            strategy: the strategy to simulate.
+            graph_item: the graph_item this strategy is generated for.
+            resource_spec: the resource_spec this strategy is on.
+
+        Returns:
+            Dict: A dict of variable name (str) to impacting factors (dict).
+        """
         if not strategy:
             raise ValueError('strategy is None.')
         if not graph_item:
@@ -108,7 +160,7 @@ def estimate_sync_time(self,
                 raise ValueError('No resource spec provided.')
             else:
                 resource_spec = self._resource_spec
-
+        # TODO(Hao): need to make sure the (strategy, graph_item, resource_spec) match each other.
         # construct the meta objects
         name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec)
 
@@ -118,138 +170,83 @@ def estimate_sync_time(self,
             if isinstance(var_item.synchronizer, PSSynchronizer):
                 var_sync_time[var_name] = self.var_ps_time(var_item, resource_item)
             elif isinstance(var_item.synchronizer, AllReduceSynchronizer):
-                var_sync_time = self.var_ar_time(var_item, resource_item)
+                var_sync_time[var_name] = self.var_ar_time(var_item, resource_item)
             else:
                 raise ValueError('{}'.format(type(var_item.synchronizer)))
         return var_sync_time
 
-
-
-
-
-
-    def create_features(self,
-                        strategy,
-                        resource_spec):
-        # var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
-
-        vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
-
-        feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
-        device_ps_sync_time = {}
-        group_ar_sync_time = {}
-
-        for var_name, var in vars.items():
-            if isinstance(var.synchronizer, PSSynchronizer):
-                sync_time = self.var_ps_time(var, resource)
-                device = vars[var_name].device
-                if device not in device_ps_sync_time:
-                    device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
-                for key in feature_keys:
-                    device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
-            elif isinstance(var.synchronizer, AllReduceSynchronizer):
-                sync_time = self.var_ar_time(var, resource)
-                var_group = sync_time['group']
-                if var_group not in group_ar_sync_time:
-                    group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys}
-                for key in feature_keys:
-                    group_ar_sync_time[var_group][key] += sync_time[key]
-            else:
-                raise ValueError('{}'.format(type(var.synchronizer)))
-
-        max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-        sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-        max_group_ar_sync_time = {key: 0.0 for key in feature_keys}
-        sum_group_ar_sync_time = {key: 0.0 for key in feature_keys}
-        for key in feature_keys:
-            max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
-            sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
-            max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0])
-            sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0])
-
-        feat = [max_device_ps_sync_time[key] for key in feature_keys] \
-               + [sum_device_ps_sync_time[key] for key in feature_keys] \
-               + [max_group_ar_sync_time[key] for key in feature_keys] \
-               + [sum_group_ar_sync_time[key] for key in feature_keys]
-
-        return feat
-
-
-
-
-    # def predefined_sync_time(self, strategy, resource_spec):
-    #     """ graph_item: transformed graph item """
-    #     vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
-    #     # Compute synchronization time for every var
-    #     var_sync_time = {}
-    #     for var_name, var in vars.items():
-    #         if isinstance(var.synchronizer, PSSynchronizer):
-    #             var_sync_time[var_name] = self.var_ps_time(var, resource)
-    #         elif isinstance(var.synchronizer, AllReduceSynchronizer):
-    #             var_sync_time[var_name] = self.var_ar_time(var, resource)
-    #         else:
-    #             raise ValueError('{}'.format(type(var.synchronizer)))
-    #     return var_sync_time, vars, resource
-
-
     def var_ps_time(self,
                     var_item,
-                    resource_item,
-                    network_overhead=0.0,
-                    gpu_kernel_memory_latency=0.0):
+                    resource_item):
         """
-        Estimate the synchronization time of a variable using PS synchronizer.
+        Estimate the synchronization time of a variable that uses PS synchronizer.
 
         Args:
-            var_item:
-            resource_item:
-            network_overhead:
-            gpu_kernel_memory_latency:
+            var_item: the variable meta information.
+            resource_item: the resource meta information.
 
         Returns:
-            tuple(dict)
+            tuple(Dict): a dict of potential impacting factors for send and recv time, respectively.
         """
         bits_to_transfer = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len)
-
+        placement = var_item.device
+        p2p_bandwidth = resource_item.p2p_bandwidth
+        max_num_local_gpu_replica = resource_item.max_num_local_gpu_replica
         num_local_replica_on_each_worker = [resource_item.num_local_gpu_replica_on(host)
                                             for host in resource_item.cpu_replicas]
         if var_item.is_sparse:
-            send_time = self._estimate_ps_time(var_item,
-                                               resource_item,
+            send_time = self._estimate_ps_time(bits_to_transfer,
+                                               placement,
+                                               p2p_bandwidth,
+                                               max_num_local_gpu_replica,
                                                resource_item.cpu_replicas,
                                                num_local_replica_on_each_worker)
-            recv_time = self._estimate_ps_time(var_item,
-                                               resource_item,
+            recv_time = self._estimate_ps_time(bits_to_transfer,
+                                               placement,
+                                               p2p_bandwidth,
+                                               max_num_local_gpu_replica,
                                                resource_item.gpu_replicas,
                                                [1.0] * len(resource_item.gpu_replicas))
         else:
-            send_time = self._estimate_ps_time(var_item,
-                                               resource_item,
+            # In AutoDist, the gradients are always locally accumulated then SENT to parameter server.
+            send_time = self._estimate_ps_time(bits_to_transfer,
+                                               placement,
+                                               p2p_bandwidth,
+                                               max_num_local_gpu_replica,
                                                resource_item.cpu_replicas,
                                                [1.0] * len(resource_item.cpu_replicas))
+            # The communication overhead of receiving parameters from PS depends on `local_replication`.
             if var_item.local_replication:
-                recv_time = self._estimate_ps_time(var_item,
-                                                   resource_item,
+                recv_time = self._estimate_ps_time(bits_to_transfer,
+                                                   placement,
+                                                   p2p_bandwidth,
+                                                   max_num_local_gpu_replica,
                                                    resource_item.cpu_replicas,
                                                    [1.0] * len(resource_item.cpu_replicas))
             else:
-                recv_time = self._estimate_ps_time(var_item,
-                                                   resource_item,
+                recv_time = self._estimate_ps_time(bits_to_transfer,
+                                                   placement,
+                                                   p2p_bandwidth,
+                                                   max_num_local_gpu_replica,
                                                    resource_item.gpu_replicas,
                                                    [1.0] * len(resource_item.gpu_replicas))
         return send_time, recv_time
 
-    def _estimate_ps_time(self,
-                          var_item,
-                          resource_item,
+    @staticmethod
+    def _estimate_ps_time(bits_to_transfer,
+                          placement,
+                          p2p_bandwidth,
+                          max_num_local_gpu_replica,
                           virtual_worker_list,
                           virtual_num_local_replica):
         """
         Estimate the send or receive time of a ps and return multiple impacting factors.
 
         Args:
-            var_item: the variable whose communication time will be estimated.
-            resource_item:
+            bits_to_transfer: the variable whose communication time will be estimated.
+            placement: the placement of the variable.
+            p2p_bandwidth: point-to-point bandwidth between divices of the cluster.
+            max_num_local_gpu_replica: the maximum number of on a single node across the cluster.
             virtual_worker_list: A list of virtual workers (could be actual gpu workers, or virtual cpu worker).
             virtual_num_local_replica: A list of integers indicating the number of local replica on each virtual worker.
 
@@ -264,201 +261,63 @@ def _estimate_ps_time(self,
         # values of arguments.
         # TODO(Hao): didn't consider any parallelization between variables or partitions.
         for k, worker in enumerate(virtual_worker_list):
-            if not on_same_host(var_item.device, worker):
-                bits_on_this_worker = var_item.bits_to_transfer(self._batch_size_per_gpu, self._seq_len) * \
-                                      virtual_num_local_replica[k]
-                bandwidth = min(resource_item.p2p_bandwidth[var_item.device][worker],
-                                resource_item.p2p_bandwidth[worker][var_item.device])
+            if not on_same_host(placement, worker):
+                bits_on_this_worker = bits_to_transfer * virtual_num_local_replica[k]
+                bandwidth = min(p2p_bandwidth[placement][worker], p2p_bandwidth[worker][placement])
                 transmission_time += bits_on_this_worker / bandwidth
-
-
         factors = {
             'transmission': transmission_time,
             'network_overhead': len(virtual_worker_list),
-            'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica,
+            'gpu_kernel_memory_latency': max_num_local_gpu_replica,  # TODO(Hao): Is this correct?
             'constant': 1.0
         }
         return factors
 
-    def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0, get_coef=False):
-        """Compute synchronization time of a variable in AR strategy."""
-        worker_list = resource.cpu_worker_list
-        num_workers = len(worker_list)
-        min_bandwidth = None
-        for i in range(num_workers):
-            for j in range(i, num_workers):
-                if min_bandwidth is None:
-                    min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]]
-                else:
-                    min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]])
-
-        # Compressor
-        if var.compressor == "PowerSGDCompressor" or var.compressor == 3:
-            rank = 10  # currently using default value. So hardcode here. # todo: confirm
-            # assume var must be a dense variable.
-            og_shape = var.shape
-            ndims = len(og_shape)
-            if ndims <= 1:  # no compress
-                size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-                                                        seq_len=self._seq_len)
-            else:
-                if ndims > 2:
-                    n = og_shape[0]
-                    m = 1
-                    for s in og_shape[1:]:
-                        m *= s  # tensor's shape (n, m)
-                else:
-                    n, m = og_shape[0], og_shape[1]
-                size_to_transfer = n * rank + m * rank
-            dtype = tf.float32
-        elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor" \
-                or var.compressor == 2  or var.compressor == 1:
-            size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-                                                    seq_len=self._seq_len)
-            dtype = tf.float32
-        elif var.compressor == "NoneCompressor" or var.compressor == 0:
-            size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-                                                    seq_len=self._seq_len)
-            dtype = var.dtype
-        else:
-            raise ValueError('Compressor does not exist: {}'.format(var.compressor))
-
-        # todo: chunk_size
-        # AllReduce communication time
-        # time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers)
-        time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
-
-        if get_coef:
-            return {
-                'transmission': time,
-                'network_overhead': 1,  # len(worker_list),
-                'gpu_kernel_memory_latency': resource.max_num_local_replica,
-                'constant': 1.0,
-                # possible affecting factors.
-                'var_name': var.name,
-                'group': var.synchronizer.group,
-                'strategy': 'allreduce',
-                'is_sparse': False,
-                # 'chunk_size': chunk_size,
-                'spec': 'NCCL',  # default
-                'compressor': var.compressor,
-                'worker_list': worker_list,
-                'num_workers': num_workers,
-                'size_to_transfer': size_to_transfer,
-                'dtype': str(dtype),
-                'min_bandwidth': min_bandwidth,
-                'max_num_local_replica': resource.max_num_local_replica,
-                'is_ps': False,
-            }
-        else:
-            return time + network_overhead * len(worker_list) \
-                   + gpu_kernel_memory_latency * resource.max_num_local_replica
+    def var_ar_time(self,
+                    var_item,
+                    resource_item,
+                    powersgd_rank=10):
+        """
+        Estimate the synchronization time of a variable that uses collective synchronizer.
 
+        Due to limitation, we only consider dense variables for now.
+        Args:
+            var_item: the variable meta information.
+            resource_item: the resource meta information.
 
+        Returns:
+            Dict: a dictionary of impacting factors.
+        """
+        # Address cases for different types of compressors
+        if var_item.compressor not in ['PowerSGDCopmressor', 'HorovodCompressorEF', 'HorovodCompressor',
+                                       'NoneCompressor', 0, 1, 2, 3]:
+            raise ValueError('Compressor type not recognized: {}'.format(var_item.compressor))
 
+        size_to_transfer = var_item.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
+                                                     seq_len=self._seq_len)
+        dtype = var_item.dtype
 
-# @staticmethod
-# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
-#				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
-#				 network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-#	 """Compute synchrinzation time of a variable in PS strategy."""
-#
-#	 def _helper(worker_list, worker_num_replicas=None):
-#		 if worker_num_replicas is None:
-#			 worker_num_replicas = [1.0] * len(worker_list)
-#		 # Compute the slowest server
-#		 slowest_server_time = 0
-#		 for j, server in enumerate(server_list):
-#			 if server.size_to_transfer == 0:
-#				 continue
-#			 # network transfer: sum up all workers time. equals to the time cost of this server.
-#			 this_server_time = 0
-#			 for k, worker in enumerate(worker_list):
-#				 if _resolved_devices_on_diff_machine(server.device, worker):
-#					 if is_sparse:
-#						 this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k]
-#					 else:
-#						 this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype)
-#					 this_server_time += this_worker_size / network_bandwidth[server.device][worker]
-#			 slowest_server_time = max(slowest_server_time, this_server_time)
-#
-#		 if get_coef:
-#			 return {
-#				 'transmission': slowest_server_time,
-#				 'network_overhead': len(worker_list),
-#				 'gpu_kernel_memory_latency': max_num_local_replica,
-#				 'constant': 1.0,
-#				 # possible affecting factors.
-#				 'var_name': var_name,
-#				 'strategy': 'ps',
-#				 'local_proxy': local_proxy,
-#				 'is_sparse': is_sparse,
-#				 'server_list': [partition.to_dict() for partition in server_list],
-#				 'worker_list': worker_list,
-#				 'cpu_worker_list': cpu_worker_list,
-#				 'gpu_worker_list': gpu_worker_list,
-#				 'worker_num_replicas': worker_num_replicas,
-#				 'max_num_local_replica': max_num_local_replica,
-#			 }
-#		 else:
-#			 return slowest_server_time + len(worker_list) * network_overhead + \
-#					gpu_kernel_memory_latency * max_num_local_replica
-#
-#	 if is_sparse:
-#		 send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas)
-#		 receive_time = _helper(gpu_worker_list)
-#	 else:
-#		 send_time = _helper(cpu_worker_list)
-#		 if local_proxy:
-#			 receive_time = _helper(cpu_worker_list)
-#		 else:
-#			 receive_time = _helper(gpu_worker_list)
-#
-#	 if get_coef:
-#		 # return {key: send_time[key]+receive_time[key] for key in send_time.keys()}
-#		 return send_time, receive_time
-#	 else:
-#		 return send_time, receive_time
-
-
-
-
-        # def _helper(worker_list, worker_num_replicas=None):
-        #     if worker_num_replicas is None:
-        #         worker_num_replicas = [1.0] * len(worker_list)
-        #
-        #     this_server_time = 0
-        #     # network transfer: sum up all workers time. equals to the time cost of this server.
-        #     # TODO(Hao): didn't consider any parallelization among partitions
-        #     for k, worker in enumerate(worker_list):
-        #         if _resolved_devices_on_diff_machine(var.device, worker):
-        #             if var.is_sparse:
-        #                 this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
-        #             else:
-        #                 this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
-        #             this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
-        #
-        #     if get_coef:
-        #         return {
-        #             'transmission': this_server_time,
-        #             'network_overhead': len(worker_list),
-        #             'gpu_kernel_memory_latency': resource.max_num_local_replica,
-        #             'constant': 1.0,
-        #             # possible affecting factors.
-        #             'var_name': var.name,
-        #             'strategy': 'ps',
-        #             'local_proxy': var.synchronizer.local_replication,
-        #             'is_sparse': var.is_sparse,
-        #             'size_to_transfer': var_size_to_transfer,
-        #             'dtype': str(var.dtype),
-        #             # 'server_list': [partition.to_dict() for partition in server_list],
-        #             'worker_list': worker_list,
-        #             'cpu_worker_list': resource.cpu_worker_list,
-        #             'gpu_worker_list': resource.gpu_worker_list,
-        #             'worker_num_replicas': worker_num_replicas,
-        #             'max_num_local_replica': resource.max_num_local_replica,
-        #             'is_ps': True,
-        #         }
-        #     else:
-        #         return this_server_time + len(worker_list) * network_overhead + \
-        #                gpu_kernel_memory_latency * resource.max_num_local_replica
\ No newline at end of file
+        if var_item.compressor in ['PowerSGDCopmressor', 3, "HorovodCompressorEF", "HorovodCompressor", 1, 2]:
+            # These compressors always use float32 to communicate.
+            dtype = tf.float32
+        if var_item.compressor in ["PowerSGDCompressor", 3]:
+            # For PowerSGDCompessor, we hard-code the rank as 10. It will always use float32 to communicate.
+            if len(var_item.shape) > 1:
+                n = var_item.shape[0]
+                m = 1
+                for d in var_item.shape[1:]:
+                    m *= d
+                size_to_transfer = (m + n) * powersgd_rank
+
+        # We assume ring allreduce, and multiple rings will be constructed and executed serialliy to synchronize grads.
+        # In one ring, each worker exchanges grads with its next worker in parallel. Hence, the time a single ring
+        # completes is bounded by the slowest pair of workers; the total time spent for all workers to synchronize
+        # grads are bounded by the time all rings finish on the slowest pair of workers.
+        transmission_time = size_to_transfer * get_dtype_bits(dtype) / resource_item.min_bandwidth
+        factors = {
+            'transmission': transmission_time,
+            'network_overhead': 1,  # TODO(Hao): is this correct?
+            'gpu_kernel_memory_latency': resource_item.max_num_local_gpu_replica,
+            'constant': 1.0
+        }
+        return factors
diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py
index 263ebba..b73d22c 100644
--- a/autodist/strategy/auto/item.py
+++ b/autodist/strategy/auto/item.py
@@ -186,7 +186,7 @@ def bits_to_transfer(self, batch_size_per_gpu=1, seq_len=1):
                    batch_size_per_gpu * seq_len * self.size / self.original_size * get_dtype_bits(tf.int64) + \
                    2 * get_dtype_bits(tf.int64)
             return bits
-        else: # Tensor
+        else:  # Tensor
             return s * get_dtype_bits(self.dtype)
 
     @property
@@ -247,6 +247,21 @@ def synchronizer(self):
             return None
         return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer'))
 
+    @property
+    def group(self):
+        """
+        Return the group in the node config of this variable.
+
+        Returns:
+            int: group
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if self._node_config.partitioner:
+            logging.warning('This variable will be partitioned')
+            return None
+        return getattr(self.synchronizer, 'group', 0)
+
     @property
     def compressor(self):
         """
@@ -379,6 +394,20 @@ def synchronizer(self):
             raise ValueError('Partitioner field is empty for a variable partition.')
         return getattr(self._node_config, self._node_config.WhichOneOf('synchronizer'))
 
+    @property
+    def group(self):
+        """
+        Return the group in the node config of this variable.
+
+        Returns:
+            int: group
+        """
+        if not self._node_config:
+            raise ValueError('Node config is unset.')
+        if not self._node_config.partitioner:
+            raise ValueError('Partitioner field is empty for a variable partition.')
+        return getattr(self.synchronizer, 'group', 0)
+
     @property
     def compressor(self):
         """

From ef66bf7ab87e21303ffa25b908aada2e3efb33b4 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Sun, 9 Aug 2020 18:54:24 -0400
Subject: [PATCH 10/11] linear simulator done. Move simulation/search code
 under autosync scope

---
 autodist/{search => autosync}/__init__.py     |   0
 .../search}/__init__.py                       |   0
 .../{ => autosync}/search/random_search.py    |   0
 autodist/autosync/simulator/__init__.py       |   0
 autodist/{ => autosync}/simulator/base.py     |  36 +-
 .../autosync/simulator/linear_simulator.py    | 193 +++++++++
 .../simulator/predefined_simulator.py         |  36 +-
 .../simulator/rankrnn_simulator.py            |   0
 .../{ => autosync}/simulator/train_linear.py  |   0
 .../simulator/train_predefined_simulator.py   |   0
 autodist/{ => autosync}/simulator/utils.py    |   0
 autodist/simulator/linear_simulator.py        | 387 ------------------
 autodist/strategy/auto/item.py                |   2 +-
 autodist/strategy/auto_strategy.py            |   2 +-
 test.py                                       |  53 ++-
 15 files changed, 267 insertions(+), 442 deletions(-)
 rename autodist/{search => autosync}/__init__.py (100%)
 rename autodist/{simulator => autosync/search}/__init__.py (100%)
 rename autodist/{ => autosync}/search/random_search.py (100%)
 create mode 100644 autodist/autosync/simulator/__init__.py
 rename autodist/{ => autosync}/simulator/base.py (86%)
 create mode 100644 autodist/autosync/simulator/linear_simulator.py
 rename autodist/{ => autosync}/simulator/predefined_simulator.py (98%)
 rename autodist/{ => autosync}/simulator/rankrnn_simulator.py (100%)
 rename autodist/{ => autosync}/simulator/train_linear.py (100%)
 rename autodist/{ => autosync}/simulator/train_predefined_simulator.py (100%)
 rename autodist/{ => autosync}/simulator/utils.py (100%)
 delete mode 100644 autodist/simulator/linear_simulator.py

diff --git a/autodist/search/__init__.py b/autodist/autosync/__init__.py
similarity index 100%
rename from autodist/search/__init__.py
rename to autodist/autosync/__init__.py
diff --git a/autodist/simulator/__init__.py b/autodist/autosync/search/__init__.py
similarity index 100%
rename from autodist/simulator/__init__.py
rename to autodist/autosync/search/__init__.py
diff --git a/autodist/search/random_search.py b/autodist/autosync/search/random_search.py
similarity index 100%
rename from autodist/search/random_search.py
rename to autodist/autosync/search/random_search.py
diff --git a/autodist/autosync/simulator/__init__.py b/autodist/autosync/simulator/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/autodist/simulator/base.py b/autodist/autosync/simulator/base.py
similarity index 86%
rename from autodist/simulator/base.py
rename to autodist/autosync/simulator/base.py
index 19b965d..5ac04b7 100644
--- a/autodist/simulator/base.py
+++ b/autodist/autosync/simulator/base.py
@@ -70,22 +70,10 @@ def simulate(self,
                  resource_spec=None,
                  *args,
                  **kwargs):
-        """
-        Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.
-
-        Args:
-            strategy:
-            graph_item:
-            resource_spec:
-            checkpoint:
-
-        Returns:
-            float
-        """
+        """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple."""
         raise NotImplementedError()
 
-    def inference(self,
-                  features):
+    def inference(self, *args, **kwargs):
         """
         Abstract method for simulator inference.
 
@@ -98,7 +86,7 @@ def inference(self,
         """
         raise NotImplementedError()
 
-    def load_checkpoint(self, checkpoint=None):
+    def load_checkpoint(self, checkpoint):
         """
         Load a checkpoint file as weights of the simulator.
 
@@ -107,15 +95,15 @@ def load_checkpoint(self, checkpoint=None):
         """
         raise NotImplementedError()
 
-    def save_checkpoint(self, model, checkpoint):
-        """
-        Save a trained weight as a checkpoint file.
-
-        Args:
-            model: trained model.
-            checkpoint: path where to save the checkpoint.
-        """
-        raise NotImplementedError()
+    # def save_checkpoint(self, model, checkpoint):
+    #     """
+    #     Save a trained weight as a checkpoint file.
+    #
+    #     Args:
+    #         model: trained model.
+    #         checkpoint: path where to save the checkpoint.
+    #     """
+    #     raise NotImplementedError()
 
     def preprocess(self,
                    strategy,
diff --git a/autodist/autosync/simulator/linear_simulator.py b/autodist/autosync/simulator/linear_simulator.py
new file mode 100644
index 0000000..d0ee310
--- /dev/null
+++ b/autodist/autosync/simulator/linear_simulator.py
@@ -0,0 +1,193 @@
+# Copyright 2020 Petuum Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Predefined simulator with linear model."""
+import os
+import pickle as pkl
+
+import tensorflow as tf
+import numpy as np
+
+from autodist.autosync.simulator.predefined_simulator import PredefinedSimulator
+from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
+from autodist.utils import logging
+
+
+class LinearSimulator(PredefinedSimulator):
+    """Simulates strategies for a given graph and resource spec."""
+
+    def __init__(self,
+                 graph_item=None,
+                 resource_spec=None,
+                 batch_size=1,
+                 seq_len=1,
+                 checkpoint=None):
+        super(PredefinedSimulator, self).__init__(graph_item, resource_spec)
+        logging.debug('A LinearSimulator is instantiated: batch_size_per_gpu is {}'.format(batch_size))
+
+        self._batch_size_per_gpu = batch_size
+        self._seq_len = seq_len
+
+        # For loading weights of the linear model.
+        self._checkpoint = checkpoint
+        if self._checkpoint:
+            try:
+                self._weight = self.load_checkpoint(checkpoint)
+            except ValueError:
+                logging.warning('self._checkpoint is invalid')
+                self._weight = None
+
+        # TODO(Hao): add the default weights here.
+        self._default_weights = None
+
+    def simulate(self,
+                 strategy,
+                 graph_item=None,
+                 resource_spec=None,
+                 checkpoint=None,
+                 *args,
+                 **kwargs):
+        """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple.
+
+        Args:
+            strategy: the strategy to simulate.
+            graph_item: the graph_item this strategy is generated on.
+            resource_spec: the resource_spec this strategy is on.
+            checkpoint: the checkpoint to perform inference (in place of the default weight).
+
+        Returns:
+            float: the estimated cost (lower is better).
+        """
+        if not strategy:
+            raise ValueError('strategy is None.')
+        if not graph_item:
+            if not self._graph_item:
+                raise ValueError('No graph item provided.')
+            else:
+                graph_item = self._graph_item
+        if not resource_spec:
+            if not self._resource_spec:
+                raise ValueError('No resource spec provided.')
+            else:
+                resource_spec = self._resource_spec
+
+        x = self._extract_feature(strategy, graph_item, resource_spec)
+
+        # The priority of checkpoint lookup priority is:
+        # simulate(checkpoint) > self._weight > self._default_weight
+        if checkpoint:
+            weights = self.load_checkpoint(checkpoint)
+        elif self._weights:
+            weights = self._weights
+        else:
+            weights = self._default_weight
+
+        cost = self.inference(np.array(x), weights)
+        return cost
+
+    def inference(self, x, weights):
+        """
+
+        Args:
+            x: features extracts from a (strategy, graph_item, resource_spec).
+            weight: trained linear model weight.
+
+        Returns:
+            float: ranking score.
+        """
+        # if not isinstance(inputs, tf.Tensor):
+        #     inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
+
+        assert len(weights) == 2
+        W, b = weights
+        cost = np.array(W) * x.T + np.array(b)
+        return cost
+
+    def load_checkpoint(self, checkpoint):
+        """
+        Load a trained weight from a checkpoint.
+
+        Args:
+            checkpoint: the file path to a npz, or a list/array of weights.
+
+        Returns:
+            list: load weights [W, b].
+        """
+        logging.info('Loading checkpoint: {}'.format(checkpoint))
+        if isinstance(checkpoint, list):
+            assert(len(checkpoint) == 2 or len(checkpoint) == 13)
+            if len(checkpoint) == 13:
+                checkpoint = checkpoint[:11], checkpoint[12]
+            return checkpoint
+        elif isinstance(checkpoint, str):
+            if os.path.isfile(checkpoint):
+                weights = np.load(checkpoint)
+                return weights['W'], weights['b']
+        else:
+            raise ValueError('Unable to load the checkpoint: {}'.format(checkpoint))
+
+    def _extract_feature(self,
+                         strategy,
+                         graph_item,
+                         resource_spec):
+        """Get the feature vector as input to the linear model."""
+        var_name_to_items, resource_item, var_name_to_sync_time = \
+            self.extract_prefeature(strategy, graph_item, resource_spec)
+
+        feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
+        ps_server_sync_time = {}
+        cc_group_sync_time = {}
+
+        for var_name, var_item in var_name_to_items.items():
+            sync_time = var_name_to_sync_time[var_name]
+
+            # Extract per-server and per-group sync time.
+            if isinstance(var_item.synchronizer, PSSynchronizer):
+                server = var_item.device
+                if server not in ps_server_sync_time:
+                    ps_server_sync_time[server] = {key: 0.0 for key in feature_keys}
+                for key in feature_keys:
+                    ps_server_sync_time[server][key] += sync_time[0][key] + sync_time[1][key]
+            elif isinstance(var_item.synchronizer, AllReduceSynchronizer):
+                group = var_item.group
+                if group not in cc_group_sync_time:
+                    cc_group_sync_time[group] = {key: 0.0 for key in feature_keys}
+                for key in feature_keys:
+                    cc_group_sync_time[group][key] += sync_time[key]
+            else:
+                raise ValueError('Unrecognized type of synchronizer: {}'.format(type(var_item.synchronizer)))
+
+            # Different from predefined modeling, we transform these into feature vectors in this simulator.
+            # We care about the sum time of all servers/groups, or the slowest (max) server/group.
+            max_ps_server_sync_time = {key: 0.0 for key in feature_keys}
+            sum_ps_server_sync_time = {key: 0.0 for key in feature_keys}
+            max_cc_group_sync_time = {key: 0.0 for key in feature_keys}
+            sum_cc_group_sync_time = {key: 0.0 for key in feature_keys}
+
+            for key in feature_keys:
+                max_ps_server_sync_time[key] = \
+                    max([sync_time[key] for sync_time in ps_server_sync_time.values()] or [0.0])
+                sum_ps_server_sync_time[key] = \
+                    sum([sync_time[key] for sync_time in ps_server_sync_time.values()] or [0.0])
+                max_cc_group_sync_time[key] = \
+                    max([sync_time[key] for sync_time in cc_group_sync_time.values()] or [0.0])
+                sum_cc_group_sync_time[key] = \
+                    sum([sync_time[key] for sync_time in cc_group_sync_time.values()] or [0.0])
+
+            # concat them to get the feature.
+            x = [max_ps_server_sync_time[key] for key in feature_keys] + \
+                [sum_ps_server_sync_time[key] for key in feature_keys] + \
+                [max_cc_group_sync_time[key] for key in feature_keys] + \
+                [sum_cc_group_sync_time[key] for key in feature_keys]
+            return x
diff --git a/autodist/simulator/predefined_simulator.py b/autodist/autosync/simulator/predefined_simulator.py
similarity index 98%
rename from autodist/simulator/predefined_simulator.py
rename to autodist/autosync/simulator/predefined_simulator.py
index 0e3f60b..b05b50d 100644
--- a/autodist/simulator/predefined_simulator.py
+++ b/autodist/autosync/simulator/predefined_simulator.py
@@ -19,9 +19,8 @@
 import tensorflow as tf
 
 from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
-from autodist.simulator.base import SimulatorBase
-from autodist.simulator.utils import on_same_host, \
-    get_dtype_bits
+from autodist.autosync.simulator.base import SimulatorBase
+from autodist.autosync.simulator.utils import on_same_host, get_dtype_bits
 from autodist.utils import logging
 
 
@@ -82,6 +81,19 @@ def simulate(self,
         Returns:
             float: the estimated runtime (lower is better).
         """
+        if not strategy:
+            raise ValueError('strategy is None.')
+        if not graph_item:
+            if not self._graph_item:
+                raise ValueError('No graph item provided.')
+            else:
+                graph_item = self._graph_item
+        if not resource_spec:
+            if not self._resource_spec:
+                raise ValueError('No resource spec provided.')
+            else:
+                resource_spec = self._resource_spec
+
         var_name_to_items, resource_item, var_name_to_sync_time = \
             self.extract_prefeature(strategy, graph_item, resource_spec)
 
@@ -135,8 +147,8 @@ def simulate(self,
 
     def extract_prefeature(self,
                            strategy,
-                           graph_item=None,
-                           resource_spec=None):
+                           graph_item,
+                           resource_spec):
         """
         Extract impacting factors of the communication time for each variable.
 
@@ -148,18 +160,6 @@ def extract_prefeature(self,
         Returns:
             Dict: A dict of variable name (str) to impacting factors (dict).
         """
-        if not strategy:
-            raise ValueError('strategy is None.')
-        if not graph_item:
-            if not self._graph_item:
-                raise ValueError('No graph item provided.')
-            else:
-                graph_item = self._graph_item
-        if not resource_spec:
-            if not self._resource_spec:
-                raise ValueError('No resource spec provided.')
-            else:
-                resource_spec = self._resource_spec
         # TODO(Hao): need to make sure the (strategy, graph_item, resource_spec) match each other.
         # construct the meta objects
         name_to_items, resource_item = self.preprocess(strategy, graph_item, resource_spec)
@@ -173,7 +173,7 @@ def extract_prefeature(self,
                 var_sync_time[var_name] = self.var_ar_time(var_item, resource_item)
             else:
                 raise ValueError('{}'.format(type(var_item.synchronizer)))
-        return var_sync_time
+        return name_to_items, resource_item, var_sync_time
 
     def var_ps_time(self,
                     var_item,
diff --git a/autodist/simulator/rankrnn_simulator.py b/autodist/autosync/simulator/rankrnn_simulator.py
similarity index 100%
rename from autodist/simulator/rankrnn_simulator.py
rename to autodist/autosync/simulator/rankrnn_simulator.py
diff --git a/autodist/simulator/train_linear.py b/autodist/autosync/simulator/train_linear.py
similarity index 100%
rename from autodist/simulator/train_linear.py
rename to autodist/autosync/simulator/train_linear.py
diff --git a/autodist/simulator/train_predefined_simulator.py b/autodist/autosync/simulator/train_predefined_simulator.py
similarity index 100%
rename from autodist/simulator/train_predefined_simulator.py
rename to autodist/autosync/simulator/train_predefined_simulator.py
diff --git a/autodist/simulator/utils.py b/autodist/autosync/simulator/utils.py
similarity index 100%
rename from autodist/simulator/utils.py
rename to autodist/autosync/simulator/utils.py
diff --git a/autodist/simulator/linear_simulator.py b/autodist/simulator/linear_simulator.py
deleted file mode 100644
index 5dc2e6b..0000000
--- a/autodist/simulator/linear_simulator.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright 2020 Petuum Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Predefined simulator with linear model."""
-
-import pickle as pkl
-
-import tensorflow as tf
-from tensorflow.python.eager import context
-
-from autodist.proto.synchronizers_pb2 import PSSynchronizer, AllReduceSynchronizer
-from autodist.resource_spec import ResourceSpec
-from autodist.simulator.base import SimulatorBase
-from autodist.simulator.utils import _resolved_devices_on_diff_machine, \
-	get_dense_var_bits, get_sparse_var_bits
-from autodist.strategy.base import Strategy
-
-
-class LinearSimulator(SimulatorBase):
-	"""Simulates strategies for a given graph and resource spec."""
-
-	def __init__(self,
-				 graph_item=None,
-				 resource_spec=None,
-				 batch_size=1,
-				 seq_len=1,
-				 get_coef=True,
-				 checkpoint=None):
-
-		super(PredefinedSimulator, self).__init__(original_graph_item_path=original_graph_item_path)
-
-		print("It's using predefined simulator. batch_size_per_gpu is {}".format(batch_size))
-		self._fetches = fetches
-		self._batch_size_per_gpu = batch_size
-		self._seq_len = seq_len
-		self._get_coef = get_coef
-		self._checkpoint = checkpoint
-		self._weights = None
-		with context.eager_mode():
-			if self._checkpoint:
-				self._weights = self.load_checkpoint(self._checkpoint)
-
-	def simulate(self, strategy: Strategy, resource_spec: ResourceSpec, checkpoint=None):
-		"""Return simulated runtime value."""
-		inputs = self.create_features(strategy, resource_spec)
-		with context.eager_mode():
-			cost = self.inference(inputs, checkpoint)
-		return cost
-
-	def inference(self, inputs, checkpoint=None):
-		if checkpoint is not None:
-			weights = self.load_checkpoint(checkpoint)
-		elif self._weights is not None:
-			weights = self._weights
-		else:
-			raise ValueError("No checkpoint provided in either initialization or inference.")
-
-		if not isinstance(inputs, tf.Tensor):
-			inputs = tf.reshape(tf.convert_to_tensor(inputs), [1, len(inputs)])
-
-		if len(weights) == 4:
-			W0, b0, W, b = weights
-			inputs = tf.nn.elu(tf.matmul(inputs, W0) + b0)
-			cost = tf.matmul(inputs, W) + b
-		elif len(weights) == 2:
-			W, b = weights
-			cost = tf.matmul(inputs, W) + b
-		else:
-			raise ValueError
-		return cost
-
-	def load_checkpoint(self, checkpoint=None):
-		if checkpoint is None:
-			if self._checkpoint is not None:
-				checkpoint = self._checkpoint
-			else:
-				raise ValueError("checkpoint is None: {}".format(checkpoint))
-		self._weights = pkl.load(open(checkpoint, 'rb'))
-		# self._weights = json.load(open(checkpoint, 'r'))
-		print("Load checkpoint: ")
-		print(self._weights)
-		return self._weights
-
-	def save_checkpoint(self, model, checkpoint):
-		pkl.dump(model, open(checkpoint, 'wb'))
-		self._checkpoint = checkpoint
-		self._weights = model
-
-	def create_features_v0(self, strategy: Strategy, resource_spec: ResourceSpec):
-		var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
-
-		# Add up sync time per device to find the slowest server time.
-		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
-		device_ps_sync_time = {}
-		var_ar_sync_time = {}
-		for var_name, sync_time in var_sync_time.items():
-			if isinstance(vars[var_name].synchronizer, PSSynchronizer):
-				device = vars[var_name].device
-				if device not in device_ps_sync_time:
-					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
-
-			else: # AllReduce
-				if var_name not in var_ar_sync_time:
-					var_ar_sync_time[var_name] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					var_ar_sync_time[var_name][key] += sync_time[key]
-
-		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		sum_var_ar_sync_time = {key: 0.0 for key in feature_keys}
-		for key in feature_keys:
-			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			sum_var_ar_sync_time[key] = sum([d[key] for d in var_ar_sync_time.values()] or [0.0])
-
-		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
-		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
-		       + [sum_var_ar_sync_time[key] for key in feature_keys]
-
-		return feat
-
-	def create_features(self, strategy: Strategy, resource_spec: ResourceSpec):
-		# var_sync_time, vars, resource = self.predefined_sync_time(strategy, resource_spec)
-
-		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
-
-		feature_keys = ['transmission', 'network_overhead', 'gpu_kernel_memory_latency']
-		device_ps_sync_time = {}
-		group_ar_sync_time = {}
-
-		for var_name, var in vars.items():
-			if isinstance(var.synchronizer, PSSynchronizer):
-				sync_time = self.var_ps_time(var, resource)
-				device = vars[var_name].device
-				if device not in device_ps_sync_time:
-					device_ps_sync_time[device] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					device_ps_sync_time[device][key] += sync_time[0][key] + sync_time[1][key]
-			elif isinstance(var.synchronizer, AllReduceSynchronizer):
-				sync_time = self.var_ar_time(var, resource)
-				var_group = sync_time['group']
-				if var_group not in group_ar_sync_time:
-					group_ar_sync_time[var_group] = {key: 0.0 for key in feature_keys}
-				for key in feature_keys:
-					group_ar_sync_time[var_group][key] += sync_time[key]
-			else:
-				raise ValueError('{}'.format(type(var.synchronizer)))
-
-		max_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		sum_device_ps_sync_time = {key: 0.0 for key in feature_keys}
-		max_group_ar_sync_time = {key: 0.0 for key in feature_keys}
-		sum_group_ar_sync_time = {key: 0.0 for key in feature_keys}
-		for key in feature_keys:
-			max_device_ps_sync_time[key] = max([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			sum_device_ps_sync_time[key] = sum([d[key] for d in device_ps_sync_time.values()] or [0.0])
-			max_group_ar_sync_time[key] = max([d[key] for d in group_ar_sync_time.values()] or [0.0])
-			sum_group_ar_sync_time[key] = sum([d[key] for d in group_ar_sync_time.values()] or [0.0])
-
-		feat = [max_device_ps_sync_time[key] for key in feature_keys] \
-		       + [sum_device_ps_sync_time[key] for key in feature_keys] \
-		       + [max_group_ar_sync_time[key] for key in feature_keys] \
-		       + [sum_group_ar_sync_time[key] for key in feature_keys]
-
-		return feat
-
-	def predefined_sync_time(self, strategy, resource_spec):
-		""" graph_item: transformed graph item """
-		vars, resource = self.preprocess(strategy=strategy, resource_spec=resource_spec)
-		# Compute synchronization time for every var
-		var_sync_time = {}
-		for var_name, var in vars.items():
-			if isinstance(var.synchronizer, PSSynchronizer):
-				var_sync_time[var_name] = self.var_ps_time(var, resource)
-			elif isinstance(var.synchronizer, AllReduceSynchronizer):
-				var_sync_time[var_name] = self.var_ar_time(var, resource)
-			else:
-				raise ValueError('{}'.format(type(var.synchronizer)))
-		return var_sync_time, vars, resource
-
-	def var_ps_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-		"""Compute synchronization time of a variable in PS strategy."""
-		def _helper(worker_list, worker_num_replicas=None):
-			if worker_num_replicas is None:
-				worker_num_replicas = [1.0] * len(worker_list)
-
-			this_server_time = 0
-			# network transfer: sum up all workers time. equals to the time cost of this server.
-			# TODO(Hao): didn't consider any parallelization among partitions
-			for k, worker in enumerate(worker_list):
-				if _resolved_devices_on_diff_machine(var.device, worker):
-					if var.is_sparse:
-						this_worker_size = get_sparse_var_bits(var_size_to_transfer) * worker_num_replicas[k]
-					else:
-						this_worker_size = get_dense_var_bits(var_size_to_transfer, var.dtype)
-					this_server_time += this_worker_size / resource.network_bandwidth[var.device][worker]
-
-			if self._get_coef:
-				return {
-					'transmission': this_server_time,
-					'network_overhead': len(worker_list),
-					'gpu_kernel_memory_latency': resource.max_num_local_replica,
-					'constant': 1.0,
-					# possible affecting factors.
-					'var_name': var.name,
-					'strategy': 'ps',
-					'local_proxy': var.synchronizer.local_replication,
-					'is_sparse': var.is_sparse,
-					'size_to_transfer': var_size_to_transfer,
-					'dtype': str(var.dtype),
-					# 'server_list': [partition.to_dict() for partition in server_list],
-					'worker_list': worker_list,
-					'cpu_worker_list': resource.cpu_worker_list,
-					'gpu_worker_list': resource.gpu_worker_list,
-					'worker_num_replicas': worker_num_replicas,
-					'max_num_local_replica': resource.max_num_local_replica,
-					'is_ps': True,
-				}
-			else:
-				return this_server_time + len(worker_list) * network_overhead + \
-					   gpu_kernel_memory_latency * resource.max_num_local_replica
-
-		var_size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-													seq_len=self._seq_len)
-
-		if var.is_sparse:
-			send_time = _helper(resource.cpu_worker_list, worker_num_replicas=resource.worker_num_replicas)
-			receive_time = _helper(resource.gpu_worker_list)
-		else:
-			send_time = _helper(resource.cpu_worker_list)
-			if var.synchronizer.local_replication:
-				receive_time = _helper(resource.cpu_worker_list)
-			else:
-				receive_time = _helper(resource.gpu_worker_list)
-
-		return send_time, receive_time
-
-	def var_ar_time(self, var, resource, network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-		"""Compute synchronization time of a variable in AR strategy."""
-		worker_list = resource.cpu_worker_list
-		num_workers = len(worker_list)
-		min_bandwidth = None
-		for i in range(num_workers):
-			for j in range(i, num_workers):
-				if min_bandwidth is None:
-					min_bandwidth = resource.network_bandwidth[worker_list[j]][worker_list[i]]
-				else:
-					min_bandwidth = min(min_bandwidth, resource.network_bandwidth[worker_list[j]][worker_list[i]])
-
-		# Compressor
-		if var.compressor == "PowerSGDCompressor" or var.compressor == 3:
-			rank = 10  # currently using default value. So hardcode here. # todo: confirm
-			# assume var must be a dense variable.
-			og_shape = var.shape
-			ndims = len(og_shape)
-			if ndims <= 1:  # no compress
-				size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-														seq_len=self._seq_len)
-			else:
-				if ndims > 2:
-					n = og_shape[0]
-					m = 1
-					for s in og_shape[1:]:
-						m *= s  # tensor's shape (n, m)
-				else:
-					n, m = og_shape[0], og_shape[1]
-				size_to_transfer = n * rank + m * rank
-			dtype = tf.float32
-		elif var.compressor == "HorovodCompressorEF" or var.compressor == "HorovodCompressor"  \
-				or var.compressor == 2  or var.compressor == 1:
-			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-													seq_len=self._seq_len)
-			dtype = tf.float32
-		elif var.compressor == "NoneCompressor" or var.compressor == 0:
-			size_to_transfer = var.size_to_transfer(batch_size_per_gpu=self._batch_size_per_gpu,
-													seq_len=self._seq_len)
-			dtype = var.dtype
-		else:
-			raise ValueError('Compressor does not exist: {}'.format(var.compressor))
-
-		# todo: chunk_size
-		# AllReduce communication time
-		# time = 2 * (num_workers - 1) * get_dense_var_bits(size_to_transfer, dtype) / (min_bandwidth * num_workers)
-		time = get_dense_var_bits(size_to_transfer, dtype) / min_bandwidth
-
-		if self._get_coef:
-			return {
-				'transmission': time,
-				'network_overhead': 1,  # len(worker_list),
-				'gpu_kernel_memory_latency': resource.max_num_local_replica,
-				'constant': 1.0,
-				# possible affecting factors.
-				'var_name': var.name,
-				'group': var.synchronizer.group,
-				'strategy': 'allreduce',
-				'is_sparse': False,
-				# 'chunk_size': chunk_size,
-				'spec': 'NCCL',  # default
-				'compressor': var.compressor,
-				'worker_list': worker_list,
-				'num_workers': num_workers,
-				'size_to_transfer': size_to_transfer,
-				'dtype': str(dtype),
-				'min_bandwidth': min_bandwidth,
-				'max_num_local_replica': resource.max_num_local_replica,
-				'is_ps': False,
-			}
-		else:
-			return time + network_overhead * len(worker_list) \
-			       + gpu_kernel_memory_latency * resource.max_num_local_replica
-
-
-
-	# @staticmethod
-	# def var_ps_time(var_name, is_sparse, local_proxy, server_list, cpu_worker_list, gpu_worker_list,
-	#				 max_num_local_replica, worker_num_replicas, network_bandwidth, get_coef,
-	#				 network_overhead=0.0, gpu_kernel_memory_latency=0.0):
-	#	 """Compute synchrinzation time of a variable in PS strategy."""
-	#
-	#	 def _helper(worker_list, worker_num_replicas=None):
-	#		 if worker_num_replicas is None:
-	#			 worker_num_replicas = [1.0] * len(worker_list)
-	#		 # Compute the slowest server
-	#		 slowest_server_time = 0
-	#		 for j, server in enumerate(server_list):
-	#			 if server.size_to_transfer == 0:
-	#				 continue
-	#			 # network transfer: sum up all workers time. equals to the time cost of this server.
-	#			 this_server_time = 0
-	#			 for k, worker in enumerate(worker_list):
-	#				 if _resolved_devices_on_diff_machine(server.device, worker):
-	#					 if is_sparse:
-	#						 this_worker_size = get_sparse_var_bits(server.size_to_transfer) * worker_num_replicas[k]
-	#					 else:
-	#						 this_worker_size = get_dense_var_bits(server.size_to_transfer, server.dtype)
-	#					 this_server_time += this_worker_size / network_bandwidth[server.device][worker]
-	#			 slowest_server_time = max(slowest_server_time, this_server_time)
-	#
-	#		 if get_coef:
-	#			 return {
-	#				 'transmission': slowest_server_time,
-	#				 'network_overhead': len(worker_list),
-	#				 'gpu_kernel_memory_latency': max_num_local_replica,
-	#				 'constant': 1.0,
-	#				 # possible affecting factors.
-	#				 'var_name': var_name,
-	#				 'strategy': 'ps',
-	#				 'local_proxy': local_proxy,
-	#				 'is_sparse': is_sparse,
-	#				 'server_list': [partition.to_dict() for partition in server_list],
-	#				 'worker_list': worker_list,
-	#				 'cpu_worker_list': cpu_worker_list,
-	#				 'gpu_worker_list': gpu_worker_list,
-	#				 'worker_num_replicas': worker_num_replicas,
-	#				 'max_num_local_replica': max_num_local_replica,
-	#			 }
-	#		 else:
-	#			 return slowest_server_time + len(worker_list) * network_overhead + \
-	#					gpu_kernel_memory_latency * max_num_local_replica
-	#
-	#	 if is_sparse:
-	#		 send_time = _helper(cpu_worker_list, worker_num_replicas=worker_num_replicas)
-	#		 receive_time = _helper(gpu_worker_list)
-	#	 else:
-	#		 send_time = _helper(cpu_worker_list)
-	#		 if local_proxy:
-	#			 receive_time = _helper(cpu_worker_list)
-	#		 else:
-	#			 receive_time = _helper(gpu_worker_list)
-	#
-	#	 if get_coef:
-	#		 # return {key: send_time[key]+receive_time[key] for key in send_time.keys()}
-	#		 return send_time, receive_time
-	#	 else:
-	#		 return send_time, receive_time
diff --git a/autodist/strategy/auto/item.py b/autodist/strategy/auto/item.py
index b73d22c..94c316b 100644
--- a/autodist/strategy/auto/item.py
+++ b/autodist/strategy/auto/item.py
@@ -25,7 +25,7 @@
 from autodist.strategy.base import byte_size_load_fn
 from autodist.utils import logging
 from autodist.cluster import SSHCluster
-from autodist.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits
+from autodist.autosync.simulator.utils import GPU_TO_CPU_BANDWIDTH, GIGABITS, get_dtype_bits
 
 
 class VarType(Enum):
diff --git a/autodist/strategy/auto_strategy.py b/autodist/strategy/auto_strategy.py
index 354d62d..3b215ee 100644
--- a/autodist/strategy/auto_strategy.py
+++ b/autodist/strategy/auto_strategy.py
@@ -15,7 +15,7 @@
 """An AutoStrategy using a trained linear simulator."""
 
 from autodist.strategy.auto.base import AutoStrategyBase
-from autodist.simulator.linear_simulator import LinearSimulator
+from autodist.autosync.simulator.linear_simulator import LinearSimulator
 
 class AutoStrategy(AutoStrategyBase):
     """
diff --git a/test.py b/test.py
index b481208..7c6be13 100644
--- a/test.py
+++ b/test.py
@@ -1,17 +1,48 @@
-from arion.simulator.simulator import Simulator
-from arion.strategy import base
-from arion.graph_item import GraphItem
+import tensorflow as tf
+import autodist
 
-resource_spec_file = '/home/hao.zhang/project/pycharm/ncf-trial/official/recommendation/trial/trialrun_resource_specs/resource_spec_2.yml'
-strategy_path = '/home/hao.zhang/oceanus_simulator/ncf_3/strategies/20200505T174311M650364'
-original_graph_item_path = '/home/hao.zhang/oceanus_simulator/ncf/strategies/original_graph_item'
+with tf.Graph().as_default(), autodist.scope():
+##########################################################################
 
-s = base.Strategy.deserialize(strategy_path)
+    train_dataset = tf.data.Dataset.from_tensor_slices(
+        (train_images, train_labels)).repeat(EPOCHS).shuffle(len(train_images)//2).batch(BATCH_SIZE)
 
+    train_iterator = tf.compat.v1.data.make_one_shot_iterator(train_dataset).get_next()
 
-simulator = Simulator(resource_file=resource_spec_file,
-                      original_graph_item_path=original_graph_item_path)
+    model = tf.keras.Sequential([
+        tf.keras.layers.Conv2D(32, 3, activation='relu'),
+        tf.keras.layers.MaxPooling2D(),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dropout(0.1),
+        tf.keras.layers.Dense(10, activation='softmax')
+    ])
+    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
+    optimizer = tf.keras.optimizers.SGD()
 
-ret = simulator.simulate(s)
+    def train_step(inputs):
+        x, y = inputs
+        y_hat = model(x, training=True)
+        loss = loss_fn(y, y_hat)
+        all_vars = []
+        for v in model.trainable_variables:
+            all_vars.append(v)
+        grads = tf.gradients(loss, all_vars)
+        update = optimizer.apply_gradients(zip(grads, all_vars))
 
-print('finished')
+        return loss, update
+
+    fetches = train_step(train_iterator)
+    #####################################################################
+    # Change 3: Create distributed session.
+    #   Instead of using the original TensorFlow session for graph execution,
+    #   let's use AutoDist's distributed session, in which a computational
+    #   graph for distributed training is constructed.
+    #
+    # [original line]
+    # >>> sess = tf.compat.v1.Session()
+    #
+    sess = autodist.create_distributed_session()
+    #####################################################################
+    for _ in range(min(10, len(train_images) // BATCH_SIZE * EPOCHS)):
+        loss, _ = sess.run(fetches)
+        print(f"train_loss: {loss}")
\ No newline at end of file

From 0737cb59a4bb29fb61d807223ac61be9069789d9 Mon Sep 17 00:00:00 2001
From: Hao Zhang <hao.zhang@petuum.com>
Date: Mon, 10 Aug 2020 02:25:55 -0400
Subject: [PATCH 11/11] remove search code for now

---
 autodist/autosync/search/__init__.py      |   0
 autodist/autosync/search/random_search.py | 336 ----------------------
 autodist/autosync/simulator/base.py       | 295 +++++++++----------
 tests/test_simulator.py                   |  26 +-
 4 files changed, 146 insertions(+), 511 deletions(-)
 delete mode 100644 autodist/autosync/search/__init__.py
 delete mode 100644 autodist/autosync/search/random_search.py

diff --git a/autodist/autosync/search/__init__.py b/autodist/autosync/search/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/autodist/autosync/search/random_search.py b/autodist/autosync/search/random_search.py
deleted file mode 100644
index 38fcd67..0000000
--- a/autodist/autosync/search/random_search.py
+++ /dev/null
@@ -1,336 +0,0 @@
-import json
-import time
-from multiprocessing import Process, Queue
-
-import copy
-import numpy as np
-import os
-
-from arion.const import DEFAULT_RANDOM_SEARCH_DIR
-from arion.graph_item import GraphItem
-from arion.resource_spec import ResourceSpec
-from arion.strategy import RandomStrategy, AllReduce
-from arion.utils import logging
-
-
-def build_worker(queue, builder, gi, rs):
-    np.random.seed()
-    ret = builder.build(gi, rs)
-    queue.put(ret)
-
-def get_resource_specs(trial_resource_spec_dir):
-    resource_specs = []
-    if os.path.isdir(trial_resource_spec_dir):
-        for file_name in os.listdir(trial_resource_spec_dir):
-            file_path = os.path.join(trial_resource_spec_dir, file_name)
-            if os.path.isfile(file_path) and file_path.endswith('.yml'):
-                resource_specs.append(file_path)
-    elif os.path.isfile(trial_resource_spec_dir):
-        resource_specs.append(trial_resource_spec_dir)
-    else:
-        raise ValueError("Cannot find valid files in {}".format(trial_resource_spec_dir))
-    return resource_specs
-
-
-def get_strategies(strategies_dir):
-    strategies = []
-    if os.path.isdir(strategies_dir):
-        for file_name in os.listdir(strategies_dir):
-            file_path = os.path.join(strategies_dir, file_name)
-            if os.path.isfile(file_path) and file_path.split('/')[-1].startswith('2020'):
-                strategies.append(file_path)
-    elif os.path.isfile(strategies_dir):
-        strategies.append(strategies_dir)
-    else:
-        raise ValueError("Cannot find valid files in {}".format(strategies_dir))
-    return strategies
-
-
-class RandomSearch:
-    def __init__(self,
-                 space,
-                 heuristics,
-                 search_params,
-                 original_graph_item_path,
-                 resource_file,
-                 simulator=None,
-                 trial_run_fn=None):
-
-        self.space = space
-        self.heuristics = heuristics
-        self.search_params = search_params
-
-        self.original_graph_item_path = original_graph_item_path
-        self.resource_file = resource_file
-
-        self.simulator = simulator
-        self.trial_run_fn = trial_run_fn
-
-        self._resource_spec = ResourceSpec(self.resource_file)
-        self._original_graph_item = GraphItem.deserialize(original_graph_item_path)
-
-    def search(self):
-        # candidates, scores, features = self.propose(self.search_params['num_candidate_explore'])
-        candidates, scores, features = self.batch_propose(self.search_params['num_candidate_explore'])
-        n_pick = self.search_params['num_candidate_per_trial']
-
-        # cast them to be np arrays
-        if self.search_params['diversity_metric'] == 'embedding':
-            picked_candidates = self.submodular_pick_by_embedding(np.array(scores),
-                                                                  candidates,
-                                                                  np.stack(features),
-                                                                  n_pick,
-                                                                  self.search_params['simulation_weight'],
-                                                                  self.search_params['diversity_weight'])
-        elif self.search_params['diversity_metric'] == 'expression':
-            picked_candidates = self.submodular_pick_by_expression(np.array(scores),
-                                                                   candidates,
-                                                                   n_pick,
-                                                                   self.search_params['simulation_weight'],
-                                                                   self.search_params['diversity_weight'])
-        else:
-            raise ValueError('Unrecognized diversity metric...')
-        if self.trial_run_fn:
-            self.trial_run(picked_candidates, search_iteration=0)
-
-    def propose(self, num_proposal, use_simulator=True):
-        builder = RandomStrategy(self.space, self.heuristics)
-        candidates = []
-        features = []
-        scores = []
-        # np.random.seed(1)
-        idx = 0
-
-        while len(candidates) < num_proposal:
-            logging.info('Sampling strategy {}'.format(idx))
-            start_time = time.time()
-            expr = builder.build(self._original_graph_item, self._resource_spec)
-            elapsed = time.time() - start_time
-            logging.info('Sampling strategy takes {}'.format(elapsed))
-            builder.reset()
-            idx += 1
-            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
-            if self.simulator and use_simulator:
-                start_time = time.time()
-                score, feature = self.simulator.simulate(expr, self._resource_spec)
-                elapsed = time.time() - start_time
-                logging.info('Inference strategy takes {}'.format(elapsed))
-                if score > self.search_params['rejection_score']:
-                    logging.info('strategy {} has score {} > {}, '
-                                 'rejected..'.format(idx, score, self.search_params['rejection_score']))
-                    continue
-                else:
-                    candidates.append(expr)
-                    features.append(feature)
-                    scores.append(score[0])
-            else:
-                candidates.append(expr)
-                features.append([])
-                scores.append(0)
-        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
-        return candidates, scores, features
-
-    def batch_propose(self, num_proposal, batch_size=32, use_simulator=True):
-
-        builders = [RandomStrategy(self.space, self.heuristics) for _ in range(batch_size)]
-        graph_items = [self._original_graph_item for _ in range(batch_size)]
-        rss = [ResourceSpec(self.resource_file) for _ in range(batch_size)]
-        candidates = []
-        features = []
-        scores = []
-        # np.random.seed(1)
-        idx = 0
-
-        while len(candidates) < num_proposal:
-            logging.info('Sampling strategy {}'.format(idx))
-            start_time = time.time()
-
-            q = Queue()
-            exprs = []
-            prs = []
-            for obj, arg1, arg2 in zip(builders, graph_items, rss):
-                prs.append(Process(target=build_worker, args=(q, obj, arg1, arg2)))
-                prs[-1].start()
-            for pr in prs:
-                expr = q.get() # will block
-                exprs.append(expr)
-            for pr in prs:
-                pr.join()
-
-            elapsed = time.time() - start_time
-            logging.info('Sampling strategy takes {}'.format(elapsed))
-            for builder in builders: builder.reset() 
-            logging.info('Progress {}/{}'.format(len(candidates), num_proposal))
-            if self.simulator and use_simulator:
-                start_time = time.time()
-                batch_score, batch_feature = self.simulator.simulate(exprs, rss)
-                elapsed = time.time() - start_time
-                logging.info('Inference strategy takes {}'.format(elapsed))
-                for ite, expr in enumerate(exprs):
-                    # print(batch_score[ite], batch_feature[ite].shape)
-                    if batch_score[ite] > self.search_params['rejection_score']:
-                        logging.info('strategy {} has score {} > {}, '
-                                     'rejected..'.format(idx+ite, batch_score[ite], self.search_params['rejection_score']))
-                    else:
-                        candidates.append(expr)
-                        features.append(batch_feature[ite])
-                        scores.append(batch_score[ite])
-            else:
-                for ite, expr in enumerate(exprs):
-                    candidates.append(expr)
-                    features.append([])
-                    scores.append(0)
-            idx += batch_size
-        logging.info('rejection ratio: {}'.format(1 - num_proposal / float(idx)))
-        return candidates[:num_proposal], scores[:num_proposal], features[:num_proposal]
-
-    def submodular_pick_by_embedding(self,
-                                     scores,
-                                     candidates,
-                                     candidate_features,
-                                     n_pick,
-                                     beta=1.0,
-                                     alpha=1.0):
-        n = len(scores)
-        assert n == len(candidate_features)
-
-        ret = []
-        sim = np.dot(candidate_features, candidate_features.T)
-        remain = list(range(len(scores)))
-
-        for _ in range(n_pick):
-            tmp_delta = -scores[remain] * beta
-            if len(ret) > 0:
-                tmp_delta -= alpha * (sim[remain, :][:, ret]).mean(1)
-            max_x = tmp_delta.argmax()
-            max_x = remain[max_x]
-
-            ret.append(max_x)
-            remain.remove(max_x)
-
-        return [candidates[i] for i in ret]
-
-    def submodular_pick_by_expression(self,
-                                      scores,
-                                      candidates,
-                                      n_pick,
-                                      beta=1.0,
-                                      alpha=1.0):
-
-        def remove_group_or_reduction_destination(strategy):
-            tmp_strategy = copy.deepcopy(strategy)
-            for node in tmp_strategy.node_config:
-                if node.partitioner:
-                    for part in node.part_config:
-                        synchronizer = getattr(part, part.WhichOneof('synchronizer'))
-                        if hasattr(synchronizer, 'reduction_destination'):
-                            synchronizer.reduction_destination = ''
-                        else:
-                            synchronizer.group = 0
-                else:
-                    synchronizer = getattr(node, node.WhichOneof('synchronizer'))
-                    if hasattr(synchronizer, 'reduction_destination'):
-                        synchronizer.reduction_destination = ''
-                    else:
-                        synchronizer.group = 0
-            return tmp_strategy
-
-        def estimate_difference(strategy, node_config_set):
-            score = 0
-            for i, node in enumerate(strategy.node_config):
-                if_seen = False
-                for seen_node in node_config_set[i]:
-                    if seen_node == node:
-                        if_seen = True
-                        break
-                if not if_seen:
-                    score += 1
-            return score
-
-        assert len(scores) == len(candidates)
-
-        node_config_set = [list() for _ in candidates[0].node_config]
-        remain = list(range(len(scores)))
-        ret = []
-        for _ in range(n_pick):
-            max_x = -1
-            max_delta = -1e9
-            max_strategy_copy = None
-
-            for x in remain:
-                tmp_strategy = remove_group_or_reduction_destination(candidates[x])
-                diff_score = estimate_difference(tmp_strategy, node_config_set)
-                assert(diff_score <= len(tmp_strategy.node_config))
-                # print('diff score {}..'.format(diff_score))
-                tmp_delta = - scores[x] * beta + diff_score * alpha
-                if tmp_delta > max_delta:
-                    max_delta, max_x, max_strategy_copy = tmp_delta, x, tmp_strategy
-                    max_diff_score = diff_score *alpha
-                    max_simulation_score= -scores[x]
-
-            print('Add one candidate with max score: {}, {}, {}'.format(max_simulation_score, max_diff_score, max_delta))
-            ret.append(max_x)
-            remain.remove(max_x)
-
-            # update the node config set
-            for i, node in enumerate(max_strategy_copy.node_config):
-                if_seen = False
-                for seen_node in node_config_set[i]:
-                    if seen_node == node:
-                        if_seen = True
-                        break
-                if not if_seen:
-                    node_config_set[i].append(node)
-
-        return [candidates[i] for i in ret]
-
-    def trial_run(self,
-                  candidate_strategies=None,
-                  search_iteration=0):
-        # serialize all candidates to folder
-        target_dir = os.path.join(DEFAULT_RANDOM_SEARCH_DIR, str(search_iteration))
-        os.makedirs(target_dir, exist_ok=False)
-        self._serialize_candidate_strategies(candidate_strategies, target_dir)
-        self._save_hyperparams(target_dir)
-
-        # launch trial run
-        self._launch_trial_run(target_dir)
-
-    @staticmethod
-    def _serialize_candidate_strategies(candidate_strategies, target_dir):
-        for strategy in candidate_strategies:
-            path = os.path.join(target_dir, strategy.id)
-            strategy.serialize(path)
-
-    def _launch_trial_run(self, strategies_dir):
-        strategies = get_strategies(strategies_dir)
-
-        # this will launch distributed processes and take very long
-        self.trial_run_fn([self.resource_file], strategies)
-
-    def _save_hyperparams(self, target_dir):
-        # copy the constraint file as well
-        space_file = os.path.join(target_dir, 'space.json')
-        with open(space_file, 'w') as f:
-            json.dump(self.space, f)
-        heuristics_file = os.path.join(target_dir, 'heuristics.json')
-        with open(heuristics_file, 'w') as f:
-            json.dump(self.heuristics, f)
-        search_params_file = os.path.join(target_dir, 'search_params.json')
-        with open(search_params_file, 'w') as f:
-            json.dump(self.search_params, f)
-
-    def check_if_visited(self):
-        raise NotImplementedError()
-
-    def check_if_trial_run(self):
-        raise NotImplementedError()
-
-    # Don't use, only for debug.
-    def _single_run(self):
-        # builder = BalancedPartitionedPS()
-        # builder = PartitionedAR(chunk_size=1)
-        builder = AllReduce()
-        expr = builder.build(self._original_graph_item, self._resource_spec)
-        logging.info(expr)
-        self.trial_run([expr], search_iteration=0)
diff --git a/autodist/autosync/simulator/base.py b/autodist/autosync/simulator/base.py
index 5ac04b7..e670cfa 100644
--- a/autodist/autosync/simulator/base.py
+++ b/autodist/autosync/simulator/base.py
@@ -1,152 +1,143 @@
-# Copyright 2020 Petuum. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Simulator base class."""
-import os
-from collections import OrderedDict
-
-from autodist.graph_item import GraphItem
-from autodist.kernel.partitioner import PartitionerConfig
-from autodist.resource_spec import ResourceSpec
-from autodist.strategy.base import Strategy
-from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem
-
-
-class SimulatorBase:
-    """Simulates strategies for a given graph and resource spec."""
-
-    def __init__(self,
-                 graph_item=None,
-                 resource_spec=None):
-        """
-        Constructor for simulator base class
-        Args:
-            graph_item: a GraphItem object, or a path to a serialized GraphItem object.
-            resource_spec: a ResourceSpec object, or a path to a resource file.
-        """
-        # check if it is a path
-        self._graph_item = None
-        if isinstance(graph_item, GraphItem):
-            self._graph_item = graph_item
-        elif isinstance(graph_item, str) and os.path.exists(graph_item):
-            self._graph_item = GraphItem.deserialize(graph_item)
-        else:
-            raise ValueError("Invalid graph_item: {}".format(graph_item))
-
-        self._resource_spec = None
-        if isinstance(resource_spec, ResourceSpec):
-            self._resource_spec = resource_spec
-        elif isinstance(resource_spec, str) and os.path.exists(resource_spec):
-            self._resource_spec = ResourceSpec(resource_spec)
-        else:
-            raise ValueError("Invalid resource_spec: {}".format(resource_spec))
-
-    def update_graph_item(self, graph_item):
-        """Change the default graph_item with this simulator."""
-        if not graph_item:
-            raise ValueError('Empty graph item.')
-        self._graph_item = graph_item
-
-    def update_resource_spec(self, resource_spec):
-        """Change the default resource_spec with this simulator."""
-        if not resource_spec:
-            raise ValueError('Empty resource spec.')
-        self._resource_spec = resource_spec
-
-    def simulate(self,
-                 strategy,
-                 graph_item=None,
-                 resource_spec=None,
-                 *args,
-                 **kwargs):
-        """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple."""
-        raise NotImplementedError()
-
-    def inference(self, *args, **kwargs):
-        """
-        Abstract method for simulator inference.
-
-        Args:
-            features: feature input extracted from (GraphItem, ResourceSpec, Strategy) tuple.
-            checkpoint: optional simulator weight.
-
-        Returns:
-            float
-        """
-        raise NotImplementedError()
-
-    def load_checkpoint(self, checkpoint):
-        """
-        Load a checkpoint file as weights of the simulator.
-
-        Args:
-            checkpoint: path to a checkpoint file.
-        """
-        raise NotImplementedError()
-
-    # def save_checkpoint(self, model, checkpoint):
-    #     """
-    #     Save a trained weight as a checkpoint file.
-    #
-    #     Args:
-    #         model: trained model.
-    #         checkpoint: path where to save the checkpoint.
-    #     """
-    #     raise NotImplementedError()
-
-    def preprocess(self,
-                   strategy,
-                   graph_item=None,
-                   resource_spec=None):
-        """
-        Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features.
-
-        Args:
-            strategy: a distribution strategy
-            graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used.
-            resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used.
-
-        Returns:
-            OrderedDict(): variable/part name to variable/part items.
-            ResourceItem:
-        """
-        if not graph_item:
-            if not self._graph_item:
-                raise ValueError('No graph item provided.')
-            else:
-                graph_item = self._graph_item
-        if not resource_spec:
-            if not self._resource_spec:
-                raise ValueError('No resource spec provided.')
-            else:
-                resource_spec = self._resource_spec
-        if not strategy:
-            raise ValueError('No strategy provided.')
-
-        resource_item = ResourceItem(resource_spec)
-        name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()}
-
-        name_to_items = OrderedDict()
-        for node in strategy.node_config:
-            var_name = node.var_name
-            var = name_to_var[var_name]
-            if node.partitioner:
-                pc = PartitionerConfig(partition_str=node.partitioner)
-                for i, part in enumerate(node.part_config):
-                    part_item = PartItem(var, graph_item, i, pc, part)
-                    name_to_items[part_item.name] = part_item
-            else:
-                var_item = VariableItem(var, graph_item, node)
-                name_to_items[var_item.name] = var_item
-        return name_to_items, resource_item
+# Copyright 2020 Petuum. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Simulator base class."""
+from collections import OrderedDict
+
+import os
+
+from autodist.graph_item import GraphItem
+from autodist.kernel.partitioner import PartitionerConfig
+from autodist.resource_spec import ResourceSpec
+from autodist.strategy.auto.item import VariableItem, PartItem, ResourceItem
+
+
+class SimulatorBase:
+    """Simulates strategies for a given graph and resource spec."""
+
+    def __init__(self,
+                 graph_item=None,
+                 resource_spec=None):
+        """
+        Constructor for simulator base class
+        Args:
+            graph_item: a GraphItem object, or a path to a serialized GraphItem object.
+            resource_spec: a ResourceSpec object, or a path to a resource file.
+        """
+        # check if it is a path
+        self._graph_item = None
+        if isinstance(graph_item, GraphItem):
+            self._graph_item = graph_item
+        elif isinstance(graph_item, str) and os.path.exists(graph_item):
+            self._graph_item = GraphItem.deserialize(graph_item)
+        else:
+            raise ValueError("Invalid graph_item: {}".format(graph_item))
+
+        self._resource_spec = None
+        if isinstance(resource_spec, ResourceSpec):
+            self._resource_spec = resource_spec
+        elif isinstance(resource_spec, str) and os.path.exists(resource_spec):
+            self._resource_spec = ResourceSpec(resource_spec)
+        else:
+            raise ValueError("Invalid resource_spec: {}".format(resource_spec))
+
+    def update_graph_item(self, graph_item):
+        """Change the default graph_item with this simulator."""
+        if not graph_item:
+            raise ValueError('Empty graph item.')
+        self._graph_item = graph_item
+
+    def update_resource_spec(self, resource_spec):
+        """Change the default resource_spec with this simulator."""
+        if not resource_spec:
+            raise ValueError('Empty resource spec.')
+        self._resource_spec = resource_spec
+
+    def simulate(self,
+                 strategy,
+                 graph_item=None,
+                 resource_spec=None,
+                 *args,
+                 **kwargs):
+        """Return simulated runtime cost given (strategy, graph_item, resource_spec) tuple."""
+        raise NotImplementedError()
+
+    def inference(self, *args, **kwargs):
+        """Abstract method for simulator inference."""
+        raise NotImplementedError()
+
+    def load_checkpoint(self, checkpoint):
+        """
+        Load a checkpoint file as weights of the simulator.
+
+        Args:
+            checkpoint: path to a checkpoint file.
+        """
+        raise NotImplementedError()
+
+    # def save_checkpoint(self, model, checkpoint):
+    #     """
+    #     Save a trained weight as a checkpoint file.
+    #
+    #     Args:
+    #         model: trained model.
+    #         checkpoint: path where to save the checkpoint.
+    #     """
+    #     raise NotImplementedError()
+
+    def preprocess(self,
+                   strategy,
+                   graph_item=None,
+                   resource_spec=None):
+        """
+        Preprocess a (strategy, graph_item, resource_spec) tuple into pre-features.
+
+        Args:
+            strategy: a distribution strategy
+            graph_item: optional graph_item, if not provided, the default one bundled with simulator will be used.
+            resource_spec: optional resource_spec, if not provided, the default one bundled with simulator will be used.
+
+        Returns:
+            OrderedDict(): variable/part name to variable/part items.
+            ResourceItem:
+        """
+        if not graph_item:
+            if not self._graph_item:
+                raise ValueError('No graph item provided.')
+            else:
+                graph_item = self._graph_item
+        if not resource_spec:
+            if not self._resource_spec:
+                raise ValueError('No resource spec provided.')
+            else:
+                resource_spec = self._resource_spec
+        if not strategy:
+            raise ValueError('No strategy provided.')
+
+        resource_item = ResourceItem(resource_spec)
+        name_to_var = {var.name: var for var_op, var in graph_item.trainable_var_op_to_var.items()}
+
+        name_to_items = OrderedDict()
+        for node in strategy.node_config:
+            var_name = node.var_name
+            var = name_to_var[var_name]
+            if node.partitioner:
+                pc = PartitionerConfig(partition_str=node.partitioner)
+                for i, part in enumerate(node.part_config):
+                    part_item = PartItem(var, graph_item, i, pc, part)
+                    name_to_items[part_item.name] = part_item
+            else:
+                var_item = VariableItem(var, graph_item, node)
+                name_to_items[var_item.name] = var_item
+        return name_to_items, resource_item
diff --git a/tests/test_simulator.py b/tests/test_simulator.py
index f2aaeb1..7b3d7ed 100644
--- a/tests/test_simulator.py
+++ b/tests/test_simulator.py
@@ -1,27 +1,7 @@
-from autodist.simulator.utils import _resolve_device_address
 from autodist.resource_spec import ResourceSpec
-from autodist.cluster import SSHCluster
-from autodist.kernel.device.resolver import DeviceResolver
-from autodist.simulator.base import SimulatorBase
 from autodist.simulator.utils import _resolve_device_address
 
-# def test_resolve_device_address():
-#     resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml'
-#     rs = ResourceSpec(resource_spec_file)
-#     cluster = SSHCluster(rs)
-#     resolver = DeviceResolver(cluster)
-#     return True
-
-def test_resolve():
-    resource_spec_file = '/home/hao.zhang/project/pycharm/autodist/examples/resource_spec.yml'
-    rs = ResourceSpec(resource_spec_file)
-    cluster = SSHCluster(rs)
-    resolver = DeviceResolver(cluster)
-    SimulatorBase.network_bandwidth(rs, resolver)
-    devices = [device for device, _ in rs.devices]
-
-    resolved_devices_1 = [_resolve_device_address(device, resolver) for device, _ in rs.devices]
-    devices = resolver.resolve_to_device_str(devices)
+from autodist.cluster import SSHCluster
+from autodist.kernel.device.resolver import DeviceResolver
+from autodist.resource_spec import ResourceSpec
 
-    for d1, d2 in zip(resolved_devices_1, devices):
-        assert d1 == d2
\ No newline at end of file