Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add layerwise quant error profiling and delete useless code #27

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 28 additions & 9 deletions dipoorlet/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
from .deploy import to_deploy
from .dist_helper import init_from_mpi, init_from_slurm
from .profiling import (quantize_profiling_multipass, quantize_profiling_transformer,
show_model_profiling_res, show_model_ranges, weight_need_perchannel)
quantize_profiling_layerwise, show_model_profiling_res,
show_model_ranges, weight_need_perchannel)
from .tensor_cali import tensor_calibration
from .utils import (ONNXGraph, load_clip_val, logger, reduce_clip_val,
reduce_profiling_res, save_clip_val, save_profiling_res,
setup_logger, deploy_QOperator)
setup_logger, restore_data)
from .weight_transform import weight_calibration

parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -51,9 +52,19 @@
parser.add_argument("--pattern", help="Sparse pattern", choices=["unstruction", "nv24"], default="unstruction")
parser.add_argument("--optim_transformer", help="Transformer model optimization", default=False, action='store_true')
parser.add_argument("--model_type", help="Transformer model type", choices=["unet"], default=None)
parser.add_argument("--quant_format", default="QDQ", type=str, choices=["QOP", "QDQ"])
parser.add_argument("--onnx_sim", help="Whether use onnxsim to simplify model", action='store_true')
parser.add_argument("--qnode_version", help="The quant node opset version", type=int, choices=[13], default=13)
parser.add_argument("--layerwise_error_prof", help='Profiling per-layer quantitative error', action="store_true")
parser.add_argument("--prof_num", type=int, default=32)
parser.add_argument("--batch_data_dir", type=str, default="./batch_data/")
parser.add_argument("--criterion", help='The evaluation criterion of profiling quantitative error', type=str,
choices=['cosine', 'max_abs_gap'], default="cosine")
parser.add_argument("--sensitive_layer_num", type=int, default=10)
args = parser.parse_args()

if args.layerwise_error_prof:
assert args.prof_num <= args.data_num

if args.slurm:
init_from_slurm()
elif args.mpirun:
Expand Down Expand Up @@ -96,10 +107,11 @@
model = onnx.load(args.optimzed_model_dir)
else:
model = onnx.load(args.model)
if model.opset_import[0].version < 13:
model = onnx.version_converter.convert_version(model, 13)
model, check = simplify(model)
assert check, "Simplified ONNX model could not be validated"
if model.opset_import[0].version < args.qnode_version:
model = onnx.version_converter.convert_version(model, args.qnode_version)
if args.onnx_sim:
model, check = simplify(model, mutable_initializer=True)
assert check, "Simplified ONNX model could not be validated"
onnx_graph = ONNXGraph(model, args.output_dir, args.deploy, args.model_type)

if dist.get_rank() == 0 and not args.optim_transformer:
Expand Down Expand Up @@ -151,11 +163,18 @@
show_model_ranges(graph, act_clip_val, weight_clip_val, args)
weight_need_perchannel(graph, args)

# Profiling Layerwise error Distributed.
if args.layerwise_error_prof:
if dist.get_rank() == 0:
if not os.path.exists(args.batch_data_dir):
restore_data(args, graph.network_inputs, args.prof_num)
logger.info("Profiling layerwise quantitative error ...")
quantize_profiling_layerwise(graph, graph_ori, act_clip_val, weight_clip_val, args)
dist.barrier()

# Deploy
if dist.get_rank() == 0:
logger.info("Deploy to " + args.deploy + '...')
to_deploy(graph, act_clip_val, weight_clip_val, args)
if args.quant_format == 'QOP' and args.model_type is None:
deploy_QOperator(graph.model, tensor_range, args)
end = time.time()
logger.info("Total time cost: {} seconds.".format(int(end - start)))
15 changes: 15 additions & 0 deletions dipoorlet/forward_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,18 @@ def forward_get_tensor(graph, net, index, args):
ort_outputs = ort_session.run(outputs, ort_inputs)
ort_outs = OrderedDict(zip(outputs, ort_outputs))
return copy.deepcopy(ort_outs)


def forward_get_output(graph, net, index, args):
rank = dist.get_rank()
device = rank % torch.cuda.device_count()
providers = [("CUDAExecutionProvider", {'device_id': device})]
ort_session = ort.InferenceSession(net.SerializeToString(), providers=providers)
ort_inputs = {}
for data in input_data_generator(args.batch_data_dir, graph.network_inputs, index, index + 1):
for name in graph.network_inputs:
ort_inputs[name] = data[name][:].reshape(graph.get_tensor_shape(name))
outputs = [output.name for output in ort_session.get_outputs()]
ort_outputs = ort_session.run(outputs, ort_inputs)
ort_outs = OrderedDict(zip(outputs, ort_outputs))
return copy.deepcopy(ort_outs)
73 changes: 70 additions & 3 deletions dipoorlet/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from onnx import numpy_helper
from tqdm import tqdm

from .forward_net import forward_get_tensor, ActivationCache
from .forward_net import forward_get_tensor, forward_get_output, ActivationCache
from .platform_settings import platform_setting_table
from .quantize import DQTENSORSUFFIX, QUANT_NODE_NAME_LIST, quant_graph
from .utils import cos_similarity, logger
from .quantize import DQTENSORSUFFIX, QUANT_NODE_NAME_LIST, quant_graph, insert_fake_quant_node, delete_fake_quant_node
from .utils import cos_similarity, max_abs_gap, logger


def update_node_quant_profiling(graph_q, node, fp_cache, q_cache, layer_cosine_dict, args):
Expand Down Expand Up @@ -262,3 +262,70 @@ def show_model_profiling_res(graph_after_wt, layer_cosine_dict, model_cosine_dic
model_cosine_dict[name][1]))
else:
logger.info("{:40} tolcos : {:<.5f}".format(name, model_cosine_dict[name][0]))


def quantize_profiling_layerwise(graph_after_wt, graph_ori, act_clip_val, weight_clip_val, args):
clip_val = act_clip_val.copy()
clip_val.update(weight_clip_val)
graph_q, quant_node_list = quant_graph(graph_after_wt, clip_val, args)

node_gen = tqdm(range(0, len(quant_node_list)))

graph_q.update_model_dim(args.prof_num)
graph_ori.update_model_dim(args.prof_num)

fp_net = graph_ori.model
q_net = graph_q.model

fp_tensors = forward_get_output(graph_ori, fp_net, 0, args)
q_tensors = forward_get_output(graph_q, q_net, 0, args)

base_error = 0.0
for tensor_name in graph_q.network_outputs:
q_tensor_name = tensor_name
if tensor_name + DQTENSORSUFFIX in graph_q.network_outputs:
q_tensor_name = tensor_name + DQTENSORSUFFIX

if args.criterion == "cosine":
base_error += 1 - cos_similarity(fp_tensors[tensor_name], q_tensors[q_tensor_name])
elif args.criterion == "max_abs_gap":
base_error += max_abs_gap(fp_tensors[tensor_name], q_tensors[q_tensor_name])

base_error /= len(graph_q.network_outputs)
res = {}
act_quant = []
for node_id in node_gen:
node = quant_node_list[node_id]
delete_fake_quant_node(graph_q, node)
graph_q.update_model()

q_net = graph_q.model
q_tensors = forward_get_output(graph_q, q_net, 0, args)

for tensor_name in graph_q.network_outputs:
q_tensor_name = tensor_name
if tensor_name + DQTENSORSUFFIX in graph_q.network_outputs:
q_tensor_name = tensor_name + DQTENSORSUFFIX

if args.criterion == "cosine":
error = 1 - cos_similarity(fp_tensors[tensor_name], q_tensors[q_tensor_name])
elif args.criterion == "max_abs_gap":
error = max_abs_gap(fp_tensors[tensor_name], q_tensors[q_tensor_name])

if node.name not in res:
res[node.name] = error
else:
res[node.name] += error

res[node.name] /= len(graph_q.network_outputs)
res[node.name] = base_error - res[node.name]
logger.info(res)

insert_fake_quant_node(graph_q, node, act_quant, clip_val, args)
graph_q.update_model()

sorted_res = dict(sorted(res.items(), key=lambda x: x[1], reverse=True))
logger.info(f"The sorted by quant sensitive of all layers {sorted_res}")

sensitive_layer = list(sorted_res.keys())
logger.info(f"The top-{args.sensitive_layer_num} sensitive layer are {sensitive_layer[:args.sensitive_layer_num]}")
22 changes: 22 additions & 0 deletions dipoorlet/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,28 @@ def insert_fake_quant_node_output(graph, clip_val, args):
return


def delete_fake_quant_node(graph, node):

def get_input_idx(node, input):
for i, inp in enumerate(node.input):
if inp == input:
return i

inputs = copy.deepcopy(node.input)
for input in inputs:
if not input.endswith(DQTENSORSUFFIX):
continue
dequant_node = graph.get_tensor_producer(input)
quant_node = graph.get_tensor_producer(dequant_node.input[0])
quant_consumers = graph.get_tensor_consumer(dequant_node.output[0])
for consumer in quant_consumers:
consumer.input.insert(get_input_idx(consumer, quant_node.input[0] + DQTENSORSUFFIX), quant_node.input[0])
if dequant_node.output[0] in consumer.input:
consumer.input.remove(dequant_node.output[0])
graph.graph.node.remove(dequant_node)
graph.graph.node.remove(quant_node)


def get_qnode_by_param(param, in_tensor_name, tensor_shape, range, need_transpose=False):
bit_width = param['bit_width']
zero_point = [0]
Expand Down
67 changes: 46 additions & 21 deletions dipoorlet/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import torch.distributed as dist
from onnx import TensorProto, numpy_helper
from onnx.external_data_helper import convert_model_to_external_data
from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
from onnxruntime.quantization.quant_utils import QuantizationMode, QuantType
from termcolor import colored

from .platform_settings import platform_setting_table
Expand Down Expand Up @@ -232,6 +230,22 @@ def update_model(self):
opset_imports=self.model.opset_import)
self.prepare_initializer()

def update_model_dim(self, dim=32):
del self.graph.value_info[:]
for input in self.graph.input:
if input.name in self.network_inputs:
dim1 = input.type.tensor_type.shape.dim[0]
dim1.dim_value = dim

for output in self.graph.output:
if output.name in self.network_outputs:
dim1 = output.type.tensor_type.shape.dim[0]
dim1.dim_value = dim

self.update_model()
self.model = onnx.shape_inference.infer_shapes(self.model)
self.get_shape_type()

def copy_from(self, source_graph):
self.model = copy.deepcopy(source_graph.model)
self.graph = copy.deepcopy(source_graph.graph)
Expand Down Expand Up @@ -278,6 +292,11 @@ def cos_similarity(ta, tb):
/ np.sqrt(np.square(tb).sum())


def max_abs_gap(ta, tb):
assert ta.shape == tb.shape
return np.max(np.abs(ta - tb))


def dispatch_functool(func):
registry = {}

Expand Down Expand Up @@ -412,24 +431,30 @@ def reduce_profiling_res(rank_size, args, layer_res_fname='layer_res.json', mode
return layer_cosine_dict, model_cosine_dict


def deploy_QOperator(model, tensor_range, args):
mode = QuantizationMode.QLinearOps
per_channel = platform_setting_table[args.deploy]['qw_params']['per_channel']
op_types_to_quantize = platform_setting_table[args.deploy]['quant_nodes']
def input_data_generator(input_dir, input_name_list, data_st_idx, data_ed_idx):
for idx in range(data_st_idx, data_ed_idx):
data = {}
for i in input_name_list:
data[i] = np.fromfile(f'{input_dir}/{i}/{idx}.bin', 'float32')
yield data

if platform_setting_table[args.deploy]['qw_params']['symmetric']:
weight_type = QuantType.QInt8
else:
weight_type = QuantType.QUInt8

if platform_setting_table[args.deploy]['qi_params']['symmetric']:
activation_type = QuantType.QInt8
else:
activation_type = QuantType.QUInt8

quantizer = ONNXQuantizer(model, per_channel, False, mode, True,
weight_type, activation_type, tensor_range,
None, args.skip_layers, op_types_to_quantize)
quantizer.quantize_model()
model_output = os.path.join(args.output_dir, 'qop_model.onnx')
quantizer.model.save_model_to_file(model_output)
def restore_data(args, input_name_list, batch_size=32):

if os.path.exists(args.batch_data_dir):
logger.info("True")
os.system(f"rm -rf {args.batch_data_dir}")

os.mkdir(args.batch_data_dir)

for name in input_name_list:
os.mkdir(args.batch_data_dir + '/' + name)
batch_data = []
for idx in range(0, args.data_num):
data = np.fromfile(f'{args.input_dir}/{name}/{idx}.bin', 'float32').reshape(1, -1)
batch_data.append(data)
if (idx + 1) % batch_size == 0:
batch_id = int(idx / batch_size)
batch_data = np.vstack(batch_data)
batch_data.tofile(f'{args.batch_data_dir}/{name}/{batch_id}.bin')
batch_data = []
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
torch
onnx>=1.10.0
onnx
onnxsim
onnxruntime-gpu
numpy
Expand Down