From ef439aba0d8e82b58113e69111fe74c3a4820549 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 19 Nov 2024 10:47:31 +0100 Subject: [PATCH 1/2] Update the quantization script --- .../distillation/model_quantization.py | 197 ++++++++++++------ 1 file changed, 131 insertions(+), 66 deletions(-) diff --git a/examples/training/distillation/model_quantization.py b/examples/training/distillation/model_quantization.py index f0d621fc7..08427d03a 100644 --- a/examples/training/distillation/model_quantization.py +++ b/examples/training/distillation/model_quantization.py @@ -1,93 +1,158 @@ """ A quantized model executes some or all of the operations with integers rather than floating point values. This allows for a more compact models and the use of high performance vectorized operations on many hardware platforms. -As a result, you get about 40% smaller and faster models. The speed-up depends on your CPU and how PyTorch was build and can be anywhere between 10% speed-up and 300% speed-up. +As a result, you get about much smaller and faster models. The speed-up depends on your CPU, but you can expect a speed-up of 2x to 4x for most CPUs. The model size is also reduced by 2x. -Note: Quantized models are only available for CPUs. Use a GPU, if available, for optimal performance. +Note: Quantized models are only recommended for CPUs. If available, Use a GPU for optimal performance. -For more details: -https://pytorch.org/docs/stable/quantization.html +See docs for more information on quantization, optimization, benchmarks, etc.: https://sbert.net/docs/sentence_transformer/usage/efficiency.html """ -import csv -import gzip import logging -import os import time -import torch -from torch.nn import Embedding, Linear -from torch.quantization import quantize_dynamic +from datasets import load_dataset -from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, util -from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator - -#### Just some code to print debug information to stdout -logging.basicConfig( - format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()] +from sentence_transformers import ( + SentenceTransformer, + export_dynamic_quantized_onnx_model, + export_static_quantized_openvino_model, ) -#### /print debug information to stdout - - -# Check if dataset exists. If not, download and extract it -sts_dataset_path = "datasets/stsbenchmark.tsv.gz" - -if not os.path.exists(sts_dataset_path): - util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path) - -# Limit torch to 4 threads -torch.set_num_threads(4) +from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator #### Just some code to print debug information to stdout logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) -### /print debug information to stdout -model_name = "all-distilroberta-v1" - -# Load a named sentence model (based on BERT). This will download the model from our server. -# Alternatively, you can also pass a filepath to SentenceTransformer() -model = SentenceTransformer(model_name, device="cpu") -q_model = quantize_dynamic(model, {Linear, Embedding}) +# Load some sentences from the STSbenchmark dataset +train_dataset = load_dataset("sentence-transformers/stsb", split="train") +sentences = train_dataset["sentence1"] + train_dataset["sentence2"] +sentences = sentences[:10_000] +test_dataset = load_dataset("sentence-transformers/stsb", split="test") +model_name = "all-mpnet-base-v2" -# Convert the dataset to a DataLoader ready for training -logging.info("Read STSbenchmark dataset") -test_samples = [] -sentences = [] +# 1. Load a baseline model with just fp32 torch +model = SentenceTransformer(model_name, device="cpu") -with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn: - reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE) - for row in reader: - score = float(row["score"]) / 5.0 # Normalize score to range 0 ... 1 - inp_example = InputExample(texts=[row["sentence1"], row["sentence2"]], label=score) +# 2. Load an ONNX model to quantize +onnx_model = SentenceTransformer( + model_name, + backend="onnx", + device="cpu", + model_kwargs={"provider": "CPUExecutionProvider"}, +) - sentences.append(row["sentence1"]) - sentences.append(row["sentence2"]) +# 3. Quantize the ONNX model +quantized_onnx_model_path = f"{model_name.replace('/', '-')}-onnx-quantized" +onnx_model.save_pretrained(quantized_onnx_model_path) +export_dynamic_quantized_onnx_model( + onnx_model, + quantization_config="avx512_vnni", + model_name_or_path=quantized_onnx_model_path, +) +quantized_onnx_model = SentenceTransformer( + quantized_onnx_model_path, + backend="onnx", + device="cpu", + model_kwargs={ + "file_name": "model_qint8_avx512_vnni.onnx", + "provider": "CPUExecutionProvider", + }, +) +# Alternatively, you can load the pre-quantized model: +# quantized_onnx_model = SentenceTransformer( +# model_name, +# backend="onnx", +# device="cpu", +# model_kwargs={ +# "file_name": "model_qint8_avx512_vnni.onnx", +# "provider": "CPUExecutionProvider", +# }, +# ) + +# To make sure that `onnx_model` itself didn't get quantized, we reload it +onnx_model = SentenceTransformer( + model_name, + backend="onnx", + device="cpu", + model_kwargs={"provider": "CPUExecutionProvider"}, +) - if row["split"] == "test": - test_samples.append(inp_example) +# 4. Load an OpenVINO model to quantize +openvino_model = SentenceTransformer(model_name, backend="openvino", device="cpu") -sentences = sentences[0:10000] +# 5. Quantize the OpenVINO model +quantized_ov_model_path = f"{model_name.replace('/', '-')}-ov-quantized" +openvino_model.save_pretrained(quantized_ov_model_path) +export_static_quantized_openvino_model( + openvino_model, + quantization_config=None, + model_name_or_path=quantized_ov_model_path, +) +quantized_ov_model = SentenceTransformer( + quantized_ov_model_path, + backend="openvino", + device="cpu", + model_kwargs={"file_name": "openvino_model_qint8_quantized.xml"}, +) +# Alternatively, you can load the pre-quantized model: +# quantized_ov_model = SentenceTransformer( +# model_name, +# backend="openvino", +# device="cpu", +# model_kwargs={"file_name": "openvino_model_qint8_quantized.xml"}, +# ) + +# To make sure that `openvino_model` itself didn't get quantized, we reload it +openvino_model = SentenceTransformer(model_name, backend="openvino", device="cpu") + + +# Create a function to evaluate the models +def evaluate(model: SentenceTransformer, name: str) -> None: + logging.info(f"Evaluating {name}") + start_time = time.time() + model.encode(sentences) + diff = time.time() - start_time + logging.info(f"Done after {diff:.2f} sec. {len(sentences) / diff:.2f} sentences / sec") + + evaluator = EmbeddingSimilarityEvaluator( + sentences1=test_dataset["sentence1"], + sentences2=test_dataset["sentence2"], + scores=test_dataset["score"], + name="sts-test", + ) + results = evaluator(model) + logging.info(f"STS Benchmark, {evaluator.primary_metric}: {results[evaluator.primary_metric]}") + + +# Evaluate the models +for model, name in [ + (model, "Baseline"), + (onnx_model, "ONNX"), + (quantized_onnx_model, "Quantized ONNX"), + (openvino_model, "OpenVINO"), + (quantized_ov_model, "Quantized OpenVINO"), +]: + evaluate(model, name) -logging.info("Evaluating speed of unquantized model") -start_time = time.time() -emb = model.encode(sentences, show_progress_bar=True) -diff_normal = time.time() - start_time -logging.info(f"Done after {diff_normal:.2f} sec. {len(sentences) / diff_normal:.2f} sentences / sec") +""" +Evaluating Baseline +Done after 48.79 sec. 204.97 sentences / sec +STS Benchmark, sts-test_spearman_cosine: 0.834221557992808 -logging.info("Evaluating speed of quantized model") -start_time = time.time() -emb = q_model.encode(sentences, show_progress_bar=True) -diff_quantized = time.time() - start_time -logging.info(f"Done after {diff_quantized:.2f} sec. {len(sentences) / diff_quantized:.2f} sentences / sec") -logging.info(f"Speed-up: {diff_normal / diff_quantized:.2f}") -######### +Evaluating ONNX +Done after 36.79 sec. 271.84 sentences / sec +STS Benchmark, sts-test_spearman_cosine: 0.8342216139244768 -evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test") +Evaluating Quantized ONNX +Done after 17.84 sec. 560.60 sentences / sec +STS Benchmark, sts-test_spearman_cosine: 0.8256725903061843 -logging.info("Evaluate regular model") -model.evaluate(evaluator) +Evaluating OpenVINO +Done after 36.43 sec. 274.49 sentences / sec +STS Benchmark, sts-test_spearman_cosine: 0.834221557992808 -print("\n\n") -logging.info("Evaluate quantized model") -q_model.evaluate(evaluator) +Evaluating Quantized OpenVINO +Done after 12.94 sec. 772.83 sentences / sec +STS Benchmark, sts-test_spearman_cosine: 0.8315710087348848 +""" From 090327fd352eb28a7b321a02b8295b842dfeba04 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 19 Nov 2024 10:47:42 +0100 Subject: [PATCH 2/2] Remove some dead code from distillation script --- examples/training/distillation/model_distillation.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/examples/training/distillation/model_distillation.py b/examples/training/distillation/model_distillation.py index b3b6f69dd..9f89af3d2 100644 --- a/examples/training/distillation/model_distillation.py +++ b/examples/training/distillation/model_distillation.py @@ -56,16 +56,6 @@ inference_batch_size = 64 train_batch_size = 64 -# We use AllNLI as a source of sentences for the distillation -nli_dataset_path = "datasets/AllNLI.tsv.gz" - -# Further, we use sentences extracted from the English Wikipedia to train the distillation -wikipedia_dataset_path = "datasets/wikipedia-en-sentences.txt.gz" - -# We use the STS benchmark dataset to see how much performance we loose -sts_dataset_path = "datasets/stsbenchmark.tsv.gz" - - logging.info("Load the AllNLI dataset") # Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli nli_train_dataset = load_dataset("sentence-transformers/all-nli", "pair-score", split="train")