Merge remote-tracking branch 'upstream/main' into longjie/add_automat…

…ic_model_parallel_via_fx
huggingface · Jul 8, 2024 · 87e66fb · 87e66fb
2 parents 0876f5d + 171020c
commit 87e66fb
Show file tree

Hide file tree

Showing 25 changed files with 397 additions and 134 deletions.
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install .[tests,onnxruntime,exporters-tf]
+        pip install .[tests,exporters]
     - name: Test with unittest
       working-directory: tests
       run: |

diff --git a/README.md b/README.md
@@ -79,8 +79,7 @@ It is possible to export 🤗 Transformers and Diffusers models to the OpenVINO
 optimum-cli export openvino --model distilbert-base-uncased-finetuned-sst-2-english distilbert_sst2_ov
 ```
 
-If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#weight-only-quantization) for more detail on weight only quantization. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#static-quantization).
-
+If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/export) for more detail. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/intel/openvino/optimization#static-quantization).
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. To load a PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
@@ -92,13 +91,13 @@ To load a model and run inference with OpenVINO Runtime, you can just replace yo
   model_id = "distilbert-base-uncased-finetuned-sst-2-english"
   tokenizer = AutoTokenizer.from_pretrained(model_id)
 - model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained("distilbert_sst2_ov")
++ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
 
   classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
   results = classifier("He's a dreadful magician.")
 ```
 
-You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
+You can find more examples in the [documentation](https://huggingface.co/docs/optimum/main/intel/openvino/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
 
 ### Neural Compressor
 

diff --git a/examples/onnxruntime/optimization/multiple-choice/run_swag.py b/examples/onnxruntime/optimization/multiple-choice/run_swag.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForMultipleChoice, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. The version of transformers must be >= 4.19.0
@@ -236,7 +236,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -254,13 +253,18 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForMultipleChoice.from_pretrained(
+        optimized_model_path,
+        provider=optim_args.execution_provider,
+    )
+
     if training_args.do_eval:
         # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
         # prediction step(s)
@@ -339,13 +343,12 @@ def compute_metrics(eval_predictions):
         # Evaluation
         logger.info("*** Evaluate ***")
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["label"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:

diff --git a/examples/onnxruntime/optimization/question-answering/run_qa.py b/examples/onnxruntime/optimization/question-answering/run_qa.py
@@ -37,7 +37,7 @@
 
 from optimum.onnxruntime import ORTModelForQuestionAnswering, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -305,7 +305,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -323,13 +322,15 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForQuestionAnswering.from_pretrained(optimized_model_path, provider=optim_args.execution_provider)
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -478,13 +479,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         predictions = post_processing_function(eval_examples, eval_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 
@@ -514,12 +514,12 @@ def compute_metrics(p: EvalPrediction):
             # During Feature creation dataset samples might increase, we will select required samples again
             predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=["start_positions", "end_positions"],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = post_processing_function(predict_examples, predict_dataset, outputs.predictions)
         metrics = compute_metrics(predictions)
 

diff --git a/examples/onnxruntime/optimization/text-classification/README.md b/examples/onnxruntime/optimization/text-classification/README.md
@@ -14,13 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Text classification 
+# Text classification
 
 ## GLUE tasks
 
-The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py)
-allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as 
-the ones from the [GLUE benchmark](https://gluebenchmark.com/).
+The script [`run_glue.py`](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/optimization/text-classification/run_glue.py) allows us to apply graph optimizations and fusion using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for sequence classification tasks such as the ones from the [GLUE benchmark](https://gluebenchmark.com/).
 
 The following example applies graph optimization on a DistilBERT fine-tuned on the sst-2 task. Here the optimization level is selected to be 1, enabling basic optimizations such as redundant node eliminations and constant folding. Higher optimization level will result in hardware dependent optimized graph.
 

diff --git a/examples/onnxruntime/optimization/text-classification/run_glue.py b/examples/onnxruntime/optimization/text-classification/run_glue.py
@@ -42,7 +42,7 @@
 
 from optimum.onnxruntime import ORTModelForSequenceClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -250,7 +250,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
 
@@ -268,13 +267,17 @@ def main():
     optimizer = ORTOptimizer.from_pretrained(model)
 
     # Optimize the model
-    optimizer.optimize(
+    optimized_model_path = optimizer.optimize(
         optimization_config=optimization_config,
         save_dir=training_args.output_dir,
         use_external_data_format=onnx_export_args.use_external_data_format,
         one_external_file=onnx_export_args.one_external_file,
     )
 
+    model = ORTModelForSequenceClassification.from_pretrained(
+        optimized_model_path, provider=optim_args.execution_provider
+    )
+
     # Prepare the dataset downloading, preprocessing and metric creation to perform the evaluation and / or the
     # prediction step(s)
     if training_args.do_eval or training_args.do_predict:
@@ -408,13 +411,13 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the evaluation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
             compute_metrics=compute_metrics,
             label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
+
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)
@@ -436,10 +439,12 @@ def compute_metrics(p: EvalPrediction):
             desc="Running tokenizer on the test dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path, execution_provider=optim_args.execution_provider, label_names=["label"]
+        outputs = evaluation_loop(
+            model=model,
+            eval_dataset=eval_dataset,
+            compute_metrics=compute_metrics,
+            label_names=["label"],
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.squeeze(outputs.predictions) if is_regression else np.argmax(outputs.predictions, axis=1)
 
         # Save predictions

diff --git a/examples/onnxruntime/optimization/token-classification/run_ner.py b/examples/onnxruntime/optimization/token-classification/run_ner.py
@@ -38,7 +38,7 @@
 
 from optimum.onnxruntime import ORTModelForTokenClassification, ORTOptimizer
 from optimum.onnxruntime.configuration import OptimizationConfig
-from optimum.onnxruntime.model import ORTModel
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -276,7 +276,6 @@ def main():
         )
 
     os.makedirs(training_args.output_dir, exist_ok=True)
-    optimized_model_path = os.path.join(training_args.output_dir, "model_optimized.onnx")
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name or model_args.model_name_or_path)
 
@@ -480,12 +479,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the validation dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
 
         # Save evaluation metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
@@ -509,12 +507,11 @@ def compute_metrics(p):
             desc="Running tokenizer on the prediction dataset",
         )
 
-        ort_model = ORTModel(
-            optimized_model_path,
-            execution_provider=optim_args.execution_provider,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=predict_dataset,
             compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(predict_dataset)
         predictions = np.argmax(outputs.predictions, axis=2)
 
         # Remove ignored index (special tokens)

diff --git a/examples/onnxruntime/quantization/image-classification/run_image_classification.py b/examples/onnxruntime/quantization/image-classification/run_image_classification.py
@@ -22,7 +22,6 @@
 import sys
 from dataclasses import dataclass, field
 from functools import partial
-from pathlib import Path
 from typing import Optional
 
 import datasets
@@ -38,7 +37,6 @@
 
 from optimum.onnxruntime import ORTQuantizer
 from optimum.onnxruntime.configuration import AutoCalibrationConfig, QuantizationConfig
-from optimum.onnxruntime.model import ORTModel
 from optimum.onnxruntime.modeling_ort import ORTModelForImageClassification
 from optimum.onnxruntime.preprocessors import QuantizationPreprocessor
 from optimum.onnxruntime.preprocessors.passes import (
@@ -47,6 +45,7 @@
     ExcludeNodeAfter,
     ExcludeNodeFollowedBy,
 )
+from optimum.onnxruntime.utils import evaluation_loop
 
 
 logger = logging.getLogger(__name__)
@@ -378,13 +377,16 @@ def compute_metrics(p: EvalPrediction):
         quantization_preprocessor.register_pass(ExcludeNodeFollowedBy("Add", "Softmax"))
 
     # Apply quantization on the model
-    quantizer.quantize(
+    quantized_model_path = quantizer.quantize(
         save_dir=training_args.output_dir,
         calibration_tensors_range=ranges,
         quantization_config=qconfig,
         preprocessor=quantization_preprocessor,
         use_external_data_format=onnx_export_args.use_external_data_format,
     )
+    model = ORTModelForImageClassification.from_pretrained(
+        quantized_model_path, provider=optim_args.execution_provider
+    )
 
     # Evaluation
     if training_args.do_eval:
@@ -409,13 +411,12 @@ def compute_metrics(p: EvalPrediction):
         # Set the validation transforms
         eval_dataset = eval_dataset.with_transform(preprocess_function)
 
-        ort_model = ORTModel(
-            Path(training_args.output_dir) / "model_quantized.onnx",
-            execution_provider=optim_args.execution_provider,
-            compute_metrics=compute_metrics,
+        outputs = evaluation_loop(
+            model=model,
+            dataset=eval_dataset,
             label_names=[labels_column],
+            compute_metrics=compute_metrics,
         )
-        outputs = ort_model.evaluation_loop(eval_dataset)
         # Save metrics
         with open(os.path.join(training_args.output_dir, "eval_results.json"), "w") as f:
             json.dump(outputs.metrics, f, indent=4, sort_keys=True)