Merge branch 'main' into clang-tidy

intel · Jan 5, 2024 · 96834d6 · 96834d6
2 parents 5d73527 + 9e20bd1
commit 96834d6
Show file tree

Hide file tree

Showing 9 changed files with 270 additions and 20 deletions.
diff --git a/.github/workflows/scripts/install_binary.sh b/.github/workflows/scripts/install_binary.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
-source /intel-extension-for-transformers/.github/workflows/scripts/change_color.sh
+source /neural-speed/.github/workflows/scripts/change_color.sh
 
-cd /intel-extension-for-transformers
+cd /neural-speed
 $BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
 git config --global --add safe.directory "*"
 git submodule update --init --recursive
@@ -12,5 +12,5 @@ python setup.py bdist_wheel
 
 
 $BOLD_YELLOW && echo "---------------- pip install binary -------------" && $RESET
-pip install dist/intel_extension_for_transformers*.whl
+pip install dist/neural_speed*.whl
 pip list
diff --git a/.github/workflows/unit-test-llmruntime.yml b/.github/workflows/unit-test-llmruntime.yml
@@ -0,0 +1,87 @@
+name: Python Unit Test
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - neural_speed/**
+      - tests/**
+      - .github/workflows/unit-test-llmruntime.yml
+      - .github/workflows/unitTest/**
+  workflow_dispatch:
+
+# If there is a new commit, the previous jobs will be canceled
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  DOCKER_CONFIG_NAME: "commonDockerConfig"
+  REPO_NAME: "neural-speed"
+  REPO_TAG: "py39"
+  DOCKER_FILE_NAME: "devel"
+  CONTAINER_NAME: "utTest"
+
+jobs:
+  unit-test:
+    runs-on: [self-hosted, linux, X64, llmruntime-node]
+    steps:
+      - name: Load environment variables
+        run: cat ~/actions-runner3/.env >> $GITHUB_ENV
+
+      - name: Docker Clean Up
+        run: |
+          docker ps -a
+          if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
+              docker start ${{ env.CONTAINER_NAME }}-${{ runner.name }}
+              echo "remove left files through container ..."
+              docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true"
+          fi
+
+      - name: Checkout out Repo
+        uses: actions/checkout@v3
+        with:
+          submodules: "recursive"
+          fetch-tags: true
+
+      - name: Docker Build
+        run: |
+          docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY }}" --build-arg https_proxy="${{ env.HTTPS_PROXY }}" -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} .
+
+      - name: Docker Run
+        run: |
+          if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
+            docker stop ${{ env.CONTAINER_NAME }}-${{ runner.name }}
+            docker rm -vf ${{ env.CONTAINER_NAME }}-${{ runner.name }} || true
+          fi
+          docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }}-${{ runner.name }} -v /dev/shm:/dev/shm \
+          -e http_proxy="${{ env.HTTP_PROXY }}" \
+          -e https_proxy="${{ env.HTTPS_PROXY }}" \
+          -v ${{ github.workspace }}:/neural-speed \
+          -v /tf_dataset2:/tf_dataset2 \
+          -v ~/.cache/oneAPI:/cache \
+          ${{ env.REPO_NAME }}:${{ env.REPO_TAG }}
+
+      - name: Env build
+        run: |
+          docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
+          bash /neural-speed/.github/workflows/scripts/prepare_env.sh
+
+      - name: Binary build
+        run: |
+          docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
+          bash -c "cd /neural-speed/.github/workflows/scripts \
+          && bash install_binary.sh"
+
+      - name: Run UT
+        run: |
+          docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
+          bash -c "cd /neural-speed/.github/workflows/unitTest \
+          && bash unittest_llmruntime.sh"
+
+      - name: Publish pipeline artifact
+        uses: actions/upload-artifact@v3
+        if: ${{ !cancelled() }}
+        with:
+          name: Python Unit Test
+          path: ${{ github.workspace }}/log_dir/unit_test*.*
diff --git a/.github/workflows/unitTest/env_setup.sh b/.github/workflows/unitTest/env_setup.sh
@@ -0,0 +1,15 @@
+pip list
+
+# Install test requirements
+echo "Install Tests Requirements"
+cd $1 || exit 1
+pwd
+if [ -f "requirements.txt" ]; then
+    python -m pip install --default-timeout=100 -r requirements.txt
+    pip list
+else
+    echo "Not found requirements.txt file."
+fi
+
+pip install coverage
+pip install pytest
diff --git a/.github/workflows/unitTest/unittest_llmruntime.sh b/.github/workflows/unitTest/unittest_llmruntime.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+source /neural-speed/.github/workflows/scripts/change_color.sh
+test_install_backend="true"
+LOG_DIR=/neural-speed/log_dir
+mkdir -p ${LOG_DIR}
+WORKING_DIR="/neural-speed/tests"
+
+# -------------------LLM Runtime Test-------------------
+function llmruntime_test() {
+    cd ${WORKING_DIR}
+    local ut_log_name=${LOG_DIR}/unit_test_llm_runtime.log
+    find . -name "test*.py" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' >run.sh
+    # run UT
+    $BOLD_YELLOW && echo "cat run.sh..." && $RESET
+    cat run.sh | tee ${ut_log_name}
+    $BOLD_YELLOW && echo "------UT start-------" && $RESET
+    bash run.sh 2>&1 | tee -a ${ut_log_name}
+    $BOLD_YELLOW && echo "------UT end -------" && $RESET
+
+    if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] ||
+        [ $(grep -c "OK" ${ut_log_name}) == 0 ] ||
+        [ $(grep -c "Segmentation fault" ${ut_log_name}) != 0 ] ||
+        [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] ||
+        [ $(grep -c "==ERROR:" ${ut_log_name}) != 0 ] ||
+        [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ]; then
+        $BOLD_RED && echo "Find errors in engine test, please check the output..." && $RESET
+        exit 1
+    else
+        $BOLD_GREEN && echo "engine test finished successfully!" && $RESET
+    fi
+}
+
+function main() {
+    bash /neural-speed/.github/workflows/unitTest/env_setup.sh "${WORKING_DIR}"
+    llmruntime_test
+}
+
+main
diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
@@ -5,7 +5,7 @@ file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
 file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
 
 option(BTLA_UT_ALL "Enable all unit tests" OFF)
-option(BTLA_UT_DEBUG "Enable debug unit tests" ON)
+option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
 option(BTLA_UT_EPILOGUE "Enable unit test for epilogue" OFF)
 option(BTLA_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
 option(BTLA_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
@@ -110,7 +110,7 @@ if(UT_BUILD)
 	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
   if(BTLA_UT_OPENMP)
     include(FindOpenMP)
-    target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
+    target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
   endif()
 	if(NOT WIN32)
 		if(NOT BTLA_UT_NOASAN)

diff --git a/bestla/CMakePresets.json b/bestla/CMakePresets.json
@@ -1,4 +1,4 @@
-{
+{
   "version": 3,
   "configurePresets": [
     {
@@ -9,7 +9,9 @@
       "binaryDir": "${sourceDir}/out/build/${presetName}",
       "installDir": "${sourceDir}/out/install/${presetName}",
       "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Debug"
+        "CMAKE_BUILD_TYPE": "Debug",
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_ALL": "OFF"
       },
       "condition": {
         "type": "equals",
@@ -25,18 +27,15 @@
       "inherits": "linux-debug",
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Release",
-        "JBLAS_UT_ALL": "ON"
+        "BTLA_UT_ALL": "ON"
       }
     },
     {
       "name": "linux-release",
       "displayName": "linux Release",
       "description": "Release",
       "inherits": "linux-debug",
-      "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release",
-        "JBLAS_UT_ALL": "OFF"
-      }
+      "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
     },
     {
       "name": "windows-base",
@@ -48,7 +47,7 @@
       "cacheVariables": {
         "CMAKE_C_COMPILER": "cl.exe",
         "CMAKE_CXX_COMPILER": "cl.exe",
-        "JBLAS_UT_ALL": "OFF"
+        "BTLA_UT_ALL": "OFF"
       },
       "condition": {
         "type": "equals",
@@ -67,7 +66,8 @@
       },
       "cacheVariables": {
         "CMAKE_BUILD_TYPE": "Debug",
-        "JBLAS_UT_DEBUG": "ON"
+        "BTLA_UT_DEBUG": "ON",
+        "BTLA_UT_ALL": "OFF"
       }
     },
     {
@@ -82,7 +82,7 @@
       "displayName": "x64 Release for UT",
       "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
       "inherits": "x64-release",
-      "cacheVariables": { "JBLAS_UT_ALL": "ON" }
+      "cacheVariables": { "BTLA_UT_ALL": "ON" }
     }
   ]
-}
+}
diff --git a/bestla/bestla/bestla_storage.h b/bestla/bestla/bestla_storage.h
@@ -792,16 +792,17 @@ class StorageWeightKBlockNFloat : public StorageWeightKBlockNInteger {
 
 class PackedWeightParser {
  public:
-  static gemm::IWeightBase* deserialBuffer(void* serialized_buf) {
+  static gemm::IWeightBase* deserialBuffer(const void* serialized_buf) {
     if (serialized_buf == nullptr) {
       return nullptr;
     }
-    auto rptr = reinterpret_cast<int8_t*>(serialized_buf);
+    auto tmpptr=const_cast<void*>(serialized_buf);
+    auto rptr = reinterpret_cast<int8_t*>(tmpptr);
     rptr += IWeightBase::offset();
     int mProID = utils::deserialize<int>(rptr);
     IWeightBase* ptr = nullptr;
     if (mProID >= int(BTLA_PROLOGUEB_IDS::Begin) && mProID < int(BTLA_PROLOGUEB_IDS::End)) {
-      rptr = reinterpret_cast<int8_t*>(serialized_buf);
+      rptr = reinterpret_cast<int8_t*>(tmpptr);
       auto type = static_cast<BTLA_PROLOGUEB_IDS>(mProID);
       switch (type) {
         case BTLA_PROLOGUEB_IDS::WeightPack:

diff --git a/developer_document.md b/developer_document.md
@@ -45,7 +45,7 @@ print(pt_ans)
 
 # itrex infer
 # fp32 config
-woq_config = WeightOnlyQuantConfig(use_quant=False)
+woq_config = WeightOnlyQuantConfig(not_quant=True)
 # model file should be in `runtime_outs` folder
 model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
 outputs = model.generate(inputs, do_sample=False, max_new_tokens=128)

diff --git a/tests/test_python_api.py b/tests/test_python_api.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import shutil
+import torch
+import unittest
+
+from transformers import AutoTokenizer, TextStreamer
+from neural_speed import Model
+
+def cmpData(numa, numb):
+    totalErr = ((np.abs(numa - numb))**2).sum()
+    totalNum = (np.abs(numa)**2).sum()
+    diff2 = np.sqrt(totalErr/totalNum)
+
+    cos = np.dot(numa, numb)/(np.linalg.norm(numa)*np.linalg.norm(numb))
+    return {"diff2": diff2, "cos": cos}
+
+class TestLLMRUNTIME(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        pass
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        shutil.rmtree("./runtime_outs", ignore_errors=True)
+
+    def test_llm_runtime(self):
+        model_name = "/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
+        prompt = "What is the meaning of life?"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        inputs = tokenizer(prompt, return_tensors="pt")
+
+        pt_logits = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_logits.pth")[:,-1]
+        pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_generate_ids.pth")[0].tolist()
+        print(tokenizer.decode(pt_generate_ids))
+
+        # check output ids
+        itrex_model = Model()
+        itrex_model.init(model_name, not_quant=True)
+        itrex_generate_ids = itrex_model.generate(inputs.input_ids, do_sample=False, max_new_tokens=100)[0]
+        print(tokenizer.decode(itrex_generate_ids))
+        for i in range(len(pt_generate_ids)):
+            self.assertEqual(pt_generate_ids[i], itrex_generate_ids[i])
+
+        # check diff of logits
+        woq_configs = {
+            "fp32": {"use_cache":True, "not_quant":True},
+            # "ggml_int4": {"compute_dtype":"int8", "weight_dtype":"int4", "use_cache":True, "use_ggml":True},
+            "jblas_int4": {"compute_dtype":"int8", "weight_dtype":"int4", "use_cache":True},
+            # "jblas_int8": {"compute_dtype":"bf16", "weight_dtype":"int8", "use_cache":True},
+        }
+        for config_type in woq_configs:
+            itrex_model = Model()
+            itrex_model.init(model_name, **woq_configs[config_type])
+            itrex_logits = itrex_model(inputs.input_ids)
+            diff_data = cmpData(pt_logits.detach().numpy().flatten(), itrex_logits.flatten())
+            print(config_type, diff_data)
+
+
+    def test_beam_search(self):
+        model_name = "/tf_dataset2/models/pytorch/gpt-j-6B"  # or local path to model
+        prompts = [
+           "she opened the door and see",
+           "tell me 10 things about jazz music",
+           "What is the meaning of life?",
+           "To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer"\
+            " The slings and arrows of outrageous fortune, "\
+            "Or to take arms against a sea of troubles."\
+            "And by opposing end them. To die—to sleep,"
+            ]
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,
+                                                  padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+        pad_token = tokenizer(tokenizer.pad_token)['input_ids'][0]
+        inputs = tokenizer(prompts, padding=True, return_tensors='pt')
+
+        # pytorch fp32
+        pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/beam_pt_generate_ids.pth").tolist()
+
+        # llm runtime fp32
+        itrex_model = Model()
+        itrex_model.init(model_name, not_quant=True)
+        itrex_generate_ids = itrex_model.generate(
+            inputs.input_ids, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True, pad_token=pad_token)
+        for i in range(len(itrex_generate_ids)):
+            self.assertListEqual(pt_generate_ids[i], itrex_generate_ids[i])
+
+
+if __name__ == "__main__":
+    unittest.main()