Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Merge branch 'main' into clang-tidy
Browse files Browse the repository at this point in the history
  • Loading branch information
airMeng authored Jan 5, 2024
2 parents 5d73527 + 9e20bd1 commit 96834d6
Show file tree
Hide file tree
Showing 9 changed files with 270 additions and 20 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/scripts/install_binary.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
source /intel-extension-for-transformers/.github/workflows/scripts/change_color.sh
source /neural-speed/.github/workflows/scripts/change_color.sh

cd /intel-extension-for-transformers
cd /neural-speed
$BOLD_YELLOW && echo "---------------- git submodule update --init --recursive -------------" && $RESET
git config --global --add safe.directory "*"
git submodule update --init --recursive
Expand All @@ -12,5 +12,5 @@ python setup.py bdist_wheel


$BOLD_YELLOW && echo "---------------- pip install binary -------------" && $RESET
pip install dist/intel_extension_for_transformers*.whl
pip install dist/neural_speed*.whl
pip list
87 changes: 87 additions & 0 deletions .github/workflows/unit-test-llmruntime.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: Python Unit Test

on:
pull_request:
branches: [main]
paths:
- neural_speed/**
- tests/**
- .github/workflows/unit-test-llmruntime.yml
- .github/workflows/unitTest/**
workflow_dispatch:

# If there is a new commit, the previous jobs will be canceled
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
DOCKER_CONFIG_NAME: "commonDockerConfig"
REPO_NAME: "neural-speed"
REPO_TAG: "py39"
DOCKER_FILE_NAME: "devel"
CONTAINER_NAME: "utTest"

jobs:
unit-test:
runs-on: [self-hosted, linux, X64, llmruntime-node]
steps:
- name: Load environment variables
run: cat ~/actions-runner3/.env >> $GITHUB_ENV

- name: Docker Clean Up
run: |
docker ps -a
if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
docker start ${{ env.CONTAINER_NAME }}-${{ runner.name }}
echo "remove left files through container ..."
docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} bash -c "ls -a /neural-speed && rm -fr /neural-speed/* && rm -fr /neural-speed/.* || true"
fi
- name: Checkout out Repo
uses: actions/checkout@v3
with:
submodules: "recursive"
fetch-tags: true

- name: Docker Build
run: |
docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }}.dockerfile --build-arg http_proxy="${{ env.HTTP_PROXY }}" --build-arg https_proxy="${{ env.HTTPS_PROXY }}" -t ${{ env.REPO_NAME }}:${{ env.REPO_TAG }} .
- name: Docker Run
run: |
if [[ $(docker ps -a | grep -i '${{ env.CONTAINER_NAME }}-${{ runner.name }}'$) ]]; then
docker stop ${{ env.CONTAINER_NAME }}-${{ runner.name }}
docker rm -vf ${{ env.CONTAINER_NAME }}-${{ runner.name }} || true
fi
docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }}-${{ runner.name }} -v /dev/shm:/dev/shm \
-e http_proxy="${{ env.HTTP_PROXY }}" \
-e https_proxy="${{ env.HTTPS_PROXY }}" \
-v ${{ github.workspace }}:/neural-speed \
-v /tf_dataset2:/tf_dataset2 \
-v ~/.cache/oneAPI:/cache \
${{ env.REPO_NAME }}:${{ env.REPO_TAG }}
- name: Env build
run: |
docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
bash /neural-speed/.github/workflows/scripts/prepare_env.sh
- name: Binary build
run: |
docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
bash -c "cd /neural-speed/.github/workflows/scripts \
&& bash install_binary.sh"
- name: Run UT
run: |
docker exec ${{ env.CONTAINER_NAME }}-${{ runner.name }} \
bash -c "cd /neural-speed/.github/workflows/unitTest \
&& bash unittest_llmruntime.sh"
- name: Publish pipeline artifact
uses: actions/upload-artifact@v3
if: ${{ !cancelled() }}
with:
name: Python Unit Test
path: ${{ github.workspace }}/log_dir/unit_test*.*
15 changes: 15 additions & 0 deletions .github/workflows/unitTest/env_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
pip list

# Install test requirements
echo "Install Tests Requirements"
cd $1 || exit 1
pwd
if [ -f "requirements.txt" ]; then
python -m pip install --default-timeout=100 -r requirements.txt
pip list
else
echo "Not found requirements.txt file."
fi

pip install coverage
pip install pytest
38 changes: 38 additions & 0 deletions .github/workflows/unitTest/unittest_llmruntime.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
source /neural-speed/.github/workflows/scripts/change_color.sh
test_install_backend="true"
LOG_DIR=/neural-speed/log_dir
mkdir -p ${LOG_DIR}
WORKING_DIR="/neural-speed/tests"

# -------------------LLM Runtime Test-------------------
function llmruntime_test() {
cd ${WORKING_DIR}
local ut_log_name=${LOG_DIR}/unit_test_llm_runtime.log
find . -name "test*.py" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' >run.sh
# run UT
$BOLD_YELLOW && echo "cat run.sh..." && $RESET
cat run.sh | tee ${ut_log_name}
$BOLD_YELLOW && echo "------UT start-------" && $RESET
bash run.sh 2>&1 | tee -a ${ut_log_name}
$BOLD_YELLOW && echo "------UT end -------" && $RESET

if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] ||
[ $(grep -c "OK" ${ut_log_name}) == 0 ] ||
[ $(grep -c "Segmentation fault" ${ut_log_name}) != 0 ] ||
[ $(grep -c "core dumped" ${ut_log_name}) != 0 ] ||
[ $(grep -c "==ERROR:" ${ut_log_name}) != 0 ] ||
[ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ]; then
$BOLD_RED && echo "Find errors in engine test, please check the output..." && $RESET
exit 1
else
$BOLD_GREEN && echo "engine test finished successfully!" && $RESET
fi
}

function main() {
bash /neural-speed/.github/workflows/unitTest/env_setup.sh "${WORKING_DIR}"
llmruntime_test
}

main
4 changes: 2 additions & 2 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(BTLA_UT_ALL "Enable all unit tests" OFF)
option(BTLA_UT_DEBUG "Enable debug unit tests" ON)
option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
option(BTLA_UT_EPILOGUE "Enable unit test for epilogue" OFF)
option(BTLA_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
option(BTLA_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
Expand Down Expand Up @@ -110,7 +110,7 @@ if(UT_BUILD)
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX)
endif()
if(NOT WIN32)
if(NOT BTLA_UT_NOASAN)
Expand Down
22 changes: 11 additions & 11 deletions bestla/CMakePresets.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
{
"version": 3,
"configurePresets": [
{
Expand All @@ -9,7 +9,9 @@
"binaryDir": "${sourceDir}/out/build/${presetName}",
"installDir": "${sourceDir}/out/install/${presetName}",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug"
"CMAKE_BUILD_TYPE": "Debug",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_ALL": "OFF"
},
"condition": {
"type": "equals",
Expand All @@ -25,18 +27,15 @@
"inherits": "linux-debug",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"JBLAS_UT_ALL": "ON"
"BTLA_UT_ALL": "ON"
}
},
{
"name": "linux-release",
"displayName": "linux Release",
"description": "Release",
"inherits": "linux-debug",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"JBLAS_UT_ALL": "OFF"
}
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
},
{
"name": "windows-base",
Expand All @@ -48,7 +47,7 @@
"cacheVariables": {
"CMAKE_C_COMPILER": "cl.exe",
"CMAKE_CXX_COMPILER": "cl.exe",
"JBLAS_UT_ALL": "OFF"
"BTLA_UT_ALL": "OFF"
},
"condition": {
"type": "equals",
Expand All @@ -67,7 +66,8 @@
},
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"JBLAS_UT_DEBUG": "ON"
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_ALL": "OFF"
}
},
{
Expand All @@ -82,7 +82,7 @@
"displayName": "x64 Release for UT",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-release",
"cacheVariables": { "JBLAS_UT_ALL": "ON" }
"cacheVariables": { "BTLA_UT_ALL": "ON" }
}
]
}
}
7 changes: 4 additions & 3 deletions bestla/bestla/bestla_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -792,16 +792,17 @@ class StorageWeightKBlockNFloat : public StorageWeightKBlockNInteger {

class PackedWeightParser {
public:
static gemm::IWeightBase* deserialBuffer(void* serialized_buf) {
static gemm::IWeightBase* deserialBuffer(const void* serialized_buf) {
if (serialized_buf == nullptr) {
return nullptr;
}
auto rptr = reinterpret_cast<int8_t*>(serialized_buf);
auto tmpptr=const_cast<void*>(serialized_buf);
auto rptr = reinterpret_cast<int8_t*>(tmpptr);
rptr += IWeightBase::offset();
int mProID = utils::deserialize<int>(rptr);
IWeightBase* ptr = nullptr;
if (mProID >= int(BTLA_PROLOGUEB_IDS::Begin) && mProID < int(BTLA_PROLOGUEB_IDS::End)) {
rptr = reinterpret_cast<int8_t*>(serialized_buf);
rptr = reinterpret_cast<int8_t*>(tmpptr);
auto type = static_cast<BTLA_PROLOGUEB_IDS>(mProID);
switch (type) {
case BTLA_PROLOGUEB_IDS::WeightPack:
Expand Down
2 changes: 1 addition & 1 deletion developer_document.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ print(pt_ans)

# itrex infer
# fp32 config
woq_config = WeightOnlyQuantConfig(use_quant=False)
woq_config = WeightOnlyQuantConfig(not_quant=True)
# model file should be in `runtime_outs` folder
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=woq_config, trust_remote_code=True)
outputs = model.generate(inputs, do_sample=False, max_new_tokens=128)
Expand Down
109 changes: 109 additions & 0 deletions tests/test_python_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2022 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import shutil
import torch
import unittest

from transformers import AutoTokenizer, TextStreamer
from neural_speed import Model

def cmpData(numa, numb):
totalErr = ((np.abs(numa - numb))**2).sum()
totalNum = (np.abs(numa)**2).sum()
diff2 = np.sqrt(totalErr/totalNum)

cos = np.dot(numa, numb)/(np.linalg.norm(numa)*np.linalg.norm(numb))
return {"diff2": diff2, "cos": cos}

class TestLLMRUNTIME(unittest.TestCase):

@classmethod
def setUpClass(cls):
pass

@classmethod
def tearDownClass(cls) -> None:
shutil.rmtree("./runtime_outs", ignore_errors=True)

def test_llm_runtime(self):
model_name = "/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
prompt = "What is the meaning of life?"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt")

pt_logits = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_logits.pth")[:,-1]
pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_generate_ids.pth")[0].tolist()
print(tokenizer.decode(pt_generate_ids))

# check output ids
itrex_model = Model()
itrex_model.init(model_name, not_quant=True)
itrex_generate_ids = itrex_model.generate(inputs.input_ids, do_sample=False, max_new_tokens=100)[0]
print(tokenizer.decode(itrex_generate_ids))
for i in range(len(pt_generate_ids)):
self.assertEqual(pt_generate_ids[i], itrex_generate_ids[i])

# check diff of logits
woq_configs = {
"fp32": {"use_cache":True, "not_quant":True},
# "ggml_int4": {"compute_dtype":"int8", "weight_dtype":"int4", "use_cache":True, "use_ggml":True},
"jblas_int4": {"compute_dtype":"int8", "weight_dtype":"int4", "use_cache":True},
# "jblas_int8": {"compute_dtype":"bf16", "weight_dtype":"int8", "use_cache":True},
}
for config_type in woq_configs:
itrex_model = Model()
itrex_model.init(model_name, **woq_configs[config_type])
itrex_logits = itrex_model(inputs.input_ids)
diff_data = cmpData(pt_logits.detach().numpy().flatten(), itrex_logits.flatten())
print(config_type, diff_data)


def test_beam_search(self):
model_name = "/tf_dataset2/models/pytorch/gpt-j-6B" # or local path to model
prompts = [
"she opened the door and see",
"tell me 10 things about jazz music",
"What is the meaning of life?",
"To be, or not to be, that is the question: Whether 'tis nobler in the mind to suffer"\
" The slings and arrows of outrageous fortune, "\
"Or to take arms against a sea of troubles."\
"And by opposing end them. To die—to sleep,"
]

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,
padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
pad_token = tokenizer(tokenizer.pad_token)['input_ids'][0]
inputs = tokenizer(prompts, padding=True, return_tensors='pt')

# pytorch fp32
pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/beam_pt_generate_ids.pth").tolist()

# llm runtime fp32
itrex_model = Model()
itrex_model.init(model_name, not_quant=True)
itrex_generate_ids = itrex_model.generate(
inputs.input_ids, num_beams=4, max_new_tokens=128, min_new_tokens=30, early_stopping=True, pad_token=pad_token)
for i in range(len(itrex_generate_ids)):
self.assertListEqual(pt_generate_ids[i], itrex_generate_ids[i])


if __name__ == "__main__":
unittest.main()

0 comments on commit 96834d6

Please sign in to comment.