From 3f75e44a020669cb7205c5ac312e326687b25453 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 05:32:22 +0000
Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---
.github/workflows/copyright_check.yml | 6 ++---
.github/workflows/cpp-graph-test.yml | 4 ++--
.../workflows/scripts/formatScan/nlp_dict.txt | 1 -
.../scripts/formatScan/pyspelling_conf.yaml | 2 +-
.../scripts/models/calculate_percertiles.py | 2 +-
.github/workflows/unit-test-bestla.yml | 4 ++--
.github/workflows/unit-test-llmruntime.yml | 2 +-
README.md | 1 -
developer_document.md | 1 -
docs/install.md | 1 -
docs/tensor_parallelism.md | 1 -
neural_speed/__init__.py | 8 +++----
neural_speed/convert/common.py | 2 +-
neural_speed/convert/convert-hf-to-gguf.py | 4 ++--
neural_speed/convert/convert_dolly.py | 2 +-
neural_speed/convert/convert_falcon.py | 2 +-
neural_speed/convert/convert_gptj.py | 2 +-
neural_speed/convert/convert_gptneox.py | 2 +-
neural_speed/convert/convert_llama.py | 2 +-
neural_speed/convert/convert_mistral.py | 4 ++--
neural_speed/convert/convert_opt.py | 2 +-
neural_speed/convert/convert_phi.py | 7 +++---
.../convert/convert_quantized_gptj.py | 6 ++---
.../convert/convert_quantized_llama.py | 4 ++--
.../convert/convert_quantized_mistral.py | 4 ++--
neural_speed/convert/convert_whisper.py | 2 +-
neural_speed/core/README.md | 1 -
neural_speed/models/requirements/common.txt | 16 +++++++-------
requirements.txt | 22 +++++++++----------
scripts/huggingface.py | 4 ++--
tests/model-test/run_tp.sh | 14 ++++++++++++
tests/requirements.txt | 2 +-
tests/test_python_api.py | 4 ++--
33 files changed, 74 insertions(+), 67 deletions(-)
diff --git a/.github/workflows/copyright_check.yml b/.github/workflows/copyright_check.yml
index 2b9644a60..f45d1e3ad 100644
--- a/.github/workflows/copyright_check.yml
+++ b/.github/workflows/copyright_check.yml
@@ -26,7 +26,7 @@ jobs:
job_name: ["copyright"]
fail-fast: false
steps:
-
+
- name: Checkout out Repo
uses: actions/checkout@v3
@@ -40,7 +40,7 @@ jobs:
git --no-pager diff --name-only remotes/origin/${{ github.base_ref }} ${{ github.workspace }}/neural_speed> ${{ env.CODE_SCAN_LOG_PATH }}/diff.log
files=$(cat ${{ env.CODE_SCAN_LOG_PATH }}/diff.log | awk '!a[$0]++')
$LIGHT_PURPLE && echo " ----------------- checking ... --------------------------" && $RESET
- if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
+ if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
rm -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log
fi
for file in ${files}
@@ -57,7 +57,7 @@ jobs:
$LIGHT_PURPLE && echo "Skipping ${file}" && $RESET
fi
done
- if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
+ if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
$BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------"
cat ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log
$BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET
diff --git a/.github/workflows/cpp-graph-test.yml b/.github/workflows/cpp-graph-test.yml
index 19b1516fa..7bd0b2940 100644
--- a/.github/workflows/cpp-graph-test.yml
+++ b/.github/workflows/cpp-graph-test.yml
@@ -59,7 +59,7 @@ jobs:
run: |
cd ${{ github.workspace }}/.github/workflows/scripts/models
bash cpp_graph_inference.sh cpp-graph-test-neural-speed ${{ matrix.modelName }} ${{ env.INPUT_COMPILER_VERSION }}
-
+
- name: Rename summary
run: |
cd ${{ github.workspace }}
@@ -93,7 +93,7 @@ jobs:
uses: actions/download-artifact@v3
with:
path: ${{ env.OUT_SCRIPT_PATH }}/generated/log
-
+
- name: Merge CPP Graph Summary Log
run: |
cd ${{ env.OUT_SCRIPT_PATH }}/generated/log/cpp_graph
diff --git a/.github/workflows/scripts/formatScan/nlp_dict.txt b/.github/workflows/scripts/formatScan/nlp_dict.txt
index 8b1378917..e69de29bb 100644
--- a/.github/workflows/scripts/formatScan/nlp_dict.txt
+++ b/.github/workflows/scripts/formatScan/nlp_dict.txt
@@ -1 +0,0 @@
-
diff --git a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
index 6fb64f3f0..17dc81c19 100644
--- a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
+++ b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
@@ -10,4 +10,4 @@ matrix:
output: ${VAL_REPO}/nlp_dict.dic
sources:
- ${SCAN_REPO}/docs/*
- - ${SCAN_REPO}/*.md
\ No newline at end of file
+ - ${SCAN_REPO}/*.md
diff --git a/.github/workflows/scripts/models/calculate_percertiles.py b/.github/workflows/scripts/models/calculate_percertiles.py
index b79c6c6df..f54d0970b 100644
--- a/.github/workflows/scripts/models/calculate_percertiles.py
+++ b/.github/workflows/scripts/models/calculate_percertiles.py
@@ -51,7 +51,7 @@ def parse_memory_file(memory_file):
p99 = calculate_percentile(predictions, 99)
latency_mean = calculate_mean(predictions[1:])
total_latency = np.sum(predictions)
-
+
print("P90: {:.2f} ms".format(p90))
print("P99: {:.2f} ms".format(p99))
print("average_latency: {:.2f} ms".format(latency_mean))
diff --git a/.github/workflows/unit-test-bestla.yml b/.github/workflows/unit-test-bestla.yml
index 0a80f5cc0..6d5dd8be8 100644
--- a/.github/workflows/unit-test-bestla.yml
+++ b/.github/workflows/unit-test-bestla.yml
@@ -33,7 +33,7 @@ jobs:
with:
submodules: "recursive"
fetch-tags: true
-
+
- name: Env build
run: |
echo "do not need conda env"
@@ -42,7 +42,7 @@ jobs:
#if [[ "${{ env.INPUT_COMPILER_VERSION }}" != "11.4.1" ]]; then
# conda install --update-deps -c conda-forge gxx==${{ env.INPUT_COMPILER_VERSION }} gcc==${{ env.INPUT_COMPILER_VERSION }} gxx_linux-64==${{ env.INPUT_COMPILER_VERSION }} libstdcxx-ng sysroot_linux-64 -y
#fi
-
+
- name: Run UT
run: |
source /opt/rh/gcc-toolset-12/enable
diff --git a/.github/workflows/unit-test-llmruntime.yml b/.github/workflows/unit-test-llmruntime.yml
index 974edff44..e5d0e0b6b 100644
--- a/.github/workflows/unit-test-llmruntime.yml
+++ b/.github/workflows/unit-test-llmruntime.yml
@@ -28,7 +28,7 @@ jobs:
steps:
- name: Load environment variables
run: cat ~/actions-runner3/.env >> $GITHUB_ENV
-
+
- name: Docker Clean Up
run: |
docker ps -a
diff --git a/README.md b/README.md
index 347a666c6..bec1e8383 100644
--- a/README.md
+++ b/README.md
@@ -143,4 +143,3 @@ Available modes:
## Enable New Model
You can consider adding your own models, please follow the document: [graph developer document](./developer_document.md).
-
diff --git a/developer_document.md b/developer_document.md
index d06f3eaf5..ffa34083d 100644
--- a/developer_document.md
+++ b/developer_document.md
@@ -426,4 +426,3 @@ We can improve the performance by fusion the FFN process.
- [FFN-Fusion example](https://github.com/intel/intel-extension-for-transformers/pull/160)
# 4. A complete example
- [Enable baichuan](https://github.com/intel/intel-extension-for-transformers/pull/376)
-
diff --git a/docs/install.md b/docs/install.md
index 874913bd1..e270f6174 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -25,4 +25,3 @@ cd build
cmake ..
cmake --build . -j --config Release
```
-
diff --git a/docs/tensor_parallelism.md b/docs/tensor_parallelism.md
index 9c95e4204..17e84b37d 100644
--- a/docs/tensor_parallelism.md
+++ b/docs/tensor_parallelism.md
@@ -112,4 +112,3 @@ mpirun -n 1 taskset -c 0-47 sh run.sh : -n 1 taskset -c 48-95 sh run.sh
```
**NOTICE**: tensor parallelsim strategy will split the model to specific node/socket, each device already use part of the original weights differently. So we should not use shared-memory of weights to avoid cross numa weight movement. Use option `--no-mmap` to disable shared weights between processes.
-
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index afc1a6100..c73dfe2f4 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -113,7 +113,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False,
self.bin_file = fp32_bin
else:
self.bin_file = quant_bin
-
+
if os.path.exists(self.bin_file):
print("{} existed, will use cache file. Otherwise please remove the file".
format(self.bin_file))
@@ -122,7 +122,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False,
if use_gptq or use_awq:
convert_model(model_name, quant_bin, "f32")
return
-
+
if not os.path.exists(fp32_bin):
convert_model(model_name, fp32_bin, "f32")
assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
@@ -142,7 +142,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
self.__import_package(model_type)
self.model = self.module.Model()
if self.max_request_num == -1:
- self.max_request_num = max(generate_kwargs.get("max_request_num",
+ self.max_request_num = max(generate_kwargs.get("max_request_num",
max_request_num_default), generate_kwargs.get("batch_size", 1))
if "threads" not in generate_kwargs:
threads = os.getenv("OMP_NUM_THREADS")
@@ -174,7 +174,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
reinit_from_bin = True
if self.max_request_num > 0:
print("Will start to reinit model from bin due to different max request num.")
- self.max_request_num = max(input_bs, max_request_num)
+ self.max_request_num = max(input_bs, max_request_num)
if self.model is None or reinit_from_bin:
self.init_from_bin(self.model_type, self.bin_file, batch_size=input_bs,
diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index d0fe62285..093d5cf89 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -401,7 +401,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
g_idx = torch.tensor([i // q_config["group_size"] for i in range(infeatures)], dtype=torch.int32)
scale_zeros = gptq_zeros * gptq_scales
weight = (gptq_scales[g_idx.long()] * weight - scale_zeros[g_idx.long()])
-
+
weight = weight.t()
weight = weight.float()
if permute_func:
diff --git a/neural_speed/convert/convert-hf-to-gguf.py b/neural_speed/convert/convert-hf-to-gguf.py
index 2d9a89837..3fbe47b9e 100755
--- a/neural_speed/convert/convert-hf-to-gguf.py
+++ b/neural_speed/convert/convert-hf-to-gguf.py
@@ -369,9 +369,9 @@ def write_tensors(self):
data = data_torch.squeeze().numpy()
# Map bloom-style qkv_linear to gpt-style qkv_linear
- # bloom:
+ # bloom:
# github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252
- # gpt-2:
+ # gpt-2:
# github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index dc77b1c43..61a0bb0ac 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
-
+
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index 9d323f89d..c4a92222b 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -110,7 +110,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
-
+
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index 2f6c8e673..a610032ea 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -102,7 +102,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
-
+
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index 409cc05ba..8c50c006b 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
-
+
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 9dae31bd8..ca638da5a 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -285,7 +285,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to
def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(self.params_vocab_size):
- text: bytes
+ text: bytes
if i < tokenizer.vocab_size():
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
index 71a195fcc..be26fd90f 100644
--- a/neural_speed/convert/convert_mistral.py
+++ b/neural_speed/convert/convert_mistral.py
@@ -1067,8 +1067,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
self.fout.write(
struct.pack("i", 1)
- )
- # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
+ )
+ # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# but bos_token_id = 1 in llama.cpp
self.fout.write(struct.pack("i", 2))
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index 4f487f68c..ab26bc538 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -109,7 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
-
+
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py
index a4c62c89d..f74fdf5d1 100644
--- a/neural_speed/convert/convert_phi.py
+++ b/neural_speed/convert/convert_phi.py
@@ -50,7 +50,7 @@ def bytes_to_unicode():
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
-
+
def phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams):
print("phi.gguf converting: ")
list_vars = model.state_dict()
@@ -257,7 +257,7 @@ def phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
print("Done. Output file: " + fname_out)
print("")
-
+
def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
@@ -288,9 +288,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
else:
phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams)
-
+
if __name__ == '__main__':
main()
-
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
index 829445707..4e6b18578 100644
--- a/neural_speed/convert/convert_quantized_gptj.py
+++ b/neural_speed/convert/convert_quantized_gptj.py
@@ -99,7 +99,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
print(f"converting {dst_name} qauntized tensor to bestla q4 block")
-def main(args_in: Optional[List[str]] = None) -> None:
+def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@@ -143,7 +143,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps
fout.write(struct.pack("f", 10000.0)) # freq_base
fout.write(struct.pack("f", 1.0)) # rope_factor
-
+
fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
@@ -183,7 +183,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
f"transformer.h.{i}.attn.k_proj.weight", list_vars, fout, quantize_config)
convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.v_proj.weight",
f"transformer.h.{i}.attn.v_proj.weight", list_vars, fout, quantize_config)
-
+
convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.out_proj.weight",
f"transformer.h.{i}.attn.out_proj.weight", list_vars, fout, quantize_config)
convert_to_qx_bestla_tensor(f"transformer.h.{i}.mlp.fc_in.weight",
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 4733d6e7f..7ea173ece 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -94,7 +94,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
dst.flatten()[:byte_size].tofile(fout)
print(f"converting {dst_name} qauntized tensor to bestla q4 block")
-def main(args_in: Optional[List[str]] = None) -> None:
+def main(args_in: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@@ -153,7 +153,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
# TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# but bos_token_id = 1 in llama.cpp
- f.write(struct.pack("i", 1))
+ f.write(struct.pack("i", 1))
f.write(struct.pack("i", 2))
f.write(struct.pack("i", 0))
diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py
index bb2f97e0d..b5b4881aa 100644
--- a/neural_speed/convert/convert_quantized_mistral.py
+++ b/neural_speed/convert/convert_quantized_mistral.py
@@ -42,7 +42,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
model, config, quantize_config = load_quantized_model(model_path)
f = open(out_path, "wb")
-
+
# 1. write hparams
n_vocab = config["vocab_size"]
n_embd = config["hidden_size"]
@@ -87,7 +87,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
f.write(struct.pack("f", rope_scale))
- # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
+ # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
# but bos_token_id = 1 in llama.cpp
f.write(struct.pack("i", 1))
f.write(struct.pack("i", 2))
diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py
index b41debbe6..70100a38a 100644
--- a/neural_speed/convert/convert_whisper.py
+++ b/neural_speed/convert/convert_whisper.py
@@ -240,4 +240,4 @@ def main(args_in: Optional[List[str]] = None) -> None:
if __name__ == "__main__":
main()
-
\ No newline at end of file
+
diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md
index 3aea65b04..ee7548e3c 100644
--- a/neural_speed/core/README.md
+++ b/neural_speed/core/README.md
@@ -73,4 +73,3 @@ Ice Lake
Cascade Lake
Cooper Lake
Tiger Lake
Rocket Lake | any int4<
Skylake | any 4bits
group size=-1
compute type=fp32 | AVX512F
Alder Lake (12th Gen)
Raptor Lake (13th and 14th Gen)|any 4bits
group size=-1
compute type=int8 | AVX_VNNI
Older architecture (before 12th Gen)| any 4bits
group size=-1
compute type=fp32 | AVX2
-
diff --git a/neural_speed/models/requirements/common.txt b/neural_speed/models/requirements/common.txt
index a6621ec05..4c66a05a5 100644
--- a/neural_speed/models/requirements/common.txt
+++ b/neural_speed/models/requirements/common.txt
@@ -1,13 +1,13 @@
--extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
-transformers
-numpy
-sentencepiece
-protobuf<3.20
-einops
accelerate
-peft
datasets
-transformers_stream_generator
+einops
gguf
+numpy
+peft
+protobuf<3.20
+sentencepiece
tiktoken
+torch==2.1.0+cpu
+transformers
+transformers_stream_generator
diff --git a/requirements.txt b/requirements.txt
index 0f37048f8..eca40e698 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,16 @@
--extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
-transformers
-numpy
-sentencepiece
-protobuf<3.20
-einops
accelerate
-peft
+cmake
datasets
-transformers_stream_generator
-tiktoken
-py-cpuinfo
+einops
gguf
-cmake
+numpy
+peft
+protobuf<3.20
+py-cpuinfo
+sentencepiece
setuptools>=61
+tiktoken
+torch==2.1.0+cpu
+transformers
+transformers_stream_generator
diff --git a/scripts/huggingface.py b/scripts/huggingface.py
index a9319aa96..16ceb55b6 100644
--- a/scripts/huggingface.py
+++ b/scripts/huggingface.py
@@ -351,7 +351,7 @@ def _create_auto_model(
load_in_8bit=load_in_8bit,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype
- )
+ )
else:
if load_in_4bit:
assert (
@@ -468,7 +468,7 @@ def add_special_tokens(self) -> bool:
elif self.model_format == "runtime":
return True
elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
- return False
+ return False
elif self.AUTO_MODEL_CLASS is transformers.AutoModel:
return False
elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
diff --git a/tests/model-test/run_tp.sh b/tests/model-test/run_tp.sh
index 1a99d1d40..5f907aef5 100644
--- a/tests/model-test/run_tp.sh
+++ b/tests/model-test/run_tp.sh
@@ -1,4 +1,18 @@
#!/bin/bash
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
set -eo pipefail
set -x
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a2d6abf48..ade85af11 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,3 +1,3 @@
+gguf
optimum==1.13.2
optimum-intel==1.11.0
-gguf
\ No newline at end of file
diff --git a/tests/test_python_api.py b/tests/test_python_api.py
index 56f3041e9..b90169009 100644
--- a/tests/test_python_api.py
+++ b/tests/test_python_api.py
@@ -47,7 +47,7 @@ def test_llm_runtime(self):
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt")
-
+
pt_logits = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_logits.pth")[:,-1]
pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_generate_ids.pth")[0].tolist()
print(tokenizer.decode(pt_generate_ids))
@@ -117,4 +117,4 @@ def test_beam_search(self):
if __name__ == "__main__":
unittest.main()
-
+