From 3f75e44a020669cb7205c5ac312e326687b25453 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 4 Feb 2024 05:32:22 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/copyright_check.yml | 6 ++--- .github/workflows/cpp-graph-test.yml | 4 ++-- .../workflows/scripts/formatScan/nlp_dict.txt | 1 - .../scripts/formatScan/pyspelling_conf.yaml | 2 +- .../scripts/models/calculate_percertiles.py | 2 +- .github/workflows/unit-test-bestla.yml | 4 ++-- .github/workflows/unit-test-llmruntime.yml | 2 +- README.md | 1 - developer_document.md | 1 - docs/install.md | 1 - docs/tensor_parallelism.md | 1 - neural_speed/__init__.py | 8 +++---- neural_speed/convert/common.py | 2 +- neural_speed/convert/convert-hf-to-gguf.py | 4 ++-- neural_speed/convert/convert_dolly.py | 2 +- neural_speed/convert/convert_falcon.py | 2 +- neural_speed/convert/convert_gptj.py | 2 +- neural_speed/convert/convert_gptneox.py | 2 +- neural_speed/convert/convert_llama.py | 2 +- neural_speed/convert/convert_mistral.py | 4 ++-- neural_speed/convert/convert_opt.py | 2 +- neural_speed/convert/convert_phi.py | 7 +++--- .../convert/convert_quantized_gptj.py | 6 ++--- .../convert/convert_quantized_llama.py | 4 ++-- .../convert/convert_quantized_mistral.py | 4 ++-- neural_speed/convert/convert_whisper.py | 2 +- neural_speed/core/README.md | 1 - neural_speed/models/requirements/common.txt | 16 +++++++------- requirements.txt | 22 +++++++++---------- scripts/huggingface.py | 4 ++-- tests/model-test/run_tp.sh | 14 ++++++++++++ tests/requirements.txt | 2 +- tests/test_python_api.py | 4 ++-- 33 files changed, 74 insertions(+), 67 deletions(-) diff --git a/.github/workflows/copyright_check.yml b/.github/workflows/copyright_check.yml index 2b9644a60..f45d1e3ad 100644 --- a/.github/workflows/copyright_check.yml +++ b/.github/workflows/copyright_check.yml @@ -26,7 +26,7 @@ jobs: job_name: ["copyright"] fail-fast: false steps: - + - name: Checkout out Repo uses: actions/checkout@v3 @@ -40,7 +40,7 @@ jobs: git --no-pager diff --name-only remotes/origin/${{ github.base_ref }} ${{ github.workspace }}/neural_speed> ${{ env.CODE_SCAN_LOG_PATH }}/diff.log files=$(cat ${{ env.CODE_SCAN_LOG_PATH }}/diff.log | awk '!a[$0]++') $LIGHT_PURPLE && echo " ----------------- checking ... --------------------------" && $RESET - if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then + if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then rm -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log fi for file in ${files} @@ -57,7 +57,7 @@ jobs: $LIGHT_PURPLE && echo "Skipping ${file}" && $RESET fi done - if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then + if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then $BOLD_YELLOW && echo " ----------------- Current log file output start --------------------------" cat ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log $BOLD_YELLOW && echo " ----------------- Current log file output end --------------------------" && $RESET diff --git a/.github/workflows/cpp-graph-test.yml b/.github/workflows/cpp-graph-test.yml index 19b1516fa..7bd0b2940 100644 --- a/.github/workflows/cpp-graph-test.yml +++ b/.github/workflows/cpp-graph-test.yml @@ -59,7 +59,7 @@ jobs: run: | cd ${{ github.workspace }}/.github/workflows/scripts/models bash cpp_graph_inference.sh cpp-graph-test-neural-speed ${{ matrix.modelName }} ${{ env.INPUT_COMPILER_VERSION }} - + - name: Rename summary run: | cd ${{ github.workspace }} @@ -93,7 +93,7 @@ jobs: uses: actions/download-artifact@v3 with: path: ${{ env.OUT_SCRIPT_PATH }}/generated/log - + - name: Merge CPP Graph Summary Log run: | cd ${{ env.OUT_SCRIPT_PATH }}/generated/log/cpp_graph diff --git a/.github/workflows/scripts/formatScan/nlp_dict.txt b/.github/workflows/scripts/formatScan/nlp_dict.txt index 8b1378917..e69de29bb 100644 --- a/.github/workflows/scripts/formatScan/nlp_dict.txt +++ b/.github/workflows/scripts/formatScan/nlp_dict.txt @@ -1 +0,0 @@ - diff --git a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml index 6fb64f3f0..17dc81c19 100644 --- a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml +++ b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml @@ -10,4 +10,4 @@ matrix: output: ${VAL_REPO}/nlp_dict.dic sources: - ${SCAN_REPO}/docs/* - - ${SCAN_REPO}/*.md \ No newline at end of file + - ${SCAN_REPO}/*.md diff --git a/.github/workflows/scripts/models/calculate_percertiles.py b/.github/workflows/scripts/models/calculate_percertiles.py index b79c6c6df..f54d0970b 100644 --- a/.github/workflows/scripts/models/calculate_percertiles.py +++ b/.github/workflows/scripts/models/calculate_percertiles.py @@ -51,7 +51,7 @@ def parse_memory_file(memory_file): p99 = calculate_percentile(predictions, 99) latency_mean = calculate_mean(predictions[1:]) total_latency = np.sum(predictions) - + print("P90: {:.2f} ms".format(p90)) print("P99: {:.2f} ms".format(p99)) print("average_latency: {:.2f} ms".format(latency_mean)) diff --git a/.github/workflows/unit-test-bestla.yml b/.github/workflows/unit-test-bestla.yml index 0a80f5cc0..6d5dd8be8 100644 --- a/.github/workflows/unit-test-bestla.yml +++ b/.github/workflows/unit-test-bestla.yml @@ -33,7 +33,7 @@ jobs: with: submodules: "recursive" fetch-tags: true - + - name: Env build run: | echo "do not need conda env" @@ -42,7 +42,7 @@ jobs: #if [[ "${{ env.INPUT_COMPILER_VERSION }}" != "11.4.1" ]]; then # conda install --update-deps -c conda-forge gxx==${{ env.INPUT_COMPILER_VERSION }} gcc==${{ env.INPUT_COMPILER_VERSION }} gxx_linux-64==${{ env.INPUT_COMPILER_VERSION }} libstdcxx-ng sysroot_linux-64 -y #fi - + - name: Run UT run: | source /opt/rh/gcc-toolset-12/enable diff --git a/.github/workflows/unit-test-llmruntime.yml b/.github/workflows/unit-test-llmruntime.yml index 974edff44..e5d0e0b6b 100644 --- a/.github/workflows/unit-test-llmruntime.yml +++ b/.github/workflows/unit-test-llmruntime.yml @@ -28,7 +28,7 @@ jobs: steps: - name: Load environment variables run: cat ~/actions-runner3/.env >> $GITHUB_ENV - + - name: Docker Clean Up run: | docker ps -a diff --git a/README.md b/README.md index 347a666c6..bec1e8383 100644 --- a/README.md +++ b/README.md @@ -143,4 +143,3 @@ Available modes: ## Enable New Model You can consider adding your own models, please follow the document: [graph developer document](./developer_document.md). - diff --git a/developer_document.md b/developer_document.md index d06f3eaf5..ffa34083d 100644 --- a/developer_document.md +++ b/developer_document.md @@ -426,4 +426,3 @@ We can improve the performance by fusion the FFN process. - [FFN-Fusion example](https://github.com/intel/intel-extension-for-transformers/pull/160) # 4. A complete example - [Enable baichuan](https://github.com/intel/intel-extension-for-transformers/pull/376) - diff --git a/docs/install.md b/docs/install.md index 874913bd1..e270f6174 100644 --- a/docs/install.md +++ b/docs/install.md @@ -25,4 +25,3 @@ cd build cmake .. cmake --build . -j --config Release ``` - diff --git a/docs/tensor_parallelism.md b/docs/tensor_parallelism.md index 9c95e4204..17e84b37d 100644 --- a/docs/tensor_parallelism.md +++ b/docs/tensor_parallelism.md @@ -112,4 +112,3 @@ mpirun -n 1 taskset -c 0-47 sh run.sh : -n 1 taskset -c 48-95 sh run.sh ``` **NOTICE**: tensor parallelsim strategy will split the model to specific node/socket, each device already use part of the original weights differently. So we should not use shared-memory of weights to avoid cross numa weight movement. Use option `--no-mmap` to disable shared weights between processes. - diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py index afc1a6100..c73dfe2f4 100644 --- a/neural_speed/__init__.py +++ b/neural_speed/__init__.py @@ -113,7 +113,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, self.bin_file = fp32_bin else: self.bin_file = quant_bin - + if os.path.exists(self.bin_file): print("{} existed, will use cache file. Otherwise please remove the file". format(self.bin_file)) @@ -122,7 +122,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False, if use_gptq or use_awq: convert_model(model_name, quant_bin, "f32") return - + if not os.path.exists(fp32_bin): convert_model(model_name, fp32_bin, "f32") assert os.path.exists(fp32_bin), "Fail to convert pytorch model" @@ -142,7 +142,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs): self.__import_package(model_type) self.model = self.module.Model() if self.max_request_num == -1: - self.max_request_num = max(generate_kwargs.get("max_request_num", + self.max_request_num = max(generate_kwargs.get("max_request_num", max_request_num_default), generate_kwargs.get("batch_size", 1)) if "threads" not in generate_kwargs: threads = os.getenv("OMP_NUM_THREADS") @@ -174,7 +174,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa reinit_from_bin = True if self.max_request_num > 0: print("Will start to reinit model from bin due to different max request num.") - self.max_request_num = max(input_bs, max_request_num) + self.max_request_num = max(input_bs, max_request_num) if self.model is None or reinit_from_bin: self.init_from_bin(self.model_type, self.bin_file, batch_size=input_bs, diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py index d0fe62285..093d5cf89 100644 --- a/neural_speed/convert/common.py +++ b/neural_speed/convert/common.py @@ -401,7 +401,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h g_idx = torch.tensor([i // q_config["group_size"] for i in range(infeatures)], dtype=torch.int32) scale_zeros = gptq_zeros * gptq_scales weight = (gptq_scales[g_idx.long()] * weight - scale_zeros[g_idx.long()]) - + weight = weight.t() weight = weight.float() if permute_func: diff --git a/neural_speed/convert/convert-hf-to-gguf.py b/neural_speed/convert/convert-hf-to-gguf.py index 2d9a89837..3fbe47b9e 100755 --- a/neural_speed/convert/convert-hf-to-gguf.py +++ b/neural_speed/convert/convert-hf-to-gguf.py @@ -369,9 +369,9 @@ def write_tensors(self): data = data_torch.squeeze().numpy() # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: + # bloom: # github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 - # gpt-2: + # gpt-2: # github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py index dc77b1c43..61a0bb0ac 100644 --- a/neural_speed/convert/convert_dolly.py +++ b/neural_speed/convert/convert_dolly.py @@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py index 9d323f89d..c4a92222b 100644 --- a/neural_speed/convert/convert_falcon.py +++ b/neural_speed/convert/convert_falcon.py @@ -110,7 +110,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py index 2f6c8e673..a610032ea 100644 --- a/neural_speed/convert/convert_gptj.py +++ b/neural_speed/convert/convert_gptj.py @@ -102,7 +102,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py index 409cc05ba..8c50c006b 100644 --- a/neural_speed/convert/convert_gptneox.py +++ b/neural_speed/convert/convert_gptneox.py @@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py index 9dae31bd8..ca638da5a 100644 --- a/neural_speed/convert/convert_llama.py +++ b/neural_speed/convert/convert_llama.py @@ -285,7 +285,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]: tokenizer = self.sentencepiece_tokenizer for i in range(self.params_vocab_size): - text: bytes + text: bytes if i < tokenizer.vocab_size(): if tokenizer.is_unknown(i): text = " \u2047 ".encode("utf-8") diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py index 71a195fcc..be26fd90f 100644 --- a/neural_speed/convert/convert_mistral.py +++ b/neural_speed/convert/convert_mistral.py @@ -1067,8 +1067,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None: self.fout.write( struct.pack("i", 1) - ) - # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json + ) + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp self.fout.write(struct.pack("i", 2)) diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py index 4f487f68c..ab26bc538 100644 --- a/neural_speed/convert/convert_opt.py +++ b/neural_speed/convert/convert_opt.py @@ -109,7 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py index a4c62c89d..f74fdf5d1 100644 --- a/neural_speed/convert/convert_phi.py +++ b/neural_speed/convert/convert_phi.py @@ -50,7 +50,7 @@ def bytes_to_unicode(): n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) - + def phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams): print("phi.gguf converting: ") list_vars = model.state_dict() @@ -257,7 +257,7 @@ def phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams): print("Done. Output file: " + fname_out) print("") - + def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") @@ -288,9 +288,8 @@ def main(args_in: Optional[List[str]] = None) -> None: phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams) else: phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams) - + if __name__ == '__main__': main() - diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py index 829445707..4e6b18578 100644 --- a/neural_speed/convert/convert_quantized_gptj.py +++ b/neural_speed/convert/convert_quantized_gptj.py @@ -99,7 +99,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config): print(f"converting {dst_name} qauntized tensor to bestla q4 block") -def main(args_in: Optional[List[str]] = None) -> None: +def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") @@ -143,7 +143,7 @@ def main(args_in: Optional[List[str]] = None) -> None: fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6))) # rms norm eps fout.write(struct.pack("f", 10000.0)) # freq_base fout.write(struct.pack("f", 1.0)) # rope_factor - + fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1)) fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2)) fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1)) @@ -183,7 +183,7 @@ def main(args_in: Optional[List[str]] = None) -> None: f"transformer.h.{i}.attn.k_proj.weight", list_vars, fout, quantize_config) convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.v_proj.weight", f"transformer.h.{i}.attn.v_proj.weight", list_vars, fout, quantize_config) - + convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.out_proj.weight", f"transformer.h.{i}.attn.out_proj.weight", list_vars, fout, quantize_config) convert_to_qx_bestla_tensor(f"transformer.h.{i}.mlp.fc_in.weight", diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py index 4733d6e7f..7ea173ece 100644 --- a/neural_speed/convert/convert_quantized_llama.py +++ b/neural_speed/convert/convert_quantized_llama.py @@ -94,7 +94,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head, dst.flatten()[:byte_size].tofile(fout) print(f"converting {dst_name} qauntized tensor to bestla q4 block") -def main(args_in: Optional[List[str]] = None) -> None: +def main(args_in: Optional[List[str]] = None) -> None: parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file") parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") @@ -153,7 +153,7 @@ def main(args_in: Optional[List[str]] = None) -> None: # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp - f.write(struct.pack("i", 1)) + f.write(struct.pack("i", 1)) f.write(struct.pack("i", 2)) f.write(struct.pack("i", 0)) diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py index bb2f97e0d..b5b4881aa 100644 --- a/neural_speed/convert/convert_quantized_mistral.py +++ b/neural_speed/convert/convert_quantized_mistral.py @@ -42,7 +42,7 @@ def main(args_in: Optional[List[str]] = None) -> None: model, config, quantize_config = load_quantized_model(model_path) f = open(out_path, "wb") - + # 1. write hparams n_vocab = config["vocab_size"] n_embd = config["hidden_size"] @@ -87,7 +87,7 @@ def main(args_in: Optional[List[str]] = None) -> None: f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000)) f.write(struct.pack("f", rope_scale)) - # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json + # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json # but bos_token_id = 1 in llama.cpp f.write(struct.pack("i", 1)) f.write(struct.pack("i", 2)) diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py index b41debbe6..70100a38a 100644 --- a/neural_speed/convert/convert_whisper.py +++ b/neural_speed/convert/convert_whisper.py @@ -240,4 +240,4 @@ def main(args_in: Optional[List[str]] = None) -> None: if __name__ == "__main__": main() - \ No newline at end of file + diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md index 3aea65b04..ee7548e3c 100644 --- a/neural_speed/core/README.md +++ b/neural_speed/core/README.md @@ -73,4 +73,3 @@ Ice Lake
Cascade Lake
Cooper Lake
Tiger Lake
Rocket Lake | any int4< Skylake | any 4bits
group size=-1
compute type=fp32 | AVX512F Alder Lake (12th Gen)
Raptor Lake (13th and 14th Gen)|any 4bits
group size=-1
compute type=int8 | AVX_VNNI Older architecture (before 12th Gen)| any 4bits
group size=-1
compute type=fp32 | AVX2 - diff --git a/neural_speed/models/requirements/common.txt b/neural_speed/models/requirements/common.txt index a6621ec05..4c66a05a5 100644 --- a/neural_speed/models/requirements/common.txt +++ b/neural_speed/models/requirements/common.txt @@ -1,13 +1,13 @@ --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.1.0+cpu -transformers -numpy -sentencepiece -protobuf<3.20 -einops accelerate -peft datasets -transformers_stream_generator +einops gguf +numpy +peft +protobuf<3.20 +sentencepiece tiktoken +torch==2.1.0+cpu +transformers +transformers_stream_generator diff --git a/requirements.txt b/requirements.txt index 0f37048f8..eca40e698 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ --extra-index-url https://download.pytorch.org/whl/cpu -torch==2.1.0+cpu -transformers -numpy -sentencepiece -protobuf<3.20 -einops accelerate -peft +cmake datasets -transformers_stream_generator -tiktoken -py-cpuinfo +einops gguf -cmake +numpy +peft +protobuf<3.20 +py-cpuinfo +sentencepiece setuptools>=61 +tiktoken +torch==2.1.0+cpu +transformers +transformers_stream_generator diff --git a/scripts/huggingface.py b/scripts/huggingface.py index a9319aa96..16ceb55b6 100644 --- a/scripts/huggingface.py +++ b/scripts/huggingface.py @@ -351,7 +351,7 @@ def _create_auto_model( load_in_8bit=load_in_8bit, trust_remote_code=trust_remote_code, torch_dtype=torch_dtype - ) + ) else: if load_in_4bit: assert ( @@ -468,7 +468,7 @@ def add_special_tokens(self) -> bool: elif self.model_format == "runtime": return True elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM: - return False + return False elif self.AUTO_MODEL_CLASS is transformers.AutoModel: return False elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM: diff --git a/tests/model-test/run_tp.sh b/tests/model-test/run_tp.sh index 1a99d1d40..5f907aef5 100644 --- a/tests/model-test/run_tp.sh +++ b/tests/model-test/run_tp.sh @@ -1,4 +1,18 @@ #!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -eo pipefail set -x diff --git a/tests/requirements.txt b/tests/requirements.txt index a2d6abf48..ade85af11 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,3 +1,3 @@ +gguf optimum==1.13.2 optimum-intel==1.11.0 -gguf \ No newline at end of file diff --git a/tests/test_python_api.py b/tests/test_python_api.py index 56f3041e9..b90169009 100644 --- a/tests/test_python_api.py +++ b/tests/test_python_api.py @@ -47,7 +47,7 @@ def test_llm_runtime(self): tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) inputs = tokenizer(prompt, return_tensors="pt") - + pt_logits = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_logits.pth")[:,-1] pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_generate_ids.pth")[0].tolist() print(tokenizer.decode(pt_generate_ids)) @@ -117,4 +117,4 @@ def test_beam_search(self): if __name__ == "__main__": unittest.main() - +