From 3f75e44a020669cb7205c5ac312e326687b25453 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 4 Feb 2024 05:32:22 +0000
Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .github/workflows/copyright_check.yml         |  6 ++---
 .github/workflows/cpp-graph-test.yml          |  4 ++--
 .../workflows/scripts/formatScan/nlp_dict.txt |  1 -
 .../scripts/formatScan/pyspelling_conf.yaml   |  2 +-
 .../scripts/models/calculate_percertiles.py   |  2 +-
 .github/workflows/unit-test-bestla.yml        |  4 ++--
 .github/workflows/unit-test-llmruntime.yml    |  2 +-
 README.md                                     |  1 -
 developer_document.md                         |  1 -
 docs/install.md                               |  1 -
 docs/tensor_parallelism.md                    |  1 -
 neural_speed/__init__.py                      |  8 +++----
 neural_speed/convert/common.py                |  2 +-
 neural_speed/convert/convert-hf-to-gguf.py    |  4 ++--
 neural_speed/convert/convert_dolly.py         |  2 +-
 neural_speed/convert/convert_falcon.py        |  2 +-
 neural_speed/convert/convert_gptj.py          |  2 +-
 neural_speed/convert/convert_gptneox.py       |  2 +-
 neural_speed/convert/convert_llama.py         |  2 +-
 neural_speed/convert/convert_mistral.py       |  4 ++--
 neural_speed/convert/convert_opt.py           |  2 +-
 neural_speed/convert/convert_phi.py           |  7 +++---
 .../convert/convert_quantized_gptj.py         |  6 ++---
 .../convert/convert_quantized_llama.py        |  4 ++--
 .../convert/convert_quantized_mistral.py      |  4 ++--
 neural_speed/convert/convert_whisper.py       |  2 +-
 neural_speed/core/README.md                   |  1 -
 neural_speed/models/requirements/common.txt   | 16 +++++++-------
 requirements.txt                              | 22 +++++++++----------
 scripts/huggingface.py                        |  4 ++--
 tests/model-test/run_tp.sh                    | 14 ++++++++++++
 tests/requirements.txt                        |  2 +-
 tests/test_python_api.py                      |  4 ++--
 33 files changed, 74 insertions(+), 67 deletions(-)

diff --git a/.github/workflows/copyright_check.yml b/.github/workflows/copyright_check.yml
index 2b9644a60..f45d1e3ad 100644
--- a/.github/workflows/copyright_check.yml
+++ b/.github/workflows/copyright_check.yml
@@ -26,7 +26,7 @@ jobs:
         job_name: ["copyright"]
       fail-fast: false
     steps:
-          
+
       - name: Checkout out Repo
         uses: actions/checkout@v3
 
@@ -40,7 +40,7 @@ jobs:
           git --no-pager diff --name-only remotes/origin/${{ github.base_ref }} ${{ github.workspace }}/neural_speed> ${{ env.CODE_SCAN_LOG_PATH }}/diff.log
           files=$(cat ${{ env.CODE_SCAN_LOG_PATH }}/diff.log | awk '!a[$0]++')
           $LIGHT_PURPLE && echo " ----------------- checking ... --------------------------" && $RESET
-          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then 
+          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
             rm -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log
           fi
           for file in ${files}
@@ -57,7 +57,7 @@ jobs:
                   $LIGHT_PURPLE && echo "Skipping ${file}"  && $RESET
               fi
           done
-          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then 
+          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
             $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------"
             cat ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log
             $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET
diff --git a/.github/workflows/cpp-graph-test.yml b/.github/workflows/cpp-graph-test.yml
index 19b1516fa..7bd0b2940 100644
--- a/.github/workflows/cpp-graph-test.yml
+++ b/.github/workflows/cpp-graph-test.yml
@@ -59,7 +59,7 @@ jobs:
         run: |
           cd ${{ github.workspace }}/.github/workflows/scripts/models
           bash cpp_graph_inference.sh cpp-graph-test-neural-speed ${{ matrix.modelName }} ${{ env.INPUT_COMPILER_VERSION }}
-      
+
       - name: Rename summary
         run: |
           cd ${{ github.workspace }}
@@ -93,7 +93,7 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           path: ${{ env.OUT_SCRIPT_PATH }}/generated/log
-      
+
       - name: Merge CPP Graph Summary Log
         run: |
           cd ${{ env.OUT_SCRIPT_PATH }}/generated/log/cpp_graph
diff --git a/.github/workflows/scripts/formatScan/nlp_dict.txt b/.github/workflows/scripts/formatScan/nlp_dict.txt
index 8b1378917..e69de29bb 100644
--- a/.github/workflows/scripts/formatScan/nlp_dict.txt
+++ b/.github/workflows/scripts/formatScan/nlp_dict.txt
@@ -1 +0,0 @@
-
diff --git a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
index 6fb64f3f0..17dc81c19 100644
--- a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
+++ b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
@@ -10,4 +10,4 @@ matrix:
     output: ${VAL_REPO}/nlp_dict.dic
   sources:
   - ${SCAN_REPO}/docs/*
-  - ${SCAN_REPO}/*.md
\ No newline at end of file
+  - ${SCAN_REPO}/*.md
diff --git a/.github/workflows/scripts/models/calculate_percertiles.py b/.github/workflows/scripts/models/calculate_percertiles.py
index b79c6c6df..f54d0970b 100644
--- a/.github/workflows/scripts/models/calculate_percertiles.py
+++ b/.github/workflows/scripts/models/calculate_percertiles.py
@@ -51,7 +51,7 @@ def parse_memory_file(memory_file):
     p99 = calculate_percentile(predictions, 99)
     latency_mean = calculate_mean(predictions[1:])
     total_latency = np.sum(predictions)
-     
+
     print("P90: {:.2f} ms".format(p90))
     print("P99: {:.2f} ms".format(p99))
     print("average_latency: {:.2f} ms".format(latency_mean))
diff --git a/.github/workflows/unit-test-bestla.yml b/.github/workflows/unit-test-bestla.yml
index 0a80f5cc0..6d5dd8be8 100644
--- a/.github/workflows/unit-test-bestla.yml
+++ b/.github/workflows/unit-test-bestla.yml
@@ -33,7 +33,7 @@ jobs:
         with:
           submodules: "recursive"
           fetch-tags: true
-      
+
       - name: Env build
         run: |
           echo "do not need conda env"
@@ -42,7 +42,7 @@ jobs:
           #if [[ "${{ env.INPUT_COMPILER_VERSION }}" != "11.4.1" ]]; then
           #  conda install --update-deps -c conda-forge gxx==${{ env.INPUT_COMPILER_VERSION }} gcc==${{ env.INPUT_COMPILER_VERSION }} gxx_linux-64==${{ env.INPUT_COMPILER_VERSION }} libstdcxx-ng sysroot_linux-64 -y
           #fi
-  
+
       - name: Run UT
         run: |
           source /opt/rh/gcc-toolset-12/enable
diff --git a/.github/workflows/unit-test-llmruntime.yml b/.github/workflows/unit-test-llmruntime.yml
index 974edff44..e5d0e0b6b 100644
--- a/.github/workflows/unit-test-llmruntime.yml
+++ b/.github/workflows/unit-test-llmruntime.yml
@@ -28,7 +28,7 @@ jobs:
     steps:
       - name: Load environment variables
         run: cat ~/actions-runner3/.env >> $GITHUB_ENV
-      
+
       - name: Docker Clean Up
         run: |
           docker ps -a
diff --git a/README.md b/README.md
index 347a666c6..bec1e8383 100644
--- a/README.md
+++ b/README.md
@@ -143,4 +143,3 @@ Available modes:
 
 ## Enable New Model
 You can consider adding your own models, please follow the document: [graph developer document](./developer_document.md).
-
diff --git a/developer_document.md b/developer_document.md
index d06f3eaf5..ffa34083d 100644
--- a/developer_document.md
+++ b/developer_document.md
@@ -426,4 +426,3 @@ We can improve the performance by fusion the FFN process.
 - [FFN-Fusion example](https://github.com/intel/intel-extension-for-transformers/pull/160)
 # 4. A complete example
 - [Enable baichuan](https://github.com/intel/intel-extension-for-transformers/pull/376)
-
diff --git a/docs/install.md b/docs/install.md
index 874913bd1..e270f6174 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -25,4 +25,3 @@ cd build
 cmake ..
 cmake --build . -j --config Release
 ```
-
diff --git a/docs/tensor_parallelism.md b/docs/tensor_parallelism.md
index 9c95e4204..17e84b37d 100644
--- a/docs/tensor_parallelism.md
+++ b/docs/tensor_parallelism.md
@@ -112,4 +112,3 @@ mpirun -n 1 taskset -c 0-47 sh run.sh : -n 1 taskset -c 48-95 sh run.sh
 
 ```
 **NOTICE**: tensor parallelsim strategy will split the model to specific node/socket, each device already use part of the original weights differently. So we should not use shared-memory of weights to avoid cross numa weight movement. Use option `--no-mmap` to disable shared weights between processes.
-
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
index afc1a6100..c73dfe2f4 100644
--- a/neural_speed/__init__.py
+++ b/neural_speed/__init__.py
@@ -113,7 +113,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False,
             self.bin_file = fp32_bin
         else:
             self.bin_file = quant_bin
-        
+
         if os.path.exists(self.bin_file):
             print("{} existed, will use cache file. Otherwise please remove the file".
                   format(self.bin_file))
@@ -122,7 +122,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False,
         if use_gptq or use_awq:
             convert_model(model_name, quant_bin, "f32")
             return
-        
+
         if not os.path.exists(fp32_bin):
             convert_model(model_name, fp32_bin, "f32")
             assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
@@ -142,7 +142,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
         self.__import_package(model_type)
         self.model = self.module.Model()
         if self.max_request_num == -1:
-            self.max_request_num = max(generate_kwargs.get("max_request_num", 
+            self.max_request_num = max(generate_kwargs.get("max_request_num",
                             max_request_num_default), generate_kwargs.get("batch_size", 1))
         if "threads" not in generate_kwargs:
             threads = os.getenv("OMP_NUM_THREADS")
@@ -174,7 +174,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
             reinit_from_bin = True
             if self.max_request_num > 0:
                 print("Will start to reinit model from bin due to different max request num.")
-            self.max_request_num = max(input_bs, max_request_num) 
+            self.max_request_num = max(input_bs, max_request_num)
 
         if self.model is None or reinit_from_bin:
             self.init_from_bin(self.model_type, self.bin_file, batch_size=input_bs,
diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
index d0fe62285..093d5cf89 100644
--- a/neural_speed/convert/common.py
+++ b/neural_speed/convert/common.py
@@ -401,7 +401,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
         g_idx = torch.tensor([i // q_config["group_size"] for i in range(infeatures)], dtype=torch.int32)
         scale_zeros = gptq_zeros * gptq_scales
         weight = (gptq_scales[g_idx.long()] * weight - scale_zeros[g_idx.long()])
-    
+
     weight = weight.t()
     weight = weight.float()
     if permute_func:
diff --git a/neural_speed/convert/convert-hf-to-gguf.py b/neural_speed/convert/convert-hf-to-gguf.py
index 2d9a89837..3fbe47b9e 100755
--- a/neural_speed/convert/convert-hf-to-gguf.py
+++ b/neural_speed/convert/convert-hf-to-gguf.py
@@ -369,9 +369,9 @@ def write_tensors(self):
             data = data_torch.squeeze().numpy()
 
             # Map bloom-style qkv_linear to gpt-style qkv_linear
-            # bloom: 
+            # bloom:
             # github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252
-            # gpt-2: 
+            # gpt-2:
             # github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312
             if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
                 qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
index dc77b1c43..61a0bb0ac 100644
--- a/neural_speed/convert/convert_dolly.py
+++ b/neural_speed/convert/convert_dolly.py
@@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
index 9d323f89d..c4a92222b 100644
--- a/neural_speed/convert/convert_falcon.py
+++ b/neural_speed/convert/convert_falcon.py
@@ -110,7 +110,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
index 2f6c8e673..a610032ea 100644
--- a/neural_speed/convert/convert_gptj.py
+++ b/neural_speed/convert/convert_gptj.py
@@ -102,7 +102,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
index 409cc05ba..8c50c006b 100644
--- a/neural_speed/convert/convert_gptneox.py
+++ b/neural_speed/convert/convert_gptneox.py
@@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
index 9dae31bd8..ca638da5a 100644
--- a/neural_speed/convert/convert_llama.py
+++ b/neural_speed/convert/convert_llama.py
@@ -285,7 +285,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to
     def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.sentencepiece_tokenizer
         for i in range(self.params_vocab_size):
-            text: bytes           
+            text: bytes
             if i < tokenizer.vocab_size():
                 if tokenizer.is_unknown(i):
                     text = " \u2047 ".encode("utf-8")
diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
index 71a195fcc..be26fd90f 100644
--- a/neural_speed/convert/convert_mistral.py
+++ b/neural_speed/convert/convert_mistral.py
@@ -1067,8 +1067,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
 
         self.fout.write(
             struct.pack("i", 1)
-        )  
-        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
+        )
+        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
         # but bos_token_id = 1 in llama.cpp
         self.fout.write(struct.pack("i", 2))
 
diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
index 4f487f68c..ab26bc538 100644
--- a/neural_speed/convert/convert_opt.py
+++ b/neural_speed/convert/convert_opt.py
@@ -109,7 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py
index a4c62c89d..f74fdf5d1 100644
--- a/neural_speed/convert/convert_phi.py
+++ b/neural_speed/convert/convert_phi.py
@@ -50,7 +50,7 @@ def bytes_to_unicode():
             n += 1
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
-        
+
 def phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams):
     print("phi.gguf converting: ")
     list_vars = model.state_dict()
@@ -257,7 +257,7 @@ def phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
 
     print("Done. Output file: " + fname_out)
     print("")
-    
+
 def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
@@ -288,9 +288,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
         phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
     else:
         phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams)
-    
+
 
 
 if __name__ == '__main__':
     main()
-
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
index 829445707..4e6b18578 100644
--- a/neural_speed/convert/convert_quantized_gptj.py
+++ b/neural_speed/convert/convert_quantized_gptj.py
@@ -99,7 +99,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
 
 
-def main(args_in: Optional[List[str]] = None) -> None:    
+def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@@ -143,7 +143,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
@@ -183,7 +183,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                     f"transformer.h.{i}.attn.k_proj.weight", list_vars, fout, quantize_config)
         convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.v_proj.weight",
                     f"transformer.h.{i}.attn.v_proj.weight", list_vars, fout, quantize_config)
-        
+
         convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.out_proj.weight",
                     f"transformer.h.{i}.attn.out_proj.weight", list_vars, fout, quantize_config)
         convert_to_qx_bestla_tensor(f"transformer.h.{i}.mlp.fc_in.weight",
diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
index 4733d6e7f..7ea173ece 100644
--- a/neural_speed/convert/convert_quantized_llama.py
+++ b/neural_speed/convert/convert_quantized_llama.py
@@ -94,7 +94,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
     dst.flatten()[:byte_size].tofile(fout)
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
 
-def main(args_in: Optional[List[str]] = None) -> None:    
+def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@@ -153,7 +153,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
     # but bos_token_id = 1 in llama.cpp
-    f.write(struct.pack("i", 1))  
+    f.write(struct.pack("i", 1))
     f.write(struct.pack("i", 2))
 
     f.write(struct.pack("i", 0))
diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py
index bb2f97e0d..b5b4881aa 100644
--- a/neural_speed/convert/convert_quantized_mistral.py
+++ b/neural_speed/convert/convert_quantized_mistral.py
@@ -42,7 +42,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     model, config, quantize_config = load_quantized_model(model_path)
     f = open(out_path, "wb")
-    
+
     # 1. write hparams
     n_vocab = config["vocab_size"]
     n_embd = config["hidden_size"]
@@ -87,7 +87,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
     f.write(struct.pack("f", rope_scale))
 
-    # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
+    # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
     # but bos_token_id = 1 in llama.cpp
     f.write(struct.pack("i", 1))
     f.write(struct.pack("i", 2))
diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py
index b41debbe6..70100a38a 100644
--- a/neural_speed/convert/convert_whisper.py
+++ b/neural_speed/convert/convert_whisper.py
@@ -240,4 +240,4 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
 if __name__ == "__main__":
     main()
-    
\ No newline at end of file
+
diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md
index 3aea65b04..ee7548e3c 100644
--- a/neural_speed/core/README.md
+++ b/neural_speed/core/README.md
@@ -73,4 +73,3 @@ Ice Lake<br>Cascade Lake<br>Cooper Lake<br>Tiger Lake<br>Rocket Lake | any int4<
 Skylake |  any 4bits<br>group size=-1<br>compute type=fp32 | AVX512F
 Alder Lake (12th Gen)<br>Raptor Lake (13th and 14th Gen)|any 4bits<br>group size=-1<br>compute type=int8 | AVX_VNNI
 Older architecture (before 12th Gen)|  any 4bits<br>group size=-1<br>compute type=fp32 | AVX2
-
diff --git a/neural_speed/models/requirements/common.txt b/neural_speed/models/requirements/common.txt
index a6621ec05..4c66a05a5 100644
--- a/neural_speed/models/requirements/common.txt
+++ b/neural_speed/models/requirements/common.txt
@@ -1,13 +1,13 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
-transformers
-numpy
-sentencepiece
-protobuf<3.20
-einops
 accelerate
-peft
 datasets
-transformers_stream_generator
+einops
 gguf
+numpy
+peft
+protobuf<3.20
+sentencepiece
 tiktoken
+torch==2.1.0+cpu
+transformers
+transformers_stream_generator
diff --git a/requirements.txt b/requirements.txt
index 0f37048f8..eca40e698 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,16 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
-transformers
-numpy
-sentencepiece
-protobuf<3.20
-einops
 accelerate
-peft
+cmake
 datasets
-transformers_stream_generator
-tiktoken
-py-cpuinfo
+einops
 gguf
-cmake
+numpy
+peft
+protobuf<3.20
+py-cpuinfo
+sentencepiece
 setuptools>=61
+tiktoken
+torch==2.1.0+cpu
+transformers
+transformers_stream_generator
diff --git a/scripts/huggingface.py b/scripts/huggingface.py
index a9319aa96..16ceb55b6 100644
--- a/scripts/huggingface.py
+++ b/scripts/huggingface.py
@@ -351,7 +351,7 @@ def _create_auto_model(
                             load_in_8bit=load_in_8bit,
                             trust_remote_code=trust_remote_code,
                             torch_dtype=torch_dtype
-                        )   
+                        )
             else:
                 if load_in_4bit:
                     assert (
@@ -468,7 +468,7 @@ def add_special_tokens(self) -> bool:
         elif self.model_format == "runtime":
             return True
         elif self.AUTO_MODEL_CLASS is transformers.AutoModelForCausalLM:
-            return False 
+            return False
         elif self.AUTO_MODEL_CLASS is transformers.AutoModel:
             return False
         elif self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM:
diff --git a/tests/model-test/run_tp.sh b/tests/model-test/run_tp.sh
index 1a99d1d40..5f907aef5 100644
--- a/tests/model-test/run_tp.sh
+++ b/tests/model-test/run_tp.sh
@@ -1,4 +1,18 @@
 #!/bin/bash
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -eo pipefail
 set -x
 
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a2d6abf48..ade85af11 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,3 +1,3 @@
+gguf
 optimum==1.13.2
 optimum-intel==1.11.0
-gguf
\ No newline at end of file
diff --git a/tests/test_python_api.py b/tests/test_python_api.py
index 56f3041e9..b90169009 100644
--- a/tests/test_python_api.py
+++ b/tests/test_python_api.py
@@ -47,7 +47,7 @@ def test_llm_runtime(self):
 
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         inputs = tokenizer(prompt, return_tensors="pt")
-        
+
         pt_logits = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_logits.pth")[:,-1]
         pt_generate_ids = torch.load("/tf_dataset2/inc-ut/nlptoolkit_ut_model/llama2_pt_generate_ids.pth")[0].tolist()
         print(tokenizer.decode(pt_generate_ids))
@@ -117,4 +117,4 @@ def test_beam_search(self):
 
 if __name__ == "__main__":
     unittest.main()
-    
+