Merge branch 'add_pre_commit' of https://github.com/intel/neural-speed …

…into add_pre_commit
intel · Feb 4, 2024 · 929ef46 · 929ef46
2 parents 9c0e728 + a145f4f
commit 929ef46
Show file tree

Hide file tree

Showing 33 changed files with 77 additions and 67 deletions.
diff --git a/.github/workflows/copyright_check.yml b/.github/workflows/copyright_check.yml
@@ -26,7 +26,7 @@ jobs:
         job_name: ["copyright"]
       fail-fast: false
     steps:
-          
+
       - name: Checkout out Repo
         uses: actions/checkout@v3
 
@@ -40,7 +40,7 @@ jobs:
           git --no-pager diff --name-only remotes/origin/${{ github.base_ref }} ${{ github.workspace }}/neural_speed> ${{ env.CODE_SCAN_LOG_PATH }}/diff.log
           files=$(cat ${{ env.CODE_SCAN_LOG_PATH }}/diff.log | awk '!a[$0]++')
           $LIGHT_PURPLE && echo " ----------------- checking ... --------------------------" && $RESET
-          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then 
+          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
             rm -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log
           fi
           for file in ${files}
@@ -57,7 +57,7 @@ jobs:
                   $LIGHT_PURPLE && echo "Skipping ${file}"  && $RESET
               fi
           done
-          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then 
+          if [[ -f ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log ]]; then
             $BOLD_YELLOW && echo " -----------------  Current log file output start --------------------------"
             cat ${{ env.CODE_SCAN_LOG_PATH }}/copyright_issue_summary.log
             $BOLD_YELLOW && echo " -----------------  Current log file output end --------------------------" && $RESET

diff --git a/.github/workflows/cpp-graph-test.yml b/.github/workflows/cpp-graph-test.yml
@@ -59,7 +59,7 @@ jobs:
         run: |
           cd ${{ github.workspace }}/.github/workflows/scripts/models
           bash cpp_graph_inference.sh cpp-graph-test-neural-speed ${{ matrix.modelName }} ${{ env.INPUT_COMPILER_VERSION }}
-      
+
       - name: Rename summary
         run: |
           cd ${{ github.workspace }}
@@ -93,7 +93,7 @@ jobs:
         uses: actions/download-artifact@v3
         with:
           path: ${{ env.OUT_SCRIPT_PATH }}/generated/log
-      
+
       - name: Merge CPP Graph Summary Log
         run: |
           cd ${{ env.OUT_SCRIPT_PATH }}/generated/log/cpp_graph

diff --git a/.github/workflows/scripts/formatScan/nlp_dict.txt b/.github/workflows/scripts/formatScan/nlp_dict.txt
@@ -12,12 +12,14 @@ haa
 inout
 mata
 matc
+mone
 nd
 ore
 ot
 parm
 ques
 rouge
+ser
 sie
 te
 tye
@@ -28,4 +30,4 @@ mone
 iterm
 tne
 aadd
-endianess
+endianess
diff --git a/.github/workflows/scripts/formatScan/pyspelling_conf.yaml b/.github/workflows/scripts/formatScan/pyspelling_conf.yaml
@@ -10,4 +10,4 @@ matrix:
     output: ${VAL_REPO}/nlp_dict.dic
   sources:
   - ${SCAN_REPO}/docs/*
-  - ${SCAN_REPO}/*.md
+  - ${SCAN_REPO}/*.md
diff --git a/.github/workflows/scripts/models/calculate_percertiles.py b/.github/workflows/scripts/models/calculate_percertiles.py
@@ -51,7 +51,7 @@ def parse_memory_file(memory_file):
     p99 = calculate_percentile(predictions, 99)
     latency_mean = calculate_mean(predictions[1:])
     total_latency = np.sum(predictions)
-     
+
     print("P90: {:.2f} ms".format(p90))
     print("P99: {:.2f} ms".format(p99))
     print("average_latency: {:.2f} ms".format(latency_mean))

diff --git a/.github/workflows/unit-test-bestla.yml b/.github/workflows/unit-test-bestla.yml
@@ -33,7 +33,7 @@ jobs:
         with:
           submodules: "recursive"
           fetch-tags: true
-      
+
       - name: Env build
         run: |
           echo "do not need conda env"
@@ -42,7 +42,7 @@ jobs:
           #if [[ "${{ env.INPUT_COMPILER_VERSION }}" != "11.4.1" ]]; then
           #  conda install --update-deps -c conda-forge gxx==${{ env.INPUT_COMPILER_VERSION }} gcc==${{ env.INPUT_COMPILER_VERSION }} gxx_linux-64==${{ env.INPUT_COMPILER_VERSION }} libstdcxx-ng sysroot_linux-64 -y
           #fi
-  
+
       - name: Run UT
         run: |
           source /opt/rh/gcc-toolset-12/enable

diff --git a/.github/workflows/unit-test-llmruntime.yml b/.github/workflows/unit-test-llmruntime.yml
@@ -28,7 +28,7 @@ jobs:
     steps:
       - name: Load environment variables
         run: cat ~/actions-runner3/.env >> $GITHUB_ENV
-      
+
       - name: Docker Clean Up
         run: |
           docker ps -a

diff --git a/README.md b/README.md
@@ -143,4 +143,3 @@ Available modes:
 
 ## Enable New Model
 You can consider adding your own models, please follow the document: [graph developer document](./developer_document.md).
-
diff --git a/developer_document.md b/developer_document.md
@@ -426,4 +426,3 @@ We can improve the performance by fusion the FFN process.
 - [FFN-Fusion example](https://github.com/intel/intel-extension-for-transformers/pull/160)
 # 4. A complete example
 - [Enable baichuan](https://github.com/intel/intel-extension-for-transformers/pull/376)
-
diff --git a/docs/install.md b/docs/install.md
@@ -25,4 +25,3 @@ cd build
 cmake ..
 cmake --build . -j --config Release
 ```
-
diff --git a/docs/tensor_parallelism.md b/docs/tensor_parallelism.md
@@ -112,4 +112,3 @@ mpirun -n 1 taskset -c 0-47 sh run.sh : -n 1 taskset -c 48-95 sh run.sh
 
 ```
 **NOTICE**: tensor parallelsim strategy will split the model to specific node/socket, each device already use part of the original weights differently. So we should not use shared-memory of weights to avoid cross numa weight movement. Use option `--no-mmap` to disable shared weights between processes.
-
diff --git a/neural_speed/__init__.py b/neural_speed/__init__.py
@@ -113,7 +113,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False,
             self.bin_file = fp32_bin
         else:
             self.bin_file = quant_bin
-        
+
         if os.path.exists(self.bin_file):
             print("{} existed, will use cache file. Otherwise please remove the file".
                   format(self.bin_file))
@@ -122,7 +122,7 @@ def init(self, model_name, use_quant=True, use_gptq=False, use_awq=False,
         if use_gptq or use_awq:
             convert_model(model_name, quant_bin, "f32")
             return
-        
+
         if not os.path.exists(fp32_bin):
             convert_model(model_name, fp32_bin, "f32")
             assert os.path.exists(fp32_bin), "Fail to convert pytorch model"
@@ -142,7 +142,7 @@ def init_from_bin(self, model_type, model_path, **generate_kwargs):
         self.__import_package(model_type)
         self.model = self.module.Model()
         if self.max_request_num == -1:
-            self.max_request_num = max(generate_kwargs.get("max_request_num", 
+            self.max_request_num = max(generate_kwargs.get("max_request_num",
                             max_request_num_default), generate_kwargs.get("batch_size", 1))
         if "threads" not in generate_kwargs:
             threads = os.getenv("OMP_NUM_THREADS")
@@ -174,7 +174,7 @@ def generate(self, input_ids, streamer=None, interactive=False, ignore_prompt=Fa
             reinit_from_bin = True
             if self.max_request_num > 0:
                 print("Will start to reinit model from bin due to different max request num.")
-            self.max_request_num = max(input_bs, max_request_num) 
+            self.max_request_num = max(input_bs, max_request_num)
 
         if self.model is None or reinit_from_bin:
             self.init_from_bin(self.model_type, self.bin_file, batch_size=input_bs,

diff --git a/neural_speed/convert/common.py b/neural_speed/convert/common.py
@@ -401,7 +401,7 @@ def convert_q4_f32_tensor(src_name, dst_name, model, fout, q_config, n_head, n_h
         g_idx = torch.tensor([i // q_config["group_size"] for i in range(infeatures)], dtype=torch.int32)
         scale_zeros = gptq_zeros * gptq_scales
         weight = (gptq_scales[g_idx.long()] * weight - scale_zeros[g_idx.long()])
-    
+
     weight = weight.t()
     weight = weight.float()
     if permute_func:

diff --git a/neural_speed/convert/convert-hf-to-gguf.py b/neural_speed/convert/convert-hf-to-gguf.py
@@ -369,9 +369,9 @@ def write_tensors(self):
             data = data_torch.squeeze().numpy()
 
             # Map bloom-style qkv_linear to gpt-style qkv_linear
-            # bloom: 
+            # bloom:
             # github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252
-            # gpt-2: 
+            # gpt-2:
             # github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312
             if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
                 qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))

diff --git a/neural_speed/convert/convert_dolly.py b/neural_speed/convert/convert_dolly.py
@@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_falcon.py b/neural_speed/convert/convert_falcon.py
@@ -110,7 +110,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_gptj.py b/neural_speed/convert/convert_gptj.py
@@ -102,7 +102,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_gptneox.py b/neural_speed/convert/convert_gptneox.py
@@ -116,7 +116,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_llama.py b/neural_speed/convert/convert_llama.py
@@ -285,7 +285,7 @@ def __init__(self, fname_tokenizer: Path, params_vocab_size: int, fname_added_to
     def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float]]:
         tokenizer = self.sentencepiece_tokenizer
         for i in range(self.params_vocab_size):
-            text: bytes           
+            text: bytes
             if i < tokenizer.vocab_size():
                 if tokenizer.is_unknown(i):
                     text = " \u2047 ".encode("utf-8")

diff --git a/neural_speed/convert/convert_mistral.py b/neural_speed/convert/convert_mistral.py
@@ -1067,8 +1067,8 @@ def write_file_header(self, params: Params, file_type: NEFileType) -> None:
 
         self.fout.write(
             struct.pack("i", 1)
-        )  
-        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
+        )
+        # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
         # but bos_token_id = 1 in llama.cpp
         self.fout.write(struct.pack("i", 2))
 

diff --git a/neural_speed/convert/convert_opt.py b/neural_speed/convert/convert_opt.py
@@ -109,7 +109,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))

diff --git a/neural_speed/convert/convert_phi.py b/neural_speed/convert/convert_phi.py
@@ -50,7 +50,7 @@ def bytes_to_unicode():
             n += 1
     cs = [chr(n) for n in cs]
     return dict(zip(bs, cs))
-        
+
 def phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams):
     print("phi.gguf converting: ")
     list_vars = model.state_dict()
@@ -257,7 +257,7 @@ def phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams):
 
     print("Done. Output file: " + fname_out)
     print("")
-    
+
 def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
@@ -288,9 +288,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
         phi_convert_gguf(model, tokenizer, dir_model, fname_out, ftype, hparams)
     else:
         phi_convert(model, tokenizer, dir_model, fname_out, ftype, hparams)
-    
+
 
 
 if __name__ == '__main__':
     main()
-
diff --git a/neural_speed/convert/convert_quantized_gptj.py b/neural_speed/convert/convert_quantized_gptj.py
@@ -99,7 +99,7 @@ def convert_to_qx_bestla_tensor(src_name, dst_name, model, fout, q_config):
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
 
 
-def main(args_in: Optional[List[str]] = None) -> None:    
+def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@@ -143,7 +143,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     fout.write(struct.pack("f", hparams.get("rms_norm_eps", 1e-6)))  # rms norm eps
     fout.write(struct.pack("f", 10000.0))  # freq_base
     fout.write(struct.pack("f", 1.0))  # rope_factor
-    
+
     fout.write(struct.pack("i", tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 1))
     fout.write(struct.pack("i", tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 2))
     fout.write(struct.pack("i", tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -1))
@@ -183,7 +183,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
                     f"transformer.h.{i}.attn.k_proj.weight", list_vars, fout, quantize_config)
         convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.v_proj.weight",
                     f"transformer.h.{i}.attn.v_proj.weight", list_vars, fout, quantize_config)
-        
+
         convert_to_qx_bestla_tensor(f"transformer.h.{i}.attn.out_proj.weight",
                     f"transformer.h.{i}.attn.out_proj.weight", list_vars, fout, quantize_config)
         convert_to_qx_bestla_tensor(f"transformer.h.{i}.mlp.fc_in.weight",

diff --git a/neural_speed/convert/convert_quantized_llama.py b/neural_speed/convert/convert_quantized_llama.py
@@ -94,7 +94,7 @@ def convert_q4_bestla_tensor(src_name, dst_name, model, fout, q_config, n_head,
     dst.flatten()[:byte_size].tofile(fout)
     print(f"converting {dst_name} qauntized tensor to bestla q4 block")
 
-def main(args_in: Optional[List[str]] = None) -> None:    
+def main(args_in: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser(description="Convert a model to a NE compatible file")
     parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
@@ -153,7 +153,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
     # but bos_token_id = 1 in llama.cpp
-    f.write(struct.pack("i", 1))  
+    f.write(struct.pack("i", 1))
     f.write(struct.pack("i", 2))
 
     f.write(struct.pack("i", 0))

diff --git a/neural_speed/convert/convert_quantized_mistral.py b/neural_speed/convert/convert_quantized_mistral.py
@@ -42,7 +42,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
     model, config, quantize_config = load_quantized_model(model_path)
     f = open(out_path, "wb")
-    
+
     # 1. write hparams
     n_vocab = config["vocab_size"]
     n_embd = config["hidden_size"]
@@ -87,7 +87,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
     f.write(struct.pack("f", config["rope_theta"] if "rope_theta" in config else 10000))
     f.write(struct.pack("f", rope_scale))
 
-    # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json 
+    # TODO, bos_token_id = 0 in https://huggingface.co/decapoda-research/llama-7b-hf/blob/main/config.json
     # but bos_token_id = 1 in llama.cpp
     f.write(struct.pack("i", 1))
     f.write(struct.pack("i", 2))

diff --git a/neural_speed/convert/convert_whisper.py b/neural_speed/convert/convert_whisper.py
@@ -240,4 +240,4 @@ def main(args_in: Optional[List[str]] = None) -> None:
 
 if __name__ == "__main__":
     main()
-    
+
diff --git a/neural_speed/core/README.md b/neural_speed/core/README.md
@@ -73,4 +73,3 @@ Ice Lake<br>Cascade Lake<br>Cooper Lake<br>Tiger Lake<br>Rocket Lake | any int4<
 Skylake |  any 4bits<br>group size=-1<br>compute type=fp32 | AVX512F
 Alder Lake (12th Gen)<br>Raptor Lake (13th and 14th Gen)|any 4bits<br>group size=-1<br>compute type=int8 | AVX_VNNI
 Older architecture (before 12th Gen)|  any 4bits<br>group size=-1<br>compute type=fp32 | AVX2
-
diff --git a/neural_speed/models/requirements/common.txt b/neural_speed/models/requirements/common.txt
@@ -1,13 +1,13 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.1.0+cpu
-transformers
-numpy
-sentencepiece
-protobuf<3.20
-einops
 accelerate
-peft
 datasets
-transformers_stream_generator
+einops
 gguf
+numpy
+peft
+protobuf<3.20
+sentencepiece
 tiktoken
+torch==2.1.0+cpu
+transformers
+transformers_stream_generator
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,12 +12,14 @@ haa @@
     inout
     mata
     matc
+    mone
     nd
     ore
     ot
     parm
     ques
     rouge
+    ser
     sie
     te
     tye
@@ Expand All / @@ -28,4 +30,4 @@ mone @@
     iterm
     tne
     aadd
-    endianess
+    endianess
Original file line number	Diff line number	Diff line change
Expand Up		@@ -143,4 +143,3 @@ Available modes:

		## Enable New Model
		You can consider adding your own models, please follow the document: [graph developer document](./developer_document.md).
Original file line number	Diff line number	Diff line change
Expand Up		@@ -426,4 +426,3 @@ We can improve the performance by fusion the FFN process.
		- [FFN-Fusion example](https://github.com/intel/intel-extension-for-transformers/pull/160)
		# 4. A complete example
		- [Enable baichuan](https://github.com/intel/intel-extension-for-transformers/pull/376)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,4 +25,3 @@ cd build
		cmake ..
		cmake --build . -j --config Release
		```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -112,4 +112,3 @@ mpirun -n 1 taskset -c 0-47 sh run.sh : -n 1 taskset -c 48-95 sh run.sh

		```
		NOTICE: tensor parallelsim strategy will split the model to specific node/socket, each device already use part of the original weights differently. So we should not use shared-memory of weights to avoid cross numa weight movement. Use option `--no-mmap` to disable shared weights between processes.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -240,4 +240,4 @@ def main(args_in: Optional[List[str]] = None) -> None:

		if __name__ == "__main__":
		main()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -73,4 +73,3 @@ Ice Lake<br>Cascade Lake<br>Cooper Lake<br>Tiger Lake<br>Rocket Lake \| any int4<
		Skylake \| any 4bits<br>group size=-1<br>compute type=fp32 \| AVX512F
		Alder Lake (12th Gen)<br>Raptor Lake (13th and 14th Gen)\|any 4bits<br>group size=-1<br>compute type=int8 \| AVX_VNNI
		Older architecture (before 12th Gen)\| any 4bits<br>group size=-1<br>compute type=fp32 \| AVX2