Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
add int3 test (#275)
Browse files Browse the repository at this point in the history
Signed-off-by: intellinjun <[email protected]>
  • Loading branch information
intellinjun authored Jun 3, 2024
1 parent e0e65bd commit cfc40ab
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 2 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/scripts/models/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ function main() {
quant_script="./build/bin/quant_llama"
infer_cmd="./build/bin/run_llama"
input_model="/tf_dataset2/models/pytorch/Meta-Llama-3-8B"
precision_list=("q4_j_b128" "q4_j_b32" "q4_0")
precision_list=("q4_j_b128" "q4_j_b32" "q4_0" "q5_j_i8_pc_asym" "q3_j_i8_b128_asym")
elif [[ "${model}" == "gpt-neox-20b" ]]; then
convert_script="${scripts_dir}/convert_gptneox.py"
quant_script="./build/bin/quant_gptneox"
Expand Down Expand Up @@ -129,6 +129,10 @@ function main() {
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q4_j_b128" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg sym
elif [[ ${precision} == "q3_j_i8_b128_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int3 --group_size 128 --scale_dtype fp32 --compute_dtype int8 --alg asym
elif [[ ${precision} == "q5_j_i8_pc_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int5 --group_size -1 --scale_dtype fp32 --compute_dtype int8 --alg asym
elif [[ ${precision} == "q4_j_b128_asym" ]]; then
${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --nthread $cores_per_instance --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_dtype fp32 --alg asym
elif [[ ${precision} == "q4_0" ]]; then
Expand Down
19 changes: 18 additions & 1 deletion tests/model-test/cpp_graph_inference.sh
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,16 @@ function main() {
quant_script="./build/bin/quant_llama"
convert_script="${convert_script}/convert_llama.py"
infer_cmd="./build/bin/run_llama"
precision_list+=("q5_j_i8_g128" "q3_j_i8_g128" "q5_j_i8_g128_asym" "q3_j_i8_g128_asym"
"q3_j_i8_pc_asym" "q5_j_i8_pc_asym"
)
elif [[ "${model}" == "gptj-6b" ]]; then
quant_script="./build/bin/quant_gptj"
convert_script="${convert_script}/convert_gptj.py"
infer_cmd="./build/bin/run_gptj"
precision_list+=("q4_j1_i8_g128" "q4_j1_bf16_pc")
precision_list+=("q4_j1_i8_g128" "q4_j1_bf16_pc" "q5_j_i8_g128" "q3_j_i8_g128" "q5_j_i8_g128_asym" "q3_j_i8_g128_asym"
"q3_j_i8_pc_asym" "q5_j_i8_pc_asym"
)
elif [[ "${model}" == "gpt-neox-20b" ]]; then
quant_script="./build/bin/quant_gptneox"
convert_script="${convert_script}/convert_gptneox.py"
Expand Down Expand Up @@ -421,6 +426,18 @@ function main() {
eval "$quant_script_prologue --weight_dtype int4 --group_size 32 --compute_dtype int8 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q4_j_f32_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int4 --group_size 128 --compute_dtype fp32 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q3_j_i8_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int3 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q5_j_i8_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int5 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg sym"
elif [[ ${precision} == "q3_j_i8_g128_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int3 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q5_j_i8_g128_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int5 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q3_j_i8_pc_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int3 --group_size -1 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q5_j_i8_pc_asym" ]]; then
eval "$quant_script_prologue --weight_dtype int5 --group_size -1 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q4_j1_i8_g128" ]]; then
eval "$quant_script_prologue --weight_dtype int4 --group_size 128 --compute_dtype int8 --scale_dtype fp32 --alg asym"
elif [[ ${precision} == "q4_j1_bf16_pc" ]]; then
Expand Down

0 comments on commit cfc40ab

Please sign in to comment.