Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
update qwen python api (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenwei-intel authored Jan 23, 2024
1 parent abcc0f4 commit 51088a2
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 4 deletions.
2 changes: 2 additions & 0 deletions neural_speed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def __import_package(self, model_type):
import neural_speed.baichuan_cpp as cpp_model
elif model_type == "polyglot":
import neural_speed.polyglot_cpp as cpp_model
elif model_type == "qwen":
import neural_speed.qwen_cpp as cpp_model
elif model_type == "mistral":
import neural_speed.mistral_cpp as cpp_model
elif model_type == "whisper":
Expand Down
9 changes: 6 additions & 3 deletions neural_speed/application/main_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,8 @@ const std::vector<float>& Model::evaluate_(const std::vector<std::vector<model_t
fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
input_id_cb.size(), n_ctx - 4);
curr_input_ids[bs].resize(n_ctx - 4);
std::copy(input_id_cb.end() - n_ctx - 4, input_id_cb.end(), curr_input_ids[bs].begin());
std::copy(input_id_cb.end() - n_ctx - 8, input_id_cb.end(), curr_input_ids[bs].begin() + 4);
std::copy(input_id_cb.begin(), input_id_cb.begin() + 4, curr_input_ids[bs].begin());
} else { // good input_id_cb and empty curr_input_ids[bs]
curr_input_ids[bs] = input_id_cb;
}
Expand Down Expand Up @@ -418,7 +419,9 @@ std::vector<std::vector<model_token>> Model::generate_tokens(const std::vector<s
fprintf(stderr, "\n%s: Warning: prompt is too long (%d tokens, max %d), will be truncated\n", __func__,
input_ids[STATIC_INPUT_HEAD_IDX].size(), n_ctx - 4);
curr_input_ids[STATIC_INPUT_HEAD_IDX].resize(n_ctx - 4);
std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - 4, input_ids[STATIC_INPUT_HEAD_IDX].end(),
std::copy(input_ids[STATIC_INPUT_HEAD_IDX].end() - n_ctx - 8, input_ids[STATIC_INPUT_HEAD_IDX].end(),
curr_input_ids[STATIC_INPUT_HEAD_IDX].begin() + 4);
std::copy(input_ids[STATIC_INPUT_HEAD_IDX].begin(), input_ids[STATIC_INPUT_HEAD_IDX].begin() + 4,
curr_input_ids[STATIC_INPUT_HEAD_IDX].begin());
} else {
curr_input_ids[STATIC_INPUT_HEAD_IDX] = input_ids[STATIC_INPUT_HEAD_IDX];
Expand Down Expand Up @@ -667,7 +670,7 @@ PYBIND11_MODULE(qwen_cpp, m)
py::class_<Model>(m, "Model", py::module_local())
.def(py::init())
.def("init_model", &Model::init_model, "initial model with model path and parameters", py::arg("model_path"),
py::arg("max_new_tokens") = -1, py::arg("n_batch") = 512, py::arg("ctx_size") = 512, py::arg("seed") = -1,
py::arg("max_new_tokens") = -1, py::arg("n_batch") = 512, py::arg("ctx_size") = 1024, py::arg("seed") = -1,
py::arg("threads") = 8, py::arg("repetition_penalty") = 1.1f, py::arg("num_beams") = 1,
py::arg("do_sample") = false, py::arg("top_k") = 40, py::arg("top_p") = 0.95, py::arg("temperature") = 0.8,
py::arg("min_new_tokens") = 0, py::arg("length_penalty") = 1.0, py::arg("early_stopping") = false,
Expand Down
2 changes: 1 addition & 1 deletion neural_speed/convert/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

GGML_QK4_0_TYPE = 2
GGML_QK4_1_TYPE = 3
GGML_QJBLAS_TYPE = 13
GGML_QJBLAS_TYPE = 19

def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
# equivalent to ggml_quantize_q4_0 in ggml.c
Expand Down

0 comments on commit 51088a2

Please sign in to comment.