-
Notifications
You must be signed in to change notification settings - Fork 864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BabyLlama with CPP backend #2544
Changes from 5 commits
641a708
016e4f1
38d3e93
52a7927
c675664
374a2e8
48f522c
49a3015
aeb1bb0
ee20424
f5d9799
9b3de26
3e0e2c3
5c0495e
e75a5ae
9afce52
0d12619
bd03fd8
d2dc632
67b46aa
f30aab2
7174cde
6dc025b
450b85d
8d279be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
#include "src/examples/babyllama/baby_llama_handler.hh" | ||
|
||
#include <typeinfo> | ||
|
||
namespace llm { | ||
|
||
std::pair<std::shared_ptr<torch::jit::script::Module>, | ||
std::shared_ptr<torch::Device>> | ||
LlmHandler::LoadModel( | ||
std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) { | ||
try { | ||
auto device = GetTorchDevice(load_model_request); | ||
// Load dummy model | ||
auto module = std::make_shared<torch::jit::script::Module>( | ||
torch::jit::load(fmt::format("{}/{}", load_model_request->model_dir, | ||
manifest_->GetModel().serialized_file), | ||
*device)); | ||
|
||
char checkpoint_path[] = "/home/ubuntu/serve/cpp/stories15M.bin"; | ||
build_transformer(&transformer, checkpoint_path); | ||
|
||
char tokenizer_path[] = | ||
"/home/ubuntu/serve/cpp/src/examples/image_classifier/babyllama/" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Path is hard coded at present -- read from config file There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @shrinath-suresh you can also add the tokenizer.bin as an additional file when creating the mar file and set the filename as load_model_request->model_dir + "tokenizer.bin" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the code to read the tokenizer and model path from a config file |
||
"tokenizer.bin"; | ||
build_tokenizer(&tokenizer, tokenizer_path, transformer.config.vocab_size); | ||
|
||
float temperature = | ||
1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher | ||
float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well, | ||
// but slower | ||
int steps = 256; // number of steps to run for | ||
unsigned long long rng_seed = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initializing an rng with 0 (at bits zero) can be problematic in some cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed the initialization |
||
// build the Sampler | ||
build_sampler(&sampler, transformer.config.vocab_size, temperature, topp, | ||
rng_seed); | ||
|
||
return std::make_pair(module, device); | ||
} catch (const c10::Error& e) { | ||
TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}", | ||
load_model_request->model_name, load_model_request->gpu_id, | ||
e.msg()); | ||
throw e; | ||
} catch (const std::runtime_error& e) { | ||
TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}", | ||
load_model_request->model_name, load_model_request->gpu_id, | ||
e.what()); | ||
throw e; | ||
} | ||
} | ||
|
||
std::vector<torch::jit::IValue> LlmHandler::Preprocess( | ||
std::shared_ptr<torch::Device>& device, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) { | ||
std::vector<torch::jit::IValue> batch_ivalue; | ||
std::vector<torch::Tensor> batch_tensors; | ||
uint8_t idx = 0; | ||
for (auto& request : *request_batch) { | ||
try { | ||
(*response_batch)[request.request_id] = | ||
std::make_shared<torchserve::InferenceResponse>(request.request_id); | ||
idx_to_req_id.first += idx_to_req_id.first.empty() | ||
? request.request_id | ||
: "," + request.request_id; | ||
|
||
auto data_it = request.parameters.find( | ||
torchserve::PayloadType::kPARAMETER_NAME_DATA); | ||
auto dtype_it = | ||
request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE); | ||
if (data_it == request.parameters.end()) { | ||
data_it = request.parameters.find( | ||
torchserve::PayloadType::kPARAMETER_NAME_BODY); | ||
dtype_it = request.headers.find( | ||
torchserve::PayloadType::kHEADER_NAME_BODY_TYPE); | ||
} | ||
|
||
if (data_it == request.parameters.end() || | ||
dtype_it == request.headers.end()) { | ||
TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id); | ||
(*response_batch)[request.request_id]->SetResponse( | ||
500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT, | ||
"Empty payload"); | ||
continue; | ||
} | ||
|
||
std::string msg = torchserve::Converter::VectorToStr(data_it->second); | ||
|
||
char* msgCStr = new char[msg.size() + 1]; // +1 for the null terminator | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use smart pointers when allocating dynamic memory and prefer new over malloc.
should work as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated code to use smart pointers in necessary places |
||
std::strcpy(msgCStr, msg.c_str()); | ||
int num_prompt_tokens = 0; | ||
int* prompt_tokens = (int*)malloc( | ||
(strlen(msgCStr) + 3) * sizeof(int)); // +3 for '\0', ?BOS, ?EOS | ||
encode(&tokenizer, msgCStr, 1, 0, prompt_tokens, &num_prompt_tokens); | ||
|
||
std::vector<torch::Tensor> tensor_vector; | ||
mreso marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for (int64_t i = 0; i < num_prompt_tokens; ++i) { | ||
auto token = prompt_tokens[i]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please avoid using auto for primitive datatypes if it does not clarify the code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the code to declare specific datatype wherever necessary |
||
torch::Tensor tensor = torch::tensor(token, torch::kInt64); | ||
tensor_vector.push_back(tensor); | ||
} | ||
torch::Tensor stacked_tensor = torch::stack(tensor_vector); | ||
batch_ivalue.push_back(stacked_tensor); | ||
|
||
delete[] msgCStr; | ||
free(prompt_tokens); | ||
|
||
idx_to_req_id.second[idx++] = request.request_id; | ||
|
||
} catch (const std::runtime_error& e) { | ||
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}", | ||
request.request_id, e.what()); | ||
auto response = (*response_batch)[request.request_id]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"runtime_error, failed to load tensor"); | ||
} catch (const c10::Error& e) { | ||
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error:{}", | ||
request.request_id, e.msg()); | ||
auto response = (*response_batch)[request.request_id]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"c10 error, failed to load tensor"); | ||
} | ||
} | ||
|
||
return batch_ivalue; | ||
} | ||
|
||
torch::Tensor LlmHandler::Inference( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. torch::inferencemode is a high level API, c10::InferenceMode is a low level api. According to libtorch doc, they are trying to use torch::xxx to unify the low level apis. |
||
std::shared_ptr<torch::jit::script::Module> model, | ||
std::vector<torch::jit::IValue>& inputs, | ||
std::shared_ptr<torch::Device>& device, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) { | ||
std::vector<torch::Tensor> tensor_vector; | ||
auto tokens_list_tensor = inputs[0].toTensor(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we extend this to batched processing or at least process all entries in the batch? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Working on the batch processing part. Will keep you posted once it is done |
||
|
||
int64_t num_elements = tokens_list_tensor.numel(); | ||
|
||
int steps = 256; | ||
// // Convert the tensor to a vector of long values | ||
std::vector<long> long_vector; | ||
long_vector.reserve(num_elements); | ||
|
||
auto data_ptr = tokens_list_tensor.data_ptr<int64_t>(); | ||
for (int64_t i = 0; i < num_elements; ++i) { | ||
long_vector.push_back(data_ptr[i]); | ||
} | ||
|
||
int* prompt_tokens = new int[num_elements]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see above There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated code to use smart pointer |
||
for (int64_t i = 0; i < num_elements; ++i) { | ||
prompt_tokens[i] = static_cast<int>(long_vector[i]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why can't we just copy the data from the tensor instead of going through long_vector? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the logic to directly copy the data from tensor |
||
} | ||
|
||
// start the main loop | ||
long start = | ||
0; // used to time our code, only initialized after first iteration | ||
int next; // will store the next token in the sequence | ||
int token = prompt_tokens[0]; // kick off with the first token in the prompt | ||
int pos = 0; // position in the sequence | ||
while (pos < steps) { | ||
// forward the transformer to get logits for the next token | ||
float* logits = forward(&transformer, token, pos); | ||
|
||
// advance the state state machine | ||
if (pos < num_elements - 1) { | ||
// if we are still processing the input prompt, force the next prompt | ||
// token | ||
next = prompt_tokens[pos + 1]; | ||
} else { | ||
// otherwise sample the next token from the logits | ||
next = sample(&sampler, logits); | ||
} | ||
pos++; | ||
|
||
torch::Tensor tensor = torch::tensor(next, torch::kLong); | ||
tensor_vector.push_back(tensor); | ||
|
||
// data-dependent terminating condition: the BOS (=1) token delimits | ||
// sequences | ||
if (next == 1) { | ||
break; | ||
} | ||
|
||
token = next; | ||
|
||
// init the timer here because the first iteration can be slower | ||
if (start == 0) { | ||
start = time_in_ms(); | ||
} | ||
} | ||
|
||
// report achieved tok/s (pos-1 because the timer starts after first | ||
// iteration) | ||
if (pos > 1) { | ||
long end = time_in_ms(); | ||
auto token_per_sec = (pos - 1) / (double)(end - start) * 1000; | ||
std::cout << "Achieved tok per sec: " << token_per_sec << std::endl; | ||
} | ||
|
||
delete[] prompt_tokens; | ||
|
||
torch::Tensor stacked_tensor = torch::stack(tensor_vector); | ||
return stacked_tensor; | ||
} | ||
|
||
void LlmHandler::Postprocess( | ||
const torch::Tensor& data, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) { | ||
for (const auto& kv : idx_to_req_id.second) { | ||
try { | ||
int64_t num_elements = data.numel(); | ||
mreso marked this conversation as resolved.
Show resolved
Hide resolved
|
||
auto data_ptr = data.data_ptr<int64_t>(); | ||
int64_t token = 1; | ||
std::string concatenated_string; | ||
for (int64_t i = 0; i < num_elements; ++i) { | ||
char* piece = decode(&tokenizer, token, data_ptr[i]); | ||
std::string piece_string(piece); | ||
token = data_ptr[i]; | ||
concatenated_string += piece_string; | ||
} | ||
|
||
std::cout << "Concatenated String: " << concatenated_string << std::endl; | ||
|
||
auto response = (*response_batch)[kv.second]; | ||
|
||
response->SetResponse(200, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
concatenated_string); | ||
} catch (const std::runtime_error& e) { | ||
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}", | ||
kv.second, e.what()); | ||
auto response = (*response_batch)[kv.second]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"runtime_error, failed to postprocess tensor"); | ||
} catch (const c10::Error& e) { | ||
TS_LOGF(ERROR, | ||
"Failed to postprocess tensor for request id: {}, error: {}", | ||
kv.second, e.msg()); | ||
auto response = (*response_batch)[kv.second]; | ||
response->SetResponse(500, "data_type", | ||
torchserve::PayloadType::kDATA_TYPE_STRING, | ||
"c10 error, failed to postprocess tensor"); | ||
} | ||
} | ||
|
||
free_sampler(&sampler); | ||
mreso marked this conversation as resolved.
Show resolved
Hide resolved
|
||
free_tokenizer(&tokenizer); | ||
free_transformer(&transformer); | ||
} | ||
|
||
} // namespace llm | ||
|
||
#if defined(__linux__) || defined(__APPLE__) | ||
extern "C" { | ||
torchserve::torchscripted::BaseHandler* allocatorLlmHandler() { | ||
return new llm::LlmHandler(); | ||
} | ||
|
||
void deleterLlmHandler(torchserve::torchscripted::BaseHandler* p) { | ||
if (p != nullptr) { | ||
delete static_cast<llm::LlmHandler*>(p); | ||
} | ||
} | ||
} | ||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#ifndef LLM_HANDLER_HH_ | ||
#define LLM_HANDLER_HH_ | ||
|
||
#include "run.c" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason why run.c gets included here? Its also listed in the cmake file as source file which probably did not work as there is no header file to declare the content. I would recommend removing it from the cmake file and include it in the .cc instead to localize visibility. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved run.c import to the .cc file |
||
#include "src/backends/torch_scripted/handler/base_handler.hh" | ||
|
||
namespace llm { | ||
class LlmHandler : public torchserve::torchscripted::BaseHandler { | ||
public: | ||
Transformer transformer; | ||
Tokenizer tokenizer; | ||
Sampler sampler; | ||
|
||
// NOLINTBEGIN(bugprone-exception-escape) | ||
LlmHandler() = default; | ||
// NOLINTEND(bugprone-exception-escape) | ||
~LlmHandler() override = default; | ||
|
||
void initialize_context(); | ||
|
||
virtual std::pair<std::shared_ptr<torch::jit::script::Module>, | ||
std::shared_ptr<torch::Device>> | ||
LoadModel(std::shared_ptr<torchserve::LoadModelRequest>& load_model_request); | ||
|
||
std::vector<torch::jit::IValue> Preprocess( | ||
std::shared_ptr<torch::Device>& device, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) | ||
override; | ||
|
||
torch::Tensor Inference( | ||
std::shared_ptr<torch::jit::script::Module> model, | ||
std::vector<torch::jit::IValue>& inputs, | ||
std::shared_ptr<torch::Device>& device, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) | ||
override; | ||
|
||
void Postprocess( | ||
const torch::Tensor& data, | ||
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id, | ||
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) | ||
override; | ||
}; | ||
} // namespace llm | ||
#endif // LLM_HANDLER_HH_ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
current cpp backend only support one device id, which means there is no across gpu device partition.
i assume this example only work for single gpu.