Skip to content

Commit

Permalink
llama : remove deprecated API (ggerganov#5770)
Browse files Browse the repository at this point in the history
ggml-ci
  • Loading branch information
ggerganov authored Feb 28, 2024
1 parent 78aacf3 commit 08c5ee8
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 132 deletions.
88 changes: 1 addition & 87 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7894,9 +7894,9 @@ static int llama_decode_internal(
const auto n_batch = cparams.n_batch;

GGML_ASSERT(n_tokens <= n_batch);
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

const int64_t t_start_us = ggml_time_us();

Expand Down Expand Up @@ -10062,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
}
}

void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
llama_sample_temp(ctx, candidates_p, temp);
}

void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
Expand Down Expand Up @@ -10192,38 +10188,6 @@ void llama_sample_apply_guidance(
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}

void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale) {
GGML_ASSERT(ctx);
int64_t t_start_sample_us;

t_start_sample_us = ggml_time_us();
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));

GGML_ASSERT(n_vocab == candidates->size);
GGML_ASSERT(!candidates->sorted);

std::vector<float> logits_base(n_vocab);
for (size_t i = 0; i < n_vocab; ++i) {
logits_base[i] = candidates->data[i].logit;
}

float * logits_guidance = llama_get_logits(guidance_ctx);

ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
t_start_sample_us = ggml_time_us();

for (size_t i = 0; i < n_vocab; ++i) {
candidates->data[i].logit = logits_base[i];
}

ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}

llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
GGML_ASSERT(ctx);

Expand Down Expand Up @@ -11724,15 +11688,6 @@ bool llama_supports_gpu_offload(void) {
#endif
}

// deprecated:
bool llama_mmap_supported(void) {
return llama_supports_mmap();
}

bool llama_mlock_supported(void) {
return llama_supports_mlock();
}

void llama_backend_init(void) {
ggml_time_init();

Expand Down Expand Up @@ -12244,15 +12199,6 @@ uint32_t llama_model_quantize(
}
}

int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
}
}

int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
Expand Down Expand Up @@ -12802,38 +12748,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
return true;
}

int llama_eval(
struct llama_context * ctx,
llama_token * tokens,
int32_t n_tokens,
int32_t n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}

return ret;
}

int llama_eval_embd(
struct llama_context * ctx,
float * embd,
int32_t n_tokens,
int32_t n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);

llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };

const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}

return ret;
}

void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;
Expand Down
45 changes: 0 additions & 45 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,9 +364,6 @@ extern "C" {
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void);

LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");

LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
Expand Down Expand Up @@ -423,14 +420,6 @@ extern "C" {
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
float scale,
const char * path_base_model,
int32_t n_threads),
"use llama_model_apply_lora_from_file instead");

LLAMA_API int32_t llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
Expand Down Expand Up @@ -606,27 +595,6 @@ extern "C" {
// Decoding
//

// Run the llama inference to obtain the logits and probabilities for the next token(s).
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
// DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval(
struct llama_context * ctx,
llama_token * tokens,
int32_t n_tokens,
int32_t n_past),
"use llama_decode() instead");

// Same as llama_eval, but use float matrix input directly.
// DEPRECATED: use llama_decode() instead
LLAMA_API DEPRECATED(int llama_eval_embd(
struct llama_context * ctx,
float * embd,
int32_t n_tokens,
int32_t n_past),
"use llama_decode() instead");

// Return batch for single sequence of tokens starting at pos_0
//
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
Expand Down Expand Up @@ -800,13 +768,6 @@ extern "C" {
float * logits_guidance,
float scale);

LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale),
"use llama_sample_apply_guidance() instead");

/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(
struct llama_context * ctx,
Expand Down Expand Up @@ -860,12 +821,6 @@ extern "C" {
llama_token_data_array * candidates,
float temp);

LLAMA_API DEPRECATED(void llama_sample_temperature(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp),
"use llama_sample_temp instead");

/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
Expand Down

0 comments on commit 08c5ee8

Please sign in to comment.