Skip to content

Commit

Permalink
Merge branch 'layla-build' into merge
Browse files Browse the repository at this point in the history
  • Loading branch information
l3utterfly authored Aug 2, 2024
2 parents e09a800 + ffe2d2b commit 46bd283
Show file tree
Hide file tree
Showing 16 changed files with 241 additions and 25 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ tmp/
!.github/workflows/*.yml

# Models

models/*
models-mnt
!models/.editorconfig
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,4 @@ endif ()
if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
endif()
29 changes: 28 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,30 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
sparams.penalty_present = std::stof(argv[i]);
return true;
}
if (arg == "--dry-multiplier") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_multiplier = std::stof(argv[i]);
return true;
}
if (arg == "--dry-base") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_base = std::stoi(argv[i]);
return true;
}
if (arg == "--dry-allowed-length") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_allowed_length = std::stoi(argv[i]);
return true;
}
if (arg == "--dynatemp-range") {
CHECK_ARG
sparams.dynatemp_range = std::stof(argv[i]);
Expand Down Expand Up @@ -1471,6 +1495,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
options.push_back({ "*", " --dry-multiplier N", "DRY sampler multiplier (default: %.1f, 0.0 = disabled)", (double)sparams.dry_multiplier });
options.push_back({ "*", " --dry-base N", "DRY sampler base (default: %.1f)", (double)sparams.dry_base });
options.push_back({ "*", " --dry-allowed-length N", "DRY sampler allowed length (default: %d)", sparams.dry_allowed_length });
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
Expand Down Expand Up @@ -2144,7 +2171,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.rpc_servers = params.rpc_servers.c_str();
//mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
Expand Down
74 changes: 55 additions & 19 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
delete ctx;
}

void llama_sampling_reset(llama_sampling_context * ctx) {
void llama_sampling_reset_grammar(struct llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
ctx->grammar = nullptr;
}

if (!ctx->parsed_grammar.rules.empty()) {
Expand All @@ -71,6 +71,10 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
}
ctx->grammar = grammar;
}
}

void llama_sampling_reset(llama_sampling_context * ctx) {
llama_sampling_reset_grammar(ctx);

std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
Expand Down Expand Up @@ -121,10 +125,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, dry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
params.mirostat, params.mirostat_eta, params.mirostat_tau, params.dry_multiplier, params.dry_base, params.dry_allowed_length);

return std::string(result);
}
Expand Down Expand Up @@ -362,13 +366,19 @@ static llama_token_data_array llama_sampling_prepare_impl(

const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

// repetition penalties
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;

const bool penalize_nl = params.penalize_nl;

// DRY sampler parameters
const float dry_multiplier = params.dry_multiplier;
const float dry_base = params.dry_base;
const uint32_t dry_allowed_length = params.dry_allowed_length;
const uint32_t dry_penalty_last_n = params.dry_penalty_last_n;

auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;

Expand Down Expand Up @@ -399,26 +409,41 @@ static llama_token_data_array llama_sampling_prepare_impl(

llama_token_data_array cur_p = { cur.data(), cur.size(), false };

// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;

// apply repetition penalties
{
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

// repetition penalties
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
}

// apply DRY penalties
{
const int penalty_tokens_used_size = std::min(penalty_tokens.size(), (size_t)dry_penalty_last_n);
if (penalty_tokens_used_size) {
llama_sample_dry(&cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, dry_base, dry_multiplier, dry_allowed_length,
params.dry_seq_breakers.data(), params.dry_seq_breakers.size());
}
}

// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
Expand Down Expand Up @@ -458,3 +483,14 @@ void llama_sampling_accept(
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
}
}


void llama_sampling_rollback(
struct llama_sampling_context * ctx_sampling,
int rollback_num) {
if(rollback_num > ctx_sampling->prev.size()) {
rollback_num = ctx_sampling->prev.size();
}

ctx_sampling->prev.erase(ctx_sampling->prev.end() - rollback_num, ctx_sampling->prev.end());
}
12 changes: 12 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ typedef struct llama_sampling_params {
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
float dry_multiplier = 0.0f; // 0.0f = disabled, recommended value: 0.8f
float dry_base = 1.75f;
uint32_t dry_allowed_length = 2;
uint32_t dry_penalty_last_n = -1; // DRY last n tokens to penalize (0 = disable penalty, -1 = context size)

std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
Expand All @@ -61,6 +65,7 @@ typedef struct llama_sampling_params {
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

std::vector<llama_token> penalty_prompt_tokens;
std::vector<llama_token> dry_seq_breakers; // sequence breakers for the DRY sampler
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;

Expand Down Expand Up @@ -93,6 +98,9 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

void llama_sampling_free(struct llama_sampling_context * ctx);

// Reset the sampler grammar without resetting the context
void llama_sampling_reset_grammar(struct llama_sampling_context * ctx);

// Reset the sampler context
// - clear prev tokens
// - reset grammar
Expand Down Expand Up @@ -158,3 +166,7 @@ void llama_sampling_accept(
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);

void llama_sampling_rollback(
struct llama_sampling_context * ctx_sampling,
int rollback_num);
3 changes: 0 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,9 +564,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-v2-en"
Expand Down
2 changes: 2 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ int main(int argc, char ** argv) {

llama_sampling_params & sparams = params.sparams;

sparams.dry_multiplier = 0.8f;

#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
Expand Down
4 changes: 4 additions & 0 deletions grammars/chinese.gbnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
root ::= cn-char+ ([ \t\n] cn-char+)*
cn-char ::= cjk | punctuation
cjk ::= [一-鿿] | [𠀀-𯿽]
punctuation ::= [、-〾]
8 changes: 8 additions & 0 deletions grammars/korean.gbnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
root ::= conversation+
conversation ::= assistant-line "\nUSER: "
assistant-line ::= kr-string "\n"
kr-string ::= kr-char*
kr-char ::= hangul | punctuation | whitespace
hangul ::= [가-힣]
punctuation ::= [、-〾]
whitespace ::= [ \t]
3 changes: 3 additions & 0 deletions grammars/schedule.gbnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
root ::= record
record ::= "Event: " string "\n" "Date: " string "\n" "Time: " string "\n"
string ::= "" [ -~]* ""
11 changes: 11 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1085,6 +1085,17 @@ extern "C" {
float p,
size_t min_keep);

/// @details DRY sampler as described in: https://github.com/oobabooga/text-generation-webui/pull/5677
LLAMA_API void llama_sample_dry(
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t last_tokens_size,
float dry_base,
float dry_multiplier,
int dry_allowed_length,
const llama_token * dry_seq_breakers,
size_t dry_seq_breakers_size);

/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx,
Expand Down
17 changes: 17 additions & 0 deletions ios.toolchain.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set(CMAKE_SYSTEM_NAME iOS)

# specify the cross compiler
set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH NO)

# specify which architectures to build for
set(CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)")

# you can also choose to build for a specific device
# set(CMAKE_OSX_ARCHITECTURES "arm64")
# or for the simulator
# set(CMAKE_OSX_ARCHITECTURES "x86_64")

set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos;-iphonesimulator")

# you might also want to set the deployment target
# set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "10.0")
5 changes: 5 additions & 0 deletions prompts/chat-with-layla.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Layla is an AI created by Layla Network that is helpful, polite, and to the point. She is here to help User with everyday tasks. Layla's favourite animal is the butterfly because it represents transformation, growth, and beauty.

Conversation between User and an Assistant named "Layla":

User:
Loading

0 comments on commit 46bd283

Please sign in to comment.