Skip to content

Commit

Permalink
Merge branch 'layla-build' into merge
Browse files Browse the repository at this point in the history
  • Loading branch information
l3utterfly authored Nov 27, 2024
2 parents 46c69e0 + 17b9dd5 commit 289e208
Show file tree
Hide file tree
Showing 20 changed files with 361 additions and 17 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ autogen-*.md
!.github/workflows/*.yml

# Models

models/*
models-mnt
!models/.editorconfig
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -223,4 +223,4 @@ endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
endif()
210 changes: 210 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1959,3 +1959,213 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
return result;
}

//
// YAML utils
//

void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
if (data.empty()) {
fprintf(stream, "%s:\n", prop_name);
return;
}

fprintf(stream, "%s: [", prop_name);
for (size_t i = 0; i < data.size() - 1; ++i) {
fprintf(stream, "%e, ", data[i]);
}
fprintf(stream, "%e]\n", data.back());
}

void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
if (data.empty()) {
fprintf(stream, "%s:\n", prop_name);
return;
}

fprintf(stream, "%s: [", prop_name);
for (size_t i = 0; i < data.size() - 1; ++i) {
fprintf(stream, "%d, ", data[i]);
}
fprintf(stream, "%d]\n", data.back());
}

void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
std::string data_str(data == NULL ? "" : data);

if (data_str.empty()) {
fprintf(stream, "%s:\n", prop_name);
return;
}

size_t pos_start = 0;
size_t pos_found = 0;

if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
data_str = "\"" + data_str + "\"";
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
return;
}

if (data_str.find('\n') == std::string::npos) {
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
return;
}

fprintf(stream, "%s: |\n", prop_name);
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
pos_start = pos_found + 1;
}
}

void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
ggml_cpu_init(); // some ARM features are detected at runtime

const auto & sparams = params.sparams;

fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");

#ifdef NDEBUG
fprintf(stream, "debug: false\n");
#else
fprintf(stream, "debug: true\n");
#endif // NDEBUG

fprintf(stream, "model_desc: %s\n", model_desc);
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));

#ifdef __OPTIMIZE__
fprintf(stream, "optimize: true\n");
#else
fprintf(stream, "optimize: false\n");
#endif // __OPTIMIZE__

fprintf(stream, "time: %s\n", timestamp.c_str());

fprintf(stream, "\n");
fprintf(stream, "###############\n");
fprintf(stream, "# User Inputs #\n");
fprintf(stream, "###############\n");
fprintf(stream, "\n");

fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");

yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

fprintf(stream, "logit_bias:\n");
for (const auto & logit_bias : sparams.logit_bias) {
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
}

fprintf(stream, "lora:\n");
for (auto & la : params.lora_adapters) {
if (la.scale == 1.0f) {
fprintf(stream, " - %s\n", la.path.c_str());
}
}
fprintf(stream, "lora_scaled:\n");
for (auto & la : params.lora_adapters) {
if (la.scale != 1.0f) {
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
}
}
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);

fprintf(stream, "reverse_prompt:\n");
for (std::string ap : params.antiprompt) {
size_t pos = 0;
while ((pos = ap.find('\n', pos)) != std::string::npos) {
ap.replace(pos, 1, "\\n");
pos += 1;
}

fprintf(stream, " - %s\n", ap.c_str());
}

fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);

fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
}
14 changes: 14 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -610,3 +610,17 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

=======

//
// YAML utils
//

void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);

void yaml_dump_non_result_info(
FILE * stream, const common_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
54 changes: 45 additions & 9 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ struct ring_buffer {
return value;
}

T pop_back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
// Move pos backwards, wrapping around if necessary
pos = (pos == 0) ? capacity - 1 : pos - 1;
T value = data[pos];
sz--;
return value;
}

const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
Expand Down Expand Up @@ -163,15 +174,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

llama_sampler_chain_add(result->chain,
llama_sampler_init_penalties(
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));
llama_n_vocab (model),
llama_token_eos(model),
llama_token_nl (model),
params.penalty_last_n,
params.penalty_repeat,
params.penalty_freq,
params.penalty_present,
params.penalize_nl,
params.ignore_eos));

if (params.mirostat == 0) {
for (const auto & cnstr : params.samplers) {
Expand Down Expand Up @@ -252,6 +263,16 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->chain);
}

void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar) {
llama_sampler_reset(gsmpl->grmr);

gsmpl->grmr = llama_sampler_init_grammar(model, grammar, "root");
}

void common_sampler_reset_grammar(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr);
}

struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
return new common_sampler {
/* .params = */ gsmpl->params,
Expand Down Expand Up @@ -405,6 +426,21 @@ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_
return result;
}

const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl) {
return gsmpl->prev.to_vector();
}

void common_sampler_rollback(common_sampler * gsmpl, int rollback_num) {
if(rollback_num > gsmpl->prev.size()) {
rollback_num = gsmpl->prev.size();
}

// continuously pop the last token
for(int i = 0; i < rollback_num; i++) {
gsmpl->prev.pop_back();
}
}

char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
switch (cnstr) {
case COMMON_SAMPLER_TYPE_DRY: return 'd';
Expand Down
4 changes: 4 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ void common_sampler_free(struct common_sampler * gsmpl);
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
void common_sampler_reset (struct common_sampler * gsmpl);
void common_sampler_reinit_grammar(struct common_sampler * gsmpl, const struct llama_model * model, const char * grammar);
void common_sampler_reset_grammar(struct common_sampler * gsmpl);
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);

// arguments can be nullptr to skip printing
Expand Down Expand Up @@ -96,6 +98,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl);

// get a string representation of the last accepted tokens
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
const std::vector<llama_token> common_sampler_prev(common_sampler * gsmpl);
void common_sampler_rollback(common_sampler * gsmpl, int rollback_num);

char common_sampler_type_to_chr(enum common_sampler_type cnstr);
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
Expand Down
7 changes: 7 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,13 @@ set(GGML_PUBLIC_HEADERS
include/ggml-vulkan.h)

set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

# link android log library
if(ANDROID)
find_library(log-lib log)
target_link_libraries(ggml PRIVATE ${log-lib})
endif()

#if (GGML_METAL)
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
#endif()
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,14 @@ add_library(ggml-base
ggml-aarch64.c
ggml-aarch64.h)

# Search for the 'log' library on Android
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
find_library(log-lib log)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${log-lib})

target_link_libraries(ggml-base PUBLIC ${GGML_EXTRA_LIBS})
endif()

target_include_directories(ggml-base PRIVATE .)

add_library(ggml
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,4 @@ size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
}
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -5244,4 +5244,4 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
}

return true;
}
}
Loading

0 comments on commit 289e208

Please sign in to comment.