Skip to content

Commit

Permalink
Merge branch 'layla-build' into merge
Browse files Browse the repository at this point in the history
  • Loading branch information
l3utterfly authored May 3, 2024
2 parents 60325fa + da1a628 commit c93b977
Show file tree
Hide file tree
Showing 13 changed files with 260 additions and 99 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ cmake-build-*
out/
tmp/

loras/*
models/*
models-mnt

Expand Down
100 changes: 22 additions & 78 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,8 @@ if (LLAMA_METAL)
endif()

if (LLAMA_BLAS)
message(STATUS "Building with OpenBLAS")

if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
Expand All @@ -299,77 +301,14 @@ if (LLAMA_BLAS)
endif()

set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS)

if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")

if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS REQUIRED openblas)
endif()
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
add_compile_options(${BLAS_LINKER_FLAGS})

message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_definitions(GGML_USE_OPENBLAS)

add_compile_options(${BLAS_LINKER_FLAGS})
add_subdirectory(../OpenBLAS ${CMAKE_CURRENT_BINARY_DIR}/OpenBLAS)

add_compile_definitions(GGML_USE_OPENBLAS)

if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct LLAMA_BLAS_VENDOR")
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas_shared)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ../OpenBLAS ${CMAKE_CURRENT_BINARY_DIR}/OpenBLAS)
endif()

if (LLAMA_LLAMAFILE)
Expand Down Expand Up @@ -485,19 +424,24 @@ if (LLAMA_MPI)
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
message(STATUS "Building with CLBlast")

set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)

add_compile_definitions(GGML_USE_CLBLAST)
add_compile_definitions(GGML_USE_CLBLAST)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
# link our libOpenCL.so (this is only used during compile time)
add_library(OpenCL SHARED IMPORTED)
set_target_properties(OpenCL PROPERTIES IMPORTED_LOCATION ${PROJECT_SOURCE_DIR}/../OpenCL/lib/libOpenCL.so)

# add our prebuilt clblast library
add_library(clblast SHARED IMPORTED)
set_target_properties(clblast PROPERTIES IMPORTED_LOCATION ${PROJECT_SOURCE_DIR}/../../android/app/src/main/jniLibs/${ANDROID_ABI}/libclblast.so)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast OpenCL)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ../CLBlast/include)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ../OpenCL/include)
endif()

if (LLAMA_VULKAN)
Expand Down
28 changes: 28 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,30 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
sparams.penalty_present = std::stof(argv[i]);
return true;
}
if (arg == "--dry-multiplier") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_multiplier = std::stof(argv[i]);
return true;
}
if (arg == "--dry-base") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_base = std::stoi(argv[i]);
return true;
}
if (arg == "--dry-allowed-length") {
if (++i >= argc) {
invalid_param = true;
return true;
}
sparams.dry_allowed_length = std::stoi(argv[i]);
return true;
}
if (arg == "--dynatemp-range") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -1464,6 +1488,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
printf(" --dry-multiplier N DRY sampler multiplier (default: %.1f, 0.0 = disabled)\n", (double)sparams.dry_multiplier);
printf(" --dry-base N DRY sampler base (default: %.1f)\n", (double)sparams.dry_base);
printf(" --dry-allowed-length N\n");
printf(" DRY sampler allowed length (default: %d)\n", sparams.dry_allowed_length);
printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
printf(" --mirostat N use Mirostat sampling.\n");
Expand Down
74 changes: 55 additions & 19 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
delete ctx;
}

void llama_sampling_reset(llama_sampling_context * ctx) {
void llama_sampling_reset_grammar(struct llama_sampling_context * ctx) {
if (ctx->grammar != NULL) {
llama_grammar_free(ctx->grammar);
ctx->grammar = NULL;
ctx->grammar = nullptr;
}

if (!ctx->parsed_grammar.rules.empty()) {
Expand All @@ -61,6 +61,10 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
grammar_rules.data(),
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
}
}

void llama_sampling_reset(llama_sampling_context * ctx) {
llama_sampling_reset_grammar(ctx);

std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
Expand Down Expand Up @@ -110,10 +114,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, dry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
params.mirostat, params.mirostat_eta, params.mirostat_tau, params.dry_multiplier, params.dry_base, params.dry_allowed_length);

return std::string(result);
}
Expand Down Expand Up @@ -267,13 +271,19 @@ static llama_token_data_array llama_sampling_prepare_impl(

const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

// repetition penalties
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;

const bool penalize_nl = params.penalize_nl;

// DRY sampler parameters
const float dry_multiplier = params.dry_multiplier;
const float dry_base = params.dry_base;
const uint32_t dry_allowed_length = params.dry_allowed_length;
const uint32_t dry_penalty_last_n = params.dry_penalty_last_n;

auto & prev = ctx_sampling->prev;
auto & cur = ctx_sampling->cur;

Expand Down Expand Up @@ -303,26 +313,41 @@ static llama_token_data_array llama_sampling_prepare_impl(

llama_token_data_array cur_p = { cur.data(), cur.size(), false };

// apply penalties
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;

// apply repetition penalties
{
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
if (penalty_tokens_used_size) {
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

// repetition penalties
llama_sample_repetition_penalties(ctx_main, &cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);

if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
}
}
}
}
}

// apply DRY penalties
{
const int penalty_tokens_used_size = std::min(penalty_tokens.size(), (size_t)dry_penalty_last_n);
if (penalty_tokens_used_size) {
llama_sample_dry(&cur_p,
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
penalty_tokens_used_size, dry_base, dry_multiplier, dry_allowed_length,
params.dry_seq_breakers.data(), params.dry_seq_breakers.size());
}
}

// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
Expand Down Expand Up @@ -362,3 +387,14 @@ void llama_sampling_accept(
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
}
}


void llama_sampling_rollback(
struct llama_sampling_context * ctx_sampling,
int rollback_num) {
if(rollback_num > ctx_sampling->prev.size()) {
rollback_num = ctx_sampling->prev.size();
}

ctx_sampling->prev.erase(ctx_sampling->prev.end() - rollback_num, ctx_sampling->prev.end());
}
12 changes: 12 additions & 0 deletions common/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ typedef struct llama_sampling_params {
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
float dry_multiplier = 0.0f; // 0.0f = disabled, recommended value: 0.8f
float dry_base = 1.75f;
uint32_t dry_allowed_length = 2;
uint32_t dry_penalty_last_n = -1; // DRY last n tokens to penalize (0 = disable penalty, -1 = context size)

std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,
Expand All @@ -61,6 +65,7 @@ typedef struct llama_sampling_params {
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

std::vector<llama_token> penalty_prompt_tokens;
std::vector<llama_token> dry_seq_breakers; // sequence breakers for the DRY sampler
bool use_penalty_prompt_tokens = false;
} llama_sampling_params;

Expand Down Expand Up @@ -92,6 +97,9 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

void llama_sampling_free(struct llama_sampling_context * ctx);

// Reset the sampler grammar without resetting the context
void llama_sampling_reset_grammar(struct llama_sampling_context * ctx);

// Reset the sampler context
// - clear prev tokens
// - reset grammar
Expand Down Expand Up @@ -152,3 +160,7 @@ void llama_sampling_accept(
struct llama_context * ctx_main,
llama_token id,
bool apply_grammar);

void llama_sampling_rollback(
struct llama_sampling_context * ctx_sampling,
int rollback_num);
3 changes: 3 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ int main(int argc, char ** argv) {
}
llama_sampling_params & sparams = params.sparams;

sparams.dry_multiplier = 0.8f;

#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
Expand Down Expand Up @@ -735,6 +737,7 @@ int main(int argc, char ** argv) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
printf("%s", token_str.c_str());
//printf("(%d)", id);

if (embd.size() > 1) {
input_tokens.push_back(id);
Expand Down
4 changes: 4 additions & 0 deletions grammars/chinese.gbnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
root ::= cn-char+ ([ \t\n] cn-char+)*
cn-char ::= cjk | punctuation
cjk ::= [一-鿿] | [𠀀-𯿽]
punctuation ::= [、-〾]
8 changes: 8 additions & 0 deletions grammars/korean.gbnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
root ::= conversation+
conversation ::= assistant-line "\nUSER: "
assistant-line ::= kr-string "\n"
kr-string ::= kr-char*
kr-char ::= hangul | punctuation | whitespace
hangul ::= [가-힣]
punctuation ::= [、-〾]
whitespace ::= [ \t]
3 changes: 3 additions & 0 deletions grammars/schedule.gbnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
root ::= record
record ::= "Event: " string "\n" "Date: " string "\n" "Time: " string "\n"
string ::= "" [ -~]* ""
17 changes: 17 additions & 0 deletions ios.toolchain.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set(CMAKE_SYSTEM_NAME iOS)

# specify the cross compiler
set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH NO)

# specify which architectures to build for
set(CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)")

# you can also choose to build for a specific device
# set(CMAKE_OSX_ARCHITECTURES "arm64")
# or for the simulator
# set(CMAKE_OSX_ARCHITECTURES "x86_64")

set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos;-iphonesimulator")

# you might also want to set the deployment target
# set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "10.0")
Loading

0 comments on commit c93b977

Please sign in to comment.