Skip to content

Commit

Permalink
Merge pull request #47 from l3utterfly/master
Browse files Browse the repository at this point in the history
merge upstream
  • Loading branch information
l3utterfly authored Nov 29, 2024
2 parents bca159c + 266b851 commit 61607e8
Show file tree
Hide file tree
Showing 56 changed files with 1,640 additions and 486 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install Cuda Toolkit 11.7
if: ${{ matrix.cuda == '11.7' }}
Expand Down Expand Up @@ -1139,6 +1141,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install
id: depends
Expand Down
186 changes: 185 additions & 1 deletion AUTHORS

Large diffs are not rendered by default.

16 changes: 11 additions & 5 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) {
}
params.hf_file = params.model;
} else if (params.model.empty()) {
params.model = fs_get_cache_file(string_split<std::string>(params.hf_file, '/').back());
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = params.hf_repo + "_" + params.hf_file;
// to make sure we don't have any slashes in the filename
string_replace_all(filename, "/", "_");
params.model = fs_get_cache_file(filename);
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {
Expand Down Expand Up @@ -1366,8 +1370,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
Expand Down Expand Up @@ -2100,8 +2105,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
Expand Down
54 changes: 27 additions & 27 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_model * model = nullptr;

if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
} else if (!params.model_url.empty()) {
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
Expand Down Expand Up @@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
}

struct llama_model * common_load_model_from_url(
const char * model_url,
const char * path_model,
const char * hf_token,
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
if (model_url.empty()) {
LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}

if (!common_download_file(model_url, path_model, hf_token)) {
if (!common_download_file(model_url, local_path, hf_token)) {
return NULL;
}

Expand All @@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url(
/*.no_alloc = */ true,
/*.ctx = */ NULL,
};
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
if (!ctx_gguf) {
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
return NULL;
}

Expand All @@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url(
// Verify the first split file format
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
return NULL;
}

if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
return NULL;
}
}
Expand All @@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url(
}
}

return llama_load_model_from_file(path_model, params);
return llama_load_model_from_file(local_path.c_str(), params);
}

struct llama_model * common_load_model_from_hf(
const char * repo,
const char * model,
const char * path_model,
const char * hf_token,
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params) {
// construct hugging face model url:
//
Expand All @@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf(
std::string model_url = "https://huggingface.co/";
model_url += repo;
model_url += "/resolve/main/";
model_url += model;
model_url += remote_path;

return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
return common_load_model_from_url(model_url, local_path, hf_token, params);
}

#else

struct llama_model * common_load_model_from_url(
const char * /*model_url*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const std::string & /*model_url*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}

struct llama_model * common_load_model_from_hf(
const char * /*repo*/,
const char * /*model*/,
const char * /*path_model*/,
const char * /*hf_token*/,
const std::string & /*repo*/,
const std::string & /*remote_path*/,
const std::string & /*local_path*/,
const std::string & /*hf_token*/,
const struct llama_model_params & /*params*/) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
Expand Down
13 changes: 11 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params
struct llama_context_params common_context_params_to_llama(const common_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * common_load_model_from_url(
const std::string & model_url,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
Expand Down
4 changes: 2 additions & 2 deletions docs/android.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:

```
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
```

Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.

To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:

Expand Down
4 changes: 4 additions & 0 deletions docs/backend/CANN.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

## News

- 2024.11
- Support F16 and F32 data type model for Ascend 310P NPU.
- 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
Expand All @@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
### Ascend NPU

**Verified devices**

| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
| Atlas 300I Duo | Support |

*Notes:*

Expand Down
15 changes: 11 additions & 4 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,17 @@
#include <cinttypes>
#include <limits>

#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)

//#define CLIP_DEBUG_FUNCTIONS

Expand Down
28 changes: 19 additions & 9 deletions examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@
#include <limits>
#include <vector>

#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)

// RGB uint8 image
struct clip_image_u8 {
Expand Down Expand Up @@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
errno = 0;
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
if (ferror(file)) {
die_fmt("read error: %s", strerror(errno));
LOG_ERR("read error: %s", strerror(errno));
free(buffer);
fclose(file);
return false;
}
if (ret != (size_t) fileSize) {
die("unexpectedly reached end of file");
LOG_ERR("unexpectedly reached end of file");
free(buffer);
fclose(file);
return false;
}
fclose(file); // Close the file

Expand Down
2 changes: 1 addition & 1 deletion examples/server/tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ aiohttp~=3.9.3
pytest~=8.3.3
huggingface_hub~=0.23.2
numpy~=1.26.4
openai~=1.30.3
openai~=1.55.3
prometheus-client~=0.20.0
requests~=2.32.3
20 changes: 2 additions & 18 deletions examples/server/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import re
import json
import sys
import threading
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
Expand Down Expand Up @@ -161,26 +160,12 @@ def start(self, timeout_seconds: int = 10) -> None:
self.process = subprocess.Popen(
[str(arg) for arg in [server_path, *server_args]],
creationflags=flags,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdout=sys.stdout,
stderr=sys.stdout,
env={**os.environ, "LLAMA_CACHE": "tmp"},
)
server_instances.add(self)

def server_log(in_stream, out_stream):
for line in iter(in_stream.readline, b""):
print(line.decode("utf-8"), end="", file=out_stream)

thread_stdout = threading.Thread(
target=server_log, args=(self.process.stdout, sys.stdout), daemon=True
)
thread_stdout.start()

thread_stderr = threading.Thread(
target=server_log, args=(self.process.stderr, sys.stderr), daemon=True
)
thread_stderr.start()

print(f"server pid={self.process.pid}, pytest pid={os.getpid()}")

# wait for server to start
Expand Down Expand Up @@ -319,7 +304,6 @@ def jina_reranker_tiny() -> ServerProcess:
server.model_hf_repo = "ggml-org/models"
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
server.model_alias = "jina-reranker"
server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf"
server.n_ctx = 512
server.n_batch = 512
server.n_slots = 1
Expand Down
2 changes: 1 addition & 1 deletion examples/simple/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.

```bash
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"

...

Expand Down
1 change: 1 addition & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
Expand Down
3 changes: 3 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,9 @@ extern "C" {
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT,
};

Expand Down
7 changes: 4 additions & 3 deletions ggml/src/ggml-cann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ if(NOT SOC_TYPE)
detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
else()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
endif()

# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower

# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)

if (CANN_INSTALL_DIR)
# Only Support Linux.
Expand Down
Loading

0 comments on commit 61607e8

Please sign in to comment.