diff --git a/common/arg.cpp b/common/arg.cpp index 272492e50df15..a6b7a1394f735 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) { } params.hf_file = params.model; } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + // this is to avoid different repo having same file name, or same file name in different subdirs + std::string filename = params.hf_repo + "_" + params.hf_file; + // to make sure we don't have any slashes in the filename + string_replace_all(filename, "/", "_"); + params.model = fs_get_cache_file(filename); } } else if (!params.model_url.empty()) { if (params.model.empty()) { diff --git a/common/common.cpp b/common/common.cpp index 09ec9f2388afb..2b2f0009897f3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -829,9 +829,9 @@ struct common_init_result common_init_from_params(common_params & params) { llama_model * model = nullptr; if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams); } else if (!params.model_url.empty()) { - model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); } else { model = llama_load_model_from_file(params.model.c_str(), mparams); } @@ -1342,17 +1342,17 @@ static bool common_download_file(const std::string & url, const std::string & pa } struct llama_model * common_load_model_from_url( - const char * model_url, - const char * path_model, - const char * hf_token, + const std::string & model_url, + const std::string & local_path, + const std::string & hf_token, const struct llama_model_params & params) { // Basic validation of the model_url - if (!model_url || strlen(model_url) == 0) { + if (model_url.empty()) { LOG_ERR("%s: invalid model_url\n", __func__); return NULL; } - if (!common_download_file(model_url, path_model, hf_token)) { + if (!common_download_file(model_url, local_path, hf_token)) { return NULL; } @@ -1363,9 +1363,9 @@ struct llama_model * common_load_model_from_url( /*.no_alloc = */ true, /*.ctx = */ NULL, }; - auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); + auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); if (!ctx_gguf) { - LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model); + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str()); return NULL; } @@ -1384,13 +1384,13 @@ struct llama_model * common_load_model_from_url( // Verify the first split file format // and extract split URL and PATH prefixes { - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { - LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split); + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split); return NULL; } - if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { - LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split); + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split); return NULL; } } @@ -1417,14 +1417,14 @@ struct llama_model * common_load_model_from_url( } } - return llama_load_model_from_file(path_model, params); + return llama_load_model_from_file(local_path.c_str(), params); } struct llama_model * common_load_model_from_hf( - const char * repo, - const char * model, - const char * path_model, - const char * hf_token, + const std::string & repo, + const std::string & remote_path, + const std::string & local_path, + const std::string & hf_token, const struct llama_model_params & params) { // construct hugging face model url: // @@ -1438,27 +1438,27 @@ struct llama_model * common_load_model_from_hf( std::string model_url = "https://huggingface.co/"; model_url += repo; model_url += "/resolve/main/"; - model_url += model; + model_url += remote_path; - return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params); + return common_load_model_from_url(model_url, local_path, hf_token, params); } #else struct llama_model * common_load_model_from_url( - const char * /*model_url*/, - const char * /*path_model*/, - const char * /*hf_token*/, + const std::string & /*model_url*/, + const std::string & /*local_path*/, + const std::string & /*hf_token*/, const struct llama_model_params & /*params*/) { LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } struct llama_model * common_load_model_from_hf( - const char * /*repo*/, - const char * /*model*/, - const char * /*path_model*/, - const char * /*hf_token*/, + const std::string & /*repo*/, + const std::string & /*remote_path*/, + const std::string & /*local_path*/, + const std::string & /*hf_token*/, const struct llama_model_params & /*params*/) { LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); return nullptr; diff --git a/common/common.h b/common/common.h index 286642db24158..9b1508a15fb43 100644 --- a/common/common.h +++ b/common/common.h @@ -470,8 +470,17 @@ struct llama_model_params common_model_params_to_llama ( common_params struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_url( + const std::string & model_url, + const std::string & local_path, + const std::string & hf_token, + const struct llama_model_params & params); +struct llama_model * common_load_model_from_hf( + const std::string & repo, + const std::string & remote_path, + const std::string & local_path, + const std::string & hf_token, + const struct llama_model_params & params); // clear LoRA adapters from context, then apply new list of adapters void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index bc590bcb31547..e31743c505d8e 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -319,7 +319,6 @@ def jina_reranker_tiny() -> ServerProcess: server.model_hf_repo = "ggml-org/models" server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf" server.model_alias = "jina-reranker" - server.model_file = "./tmp/jina-reranker-v1-tiny-en.gguf" server.n_ctx = 512 server.n_batch = 512 server.n_slots = 1