diff --git a/common/arg.cpp b/common/arg.cpp
index 12f05cc20cb4c..de8e6bac8ca59 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1093,7 +1093,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
}
).set_sparam());
add_opt(llama_arg(
- {"--pooling"}, "{none,mean,cls,last, rank}",
+ {"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified",
[](gpt_params & params, const std::string & value) {
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
diff --git a/examples/server/README.md b/examples/server/README.md
index dfca07f988824..2562680cb6c98 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,6 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
**Features:**
* LLM inference of F16 and quantized models on GPU and CPU
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
+ * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
* Parallel decoding with multi-user support
* Continuous batching
* Multimodal (wip)
@@ -130,7 +131,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
| `-sp, --special` | special tokens output enabled (default: false) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) |
+| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) |
| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) |
| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) |
@@ -478,6 +479,39 @@ The same as [the embedding example](../embedding) does.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+### POST `/reranking`: Rerank documents according to a given query
+
+Similar to https://jina.ai/reranker/ but might change in the future.
+Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
+
+ *Options:*
+
+ `query`: The query against which the documents will be ranked.
+
+ `documents`: An array strings representing the documents to be ranked.
+
+ *Aliases:*
+ - `/rerank`
+ - `/v1/rerank`
+ - `/v1/reranking`
+
+ *Examples:*
+
+ ```shell
+ curl http://127.0.0.1:8012/v1/rerank \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "some-model",
+ "query": "What is panda?",
+ "top_n": 3,
+ "documents": [
+ "hi",
+ "it is a bear",
+ "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+ ]
+ }' | jq
+ ```
+
### POST `/infill`: For code infilling.
Takes a prefix and a suffix and returns the predicted completion as stream.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 084dea212cbd7..726d4a7e37fa5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3297,7 +3297,9 @@ int main(int argc, char ** argv) {
svr->Post("/embeddings", handle_embeddings);
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/rerank", handle_rerank);
+ svr->Post("/reranking", handle_rerank);
svr->Post("/v1/rerank", handle_rerank);
+ svr->Post("/v1/reranking", handle_rerank);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
// LoRA adapters hotswap