Skip to content

Commit

Permalink
Merge branch 'master' into layla-build
Browse files Browse the repository at this point in the history
  • Loading branch information
l3utterfly committed Oct 1, 2024
2 parents 76fe14c + a90484c commit a063017
Show file tree
Hide file tree
Showing 49 changed files with 1,830 additions and 514 deletions.
6 changes: 3 additions & 3 deletions .devops/full-rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
Expand All @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
gfx1102"

COPY requirements.txt requirements.txt
COPY requirements requirements
Expand All @@ -34,7 +34,7 @@ WORKDIR /app
COPY . .

# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
Expand Down
6 changes: 3 additions & 3 deletions .devops/llama-cli-rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
Expand All @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
gfx1102"

COPY requirements.txt requirements.txt
COPY requirements requirements
Expand All @@ -34,7 +34,7 @@ WORKDIR /app
COPY . .

# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
Expand Down
6 changes: 3 additions & 3 deletions .devops/llama-server-rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
ARG ROCM_DOCKER_ARCH="\
gfx803 \
gfx900 \
gfx906 \
Expand All @@ -21,7 +21,7 @@ ARG ROCM_DOCKER_ARCH=\
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
gfx1102"

COPY requirements.txt requirements.txt
COPY requirements requirements
Expand All @@ -34,7 +34,7 @@ WORKDIR /app
COPY . .

# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
Expand Down Expand Up @@ -1031,7 +1032,7 @@ jobs:
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
Expand Down
4 changes: 3 additions & 1 deletion .github/workflows/python-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- '**.py'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- 'pyrightconfig.json'
- '**.py'
- '**/requirements*.txt'

Expand All @@ -33,6 +35,6 @@ jobs:
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.370
version: 1.1.382
level: warning
warnings: true
11 changes: 8 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)

# utils
option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)

# extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -191,15 +194,17 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig)

#
# programs, examples and tests
# utils, programs, examples and tests
#

add_subdirectory(common)
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()

if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif ()
endif()

if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
Expand Down
5 changes: 5 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@

![matmul](media/matmul.png)

# Resources

The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

https://github.com/ggerganov/llama.cpp/projects
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

## Hot topics

- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

----

Expand Down Expand Up @@ -173,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
**Tools:**

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
Expand Down Expand Up @@ -441,7 +443,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues and PRs is very appreciated!
- Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
Expand Down
85 changes: 76 additions & 9 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,81 @@ function gg_run_embd_bge_small {
set +e
}

function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"

gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}

# rerank_tiny

function gg_run_rerank_tiny {
cd ${SRC}

gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json

gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json

path_models="../models-mnt/rerank-tiny"

rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release

set -e

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

model_f16="${path_models}/ggml-model-f16.gguf"

(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

# sample output
# rerank score 0: 0.029
# rerank score 1: 0.029
# rerank score 2: 0.135

# check that the score is in the range [$3, $4]
function check_score {
qnt="$1"
score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)

if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
return 20
fi

printf ' - %s @ %s OK\n' "$qnt" "$score"
return 0
}

check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log

set +e
}

function gg_sum_rerank_tiny {
gg_printf '### %s\n\n' "${ci}"

gg_printf 'Rerank Tiny (Jina):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
}

function gg_check_build_requirements {
if ! command -v cmake &> /dev/null; then
gg_printf 'cmake not found, please install'
Expand All @@ -726,15 +801,6 @@ function gg_check_build_requirements {
fi
}

function gg_sum_embd_bge_small {
gg_printf '### %s\n\n' "${ci}"

gg_printf 'BGE Small (BERT):\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
}

## main

export LLAMA_LOG_PREFIX=1
Expand Down Expand Up @@ -762,6 +828,7 @@ test $ret -eq 0 && gg_run ctest_release

if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
test $ret -eq 0 && gg_run rerank_tiny

if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
Expand Down
18 changes: 15 additions & 3 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,10 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
params.kv_overrides.back().key[0] = 0;
}

if (params.reranking && params.embedding) {
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
}

return true;
}

Expand Down Expand Up @@ -391,7 +395,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
[](gpt_params & params) {
params.verbose_prompt = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
));
add_opt(llama_arg(
{"--no-display-prompt"},
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
Expand Down Expand Up @@ -1093,13 +1097,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
}
).set_sparam());
add_opt(llama_arg(
{"--pooling"}, "{none,mean,cls,last}",
{"--pooling"}, "{none,mean,cls,last,rank}",
"pooling type for embeddings, use model default if unspecified",
[](gpt_params & params, const std::string & value) {
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
else { throw std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
Expand Down Expand Up @@ -1749,6 +1754,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
add_opt(llama_arg(
{"--reranking", "--rerank"},
format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
[](gpt_params & params) {
params.reranking = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
add_opt(llama_arg(
{"--api-key"}, "KEY",
"API key to use for authentication (default: none)",
Expand Down
7 changes: 7 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,11 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;

if (params.reranking) {
cparams.embeddings = true;
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
}

cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

Expand Down Expand Up @@ -1432,6 +1437,8 @@ void llama_batch_add(
llama_pos pos,
const std::vector<llama_seq_id> & seq_ids,
bool logits) {
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");

batch.token [batch.n_tokens] = id;
batch.pos [batch.n_tokens] = pos;
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ struct gpt_params {
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
std::string embd_sep = "\n"; // separator of embendings
bool reranking = false; // enable reranking support on server

// server params
int32_t port = 8080; // server listens on this network port
Expand Down
3 changes: 3 additions & 0 deletions common/console.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ namespace console {
simple_io = true;
}
}
if (simple_io) {
_setmode(_fileno(stdin), _O_U8TEXT);
}
#else
// POSIX-specific console initialization
if (!simple_io) {
Expand Down
Loading

0 comments on commit a063017

Please sign in to comment.