From cf8e0a3bb9c0e93e371773b282054cdbbb231038 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Fri, 11 Oct 2024 02:10:37 +0800
Subject: [PATCH] musa: add docker image support (#9685)

* mtgpu: add docker image support

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* mtgpu: enable docker workflow

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 .devops/full-musa.Dockerfile         | 26 +++++++++++++++++++
 .devops/llama-cli-musa.Dockerfile    | 30 +++++++++++++++++++++
 .devops/llama-server-musa.Dockerfile | 35 +++++++++++++++++++++++++
 .github/workflows/docker.yml         |  3 +++
 docs/docker.md                       | 39 +++++++++++++++++++++++++++-
 ggml/src/CMakeLists.txt              |  4 +--
 6 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100644 .devops/full-musa.Dockerfile
 create mode 100644 .devops/llama-cli-musa.Dockerfile
 create mode 100644 .devops/llama-server-musa.Dockerfile

diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile
new file mode 100644
index 0000000000000..34ba856d3d1ca
--- /dev/null
+++ b/.devops/full-musa.Dockerfile
@@ -0,0 +1,26 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile
new file mode 100644
index 0000000000000..b5696794f1a56
--- /dev/null
+++ b/.devops/llama-cli-musa.Dockerfile
@@ -0,0 +1,30 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc)
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile
new file mode 100644
index 0000000000000..193a6d77cb9ed
--- /dev/null
+++ b/.devops/llama-server-musa.Dockerfile
@@ -0,0 +1,35 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc)
+
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/llama-server" ]
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index a4ac9b21792e0..a953cdac907ae 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -43,6 +43,9 @@ jobs:
           - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
           - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
           #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
           #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
diff --git a/docs/docker.md b/docs/docker.md
index e8a084173e87e..8d90e6ded5738 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
 
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
 
 ## Usage
 
@@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
+
+## Docker With MUSA
+
+Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container.
+
+## Building Docker locally
+
+```bash
+docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
+docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
+docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
+```
+
+You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
+
+The defaults are:
+
+- `MUSA_VERSION` set to `rc3.1.0`
+
+The resulting images, are essentially the same as the non-MUSA images:
+
+1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
+3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
+
+## Usage
+
+After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
+
+```bash
+docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
+```
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f126ebf7e9282..676f85a369bc6 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -163,8 +163,8 @@ if (GGML_OPENMP)
         list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
 
         if (GGML_MUSA)
-            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
+            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-14/lib/clang/14.0.0/include")
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
         endif()
     else()
         message(WARNING "OpenMP not found")