diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c04e4c19c5d7..427015be52f69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -809,9 +809,9 @@ if (LLAMA_CCACHE) if (LLAMA_CCACHE_FOUND) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) set(ENV{CCACHE_SLOPPINESS} time_macros) - message(STATUS "Using ccache") + message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.") else() - message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF") + message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF") endif () endif() diff --git a/Makefile b/Makefile index 21d5e15baa764..ba73f063709c7 100644 --- a/Makefile +++ b/Makefile @@ -112,6 +112,18 @@ MK_CXXFLAGS += -O3 MK_NVCCFLAGS += -O3 endif +ifndef LLAMA_NO_CCACHE +CCACHE := $(shell which ccache) +ifdef CCACHE +export CCACHE_SLOPPINESS = time_macros +$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.) +CC := $(CCACHE) $(CC) +CXX := $(CCACHE) $(CXX) +else +$(info I ccache not found. Consider installing it for faster compilation.) +endif # CCACHE +endif # LLAMA_NO_CCACHE + # clock_gettime came in POSIX.1b (1993) # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # posix_memalign came in POSIX.1-2001 / SUSv3 @@ -374,9 +386,9 @@ ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo endif # LLAMA_DEBUG ifdef LLAMA_CUDA_NVCC - NVCC = $(LLAMA_CUDA_NVCC) + NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) else - NVCC = nvcc + NVCC = $(CCACHE) nvcc endif #LLAMA_CUDA_NVCC ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) @@ -483,7 +495,7 @@ ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif - HIPCC ?= $(ROCM_PATH)/bin/hipcc + HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 @@ -607,97 +619,135 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) clean: rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + find examples pocs -type f -name "*.o" -delete # # Examples # +# $< is the first prerequisite, i.e. the source file. +# Explicitly compile this to an object file so that it can be cached with ccache. +# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead. + +# Helper function that replaces .c, .cpp, and .cu file endings with .o: +GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) + main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2) gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift @@ -705,7 +755,7 @@ swift: examples/batched.swift endif common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh - @sh scripts/build-info.sh $(CC) > $@.tmp + @sh scripts/build-info.sh "$(CC)" > $@.tmp @if ! cmp -s $@.tmp $@; then \ mv $@.tmp $@; \ else \ @@ -722,7 +772,8 @@ build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) run-benchmark-matmult: benchmark-matmult ./$@ @@ -730,58 +781,76 @@ run-benchmark-matmult: benchmark-matmult .PHONY: run-benchmark-matmult swift vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-c.o: tests/test-c.c llama.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index 4a9bdf314b910..bb6c49338d5d6 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) -Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ ### Hot topics @@ -58,18 +58,20 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ## Description -The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook +The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide +variety of hardware - locally and in the cloud. -- Plain C/C++ implementation without dependencies -- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks +- Plain C/C++ implementation without any dependencies +- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - AVX, AVX2 and AVX512 support for x86 architectures -- Mixed F16 / F32 precision -- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support -- CUDA, Metal, OpenCL, SYCL GPU backend support +- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use +- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) +- Vulkan, SYCL, and (partial) OpenCL backend support +- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). -Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves -as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. +Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has +improved significantly thanks to many contributions. It is the main playground for developing new features for the +[ggml](https://github.com/ggerganov/ggml) library. **Supported platforms:** @@ -77,11 +79,14 @@ as the main playground for developing new features for the [ggml](https://github - [X] Linux - [X] Windows (via CMake) - [X] Docker +- [X] FreeBSD **Supported models:** - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 +- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) +- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [X] Falcon - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) @@ -95,7 +100,6 @@ as the main playground for developing new features for the [ggml](https://github - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) -- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) @@ -104,14 +108,14 @@ as the main playground for developing new features for the [ggml](https://github - [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) -- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) - [x] [GPT-2](https://huggingface.co/gpt2) +- [x] [CodeShell](https://github.com/WisdomShell/codeshell) **Multimodal models:** -- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e) -- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava) +- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e) +- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) @@ -136,14 +140,22 @@ as the main playground for developing new features for the [ggml](https://github **UI:** +Unless otherwise noted these projects are open-source with permissive licensing: + +- [iohub/collama](https://github.com/iohub/coLLaMA) +- [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) -- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) -- [withcatai/catai](https://github.com/withcatai/catai) -- [semperai/amica](https://github.com/semperai/amica) +- [LMStudio](https://lmstudio.ai/) (proprietary) +- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) +- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) +- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) +- [ollama/ollama](https://github.com/ollama/ollama) +- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) - [psugihara/FreeChat](https://github.com/psugihara/FreeChat) - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) -- [iohub/collama](https://github.com/iohub/coLLaMA) -- [pythops/tenere](https://github.com/pythops/tenere) +- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) +- [semperai/amica](https://github.com/semperai/amica) +- [withcatai/catai](https://github.com/withcatai/catai) --- diff --git a/common/common.cpp b/common/common.cpp index 3302caa200459..8c1a60583f276 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -399,6 +399,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } sparams.penalty_present = std::stof(argv[i]); + } else if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_range = std::stof(argv[i]); + } else if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.dynatemp_exponent = std::stof(argv[i]); } else if (arg == "--mirostat") { if (++i >= argc) { invalid_param = true; @@ -942,6 +954,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); + printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); + printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a6ffd128b6bbb..5e343742d2484 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1416,8 +1416,32 @@ def set_vocab(self): self.gguf_writer.add_add_space_prefix(add_prefix) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + special_vocab.add_to_gguf(self.gguf_writer) + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.encode('<|im_end|>') + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + def set_gguf_parameters(self): self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) @@ -1486,8 +1510,9 @@ def write_tensors(self): qkv = data_torch qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - q = rearrange(q, " o g n i -> o (g n i)").T - k = rearrange(k, " o g n i -> o (g n i)").T + # The model weights of q and k equire additional reshape. + q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) v = rearrange(v, " o g n i -> o (g n i)").T self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) diff --git a/examples/server/README.md b/examples/server/README.md index fe934dab11010..d8e7c313e1732 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -264,7 +264,21 @@ Notice that each `probs` is an array of length `n_probs`. It also accepts all the options of `/completion` except `stream` and `prompt`. -- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- **GET** `/props`: Return current server settings. + +### Result JSON + +```json +{ + "assistant_name": "", + "user_name": "", + "default_generation_settings": { ... } +} +``` + +- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. +- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a9f8cb3692158..8000fee5c90d7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -334,6 +334,7 @@ struct llama_server_context // slots / clients std::vector slots; + json default_generation_settings_for_props; llama_server_queue queue_tasks; llama_server_response queue_results; @@ -430,6 +431,9 @@ struct llama_server_context slots.push_back(slot); } + default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props["seed"] = -1; + batch = llama_batch_init(n_ctx, 0, params.n_parallel); // empty system prompt @@ -2614,7 +2618,8 @@ int main(int argc, char **argv) res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "user_name", llama.name_user.c_str() }, - { "assistant_name", llama.name_assistant.c_str() } + { "assistant_name", llama.name_assistant.c_str() }, + { "default_generation_settings", llama.default_generation_settings_for_props } }; res.set_content(data.dump(), "application/json; charset=utf-8"); }); diff --git a/ggml-impl.h b/ggml-impl.h index 2c58075ac7c56..19df66bceee4a 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -19,6 +19,7 @@ extern "C" { // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 +#ifndef __cplusplus #ifndef static_assert #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) @@ -26,6 +27,7 @@ extern "C" { #define static_assert(cond, msg) struct global_scope_noop_trick #endif #endif +#endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) diff --git a/ggml-quants.c b/ggml-quants.c index 8236385bce8e9..014c0525abd1b 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -9048,8 +9048,6 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict int8_t L[32]; int8_t Laux[32]; float waux[32]; - bool is_on_grid[4]; - bool is_on_grid_aux[4]; uint8_t block_signs[4]; uint32_t q2[2*(QK_K/32)]; @@ -9099,10 +9097,11 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict memset(L, 0, 32); continue; } + float scale = make_qp_quants(32, kMaxQ+1, xval, (uint8_t*)L, weight); + float eff_max = scale*kMaxQ; float best = 0; - float scale = max/(2*kMaxQ-1); - for (int is = -9; is <= 9; ++is) { - float id = (2*kMaxQ-1+is*0.1f)/max; + for (int is = -6; is <= 6; ++is) { + float id = (2*kMaxQ-1+is*0.1f)/eff_max; float this_scale = 1/id; for (int k = 0; k < 4; ++k) { for (int i = 0; i < 8; ++i) { @@ -9112,9 +9111,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict uint16_t u = 0; for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i); int grid_index = kmap_q2xs[u]; - is_on_grid_aux[k] = true; if (grid_index < 0) { - is_on_grid_aux[k] = false; const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k); } @@ -9128,16 +9125,12 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict } if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { scale = sumqx/sumq2; best = scale*sumqx; - for (int i = 0; i < 32; ++i) L[i] = Laux[i]; - for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + memcpy(L, Laux, 32); } } - int n_not_ongrid = 0; - for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; - if (n_not_ongrid > 0 && scale > 0) { + if (scale > 0) { float id = 1/scale; for (int k = 0; k < 4; ++k) { - if (is_on_grid[k]) continue; uint16_t u = 0; for (int i = 0; i < 8; ++i) { int l = nearest_int(0.5f*(id*xval[8*k+i]-1)); @@ -9193,49 +9186,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict float d = max_scale/31; y[ibl].d = GGML_FP32_TO_FP16(d); float id = 1/d; - float sumqx = 0, sumq2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(15, l)); q2[2*ib+1] |= ((uint32_t)l << 28); - const float * xb = xbl + 32*ib; - const float * qw = quant_weights + QK_K*ibl + 32*ib; - for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib); - const float db = d * (1 + 2*l); - uint32_t u = 0; - for (int k = 0; k < 4; ++k) { - const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127); - const float * xk = xb + 8*k; - const float * wk = weight + 8*k; - const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); - float best_mse = 0; int best_index = aux8[k]; - for (int j = 0; j < 8; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - best_mse += wk[j] * diff * diff; - } - for (int idx = 0; idx < 256; ++idx) { - grid = (const uint8_t *)(kgrid_q2xs + idx); - float mse = 0; - for (int j = 0; j < 8; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - mse += wk[j] * diff * diff; - } - if (mse < best_mse) { - best_mse = mse; best_index = idx; - } - } - u |= (best_index << 8*k); - grid = (const uint8_t *)(kgrid_q2xs + best_index); - //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]); - for (int j = 0; j < 8; ++j) { - float q = db * grid[j] * signs[j]; - sumqx += wk[j] * q * xk[j]; - sumq2 += wk[j] * q * q; - } - } - q2[2*ib] = u; - if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2); } memcpy(y[ibl].qs, q2, QK_K/4); } diff --git a/ggml-quants.h b/ggml-quants.h index 5c9f63bd9b102..bfdf3c99718ee 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -191,70 +191,74 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +#ifdef __cplusplus +extern "C" { +#endif + // Quantization -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); - -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); -void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k); - -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); - -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); -void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k); +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k); +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k); +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k); +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k); +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k); + +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k); +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k); +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k); +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k); +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); + +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); -//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); - -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); -void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); -void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); -void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k); +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); - -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); + +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -276,3 +280,8 @@ void iq2xs_init_impl(int grid_size); void iq2xs_free_impl(int grid_size); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); + +#ifdef __cplusplus +} +#endif + diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 51445b5e7eaaf..a03df4c654303 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -7693,6 +7693,13 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) { *dsti = *xi; } +static void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const sycl::half *xi = (const sycl::half *)cxi; + float *dsti = (float *)cdsti; + + *dsti = *xi; +} + static void cpy_1_i16_i16(const char * cxi, char * cdsti) { const int16_t *xi = (const int16_t *)cxi; int16_t *dsti = (int16_t *)cdsti; @@ -7709,9 +7716,9 @@ static void cpy_1_i32_i32(const char * cxi, char * cdsti) { template static void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, - const sycl::nd_item<3> &item_ct1) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -7721,15 +7728,17 @@ static void cpy_f32_f16(const char * cx, char * cdst, const int ne, // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor // then combine those indices with the corresponding byte offsets to get the total offsets - const int i02 = i / (ne00*ne01); - const int i01 = (i - i02*ne01*ne00) / ne00; - const int i00 = i - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; - - const int i12 = i / (ne10*ne11); - const int i11 = (i - i12*ne10*ne11) / ne10; - const int i10 = i - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; + + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; cpy_1(cx + x_offset, cdst + dst_offset); } @@ -7823,9 +7832,9 @@ static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { template static void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, - const sycl::nd_item<3> &item_ct1) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; @@ -7834,15 +7843,17 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, return; } - const int i02 = i / (ne00*ne01); - const int i01 = (i - i02*ne01*ne00) / ne00; - const int i00 = (i - i02*ne01*ne00 - i01*ne00); - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - const int i12 = i / (ne10*ne11); - const int i11 = (i - i12*ne10*ne11) / ne10; - const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -10599,10 +10610,12 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10615,8 +10628,8 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10624,10 +10637,12 @@ static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10640,8 +10655,8 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10649,10 +10664,12 @@ static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { GGML_ASSERT(ne % QK8_0 == 0); @@ -10661,17 +10678,20 @@ static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { cpy_f32_q( - cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, item_ct1); + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); }); } static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { GGML_ASSERT(ne % QK4_0 == 0); @@ -10680,17 +10700,20 @@ static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { cpy_f32_q( - cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, item_ct1); + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); }); } static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { GGML_ASSERT(ne % QK4_1 == 0); @@ -10699,17 +10722,20 @@ static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { cpy_f32_q( - cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, - ne10, ne11, nb10, nb11, nb12, item_ct1); + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); }); } static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10722,8 +10748,8 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10731,10 +10757,12 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10747,8 +10775,8 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -10756,10 +10784,12 @@ static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, - const int nb00, const int nb01, - const int nb02, const int ne10, - const int ne11, const int nb10, - const int nb11, const int nb12, + const int ne02, const int nb00, + const int nb01, const int nb02, + const int nb03, const int ne10, + const int ne11, const int ne12, + const int nb10, const int nb11, + const int nb12, const int nb13, dpct::queue_ptr stream) { const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; @@ -10772,8 +10802,8 @@ static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, - nb02, ne10, ne11, nb10, nb11, nb12, + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); }); } @@ -13910,19 +13940,23 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - GGML_ASSERT(src0->ne[3] == 1); + const int64_t ne02 = src0->ne[2]; + const int64_t nb00 = src0->nb[0]; const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; + const int64_t nb03 = src0->nb[3]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(src1->ne[3] == 1); + const int64_t ne12 = src1->ne[2]; + const int64_t nb10 = src1->nb[0]; const int64_t nb11 = src1->nb[1]; const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; @@ -13934,21 +13968,21 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index]; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { - ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ggml.c b/ggml.c index ee994c8757d9d..b9ec0c981b630 100644 --- a/ggml.c +++ b/ggml.c @@ -2470,7 +2470,8 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t max_size = 0; for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) { - max_size = MAX(max_size, ggml_nbytes(tensor)); + size_t bytes = ggml_nbytes(tensor); + max_size = MAX(max_size, bytes); } return max_size; @@ -11887,8 +11888,10 @@ GGML_CALL void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims - dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); - dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); } static void ggml_compute_forward_rope_f32( diff --git a/llama.cpp b/llama.cpp index 4787a92fea030..65e399adca60e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9456,8 +9456,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -9496,9 +9496,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } - //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K; - //} + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { + new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 0b83cdbbca821..30bbac3215f96 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -14,16 +14,17 @@ # - Might be unstable! # # Usage: -# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] +# ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive] # -# --port: port number, default is 8888 -# --repo: path to a repo containing GGUF model files -# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input -# --backend: cpu, cuda, metal, opencl, depends on the OS -# --gpu-id: gpu id, default is 0 -# --n-parallel: number of parallel requests, default is 8 -# --n-kv: KV cache size, default is 4096 -# --verbose: verbose output +# --port: port number, default is 8888 +# --repo: path to a repo containing GGUF model files +# --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input +# --backend: cpu, cuda, metal, opencl, depends on the OS +# --gpu-id: gpu id, default is 0 +# --n-parallel: number of parallel requests, default is 8 +# --n-kv: KV cache size, default is 4096 +# --verbose: verbose output +# --non-interactive: run without asking a permission to run # # Example: # @@ -47,6 +48,7 @@ if ! command -v make &> /dev/null; then fi # parse arguments +is_interactive=1 port=8888 repo="" wtype="" @@ -66,15 +68,16 @@ verbose=0 function print_usage { printf "Usage:\n" - printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" - printf " --port: port number, default is 8888\n" - printf " --repo: path to a repo containing GGUF model files\n" - printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" - printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" - printf " --gpu-id: gpu id, default is 0\n" - printf " --n-parallel: number of parallel requests, default is 8\n" - printf " --n-kv: KV cache size, default is 4096\n" - printf " --verbose: verbose output\n\n" + printf " ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] [-non-interactive]\n\n" + printf " --port: port number, default is 8888\n" + printf " --repo: path to a repo containing GGUF model files\n" + printf " --wtype: weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" + printf " --backend: cpu, cuda, metal, opencl, depends on the OS\n" + printf " --gpu-id: gpu id, default is 0\n" + printf " --n-parallel: number of parallel requests, default is 8\n" + printf " --n-kv: KV cache size, default is 4096\n" + printf " --verbose: verbose output\n\n" + printf " --non-interactive: run without asking a permission to run\n" printf "Example:\n\n" printf ' bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' } @@ -82,6 +85,10 @@ function print_usage { while [[ $# -gt 0 ]]; do key="$1" case $key in + --non-interactive) + is_interactive=0 + shift + ;; --port) port="$2" shift @@ -176,31 +183,32 @@ repos=( "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" ) +if [ $is_interactive -eq 1 ]; then + printf "\n" + printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" + printf " Based on the options that follow, the script might download a model file\n" + printf " from the internet, which can be a few GBs in size. The script will also\n" + printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" + printf "\n" + printf " Upon success, an HTTP server will be started and it will serve the selected\n" + printf " model using llama.cpp for demonstration purposes.\n" + printf "\n" + printf " Please note:\n" + printf "\n" + printf " - All new data will be stored in the current folder\n" + printf " - The server will be listening on all network interfaces\n" + printf " - The server will run with default settings which are not always optimal\n" + printf " - Do not judge the quality of a model based on the results from this script\n" + printf " - Do not use this script to benchmark llama.cpp\n" + printf " - Do not use this script in production\n" + printf " - This script is only for demonstration purposes\n" + printf "\n" + printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" + printf "\n" + printf " Press Enter to continue ...\n\n" -printf "\n" -printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" -printf " Based on the options that follow, the script might download a model file\n" -printf " from the internet, which can be a few GBs in size. The script will also\n" -printf " build the latest llama.cpp source code from GitHub, which can be unstable.\n" -printf "\n" -printf " Upon success, an HTTP server will be started and it will serve the selected\n" -printf " model using llama.cpp for demonstration purposes.\n" -printf "\n" -printf " Please note:\n" -printf "\n" -printf " - All new data will be stored in the current folder\n" -printf " - The server will be listening on all network interfaces\n" -printf " - The server will run with default settings which are not always optimal\n" -printf " - Do not judge the quality of a model based on the results from this script\n" -printf " - Do not use this script to benchmark llama.cpp\n" -printf " - Do not use this script in production\n" -printf " - This script is only for demonstration purposes\n" -printf "\n" -printf " If you don't know what you are doing, please press Ctrl-C to abort now\n" -printf "\n" -printf " Press Enter to continue ...\n\n" - -read + read +fi if [[ -z "$repo" ]]; then printf "[+] No repo provided from the command line\n"