diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..cab6b7d --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,150 @@ +name: Build + +permissions: + contents: write + +on: + push: + tags: + - v* + workflow_dispatch: + +jobs: + build_cu121: + name: Build Shared Library (CUDA 12.1) + runs-on: ubuntu-latest + container: + image: pytorch/manylinux2_28-builder:cuda12.1 + + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Environments + run: | + echo "PYTHON_VERSION=$(python3 -c "from os import environ as env; print({'3.7': 'cp37-cp37m', '3.8': 'cp38-cp38', '3.9': 'cp39-cp39', '3.10': 'cp310-cp310', '3.11': 'cp311-cp311', '3.12': 'cp312-cp312' }['${{ matrix.python-version }}'])")" >> $GITHUB_ENV + echo "PATH=$PATH" >> $GITHUB_ENV + + - name: Set up Packages + env: + PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin + run: | + dnf -y install epel-release && dnf upgrade + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && dnf makecache + dnf module install -y nvidia-driver:latest && dnf clean all + yum install -y pigz clang-tools-extra openmpi openmpi-devel spdlog spdlog-devel libibverbs rdma-core-devel numactl numactl-devel numactl-libs + pwd && ls -la + sed -i '256d' 3rdparty/mscclpp/src/bootstrap/bootstrap.cc + cat 3rdparty/patches/spdlog/*.patch | patch -p1 -d 3rdparty/spdlog + $PYBIN/pip install cmake mypy pybind11 black + cd pipeline/src/generate-gemm && $PYBIN/python genGEMM.py && cd ../../ + + - name: Build Library + env: + PATH: ${{ env.PATH }}:/usr/lib64/openmpi/bin + PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin + PREFIX: /opt/python/${{ env.PYTHON_VERSION }};/usr/lib64/openmpi + pybind11_DIR: /opt/python/${{ env.PYTHON_VERSION }}/lib/python${{ matrix.python-version }}/site-packages/pybind11 + CUDACXX: "/usr/local/cuda-12.1/bin/nvcc" + CUDA_HOME: "/usr/local/cuda-12.1" + CFLAGS: "-I/usr/include" + LDFLAGS: "-L/usr/lib" + run: | + cd pipeline && mkdir -p build && cd build + which cmake && cmake --version + which mpicxx && mpicxx --version + ${PYBIN}/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${PREFIX} -DBUILD_PYTHON_BINDINGS=OFF -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + make -j$(nproc) + tar -zcvf pllm_cu121_py${{ matrix.python-version }}.tgz pllm_python.* test_* + ls -la + + - uses: actions/upload-artifact@v4 + with: + name: pllm_cu121_py${{ matrix.python-version }}.tgz + path: | + pipeline/build/pllm_cu121_py${{ matrix.python-version }}.tgz + + - name: Upload Release Asset + uses: softprops/action-gh-release@v2 + if: startsWith(github.ref, 'refs/tags/') + with: + files: | + pipeline/build/pllm_cu121_py${{ matrix.python-version }}.tgz + + build_cu124: + name: Build Shared Library (CUDA 12.4) + runs-on: ubuntu-latest + container: + image: pytorch/manylinux2_28-builder:cuda12.4 + + # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Environments + run: | + echo "PYTHON_VERSION=$(python3 -c "from os import environ as env; print({'3.7': 'cp37-cp37m', '3.8': 'cp38-cp38', '3.9': 'cp39-cp39', '3.10': 'cp310-cp310', '3.11': 'cp311-cp311', '3.12': 'cp312-cp312' }['${{ matrix.python-version }}'])")" >> $GITHUB_ENV + echo "PATH=$PATH" >> $GITHUB_ENV + + - name: Set up Packages + env: + PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin + run: | + dnf -y install epel-release && dnf upgrade + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && dnf makecache + dnf module install -y nvidia-driver:latest && dnf clean all + yum install -y pigz clang-tools-extra openmpi openmpi-devel spdlog spdlog-devel libibverbs rdma-core-devel numactl numactl-devel numactl-libs + pwd && ls -la + sed -i '256d' 3rdparty/mscclpp/src/bootstrap/bootstrap.cc + cat 3rdparty/patches/spdlog/*.patch | patch -p1 -d 3rdparty/spdlog + $PYBIN/pip install cmake mypy pybind11 black + cd pipeline/src/generate-gemm && $PYBIN/python genGEMM.py && cd ../../ + + - name: Build Library + env: + PATH: ${{ env.PATH }}:/usr/lib64/openmpi/bin + PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin + PREFIX: /opt/python/${{ env.PYTHON_VERSION }};/usr/lib64/openmpi + pybind11_DIR: /opt/python/${{ env.PYTHON_VERSION }}/lib/python${{ matrix.python-version }}/site-packages/pybind11 + CUDACXX: "/usr/local/cuda-12.4/bin/nvcc" + CUDA_HOME: "/usr/local/cuda-12.4" + CFLAGS: "-I/usr/include" + LDFLAGS: "-L/usr/lib" + run: | + cd pipeline && mkdir -p build && cd build + which cmake && cmake --version + which mpicxx && mpicxx --version + ${PYBIN}/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${PREFIX} -DBUILD_PYTHON_BINDINGS=OFF -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .. + make -j$(nproc) + tar -zcvf pllm_cu124_py${{ matrix.python-version }}.tgz pllm_python.* test_* + ls -la + + - uses: actions/upload-artifact@v4 + with: + name: pllm_cu124_py${{ matrix.python-version }}.tgz + path: | + pipeline/build/pllm_cu124_py${{ matrix.python-version }}.tgz + + - name: Upload Release Asset + uses: softprops/action-gh-release@v2 + if: startsWith(github.ref, 'refs/tags/') + with: + files: | + pipeline/build/pllm_cu124_py${{ matrix.python-version }}.tgz + diff --git a/.gitmodules b/.gitmodules index 142dd8b..b239e5a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,6 @@ [submodule "3rdparty/flashinfer"] path = 3rdparty/flashinfer url = https://github.com/happierpig/flashinfer-ai.git - [submodule "3rdparty/nvbench"] path = 3rdparty/nvbench url = https://github.com/NVIDIA/nvbench.git diff --git a/3rdparty/mscclpp b/3rdparty/mscclpp index 7ed13ec..cdaf3ae 160000 --- a/3rdparty/mscclpp +++ b/3rdparty/mscclpp @@ -1 +1 @@ -Subproject commit 7ed13ec4b57a0805aadd0a3d33613569b0c03d6d +Subproject commit cdaf3aea3d767ba65dd3b08984d76bd50615f92e diff --git a/3rdparty/spdlog b/3rdparty/spdlog index 238c9ff..a3a0c9d 160000 --- a/3rdparty/spdlog +++ b/3rdparty/spdlog @@ -1 +1 @@ -Subproject commit 238c9ffa5d1a14226eeabe10c9b63ffff3ed8b8e +Subproject commit a3a0c9d66386962fcaf0178fcae03ac77c1e0257 diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt index c7f26ef..c8a1608 100644 --- a/pipeline/CMakeLists.txt +++ b/pipeline/CMakeLists.txt @@ -15,7 +15,7 @@ set(NVBENCH_DIR ${PARENT_DIR}/3rdparty/nvbench CACHE PATH "Path to nvbench src") set(GTEST_DIR ${PARENT_DIR}/3rdparty/googletest CACHE PATH "Path to googletest src") set(CUTLASS_DIR ${PARENT_DIR}/3rdparty/cutlass CACHE PATH "Path to modified cutlass src") set(SPDLOG_DIR ${PARENT_DIR}/3rdparty/spdlog CACHE PATH "Path to spdlog src") -set(MSCCLPP_DIR /usr/local/mscclpp CACHE PATH "Path to mscclpp install") +set(MSCCLPP_DIR ${PARENT_DIR}/3rdparty/mscclpp CACHE PATH "Path to mscclpp install") set(FLASHINFER_DIR ${PARENT_DIR}/3rdparty/flashinfer CACHE PATH "Path to flashinfer src") # override by SMALL_FLASHINFER #set(FLASHINFER_DIR ${PARENT_DIR}/3rdparty/flashinfer CACHE PATH "Path to flashinfer src") @@ -36,6 +36,7 @@ if (NOT MPI_FOUND) endif() add_subdirectory(${SPDLOG_DIR} spdlog) +add_subdirectory(${MSCCLPP_DIR} mscclpp) find_package(pybind11 REQUIRED) if (NOT pybind11_FOUND) @@ -45,7 +46,6 @@ endif() find_program(STUBGEN_EXECUTABLE NAMES stubgen) add_compile_options(-w) -include_directories(${MSCCLPP_DIR}/include) include_directories(${CUTLASS_DIR}/include) include_directories(${CUTLASS_DIR}/tools/util/include) @@ -53,13 +53,11 @@ include_directories(${CUTLASS_DIR}/tools/util/include) #include_directories(${FLASHINFER_DIR}/include) # ------------- Build Network Test -----------------# -find_library(MSCCLPP_LIBRARY NAMES mscclpp PATHS ${MSCCLPP_DIR}/lib NO_DEFAULT_PATH) - add_executable(test_comm "${CMAKE_SOURCE_DIR}/src/comm_test.cu" "${CMAKE_SOURCE_DIR}/src/comm.cu") target_include_directories(test_comm PRIVATE ${CMAKE_SOURCE_DIR}/include) target_include_directories(test_comm PRIVATE ${PARENT_DIR}/3rdparty/cutlass/include) target_link_libraries(test_comm PRIVATE MPI::MPI_CXX) -target_link_libraries(test_comm PRIVATE ${MSCCLPP_LIBRARY}) +target_link_libraries(test_comm PRIVATE mscclpp_static) target_link_libraries(test_comm PRIVATE spdlog::spdlog) target_compile_definitions(test_comm PRIVATE -DENABLE_MPI) @@ -193,7 +191,7 @@ target_include_directories(shared_lib PRIVATE ${PARENT_DIR}/3rdparty/cutlass/inc target_include_directories(shared_lib PRIVATE ${PARENT_DIR}/3rdparty/cutlass/tools/util/include) target_include_directories(shared_lib PRIVATE ${SMALL_FLASHINFER_DIR}/include ${FLASHINFER_DIR}/include) target_link_libraries(shared_lib PRIVATE decode_kernels prefill_kernels) -target_link_libraries(shared_lib PRIVATE ${MSCCLPP_LIBRARY}) +target_link_libraries(shared_lib PRIVATE mscclpp_static) target_link_libraries(shared_lib PRIVATE MPI::MPI_CXX) target_link_libraries(shared_lib PRIVATE spdlog::spdlog) target_link_libraries(shared_lib PRIVATE gemm_lib) @@ -210,7 +208,7 @@ target_include_directories(test_compute PRIVATE ${CMAKE_SOURCE_DIR}/include) target_include_directories(test_compute PRIVATE ${SMALL_FLASHINFER_DIR}/include ${FLASHINFER_DIR}/include) target_link_libraries(test_compute PRIVATE decode_kernels prefill_kernels) # Include Network Libraries -target_link_libraries(test_compute PRIVATE ${MSCCLPP_LIBRARY}) +target_link_libraries(test_compute PRIVATE mscclpp_static) target_link_libraries(test_compute PRIVATE MPI::MPI_CXX) # Include Log Libraries target_link_libraries(test_compute PRIVATE spdlog::spdlog) @@ -229,7 +227,7 @@ target_include_directories(pllm_python PRIVATE ${CMAKE_SOURCE_DIR}/include) target_include_directories(pllm_python PRIVATE ${SMALL_FLASHINFER_DIR}/include ${FLASHINFER_DIR}/include) target_link_libraries(pllm_python PRIVATE decode_kernels prefill_kernels) # Include Network Libraries -target_link_libraries(pllm_python PRIVATE ${MSCCLPP_LIBRARY}) +target_link_libraries(pllm_python PRIVATE mscclpp_static) target_link_libraries(pllm_python PRIVATE MPI::MPI_CXX) # Include Log Libraries target_link_libraries(pllm_python PRIVATE spdlog::spdlog) diff --git a/pipeline/utils/kv_cache.py b/pipeline/utils/kv_cache.py index bcd1251..a5faa6d 100644 --- a/pipeline/utils/kv_cache.py +++ b/pipeline/utils/kv_cache.py @@ -14,7 +14,7 @@ from pybindUtil import toGPU, toGPUTensor from torch.profiler import profile, record_function, ProfilerActivity import time - +from typing import List class DistKVPool: @@ -73,7 +73,7 @@ class DistKVCache: def __init__(self, pool: DistKVPool): self._pool = pool - self._indices : list[int] = [] + self._indices : List[int] = [] self._seqlen : int = 0 @property @@ -81,7 +81,7 @@ def seqlen(self) -> int: return self._seqlen @property - def indicies(self) -> list[int]: + def indicies(self) -> List[int]: return self._indices @property @@ -116,11 +116,11 @@ def __init__(self, decode_kvs: Sequence[DistKVCache], prefill_kvs: Sequence[Dist """ # batch_size = len(decode_kvs) + len(prefill_kvs) # [batch_size + 1,] - self._kv_indptr : list[int] = [0] + self._kv_indptr : List[int] = [0] # [num_pages_in_total, ] - self._kv_indices : list[int] = [] + self._kv_indices : List[int] = [] # [batch_size, ] - self._kv_last_page_len : list[int] = [] + self._kv_last_page_len : List[int] = [] # Here we do not materialize data into specific devices, # for distributed assignment. diff --git a/pipeline/utils/pybindUtil.py b/pipeline/utils/pybindUtil.py index 4379580..581d8ac 100644 --- a/pipeline/utils/pybindUtil.py +++ b/pipeline/utils/pybindUtil.py @@ -3,6 +3,7 @@ import numpy as np import pllm_python from torch.profiler import profile, record_function, ProfilerActivity +from typing import List def toGPU(data, nranks, dtype, non_blocking=True): @@ -88,8 +89,8 @@ def initUpdateData( kv_last_page_len: int, rev_input_indptr: int, per_token_offset: int, - gemv_batch_size: list[int], - gemv_block_num: list[int] ) -> list[pllm_python.VortexUpdateData]: + gemv_batch_size: List[int], + gemv_block_num: List[int] ) -> List[pllm_python.VortexUpdateData]: updateDataList = [] gemv_batch_size = np.array(gemv_batch_size, dtype=np.int32) gemv_block_num = np.array(gemv_block_num, dtype=np.int32) @@ -118,4 +119,4 @@ def initUpdateData( print("Time taken: ", time.time() - t) - \ No newline at end of file + diff --git a/pipeline/utils/request_info.py b/pipeline/utils/request_info.py index 4413d1b..c42dd39 100644 --- a/pipeline/utils/request_info.py +++ b/pipeline/utils/request_info.py @@ -1,11 +1,13 @@ from collections import deque +from typing import List + class NewRequestInfo: """ Request info for incoming request NOTE (Yilong): add support for offloading / onloading KV-Cache """ req_idx: int - prompt: list[int] + prompt: List[int] output_len : int start_time: float @@ -36,7 +38,7 @@ class FlyRequestInfo: NOTE (Yilong): add support for offloading / onloading KV-Cache """ - def __init__(self, req_idx: int, input: list[int], output: list[int], prompt: list[int], request_comein_time: float, + def __init__(self, req_idx: int, input: List[int], output: List[int], prompt: List[int], request_comein_time: float, chunked_prefill: bool, kv_cache, encode_latency: float, decode_start_at: float, decode_latency: float, output_len: int, input_len: int): self.req_idx = req_idx diff --git a/pipeline/utils/scheduler.py b/pipeline/utils/scheduler.py index 44baf8a..291edc0 100644 --- a/pipeline/utils/scheduler.py +++ b/pipeline/utils/scheduler.py @@ -13,6 +13,7 @@ import pickle from collections import deque +from typing import List from kv_cache import DistKVPool, DistKVCache, BatchedDistKVCache from request_info import NewRequestInfo, NewRequestQueue, FlyRequestInfo @@ -38,7 +39,7 @@ class WorkingSet: """ def __init__(self) -> None: - self._set : list[FlyRequestInfo] = [] + self._set : List[FlyRequestInfo] = [] def put(self, req: FlyRequestInfo): self._set.append(req) @@ -209,12 +210,12 @@ def bench_text_gen(self, retired_rq, actualRun = True): self._prefill_workset.adjust_kv_cache() self._decode_workset.adjust_kv_cache() - input_ids : list[int] = [] - input_indptr : list[int] = [0] - prev_len : list[int] = [] + input_ids : List[int] = [] + input_indptr : List[int] = [0] + prev_len : List[int] = [] decodePrefillBorder = self._decode_workset.effective_bsz - decode_kvs : list[DistKVCache] = [] - prefill_kvs : list[DistKVCache] = [] + decode_kvs : List[DistKVCache] = [] + prefill_kvs : List[DistKVCache] = [] with record_function("calc batch size"): t3 = time.perf_counter() @@ -409,7 +410,7 @@ def print_and_to_file(string, f): scheduler = Scheduler(pool, request_manager.avaliable_request_queue) scheduler.init_pipe(args.config_path) - retired_rq : list[FlyRequestInfo] = [] + retired_rq : List[FlyRequestInfo] = [] totalCycle = 0 skip_cycle = args.skip_cycles diff --git a/pipeline/utils/serve.py b/pipeline/utils/serve.py index adf4a88..c1a6b81 100644 --- a/pipeline/utils/serve.py +++ b/pipeline/utils/serve.py @@ -13,6 +13,7 @@ import pickle from collections import deque +from typing import List from kv_cache import DistKVPool, DistKVCache, BatchedDistKVCache from request_info import NewRequestInfo, NewRequestQueue, FlyRequestInfo @@ -41,7 +42,7 @@ class WorkingSet: """ def __init__(self) -> None: - self._set : list[FlyRequestInfo] = [] + self._set : List[FlyRequestInfo] = [] def put(self, req: FlyRequestInfo): self._set.append(req) @@ -70,7 +71,7 @@ def adjust_kv_cache(self) -> None: req.kv_cache.allocate_tokens(len(req.input)) class Scheduler: - def __init__(self, memory_pool: DistKVPool, request_queue: NewRequestQueue, weight: list[pllm_python.VortexModelWeight], decode_length, prefill_length): + def __init__(self, memory_pool: DistKVPool, request_queue: NewRequestQueue, weight: List[pllm_python.VortexModelWeight], decode_length, prefill_length): self._memory_pool = memory_pool self._request_queue = request_queue @@ -248,12 +249,12 @@ def bench_text_gen(self, retired_rq, actualRun = True): self._prefill_workset.adjust_kv_cache() self._decode_workset.adjust_kv_cache() - input_ids : list[int] = [] - input_indptr : list[int] = [0] - prev_len : list[int] = [] + input_ids : List[int] = [] + input_indptr : List[int] = [0] + prev_len : List[int] = [] decodePrefillBorder = self._decode_workset.effective_bsz - decode_kvs : list[DistKVCache] = [] - prefill_kvs : list[DistKVCache] = [] + decode_kvs : List[DistKVCache] = [] + prefill_kvs : List[DistKVCache] = [] with record_function("calc batch size"): t3 = time.perf_counter() @@ -475,7 +476,7 @@ def bench_text_gen(self, retired_rq, actualRun = True): scheduler = Scheduler(pool, request_manager.avaliable_request_queue, model_weights, request_manager.average_decode_length, request_manager.average_prefill_length) scheduler.init_pipe(args.config_path) - retired_rq : list[FlyRequestInfo] = [] + retired_rq : List[FlyRequestInfo] = [] totalCycle = 0 t1 = time.perf_counter() diff --git a/pipeline/utils/weightLoader.py b/pipeline/utils/weightLoader.py index 92e3fb0..e035556 100644 --- a/pipeline/utils/weightLoader.py +++ b/pipeline/utils/weightLoader.py @@ -7,6 +7,7 @@ import tqdm from transformers import LlamaTokenizer import concurrent.futures +from typing import List def to_vortex_weight(tensor): w = pllm_python.VortexWeight() diff --git a/setup.sh b/setup.sh index 3db180a..5e7c15f 100755 --- a/setup.sh +++ b/setup.sh @@ -41,16 +41,8 @@ fi cd Nanoflow -# build mscclpp -cd 3rdparty/mscclpp -git reset --hard cdaf3aea3d767ba65dd3b08984d76bd50615f92e -sed -i '256d' ./src/bootstrap/bootstrap.cc -mkdir -p build -cd build -cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/mscclpp -DBUILD_PYTHON_BINDINGS=OFF .. -make -j mscclpp mscclpp_static -make install/fast -cd ../../../ +# fix mscclpp +sed -i '256d' 3rdparty/mscclpp/src/bootstrap/bootstrap.cc # fix spdlog v1.14.0 + cuda 12.1 compatibility bug for repo in spdlog; do