diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..cab6b7d
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,150 @@
+name: Build
+
+permissions:
+  contents: write
+
+on:
+  push:
+    tags:
+      - v*
+  workflow_dispatch:
+
+jobs:
+  build_cu121:
+    name: Build Shared Library (CUDA 12.1)
+    runs-on: ubuntu-latest
+    container:
+      image: pytorch/manylinux2_28-builder:cuda12.1
+
+    # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Environments
+        run: |
+          echo "PYTHON_VERSION=$(python3 -c "from os import environ as env; print({'3.7': 'cp37-cp37m', '3.8': 'cp38-cp38', '3.9': 'cp39-cp39', '3.10': 'cp310-cp310', '3.11': 'cp311-cp311', '3.12': 'cp312-cp312' }['${{ matrix.python-version }}'])")" >> $GITHUB_ENV
+          echo "PATH=$PATH" >> $GITHUB_ENV
+
+      - name: Set up Packages
+        env:
+          PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin
+        run: |
+          dnf -y install epel-release && dnf upgrade
+          dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && dnf makecache
+          dnf module install -y nvidia-driver:latest && dnf clean all
+          yum install -y pigz clang-tools-extra openmpi openmpi-devel spdlog spdlog-devel libibverbs rdma-core-devel numactl numactl-devel numactl-libs
+          pwd && ls -la
+          sed -i '256d' 3rdparty/mscclpp/src/bootstrap/bootstrap.cc
+          cat 3rdparty/patches/spdlog/*.patch | patch -p1 -d 3rdparty/spdlog
+          $PYBIN/pip install cmake mypy pybind11 black
+          cd pipeline/src/generate-gemm && $PYBIN/python genGEMM.py && cd ../../
+      
+      - name: Build Library
+        env:
+          PATH: ${{ env.PATH }}:/usr/lib64/openmpi/bin
+          PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin
+          PREFIX: /opt/python/${{ env.PYTHON_VERSION }};/usr/lib64/openmpi
+          pybind11_DIR: /opt/python/${{ env.PYTHON_VERSION }}/lib/python${{ matrix.python-version }}/site-packages/pybind11
+          CUDACXX: "/usr/local/cuda-12.1/bin/nvcc"
+          CUDA_HOME: "/usr/local/cuda-12.1"
+          CFLAGS: "-I/usr/include" 
+          LDFLAGS: "-L/usr/lib"
+        run: |
+          cd pipeline && mkdir -p build && cd build
+          which cmake && cmake --version
+          which mpicxx && mpicxx --version
+          ${PYBIN}/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${PREFIX} -DBUILD_PYTHON_BINDINGS=OFF -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
+          make -j$(nproc)
+          tar -zcvf pllm_cu121_py${{ matrix.python-version }}.tgz pllm_python.* test_*
+          ls -la
+      
+      - uses: actions/upload-artifact@v4
+        with:
+          name: pllm_cu121_py${{ matrix.python-version }}.tgz
+          path: |
+            pipeline/build/pllm_cu121_py${{ matrix.python-version }}.tgz
+
+      - name: Upload Release Asset
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            pipeline/build/pllm_cu121_py${{ matrix.python-version }}.tgz
+
+  build_cu124:
+    name: Build Shared Library (CUDA 12.4)
+    runs-on: ubuntu-latest
+    container:
+      image: pytorch/manylinux2_28-builder:cuda12.4
+
+    # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Set up Environments
+        run: |
+          echo "PYTHON_VERSION=$(python3 -c "from os import environ as env; print({'3.7': 'cp37-cp37m', '3.8': 'cp38-cp38', '3.9': 'cp39-cp39', '3.10': 'cp310-cp310', '3.11': 'cp311-cp311', '3.12': 'cp312-cp312' }['${{ matrix.python-version }}'])")" >> $GITHUB_ENV
+          echo "PATH=$PATH" >> $GITHUB_ENV
+
+      - name: Set up Packages
+        env:
+          PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin
+        run: |
+          dnf -y install epel-release && dnf upgrade
+          dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && dnf makecache
+          dnf module install -y nvidia-driver:latest && dnf clean all
+          yum install -y pigz clang-tools-extra openmpi openmpi-devel spdlog spdlog-devel libibverbs rdma-core-devel numactl numactl-devel numactl-libs
+          pwd && ls -la
+          sed -i '256d' 3rdparty/mscclpp/src/bootstrap/bootstrap.cc
+          cat 3rdparty/patches/spdlog/*.patch | patch -p1 -d 3rdparty/spdlog
+          $PYBIN/pip install cmake mypy pybind11 black
+          cd pipeline/src/generate-gemm && $PYBIN/python genGEMM.py && cd ../../
+      
+      - name: Build Library
+        env:
+          PATH: ${{ env.PATH }}:/usr/lib64/openmpi/bin
+          PYBIN: /opt/python/${{ env.PYTHON_VERSION }}/bin
+          PREFIX: /opt/python/${{ env.PYTHON_VERSION }};/usr/lib64/openmpi
+          pybind11_DIR: /opt/python/${{ env.PYTHON_VERSION }}/lib/python${{ matrix.python-version }}/site-packages/pybind11
+          CUDACXX: "/usr/local/cuda-12.4/bin/nvcc"
+          CUDA_HOME: "/usr/local/cuda-12.4"
+          CFLAGS: "-I/usr/include" 
+          LDFLAGS: "-L/usr/lib"
+        run: |
+          cd pipeline && mkdir -p build && cd build
+          which cmake && cmake --version
+          which mpicxx && mpicxx --version
+          ${PYBIN}/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${PREFIX} -DBUILD_PYTHON_BINDINGS=OFF -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
+          make -j$(nproc)
+          tar -zcvf pllm_cu124_py${{ matrix.python-version }}.tgz pllm_python.* test_*
+          ls -la
+      
+      - uses: actions/upload-artifact@v4
+        with:
+          name: pllm_cu124_py${{ matrix.python-version }}.tgz
+          path: |
+            pipeline/build/pllm_cu124_py${{ matrix.python-version }}.tgz
+
+      - name: Upload Release Asset
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            pipeline/build/pllm_cu124_py${{ matrix.python-version }}.tgz
+
diff --git a/.gitmodules b/.gitmodules
index 142dd8b..b239e5a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,6 @@
 [submodule "3rdparty/flashinfer"]
 	path = 3rdparty/flashinfer
 	url = https://github.com/happierpig/flashinfer-ai.git
-
 [submodule "3rdparty/nvbench"]
 	path = 3rdparty/nvbench
 	url = https://github.com/NVIDIA/nvbench.git
diff --git a/3rdparty/mscclpp b/3rdparty/mscclpp
index 7ed13ec..cdaf3ae 160000
--- a/3rdparty/mscclpp
+++ b/3rdparty/mscclpp
@@ -1 +1 @@
-Subproject commit 7ed13ec4b57a0805aadd0a3d33613569b0c03d6d
+Subproject commit cdaf3aea3d767ba65dd3b08984d76bd50615f92e
diff --git a/3rdparty/spdlog b/3rdparty/spdlog
index 238c9ff..a3a0c9d 160000
--- a/3rdparty/spdlog
+++ b/3rdparty/spdlog
@@ -1 +1 @@
-Subproject commit 238c9ffa5d1a14226eeabe10c9b63ffff3ed8b8e
+Subproject commit a3a0c9d66386962fcaf0178fcae03ac77c1e0257
diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt
index c7f26ef..c8a1608 100644
--- a/pipeline/CMakeLists.txt
+++ b/pipeline/CMakeLists.txt
@@ -15,7 +15,7 @@ set(NVBENCH_DIR ${PARENT_DIR}/3rdparty/nvbench CACHE PATH "Path to nvbench src")
 set(GTEST_DIR ${PARENT_DIR}/3rdparty/googletest CACHE PATH "Path to googletest src")
 set(CUTLASS_DIR ${PARENT_DIR}/3rdparty/cutlass CACHE PATH "Path to modified cutlass src")
 set(SPDLOG_DIR  ${PARENT_DIR}/3rdparty/spdlog CACHE PATH "Path to spdlog src")
-set(MSCCLPP_DIR /usr/local/mscclpp CACHE PATH "Path to mscclpp install")
+set(MSCCLPP_DIR ${PARENT_DIR}/3rdparty/mscclpp CACHE PATH "Path to mscclpp install")
 set(FLASHINFER_DIR ${PARENT_DIR}/3rdparty/flashinfer CACHE PATH "Path to flashinfer src")
 # override by SMALL_FLASHINFER
 #set(FLASHINFER_DIR ${PARENT_DIR}/3rdparty/flashinfer CACHE PATH "Path to flashinfer src")
@@ -36,6 +36,7 @@ if (NOT MPI_FOUND)
 endif()
 
 add_subdirectory(${SPDLOG_DIR} spdlog)
+add_subdirectory(${MSCCLPP_DIR} mscclpp)
 
 find_package(pybind11 REQUIRED)
 if (NOT pybind11_FOUND)
@@ -45,7 +46,6 @@ endif()
 find_program(STUBGEN_EXECUTABLE NAMES stubgen)
 
 add_compile_options(-w)
-include_directories(${MSCCLPP_DIR}/include)
 
 include_directories(${CUTLASS_DIR}/include)
 include_directories(${CUTLASS_DIR}/tools/util/include)
@@ -53,13 +53,11 @@ include_directories(${CUTLASS_DIR}/tools/util/include)
 #include_directories(${FLASHINFER_DIR}/include)
 # ------------- Build Network Test -----------------#
 
-find_library(MSCCLPP_LIBRARY NAMES mscclpp PATHS ${MSCCLPP_DIR}/lib NO_DEFAULT_PATH)
-
 add_executable(test_comm "${CMAKE_SOURCE_DIR}/src/comm_test.cu" "${CMAKE_SOURCE_DIR}/src/comm.cu")
 target_include_directories(test_comm PRIVATE ${CMAKE_SOURCE_DIR}/include)
 target_include_directories(test_comm PRIVATE ${PARENT_DIR}/3rdparty/cutlass/include)
 target_link_libraries(test_comm PRIVATE MPI::MPI_CXX)
-target_link_libraries(test_comm PRIVATE ${MSCCLPP_LIBRARY})
+target_link_libraries(test_comm PRIVATE mscclpp_static)
 target_link_libraries(test_comm PRIVATE spdlog::spdlog)
 target_compile_definitions(test_comm PRIVATE -DENABLE_MPI)
 
@@ -193,7 +191,7 @@ target_include_directories(shared_lib PRIVATE ${PARENT_DIR}/3rdparty/cutlass/inc
 target_include_directories(shared_lib PRIVATE ${PARENT_DIR}/3rdparty/cutlass/tools/util/include)
 target_include_directories(shared_lib PRIVATE ${SMALL_FLASHINFER_DIR}/include ${FLASHINFER_DIR}/include)
 target_link_libraries(shared_lib PRIVATE decode_kernels prefill_kernels)
-target_link_libraries(shared_lib PRIVATE ${MSCCLPP_LIBRARY})
+target_link_libraries(shared_lib PRIVATE mscclpp_static)
 target_link_libraries(shared_lib PRIVATE MPI::MPI_CXX)
 target_link_libraries(shared_lib PRIVATE spdlog::spdlog)
 target_link_libraries(shared_lib PRIVATE gemm_lib)
@@ -210,7 +208,7 @@ target_include_directories(test_compute PRIVATE ${CMAKE_SOURCE_DIR}/include)
 target_include_directories(test_compute PRIVATE ${SMALL_FLASHINFER_DIR}/include ${FLASHINFER_DIR}/include)
 target_link_libraries(test_compute PRIVATE decode_kernels prefill_kernels)
 # Include Network Libraries
-target_link_libraries(test_compute PRIVATE ${MSCCLPP_LIBRARY})
+target_link_libraries(test_compute PRIVATE mscclpp_static)
 target_link_libraries(test_compute PRIVATE MPI::MPI_CXX)
 # Include Log Libraries
 target_link_libraries(test_compute PRIVATE spdlog::spdlog)
@@ -229,7 +227,7 @@ target_include_directories(pllm_python PRIVATE ${CMAKE_SOURCE_DIR}/include)
 target_include_directories(pllm_python PRIVATE ${SMALL_FLASHINFER_DIR}/include ${FLASHINFER_DIR}/include)
 target_link_libraries(pllm_python PRIVATE decode_kernels prefill_kernels)
 # Include Network Libraries
-target_link_libraries(pllm_python PRIVATE ${MSCCLPP_LIBRARY})
+target_link_libraries(pllm_python PRIVATE mscclpp_static)
 target_link_libraries(pllm_python PRIVATE MPI::MPI_CXX)
 # Include Log Libraries
 target_link_libraries(pllm_python PRIVATE spdlog::spdlog)
diff --git a/pipeline/utils/kv_cache.py b/pipeline/utils/kv_cache.py
index bcd1251..a5faa6d 100644
--- a/pipeline/utils/kv_cache.py
+++ b/pipeline/utils/kv_cache.py
@@ -14,7 +14,7 @@
 from pybindUtil import toGPU, toGPUTensor
 from torch.profiler import profile, record_function, ProfilerActivity
 import time
-
+from typing import List
 
 
 class DistKVPool:
@@ -73,7 +73,7 @@ class DistKVCache:
     
     def __init__(self, pool: DistKVPool):
         self._pool = pool
-        self._indices : list[int] = []
+        self._indices : List[int] = []
         self._seqlen : int = 0
     
     @property
@@ -81,7 +81,7 @@ def seqlen(self) -> int:
         return self._seqlen
     
     @property
-    def indicies(self) -> list[int]:
+    def indicies(self) -> List[int]:
         return self._indices
     
     @property
@@ -116,11 +116,11 @@ def __init__(self, decode_kvs: Sequence[DistKVCache], prefill_kvs: Sequence[Dist
         """
         # batch_size = len(decode_kvs) + len(prefill_kvs)
         # [batch_size + 1,]
-        self._kv_indptr : list[int] = [0]
+        self._kv_indptr : List[int] = [0]
         # [num_pages_in_total, ]
-        self._kv_indices : list[int] = []
+        self._kv_indices : List[int] = []
         # [batch_size, ]
-        self._kv_last_page_len : list[int] = []
+        self._kv_last_page_len : List[int] = []
         
         # Here we do not materialize data into specific devices,
         # for distributed assignment.
diff --git a/pipeline/utils/pybindUtil.py b/pipeline/utils/pybindUtil.py
index 4379580..581d8ac 100644
--- a/pipeline/utils/pybindUtil.py
+++ b/pipeline/utils/pybindUtil.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pllm_python
 from torch.profiler import profile, record_function, ProfilerActivity
+from typing import List
 
 def toGPU(data, nranks, dtype, non_blocking=True):
     
@@ -88,8 +89,8 @@ def initUpdateData(
         kv_last_page_len: int,
         rev_input_indptr: int,
         per_token_offset: int,
-        gemv_batch_size: list[int],
-        gemv_block_num: list[int] ) -> list[pllm_python.VortexUpdateData]:
+        gemv_batch_size: List[int],
+        gemv_block_num: List[int] ) -> List[pllm_python.VortexUpdateData]:
     updateDataList = []
     gemv_batch_size = np.array(gemv_batch_size, dtype=np.int32)
     gemv_block_num = np.array(gemv_block_num, dtype=np.int32)
@@ -118,4 +119,4 @@ def initUpdateData(
     print("Time taken: ", time.time() - t)
     
     
-    
\ No newline at end of file
+    
diff --git a/pipeline/utils/request_info.py b/pipeline/utils/request_info.py
index 4413d1b..c42dd39 100644
--- a/pipeline/utils/request_info.py
+++ b/pipeline/utils/request_info.py
@@ -1,11 +1,13 @@
 from collections import deque
+from typing import List
+
 class NewRequestInfo:
     """
     Request info for incoming request
     NOTE (Yilong): add support for offloading / onloading KV-Cache
     """
     req_idx: int
-    prompt: list[int]
+    prompt: List[int]
     output_len : int
     start_time: float
 
@@ -36,7 +38,7 @@ class FlyRequestInfo:
     NOTE (Yilong): add support for offloading / onloading KV-Cache
     """
     
-    def __init__(self, req_idx: int, input: list[int], output: list[int], prompt: list[int], request_comein_time: float, 
+    def __init__(self, req_idx: int, input: List[int], output: List[int], prompt: List[int], request_comein_time: float, 
                  chunked_prefill: bool, kv_cache, encode_latency: float, 
                  decode_start_at: float, decode_latency: float, output_len: int, input_len: int):
         self.req_idx = req_idx
diff --git a/pipeline/utils/scheduler.py b/pipeline/utils/scheduler.py
index 44baf8a..291edc0 100644
--- a/pipeline/utils/scheduler.py
+++ b/pipeline/utils/scheduler.py
@@ -13,6 +13,7 @@
 
 import pickle
 from collections import deque
+from typing import List
 
 from kv_cache import DistKVPool, DistKVCache, BatchedDistKVCache
 from request_info import NewRequestInfo, NewRequestQueue, FlyRequestInfo
@@ -38,7 +39,7 @@ class WorkingSet:
     """
     
     def __init__(self) -> None:
-        self._set : list[FlyRequestInfo] = []
+        self._set : List[FlyRequestInfo] = []
         
     def put(self, req: FlyRequestInfo):
         self._set.append(req)
@@ -209,12 +210,12 @@ def bench_text_gen(self, retired_rq, actualRun = True):
             self._prefill_workset.adjust_kv_cache()
             self._decode_workset.adjust_kv_cache()
             
-            input_ids : list[int] = []
-            input_indptr : list[int] = [0]
-            prev_len : list[int] = []
+            input_ids : List[int] = []
+            input_indptr : List[int] = [0]
+            prev_len : List[int] = []
             decodePrefillBorder = self._decode_workset.effective_bsz
-            decode_kvs : list[DistKVCache] = []
-            prefill_kvs : list[DistKVCache] = []
+            decode_kvs : List[DistKVCache] = []
+            prefill_kvs : List[DistKVCache] = []
             
         with record_function("calc batch size"):
             t3 = time.perf_counter()
@@ -409,7 +410,7 @@ def print_and_to_file(string, f):
     scheduler = Scheduler(pool, request_manager.avaliable_request_queue)
     scheduler.init_pipe(args.config_path)
 
-    retired_rq : list[FlyRequestInfo] = []
+    retired_rq : List[FlyRequestInfo] = []
     totalCycle = 0
     
     skip_cycle = args.skip_cycles
diff --git a/pipeline/utils/serve.py b/pipeline/utils/serve.py
index adf4a88..c1a6b81 100644
--- a/pipeline/utils/serve.py
+++ b/pipeline/utils/serve.py
@@ -13,6 +13,7 @@
 
 import pickle
 from collections import deque
+from typing import List
 
 from kv_cache import DistKVPool, DistKVCache, BatchedDistKVCache
 from request_info import NewRequestInfo, NewRequestQueue, FlyRequestInfo
@@ -41,7 +42,7 @@ class WorkingSet:
     """
     
     def __init__(self) -> None:
-        self._set : list[FlyRequestInfo] = []
+        self._set : List[FlyRequestInfo] = []
         
     def put(self, req: FlyRequestInfo):
         self._set.append(req)
@@ -70,7 +71,7 @@ def adjust_kv_cache(self) -> None:
             req.kv_cache.allocate_tokens(len(req.input))
 
 class Scheduler:
-    def __init__(self, memory_pool: DistKVPool, request_queue: NewRequestQueue, weight: list[pllm_python.VortexModelWeight], decode_length, prefill_length):
+    def __init__(self, memory_pool: DistKVPool, request_queue: NewRequestQueue, weight: List[pllm_python.VortexModelWeight], decode_length, prefill_length):
         self._memory_pool = memory_pool
         self._request_queue = request_queue
         
@@ -248,12 +249,12 @@ def bench_text_gen(self, retired_rq, actualRun = True):
             self._prefill_workset.adjust_kv_cache()
             self._decode_workset.adjust_kv_cache()
             
-            input_ids : list[int] = []
-            input_indptr : list[int] = [0]
-            prev_len : list[int] = []
+            input_ids : List[int] = []
+            input_indptr : List[int] = [0]
+            prev_len : List[int] = []
             decodePrefillBorder = self._decode_workset.effective_bsz
-            decode_kvs : list[DistKVCache] = []
-            prefill_kvs : list[DistKVCache] = []
+            decode_kvs : List[DistKVCache] = []
+            prefill_kvs : List[DistKVCache] = []
             
         with record_function("calc batch size"):
             t3 = time.perf_counter()
@@ -475,7 +476,7 @@ def bench_text_gen(self, retired_rq, actualRun = True):
     scheduler = Scheduler(pool, request_manager.avaliable_request_queue, model_weights, request_manager.average_decode_length, request_manager.average_prefill_length)
     scheduler.init_pipe(args.config_path)
 
-    retired_rq : list[FlyRequestInfo] = []
+    retired_rq : List[FlyRequestInfo] = []
     totalCycle = 0
     
     t1 = time.perf_counter()
diff --git a/pipeline/utils/weightLoader.py b/pipeline/utils/weightLoader.py
index 92e3fb0..e035556 100644
--- a/pipeline/utils/weightLoader.py
+++ b/pipeline/utils/weightLoader.py
@@ -7,6 +7,7 @@
 import tqdm
 from transformers import LlamaTokenizer
 import concurrent.futures
+from typing import List
 
 def to_vortex_weight(tensor):
     w = pllm_python.VortexWeight()
diff --git a/setup.sh b/setup.sh
index 3db180a..5e7c15f 100755
--- a/setup.sh
+++ b/setup.sh
@@ -41,16 +41,8 @@ fi
 
 cd Nanoflow
 
-# build mscclpp
-cd 3rdparty/mscclpp
-git reset --hard cdaf3aea3d767ba65dd3b08984d76bd50615f92e
-sed -i '256d' ./src/bootstrap/bootstrap.cc
-mkdir -p build
-cd build
-cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/mscclpp -DBUILD_PYTHON_BINDINGS=OFF ..
-make -j mscclpp mscclpp_static
-make install/fast
-cd ../../../
+# fix mscclpp
+sed -i '256d' 3rdparty/mscclpp/src/bootstrap/bootstrap.cc
 
 # fix spdlog v1.14.0 + cuda 12.1 compatibility bug
 for repo in spdlog; do