runpod-workers · supa-thibaud · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
@@ -0,0 +1,36 @@
+name: Docker Build and Push
+
+on:
+  push:
+    branches: [ main ]  # Adjust this if your main branch has a different name
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v1
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v1
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    - name: Build and push
+      uses: docker/build-push-action@v2
+      with:
+        context: .
+        push: true
+        tags: ${{ secrets.DOCKERHUB_USERNAME }}/worker-sglang:latest
+        cache-from: type=gha
+        cache-to: type=gha,mode=max
+
+    - name: Cleanup
+      if: always()
+      run: |
+        docker system prune -af
+        df -h
diff --git a/Dockerfile b/Dockerfile
@@ -1,19 +1,55 @@
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 
+ARG CUDA_VERSION=12.4.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+ENV DEBIAN_FRONTEND=noninteractive
 
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt update -y \
+    && apt install software-properties-common -y \
+    && add-apt-repository ppa:deadsnakes/ppa -y && apt update \
+    && apt install python3.10 python3.10-dev -y \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 2 \
+    && update-alternatives --set python3 /usr/bin/python3.10 && apt install python3.10-distutils -y \
+    && apt install curl git sudo -y \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py \
+    && python3 --version \
+    && python3 -m pip --version \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt clean
 
 RUN ldconfig /usr/local/cuda-12.1/compat/
 
+WORKDIR /sgl-workspace
+
+RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
+    && git clone --depth=1 https://github.com/sgl-project/sglang.git \
+    && cd sglang \
+    && if [ "$BUILD_TYPE" = "srt" ]; then \
+         python3 -m pip --no-cache-dir install -e "python[srt]"; \
+       else \
+         python3 -m pip --no-cache-dir install -e "python[all]"; \
+       fi
+
+ARG CUDA_VERSION
+RUN if [ "$CUDA_VERSION" = "12.1.1" ]; then \
+        export CUDA_IDENTIFIER=cu121 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/; \
+    elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
+        export CUDA_IDENTIFIER=cu124 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu124/torch2.4/; \
+    elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
+        export CUDA_IDENTIFIER=cu118 && \
+        python3 -m pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu118 && \
+        python3 -m pip --no-cache-dir install flashinfer -i https://flashinfer.ai/whl/cu118/torch2.4/; \
+    else \
+        echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
+    fi
+
 # Install Python dependencies
 COPY builder/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
+RUN python3 -m pip install --upgrade -r /requirements.txt
 
-# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
-RUN python3 -m pip install "sglang[all]" && \
-    python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
+RUN python3 -m pip cache purge
 
 # Setup for Option 2: Building the Image with the Model included
 ARG MODEL_NAME=""

diff --git a/README.md b/README.md
@@ -1,15 +1,15 @@
 <div align="center">
 
-<h1> SgLang Worker</h1>
+<h1> SgLang Worker</h1> 
 
 🚀 | SGLang is yet another fast serving framework for large language models and vision language models.
 </div>
 
 ## 📖 | Getting Started
 
 1. Clone this repository.
-2. Build a docker image - ```docker build -t <your_username>:worker-sglang:v1 .```
-3. ```docker push <your_username>:worker-sglang:v1```
+2. Build a docker image - ```docker build -t <your_username>/worker-sglang:v1 .```
+3. ```docker push <your_username>/worker-sglang:v1```
 
 
 ***Once you have built the Docker image and deployed the endpoint, you can use the code below to interact with the endpoint***: 

diff --git a/builder/requirements.txt b/builder/requirements.txt
@@ -1,10 +1,10 @@
 ray
 pandas
 pyarrow
-runpod==1.6.2
+runpod==1.7.0
 huggingface-hub
 packaging
-typing-extensions==4.7.1
+typing-extensions==4.11.0
 pydantic
 pydantic-settings
 hf-transfer
diff --git a/src/engine.py b/src/engine.py
@@ -74,6 +74,8 @@ def start_server(self):
             if os.getenv(flag, '').lower() in ('true', '1', 'yes'):
                 command.append(f"--{flag.lower().replace('_', '-')}")
 
+        print("LAUNCH SERVER COMMAND:")
+        print(command)
         self.process = subprocess.Popen(command, stdout=None, stderr=None)
         print(f"Server started with PID: {self.process.pid}")
 
@@ -93,14 +95,14 @@ def wait_for_server(self, timeout=300, interval=5):
     def shutdown(self):
         if self.process:
             self.process.terminate()
-            self.process.wait()
+            self.process.wait() 
             print("Server shut down.")
 
 class OpenAIRequest:
     def __init__(self, base_url="http://0.0.0.0:30000/v1", api_key="EMPTY"):
         self.client = openai.Client(base_url=base_url, api_key=api_key)
 
-    async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False):
+    async def request_chat_completions(self, model="default", messages=None, max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0):
         if messages is None:
             messages = [
                 {"role": "system", "content": "You are a helpful AI assistant"},
@@ -111,7 +113,12 @@ async def request_chat_completions(self, model="default", messages=None, max_tok
             model=model,
             messages=messages,
             max_tokens=max_tokens,
-            stream=stream
+            stream=stream,
+            frequency_penalty=frequency_penalty,
+            n=n,
+            stop=stop,
+            temperature=temperature,
+            top_p=top_p
         )
 
         if stream:
@@ -120,12 +127,17 @@ async def request_chat_completions(self, model="default", messages=None, max_tok
         else:
             yield response.to_dict()
 
-    async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False):
+    async def request_completions(self, model="default", prompt="The capital of France is", max_tokens=100, stream=False, frequency_penalty=0.0, n=1, stop=None, temperature=1.0, top_p=1.0):
         response = self.client.completions.create(
             model=model,
             prompt=prompt,
             max_tokens=max_tokens,
-            stream=stream
+            stream=stream,
+            frequency_penalty=frequency_penalty,
+            n=n,
+            stop=stop,
+            temperature=temperature,
+            top_p=top_p
         )
 
         if stream:

diff --git a/src/handler.py b/src/handler.py