Skip to content

Commit

Permalink
updated MLC containers
Browse files Browse the repository at this point in the history
  • Loading branch information
dusty-nv committed Apr 17, 2024
1 parent ecfc334 commit 83abd0d
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 32 deletions.
15 changes: 11 additions & 4 deletions packages/llm/mlc/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,17 @@
import argparse
import resource

from mlc_chat import ChatModule, ChatConfig
from mlc_chat.callback import StreamToStdout


try:
from mlc_chat import ChatModule, ChatConfig
from mlc_chat.callback import StreamToStdout
except Exception as error:
print(f"failed to import ChatModule from mlc_chat ({error})")
print(f"trying to import ChatModule from mlc_llm instead...")

from mlc_llm import ChatModule, ChatConfig
from mlc_llm.callback import StreamToStdout


parser = argparse.ArgumentParser()

parser.add_argument('--model', type=str, default="Llama-2-7b-chat-hf-q4f16_1")
Expand Down
13 changes: 13 additions & 0 deletions packages/llm/mlc/patches/3403a4e.diff
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,19 @@ index 4c36cab9..6afe6d0c 100644
MLC_LIBRARY_PATH = os.environ.get("MLC_LIBRARY_PATH", None)


diff --git a/python/mlc_llm/quantization/ft_quantization.py b/python/mlc_llm/quantization/ft_quantization.py
index 4a158460..7cd8fc6a 100644
--- a/python/mlc_llm/quantization/ft_quantization.py
+++ b/python/mlc_llm/quantization/ft_quantization.py
@@ -207,7 +207,7 @@ class FTQuantize: # pylint: disable=too-many-instance-attributes
relax.call_pure_packed(
"cutlass.ft_preprocess_weight",
lv1,
- detect_cuda_arch_list(target=target)[0],
+ int(detect_cuda_arch_list(target=target)[0]),
DataType(self.quantize_dtype).bits == 4,
sinfo_args=lv1.struct_info,
)
diff --git a/python/setup.py b/python/setup.py
index 2f1b632b..6c7c2088 100644
--- a/python/setup.py
Expand Down
85 changes: 57 additions & 28 deletions packages/llm/mlc/test.sh
Original file line number Diff line number Diff line change
@@ -1,40 +1,69 @@
#!/usr/bin/env bash
set -e
set -ex

MODEL_NAME=${1:-"Llama-2-7b-hf"}
MODEL_PATH=${2:-"https://nvidia.box.com/shared/static/i3jtp8jdmdlsq4qkjof8v4muth8ar7fo.gz"}
MODEL_ROOT="/data/models/mlc/dist"
MLC_USE_CACHE=${MLC_USE_CACHE:-1}

test_model()
QUANTIZATION=${QUANTIZATION:-"q4f16_ft"}
QUANTIZATION_PATH="${MODEL_ROOT}/${MODEL_NAME}-${QUANTIZATION}"
SKIP_QUANTIZATION=${SKIP_QUANTIZATION:-"no"}

USE_CACHE=${USE_CACHE:-1}
CONV_TEMPLATE=${CONV_TEMPLATE:-"llama-2"}
MAX_CONTEXT_LEN=${MAX_CONTEXT_LEN:-4096}


quantize() # mlc_llm >= 0.1.1
{
local MODEL_NAME=$1
local MODEL_URL=$2
local MODEL_TAR="$MODEL_NAME.tar.gz"
local QUANTIZATION=${3:-q4f16_ft}
QUANTIZATION_LIB="$QUANTIZATION_PATH/$MODEL_NAME-$QUANTIZATION-cuda.so"

if [ ! -d "$MODEL_ROOT/models/$MODEL_NAME" ]; then
if [ $SKIP_QUANTIZATION != "yes" ]; then
mlc_llm convert_weight $MODEL_PATH --quantization $QUANTIZATION --output $QUANTIZATION_PATH
mlc_llm gen_config $MODEL_PATH --quantization $QUANTIZATION --conv-template $CONV_TEMPLATE --context-window-size $MAX_CONTEXT_LEN --max-batch-size 1 --output $QUANTIZATION_PATH
mlc_llm compile $QUANTIZATION_PATH --device cuda --opt O3 --output $QUANTIZATION_LIB
fi

QUANTIZATION_LIB="--model-lib-path $QUANTIZATION_LIB"
}

quantize_legacy() # mlc_llm == 0.1.0
{
if [ $SKIP_QUANTIZATION != "yes" ]; then
python3 -m mlc_llm.build \
--model $MODEL_NAME \
--target cuda \
--use-cuda-graph \
--use-flash-attn-mqa \
--use-cache $USE_CACHE \
--quantization $QUANTIZATION \
--artifact-path $MODEL_ROOT \
--max-seq-len 4096

if [ $? != 0 ]; then
return 1
fi
fi

QUANTIZATION_PATH="$QUANTIZATION_PATH/params"
}

if [[ $MODEL_PATH == http* ]]; then
MODEL_EXTRACTED="$MODEL_ROOT/models/$MODEL_NAME"
if [ ! -d "$MODEL_EXTRACTED" ]; then
cd $MODEL_ROOT/models
wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate $MODEL_URL -O $MODEL_TAR
MODEL_TAR="$MODEL_NAME.tar.gz"
wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate $MODEL_PATH -O $MODEL_TAR
tar -xzvf $MODEL_TAR
#rm $MODEL_TAR
fi
MODEL_PATH=$MODEL_EXTRACTED
fi

cd $MODEL_ROOT

python3 -m mlc_llm.build \
--model $MODEL_NAME \
--target cuda \
--use-cuda-graph \
--use-flash-attn-mqa \
--use-cache $MLC_USE_CACHE \
--quantization $QUANTIZATION \
--artifact-path $MODEL_ROOT \
--max-seq-len 4096

python3 /opt/mlc-llm/benchmark.py \
--model ${MODEL_ROOT}/${MODEL_NAME}-${QUANTIZATION}/params \
--max-new-tokens 128 \
--max-num-prompts 4 \
--prompt /data/prompts/completion.json
}
quantize_legacy || quantize

test_model "Llama-2-7b-hf" "https://nvidia.box.com/shared/static/i3jtp8jdmdlsq4qkjof8v4muth8ar7fo.gz"
python3 benchmark.py \
--model $QUANTIZATION_PATH $QUANTIZATION_LIB \
--max-new-tokens 128 \
--max-num-prompts 4 \
--prompt /data/prompts/completion.json

0 comments on commit 83abd0d

Please sign in to comment.