From 83abd0dc663a31964fb8a4a82d5b7f6c058b504b Mon Sep 17 00:00:00 2001 From: Dustin Franklin Date: Wed, 17 Apr 2024 10:14:26 -0400 Subject: [PATCH] updated MLC containers --- packages/llm/mlc/benchmark.py | 15 +++-- packages/llm/mlc/patches/3403a4e.diff | 13 ++++ packages/llm/mlc/test.sh | 85 ++++++++++++++++++--------- 3 files changed, 81 insertions(+), 32 deletions(-) diff --git a/packages/llm/mlc/benchmark.py b/packages/llm/mlc/benchmark.py index 32a181440..3aba9ef2b 100644 --- a/packages/llm/mlc/benchmark.py +++ b/packages/llm/mlc/benchmark.py @@ -6,10 +6,17 @@ import argparse import resource -from mlc_chat import ChatModule, ChatConfig -from mlc_chat.callback import StreamToStdout - - +try: + from mlc_chat import ChatModule, ChatConfig + from mlc_chat.callback import StreamToStdout +except Exception as error: + print(f"failed to import ChatModule from mlc_chat ({error})") + print(f"trying to import ChatModule from mlc_llm instead...") + + from mlc_llm import ChatModule, ChatConfig + from mlc_llm.callback import StreamToStdout + + parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default="Llama-2-7b-chat-hf-q4f16_1") diff --git a/packages/llm/mlc/patches/3403a4e.diff b/packages/llm/mlc/patches/3403a4e.diff index 04bb894f1..eb74fd8f2 100644 --- a/packages/llm/mlc/patches/3403a4e.diff +++ b/packages/llm/mlc/patches/3403a4e.diff @@ -321,6 +321,19 @@ index 4c36cab9..6afe6d0c 100644 MLC_LIBRARY_PATH = os.environ.get("MLC_LIBRARY_PATH", None) +diff --git a/python/mlc_llm/quantization/ft_quantization.py b/python/mlc_llm/quantization/ft_quantization.py +index 4a158460..7cd8fc6a 100644 +--- a/python/mlc_llm/quantization/ft_quantization.py ++++ b/python/mlc_llm/quantization/ft_quantization.py +@@ -207,7 +207,7 @@ class FTQuantize: # pylint: disable=too-many-instance-attributes + relax.call_pure_packed( + "cutlass.ft_preprocess_weight", + lv1, +- detect_cuda_arch_list(target=target)[0], ++ int(detect_cuda_arch_list(target=target)[0]), + DataType(self.quantize_dtype).bits == 4, + sinfo_args=lv1.struct_info, + ) diff --git a/python/setup.py b/python/setup.py index 2f1b632b..6c7c2088 100644 --- a/python/setup.py diff --git a/packages/llm/mlc/test.sh b/packages/llm/mlc/test.sh index 6f3cda361..876838ebf 100644 --- a/packages/llm/mlc/test.sh +++ b/packages/llm/mlc/test.sh @@ -1,40 +1,69 @@ #!/usr/bin/env bash -set -e +set -ex +MODEL_NAME=${1:-"Llama-2-7b-hf"} +MODEL_PATH=${2:-"https://nvidia.box.com/shared/static/i3jtp8jdmdlsq4qkjof8v4muth8ar7fo.gz"} MODEL_ROOT="/data/models/mlc/dist" -MLC_USE_CACHE=${MLC_USE_CACHE:-1} -test_model() +QUANTIZATION=${QUANTIZATION:-"q4f16_ft"} +QUANTIZATION_PATH="${MODEL_ROOT}/${MODEL_NAME}-${QUANTIZATION}" +SKIP_QUANTIZATION=${SKIP_QUANTIZATION:-"no"} + +USE_CACHE=${USE_CACHE:-1} +CONV_TEMPLATE=${CONV_TEMPLATE:-"llama-2"} +MAX_CONTEXT_LEN=${MAX_CONTEXT_LEN:-4096} + + +quantize() # mlc_llm >= 0.1.1 { - local MODEL_NAME=$1 - local MODEL_URL=$2 - local MODEL_TAR="$MODEL_NAME.tar.gz" - local QUANTIZATION=${3:-q4f16_ft} + QUANTIZATION_LIB="$QUANTIZATION_PATH/$MODEL_NAME-$QUANTIZATION-cuda.so" - if [ ! -d "$MODEL_ROOT/models/$MODEL_NAME" ]; then + if [ $SKIP_QUANTIZATION != "yes" ]; then + mlc_llm convert_weight $MODEL_PATH --quantization $QUANTIZATION --output $QUANTIZATION_PATH + mlc_llm gen_config $MODEL_PATH --quantization $QUANTIZATION --conv-template $CONV_TEMPLATE --context-window-size $MAX_CONTEXT_LEN --max-batch-size 1 --output $QUANTIZATION_PATH + mlc_llm compile $QUANTIZATION_PATH --device cuda --opt O3 --output $QUANTIZATION_LIB + fi + + QUANTIZATION_LIB="--model-lib-path $QUANTIZATION_LIB" +} + +quantize_legacy() # mlc_llm == 0.1.0 +{ + if [ $SKIP_QUANTIZATION != "yes" ]; then + python3 -m mlc_llm.build \ + --model $MODEL_NAME \ + --target cuda \ + --use-cuda-graph \ + --use-flash-attn-mqa \ + --use-cache $USE_CACHE \ + --quantization $QUANTIZATION \ + --artifact-path $MODEL_ROOT \ + --max-seq-len 4096 + + if [ $? != 0 ]; then + return 1 + fi + fi + + QUANTIZATION_PATH="$QUANTIZATION_PATH/params" +} + +if [[ $MODEL_PATH == http* ]]; then + MODEL_EXTRACTED="$MODEL_ROOT/models/$MODEL_NAME" + if [ ! -d "$MODEL_EXTRACTED" ]; then cd $MODEL_ROOT/models - wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate $MODEL_URL -O $MODEL_TAR + MODEL_TAR="$MODEL_NAME.tar.gz" + wget --quiet --show-progress --progress=bar:force:noscroll --no-check-certificate $MODEL_PATH -O $MODEL_TAR tar -xzvf $MODEL_TAR #rm $MODEL_TAR fi + MODEL_PATH=$MODEL_EXTRACTED +fi - cd $MODEL_ROOT - - python3 -m mlc_llm.build \ - --model $MODEL_NAME \ - --target cuda \ - --use-cuda-graph \ - --use-flash-attn-mqa \ - --use-cache $MLC_USE_CACHE \ - --quantization $QUANTIZATION \ - --artifact-path $MODEL_ROOT \ - --max-seq-len 4096 - - python3 /opt/mlc-llm/benchmark.py \ - --model ${MODEL_ROOT}/${MODEL_NAME}-${QUANTIZATION}/params \ - --max-new-tokens 128 \ - --max-num-prompts 4 \ - --prompt /data/prompts/completion.json -} +quantize_legacy || quantize -test_model "Llama-2-7b-hf" "https://nvidia.box.com/shared/static/i3jtp8jdmdlsq4qkjof8v4muth8ar7fo.gz" +python3 benchmark.py \ + --model $QUANTIZATION_PATH $QUANTIZATION_LIB \ + --max-new-tokens 128 \ + --max-num-prompts 4 \ + --prompt /data/prompts/completion.json