diff --git a/.github/workflows/docker/ut.dockerfile b/.github/workflows/docker/common.dockerfile similarity index 100% rename from .github/workflows/docker/ut.dockerfile rename to .github/workflows/docker/common.dockerfile diff --git a/.github/workflows/model_test.yml b/.github/workflows/model_test.yml index 7a481c8b..4f18c630 100644 --- a/.github/workflows/model_test.yml +++ b/.github/workflows/model_test.yml @@ -25,9 +25,8 @@ permissions: write-all env: OUT_SCRIPT_PATH: ${{ github.workspace }}/.github/workflows/scripts/models SCRIPT_PATH: /GenAIEval/.github/workflows/scripts - REPO_NAME: "GenAIEval" + DOCKER_NAME: "genaieval" DOCKER_TAG: "latest" - DOCKER_FILE_NAME: "model.dockerfile" CONTAINER_NAME: "modelTest" @@ -37,9 +36,10 @@ jobs: strategy: matrix: include: - - modelName: "EleutherAI/gpt-j-6B" - task: "hellaswag" + - modelName: "facebook/opt-125m" + datasets: "piqa" device: "cpu" + tasks: "text-generation" fail-fast: true steps: @@ -54,7 +54,7 @@ jobs: # We need this because GitHub needs to clone the branch to pipeline - name: Docker Build run: | - docker build -f ${{ github.workspace }}/.github/workflows/docker/${{ env.DOCKER_FILE_NAME }} -t ${{ env.REPO_NAME }}:${{ env.DOCKER_TAG }} . + docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - name: Docker Run run: | @@ -64,7 +64,7 @@ jobs: fi docker run -dit --disable-content-trust --privileged --name=${{ env.CONTAINER_NAME }} -v /dev/shm:/dev/shm \ -v ${{ github.workspace }}:/GenAIEval \ - ${{ env.REPO_NAME }}:${{ env.DOCKER_TAG }} + ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} - name: Binary build run: | @@ -93,7 +93,7 @@ jobs: run: | docker exec ${{ env.CONTAINER_NAME }} \ bash -c "cd /GenAIEval/.github/workflows/scripts/models \ - && bash model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --tasks=${{ matrix.task }}" + && bash model_test.sh --model=${{ matrix.modelName }} --device=${{ matrix.device }} --datasets=${{ matrix.datasets }} --tasks=${{ matrix.tasks }}" - name: Collect Log run: | @@ -101,7 +101,8 @@ jobs: bash -c "cd /GenAIEval/.github/workflows/scripts/models \ && bash -x collect_log.sh --model=${{ matrix.modelName }} \ --device=${{ matrix.device }} \ - --task=${{ matrix.task }} + --datasets=${{ matrix.datasets }} \ + --tasks=${{ matrix.tasks }} - name: Publish pipeline artifact uses: actions/upload-artifact@v4 diff --git a/.github/workflows/scripts/codeScan/trellix.sh b/.github/workflows/scripts/codeScan/trellix.sh new file mode 100644 index 00000000..8cd13e09 --- /dev/null +++ b/.github/workflows/scripts/codeScan/trellix.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +source ${workspace}/.github/workflows/scripts/change_color +log_dir=${workspace}/.github/workflows/scripts/codeScan + + +echo "---Updating definition (DAT) files ---" +DEFS_URL=https://update.nai.com/products/commonupdater/current/vscandat1000/dat/0000 +echo "Finding latest defs at $DEFS_URL/avvdat.ini..." \ + && wget -q $DEFS_URL/avvdat.ini \ + && echo "SUCCESS" || fail + +inifile="avvdat.ini" +filename=`awk -F"=" '$2 ~ /avvdat.*zip/ { print $2 } ' $inifile` +filename2="$(echo -e "${filename}" | tr -d '[:space:]')" + +if [ -z "$filename2" ] +then + echo "Cannot get defs information from INI file:" + cat $inifile + fail +fi + +echo "Downloading latest defs from $DEFS_URL/$filename2..." \ + && wget -q $DEFS_URL/$filename2 \ + && echo "SUCCESS" || fail + +echo "Extracting latest defs..." \ + && unzip -o $filename2 -d /usr/local/uvscan \ + && echo "SUCCESS" || fail + +echo "--- Scanning ---" +ENV_SCAN_OPTS="--analyze --mime --program --recursive --unzip --threads 4 --summary --verbose --html=${workspace}/.github/workflows/scripts/codeScan/report.html" +echo "Scan Options: $ENV_SCAN_OPTS" + +rm -r ${workspace}/avvdat* +rm -r ${workspace}/.git +uvscan $ENV_SCAN_OPTS ${workspace} 2>&1 | tee ${log_dir}/trellix.log + + +if [[ $(grep "Possibly Infected" ${log_dir}/trellix.log | sed 's/[^0-9]//g') != 0 ]]; then + $BOLD_RED && echo "Error!! Please Click on the artifact button to download and check error details." && $RESET + exit 1 +fi + +$BOLD_PURPLE && echo "Congratulations, Trellix Scan passed!" && $LIGHT_PURPLE && echo " You can click on the artifact button to see the log details." && $RESET +exit 0 diff --git a/.github/workflows/scripts/models/collect_log.sh b/.github/workflows/scripts/models/collect_log.sh index f6a68330..a197393a 100644 --- a/.github/workflows/scripts/models/collect_log.sh +++ b/.github/workflows/scripts/models/collect_log.sh @@ -14,32 +14,34 @@ # limitations under the License. set -eo pipefail -source /GenAIEval/.github/workflows/script/change_color.sh +source /GenAIEval/.github/workflows/scripts/change_color WORKSPACE="/GenAIEval" # get parameters PATTERN='[-a-zA-Z0-9_]*=' PERF_STABLE_CHECK=true for i in "$@"; do case $i in + --datasets*) + datasets=`echo $i | sed "s/${PATTERN}//"`;; --device=*) device=`echo $i | sed "s/${PATTERN}//"`;; --model=*) model=`echo $i | sed "s/${PATTERN}//"`;; - --task=*) - task=`echo $i | sed "s/${PATTERN}//"`;; + --tasks=*) + tasks=`echo $i | sed "s/${PATTERN}//"`;; *) echo "Parameter $i not recognized."; exit 1;; esac done -output_file="/GenAIEval/${device}/${model}/${device}-${model}-${task}.log" +log_file="/GenAIEval/${device}/${model}/${device}-${model}-${tasks}-${datasets}.log" $BOLD_YELLOW && echo "-------- Collect logs --------" && $RESET echo "working in" pwd -if [[ ! -f ${output_file} ]]; then - echo "${device};${model};${task};;${logfile}" >> ${WORKSPACE}/summary.log +if [[ ! -f ${log_file} ]]; then + echo "${device};${model};${tasks};${datasets};;${logfile}" >> ${WORKSPACE}/summary.log else - acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${acc_log_name} | head -n 1 | sed 's/.*://;s/[^0-9.]//g') - echo "${device};${model};${task};${acc};${logfile}" >> ${WORKSPACE}/summary.log + acc=$(grep -Po "Accuracy .* is:\\s+(\\d+(\\.\\d+)?)" ${log_file} | head -n 1 | sed 's/.*://;s/[^0-9.]//g') + echo "${device};${model};${tasks};${datasets};${acc};${logfile}" >> ${WORKSPACE}/summary.log fi diff --git a/.github/workflows/scripts/models/model_test.sh b/.github/workflows/scripts/models/model_test.sh index fae21add..fb420086 100644 --- a/.github/workflows/scripts/models/model_test.sh +++ b/.github/workflows/scripts/models/model_test.sh @@ -14,19 +14,21 @@ # limitations under the License. set -eo pipefail -source /GenAIEval/.github/workflows/script/change_color.sh +source /GenAIEval/.github/workflows/scripts/change_color # get parameters PATTERN='[-a-zA-Z0-9_]*=' PERF_STABLE_CHECK=true for i in "$@"; do case $i in + --datasets*) + datasets=`echo $i | sed "s/${PATTERN}//"`;; --device=*) device=`echo $i | sed "s/${PATTERN}//"`;; --model=*) model=`echo $i | sed "s/${PATTERN}//"`;; - --task=*) - task=`echo $i | sed "s/${PATTERN}//"`;; + --tasks=*) + tasks=`echo $i | sed "s/${PATTERN}//"`;; *) echo "Parameter $i not recognized."; exit 1;; esac @@ -34,17 +36,23 @@ done log_dir="/GenAIEval/${device}/${model}" mkdir -p ${log_dir} - +working_dir="" $BOLD_YELLOW && echo "-------- evaluation start --------" && $RESET main() { - #prepare + case ${tasks} in + "text-generation") + working_dir="/GenAIEval/GenAIEval/evaluation/lm_evaluation_harness/examples";; + "code-generation") + working_dir="/GenAIEval/GenAIEval/evaluation/bigcode_evaluation_harness/examples";; + *) + echo "Not suppotted task"; exit 1;; + esac run_benchmark } function prepare() { ## prepare env - working_dir="/GenAIEval" cd ${working_dir} echo "Working in ${working_dir}" echo -e "\nInstalling model requirements..." @@ -54,18 +62,20 @@ function prepare() { else echo "Not found requirements.txt file." fi + if [[ ${device} == "hpu" ]]; then + pip install --upgrade-strategy eager optimum[habana] + fi } function run_benchmark() { cd ${working_dir} - pip install --upgrade-strategy eager optimum[habana] - overall_log="${log_dir}/${device}-${model}-${task}.log" + overall_log="${log_dir}/${device}-${model}-${tasks}-${datasets}.log" python main.py \ --model hf \ --model_args pretrained=${model} \ - --tasks ${task} \ + --tasks ${datasets} \ --device ${device} \ - --batch_size 8 + --batch_size 112 2>&1 | tee ${overall_log} status=$? diff --git a/.github/workflows/scripts/unittest/calc_coverage.sh b/.github/workflows/scripts/unittest/calc_coverage.sh index e00df60d..a2149f3e 100644 --- a/.github/workflows/scripts/unittest/calc_coverage.sh +++ b/.github/workflows/scripts/unittest/calc_coverage.sh @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -source ../../change_color.sh +source ../../change_color LOG_DIR=$1 coverage_compare="${LOG_DIR}/coverage_compare.html" coverage_log_pr="${LOG_DIR}/UnitTestPR-test/coverage_pr" diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 8562165d..261b775f 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -62,7 +62,7 @@ jobs: - name: Docker Build run: | - docker build -f ${{ github.workspace }}/.github/workflows/docker/ut.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . + docker build -f ${{ github.workspace }}/.github/workflows/docker/common.dockerfile -t ${{ env.DOCKER_NAME }}:${{ env.DOCKER_TAG }} . - name: Docker Run run: | diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/main.py b/GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py similarity index 100% rename from GenAIEval/evaluation/bigcode_evaluation_harness/main.py rename to GenAIEval/evaluation/bigcode_evaluation_harness/examples/main.py diff --git a/GenAIEval/evaluation/lm_evaluation_harness/main.py b/GenAIEval/evaluation/lm_evaluation_harness/examples/main.py similarity index 100% rename from GenAIEval/evaluation/lm_evaluation_harness/main.py rename to GenAIEval/evaluation/lm_evaluation_harness/examples/main.py diff --git a/README.md b/README.md index 830274de..68744fe1 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ For evaluating the models on text-generation tasks, we follow the [lm-evaluation ```shell # pip install --upgrade-strategy eager optimum[habana] -cd GenAIEval/evaluation/lm_evaluation_harness +cd GenAIEval/evaluation/lm_evaluation_harness/examples python main.py \ --model gaudi-hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -29,7 +29,7 @@ python main.py \ ##### CPU ```shell -cd GenAIEval/evaluation/lm_evaluation_harness +cd GenAIEval/evaluation/lm_evaluation_harness/examples python main.py \ --model hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -57,7 +57,7 @@ For evaluating the models on coding tasks or specifically coding LLMs, we follow #### command line usage ```shell -cd GenAIEval/evaluation/bigcode_evaluation_harness +cd GenAIEval/evaluation/bigcode_evaluation_harness/examples python main.py \ --model "codeparrot/codeparrot-small" \ --tasks "humaneval" \