From 5777912c7b0110242b0caab80f33bf3fb239c305 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 18 Dec 2023 14:44:31 +0400
Subject: [PATCH] causal_lm->greedt_causal_lm

---
 .github/workflows/causal_lm_cpp.yml           |  7 ++--
 llm_bench/python/requirements.txt             |  1 +
 text_generation/causal_lm/cpp/CMakeLists.txt  | 14 ++++----
 text_generation/causal_lm/cpp/README.md       | 33 ++++++++++---------
 .../{causal_lm.cpp => greedy_causal_lm.cpp}   |  0
 .../causal_lm/cpp/set_up_and_run.sh           |  6 ++--
 6 files changed, 33 insertions(+), 28 deletions(-)
 rename text_generation/causal_lm/cpp/{causal_lm.cpp => greedy_causal_lm.cpp} (100%)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 9cd685cf8b..859aa52c58 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -2,15 +2,16 @@ name: causal_lm_cpp
 on:
   pull_request:
     paths:
-      - text_generation/causal_lm/cpp/**
-      - '!text_generation/causal_lm/cpp/README.md'
+      - text_generation/causal_lm/cpp/*
+      - llm_bench/python/**
       - thirdparty/openvino_contrib
       - .github/workflows/causal_lm_cpp.yml
+      - '!**.md'
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 jobs:
-  causal_lm_cpp:
+  cpp-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
     steps:
       - uses: actions/checkout@v4
diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index 103450fc03..39fef9cb29 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -7,6 +7,7 @@ torch
 transformers>=4.33.0
 diffusers>=0.22.0
 optimum>=1.14.0,<1.15.0
+# TODO: replace with slyalin after https://github.com/huggingface/optimum-intel/pull/486/#discussion_r1428853330 is resolved
 git+https://github.com/Wovchena/optimum-intel.git@stateful
 git+https://github.com/openvinotoolkit/nncf.git
 packaging
diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt
index 35c0428576..3ffb385e00 100644
--- a/text_generation/causal_lm/cpp/CMakeLists.txt
+++ b/text_generation/causal_lm/cpp/CMakeLists.txt
@@ -8,19 +8,19 @@ project(causal_lm)
 list(APPEND CUSTOM_OPERATIONS tokenizer)
 add_subdirectory(../../../thirdparty/openvino_contrib/modules/custom_operations/ "${CMAKE_CURRENT_BINARY_DIR}/custom_operations/")
 
-add_executable(causal_lm causal_lm.cpp)
-target_compile_definitions(causal_lm PRIVATE USER_OV_EXTENSIONS_PATH=\"$<TARGET_FILE:user_ov_extensions>\")
+add_executable(greedy_causal_lm greedy_causal_lm.cpp)
+target_compile_definitions(greedy_causal_lm PRIVATE USER_OV_EXTENSIONS_PATH=\"$<TARGET_FILE:user_ov_extensions>\")
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
-target_link_libraries(causal_lm PRIVATE openvino::runtime user_ov_extensions)
-set_target_properties(causal_lm PROPERTIES CXX_STANDARD 17)
-set_target_properties(causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
+target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime user_ov_extensions)
+set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17)
+set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
 if(MSVC)
     target_compile_options(
-        causal_lm PRIVATE
+        greedy_causal_lm PRIVATE
         /Wall  # Display all warnings
         /wd4710 /wd4711  # Disable the inline warnings
         /EHsc  # Enable standard C++ stack unwinding, assume functions with extern "C" never throw
     )
 else()
-    target_compile_options(causal_lm PRIVATE -Wall)  # Display all warnings
+    target_compile_options(greedy_causal_lm PRIVATE -Wall)  # Display all warnings
 endif()
diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md
index 4bdf33f557..3e0f321d40 100644
--- a/text_generation/causal_lm/cpp/README.md
+++ b/text_generation/causal_lm/cpp/README.md
@@ -1,19 +1,19 @@
-# Casual LM
+# Causal LM
 
-This application showcases inference of a casual language model (LM). It doesn't have many configuration options to encourage the reader to explore and modify the source code. There's a Jupyter notebook which corresponds to this pipeline and discusses how to create an LLM-powered Chatbot: https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot.
+This application showcases inference of a causal language model (LM). It doesn't have many configuration options to encourage the reader to explore and modify the source code. There's a Jupyter notebook which corresponds to this pipeline and discusses how to create an LLM-powered Chatbot: https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot.
 
 > [!NOTE]
 > This project is not for production use.
 
 ## How it works
 
-The program loads a tokenizer, detokenizer, and a model (`.xml` and `.bin`) to OpenVINO. The model is reshaped to batch 1 and variable prompt length. A prompt is tokenized and passed to the model. The model greedily generates token by token until the special end of sequence (EOS) token is obtained. The predicted tokens are converted to chars and printed in a streaming fashion.
+The program loads a model, a tokenizer and a detokenizer (`.xml` and `.bin`) to OpenVINO. A prompt is tokenized and passed to the model. The model greedily generates token by token until the special end of sequence (EOS) token is obtained. The predicted tokens are converted to chars and printed in a streaming fashion.
 
 ## Install OpenVINO Runtime
 
 Install OpenVINO Runtime from an archive: [Linux](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_installing_openvino_from_archive_linux.html). `<INSTALL_DIR>` below refers to the extraction location.
 
-## Build `Casual LM` and `user_ov_extensions`
+## Build `greedy_causal_lm` and `user_ov_extensions`
 
 ```sh
 git submodule update --init
@@ -24,19 +24,22 @@ cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ --c
 ## Supported models
 
 1. LLaMA 2
-   1. https://huggingface.co/meta-llama/Llama-2-7b-hf
-   2. https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
-   3. https://huggingface.co/meta-llama/Llama-2-13b-hf
    4. https://huggingface.co/meta-llama/Llama-2-13b-chat-hf
-   5. https://huggingface.co/meta-llama/Llama-2-70b-hf
+   3. https://huggingface.co/meta-llama/Llama-2-13b-hf
+   2. https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+   1. https://huggingface.co/meta-llama/Llama-2-7b-hf
    6. https://huggingface.co/meta-llama/Llama-2-70b-chat-hf
-2. OpenLLaMA
-   1. https://huggingface.co/openlm-research/open_llama_3b
-   2. https://huggingface.co/openlm-research/open_llama_7b
+   5. https://huggingface.co/meta-llama/Llama-2-70b-hf
+2. [Llama2-7b-WhoIsHarryPotter](https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter)
+3. OpenLLaMA
    3. https://huggingface.co/openlm-research/open_llama_13b
+   1. https://huggingface.co/openlm-research/open_llama_3b
    4. https://huggingface.co/openlm-research/open_llama_3b_v2
+   2. https://huggingface.co/openlm-research/open_llama_7b
    5. https://huggingface.co/openlm-research/open_llama_7b_v2
-3. [Llama2-7b-WhoIsHarryPotter](https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter)
+4. TinyLlama
+   1. https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6
+   2. https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T
 
 This pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature.
 
@@ -47,14 +50,14 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upg
 ```sh
 source <INSTALL_DIR>/setupvars.sh
 python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu
-python ../../../llm_bench/python/convert.py --model_id meta-llama/Llama-2-7b-hf --output_dir ./Llama-2-7b-hf/ --save_orig --stateful
+python ../../../llm_bench/python/convert.py --model_id meta-llama/Llama-2-7b-hf --output_dir ./Llama-2-7b-hf/ --precision FP16 --stateful
 python ./convert_tokenizers.py ./Llama-2-7b-hf/
 ```
 
 ## Run
 
-Usage: `causal_lm <openvino_model.xml> <tokenizer.xml> <detokenizer.xml> "<prompt>"`
+Usage: `greedy_causal_lm <openvino_model.xml> <tokenizer.xml> <detokenizer.xml> "<prompt>"`
 
-Example: `./build/causal_lm ./Llama-2-7b-hf/pytorch/dldt/FP32/openvino_model.xml ./tokenizer.xml ./detokenizer.xml "Why is the Sun yellow?"`
+Example: `./build/greedy_causal_lm ./Llama-2-7b-hf/pytorch/dldt/FP32/openvino_model.xml ./tokenizer.xml ./detokenizer.xml "Why is the Sun yellow?"`
 
 To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
diff --git a/text_generation/causal_lm/cpp/causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
similarity index 100%
rename from text_generation/causal_lm/cpp/causal_lm.cpp
rename to text_generation/causal_lm/cpp/greedy_causal_lm.cpp
diff --git a/text_generation/causal_lm/cpp/set_up_and_run.sh b/text_generation/causal_lm/cpp/set_up_and_run.sh
index 5258d0283b..78bb721e34 100755
--- a/text_generation/causal_lm/cpp/set_up_and_run.sh
+++ b/text_generation/causal_lm/cpp/set_up_and_run.sh
@@ -17,10 +17,10 @@ curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/
 sudo ./ov/install_dependencies/install_openvino_dependencies.sh
 
 source ./ov/setupvars.sh
-python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ../../../llm_bench/python/convert.py --model_id openlm-research/open_llama_3b_v2 --output_dir ./open_llama_3b_v2/ --stateful &
+python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python ../../../llm_bench/python/convert.py --model_id openlm-research/open_llama_3b_v2 --output_dir ./open_llama_3b_v2/ --precision FP16 --stateful &
 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
 cmake --build ./build/ --config Release -j
 wait
 
-python ./convert_tokenizers.py ./open_llama_3b_v2/pytorch/dldt/FP32/
-./build/causal_lm ./open_llama_3b_v2/pytorch/dldt/FP32/openvino_model.xml ./tokenizer.xml ./detokenizer.xml "return 0"
+python ./convert_tokenizers.py ./open_llama_3b_v2/pytorch/dldt/FP16/
+./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/openvino_model.xml ./tokenizer.xml ./detokenizer.xml "return 0"