Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up VLMPipeline #923

Merged
merged 229 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
229 commits
Select commit Hold shift + click to select a range
8d793c5
Create /chatglm_cpp
wenyi5608 Nov 29, 2023
212de8b
Delete llm/chatglm_cpp
wenyi5608 Nov 29, 2023
669dd53
Create CMakeLists.txt
wenyi5608 Nov 29, 2023
c04d1c7
Update CMakeLists.txt
wenyi5608 Nov 29, 2023
6847898
Add files via upload
wenyi5608 Nov 29, 2023
9a368bb
add chatglm_cpp demo
wenyi5608 Nov 29, 2023
b9fe37f
Update README.md
wenyi5608 Nov 29, 2023
a2bbd4a
Update chatglm.cpp
wenyi5608 Nov 29, 2023
0b480c9
Update README.md
wenyi5608 Dec 1, 2023
5eb8c4a
Update model compile from xml
wenyi5608 Dec 7, 2023
995c82d
Update README.md
wenyi5608 Dec 7, 2023
0823c6c
Update README.md
wenyi5608 Dec 7, 2023
8d70a45
Update README.md
wenyi5608 Dec 7, 2023
d74fddd
Update README.md
wenyi5608 Dec 13, 2023
cafdec4
Update README.md
wenyi5608 Dec 13, 2023
3319533
Update chatglm.cpp
wenyi5608 Dec 13, 2023
417555f
Update chatglm.cpp
wenyi5608 Dec 13, 2023
a244f9b
Update chatglm.cpp
wenyi5608 Dec 13, 2023
20931c6
Update chatglm.cpp
wenyi5608 Dec 14, 2023
5c09800
Update README.md
wenyi5608 Dec 14, 2023
85a3945
Update README.md
wenyi5608 Dec 14, 2023
6af1acd
Update chatglm.cpp
wenyi5608 Dec 15, 2023
fefc606
Add files via upload
wenyi5608 Dec 15, 2023
36ca098
Update chatglm.cpp
wenyi5608 Dec 15, 2023
c45b85c
Update README.md
wenyi5608 Dec 15, 2023
b2909b8
Update README.md
wenyi5608 Dec 15, 2023
67b68f5
Update README.md
wenyi5608 Dec 15, 2023
dd60a8f
Update README.md
wenyi5608 Dec 15, 2023
035a7d6
Update chatglm.cpp
wenyi5608 Dec 15, 2023
7b6f607
Update chatglm.cpp
wenyi5608 Dec 20, 2023
ba888bb
Update chatglm.cpp
wenyi5608 Dec 27, 2023
673d86e
Update README.md
wenyi5608 Dec 27, 2023
c30bdf8
Update chatglm.cpp
wenyi5608 Dec 27, 2023
fcadd45
Update README.md
wenyi5608 Dec 27, 2023
35a9d66
Update chatglm.cpp
wenyi5608 Dec 29, 2023
57b9f68
Update chatglm.cpp
wenyi5608 Jan 23, 2024
1703821
Create clip.h
wenyi5608 Jul 26, 2024
8fd2fe7
minicmpv_2 demo
wenyi5608 Jul 26, 2024
2dda201
Update submodul
yangsu2022 Jul 26, 2024
59b394f
Add minicpmv2
wenyi5608 Jul 26, 2024
392dd62
multi-round chat
wenyi5608 Aug 11, 2024
c4f9a19
multi-round chat
wenyi5608 Aug 11, 2024
ae735a8
multi-round chat
wenyi5608 Aug 11, 2024
4879efc
Update minicpmv.h
wenyi5608 Aug 11, 2024
6507280
multi-round chat
wenyi5608 Aug 11, 2024
9b67765
Add files via upload
wenyi5608 Aug 11, 2024
f10989f
Update README.md
wenyi5608 Aug 11, 2024
a20db15
Update minicpmv_ov.cpp
wenyi5608 Aug 11, 2024
5c4f254
using namespace ov
Wovchena Aug 12, 2024
bbefc17
Merge branch 'wenyi5608-stateful' into add-openbmb/MiniCPM-V-2-sample
Wovchena Aug 12, 2024
73db9a6
Add VLMPipeline
Wovchena Aug 12, 2024
8d0ad82
Add python
Wovchena Aug 13, 2024
4bc8a6d
Add constructor
Wovchena Aug 13, 2024
bea9a46
Move ModelConfig
Wovchena Aug 14, 2024
08ad5a3
generate(imgage)
Wovchena Aug 14, 2024
9dbf0e0
rename
Wovchena Aug 15, 2024
448a35e
Tensor
Wovchena Aug 15, 2024
8f02b9f
Remove output_fixed_len
Wovchena Aug 15, 2024
ceba7c9
merge generate
Wovchena Aug 15, 2024
6420a39
Add Clip
Wovchena Aug 15, 2024
6f5c256
remove n_threads
Wovchena Aug 16, 2024
fb2dcd0
VisionEncoder
Wovchena Aug 16, 2024
6b2aaa3
tensor
Wovchena Aug 16, 2024
f54e458
tensor
Wovchena Aug 16, 2024
4611834
tensor
Wovchena Aug 19, 2024
45d2f85
tensor
Wovchena Aug 19, 2024
431be5f
drop n_img_pos
Wovchena Aug 19, 2024
93d3eba
Split encoder and resampler
Wovchena Aug 19, 2024
1b2bc1e
Add encode for VisionEncoder
Wovchena Aug 19, 2024
0ea4c09
clean up
Wovchena Aug 19, 2024
18fa984
Merge branch 'master' into add-openbmb/MiniCPM-V-2-sample
Wovchena Aug 20, 2024
3e4af2c
Remove int4
Wovchena Aug 21, 2024
02c79fc
workaround
Wovchena Aug 21, 2024
1124ddf
fix
Wovchena Aug 21, 2024
79e3e04
public
Wovchena Aug 21, 2024
3edfd91
workaround
Wovchena Aug 21, 2024
b5f04fc
x and y ratios
Wovchena Aug 21, 2024
2514ae2
Merge branch 'master' into add-openbmb/MiniCPM-V-2-sample
Wovchena Aug 21, 2024
066ff0c
Merge branch 'master' into add-openbmb/MiniCPM-V-2-sample
Wovchena Aug 21, 2024
5785cb2
Delete resize from ov_minicpm-v2-test.py
Wovchena Aug 21, 2024
0d28a64
resample
Wovchena Aug 22, 2024
c18219b
Update llm/mincpmv2_cpp/export_MiniCPM-V-2.py
Wovchena Aug 22, 2024
db8330a
ref
Wovchena Aug 23, 2024
b176c26
fix dims
Wovchena Aug 23, 2024
7c5f631
Batch inference
Wovchena Aug 26, 2024
7972bc2
Hide headers
Wovchena Aug 26, 2024
800bcc6
Return string
Wovchena Aug 26, 2024
15dbbaf
Remove
Wovchena Aug 26, 2024
7b4713c
Factor vision_encoder.hpp out
Wovchena Aug 28, 2024
e3dd4f6
Merge branch 'master' into add-openbmb/MiniCPM-V-2-sample
Wovchena Aug 28, 2024
2d1d6b7
Update README.md
Wovchena Aug 28, 2024
8a9f4b9
fix warnings
Wovchena Aug 28, 2024
b51b981
Encoder AnyMap
Wovchena Aug 28, 2024
3fce7c7
Reproduce
Wovchena Aug 29, 2024
f5eb2b3
Remove members
Wovchena Aug 29, 2024
9c18ff9
relative link
Wovchena Aug 29, 2024
16d9185
relative link
Wovchena Aug 29, 2024
f6cc888
vlm_config.hpp
Wovchena Aug 29, 2024
19381a0
Add generate methods
Wovchena Aug 29, 2024
914293f
Add vlm_config
Wovchena Aug 29, 2024
dd30ca0
m_
Wovchena Aug 29, 2024
8a155b5
Merge branch 'master' into add-openbmb/MiniCPM-V-2-sample
Wovchena Aug 29, 2024
8e1819f
[GHA] Update OV refs to 2024.4
akladiev Aug 30, 2024
f169ec8
Remove unused
Wovchena Aug 30, 2024
21110e2
Add overloads
Wovchena Aug 30, 2024
4cfba8a
[GHA] Update OV refs to 2024.4 (#810)
Wovchena Aug 30, 2024
0052922
Tensor->EncodedImage
Wovchena Sep 2, 2024
86187dd
Add VisionEncoder docstrings
Wovchena Sep 2, 2024
4d79fda
public
Wovchena Sep 2, 2024
52a7c12
Update src/cpp/src/vision_encoder.cpp
Wovchena Sep 3, 2024
278ff39
Document shapes
Wovchena Sep 4, 2024
94a2f3f
Remove generate(AnyMap)
Wovchena Sep 4, 2024
675c8f0
Beautify sample
Wovchena Sep 4, 2024
b79874b
fix PerfMetrics
pavel-esir Sep 4, 2024
b4955f5
TTTFT -> TTFT
pavel-esir Sep 4, 2024
c3f895c
Merge branch 'releases/2024/4' into add-openbmb/MiniCPM-V-2-sample
Wovchena Sep 4, 2024
94573f3
Replace stable diffusion 1.5 model
yatarkan Sep 4, 2024
88c100c
std::move
Wovchena Sep 4, 2024
9a737f2
add tests
pavel-esir Sep 4, 2024
8ccfdf7
Change Stable Diffusion v1.5 model (#824)
Wovchena Sep 4, 2024
5773ae3
some corrections
pavel-esir Sep 4, 2024
41db203
temporary disable causal_lm win tests
pavel-esir Sep 4, 2024
6321b4d
fix PerfMetrics (#823)
Wovchena Sep 5, 2024
d1e5c13
Merge branch 'releases/2024/4' into add-openbmb/MiniCPM-V-2-sample
Wovchena Sep 5, 2024
6c0e656
[2024.4] update optimum intel commit to include mxfp4 conversion (#828)
eaidova Sep 5, 2024
ba0491a
[2024.4] use perf metrics for genai in llm bench (#830)
eaidova Sep 5, 2024
0c2f4c9
Tokenizerrs update
ilya-lavrenov Sep 5, 2024
7734510
Use tarballs
ilya-lavrenov Sep 5, 2024
18fc740
Use OpenVINO 2024.4 RC2
ilya-lavrenov Sep 6, 2024
1914a70
updated pybind version
mryzhov Sep 6, 2024
c8b3e3f
Merge branch 'releases/2024/4' into add-openbmb/MiniCPM-V-2-sample
Wovchena Sep 6, 2024
2d99414
improve utf8 handling for DecodedResults in pybind
pavel-esir Sep 6, 2024
52ea56d
alighn printing DecodedResults scores in py_generate_pipeline and llm…
pavel-esir Sep 6, 2024
fb8c7a4
Improve utf8 handling for DecodedResults in pybind (#837)
pavel-esir Sep 6, 2024
a75df54
replace invalid utf8 bytes when file is opened
pavel-esir Sep 6, 2024
355240e
print with 'cat pred.txt' for debug
pavel-esir Sep 6, 2024
1aa61f8
replace � -> ""
pavel-esir Sep 6, 2024
6d3d6b6
Apply suggestions from code review
pavel-esir Sep 6, 2024
95ceb92
Merge branch 'releases/2024/4' into add-openbmb/MiniCPM-V-2-sample
Wovchena Sep 6, 2024
8f87768
accept vector of tensors
Wovchena Sep 6, 2024
1a896d2
Fails at runtime with out of bounds
Wovchena Sep 10, 2024
adf0db3
temp
Wovchena Sep 10, 2024
3928162
Temo
Wovchena Sep 10, 2024
7920053
Temo
Wovchena Sep 10, 2024
8f51232
nan or seg fault
Wovchena Sep 19, 2024
daa8f6c
Still need to fix images
Wovchena Sep 19, 2024
348ec37
update README.md
Wovchena Sep 19, 2024
f9358d4
uncomment conversion
Wovchena Sep 19, 2024
e92b778
Comment out dumped tokens
Wovchena Sep 19, 2024
7de6d00
Merge branch 'master' into miniCPM-V-2_6
Wovchena Sep 20, 2024
f88b45e
Resolve conflicts
Wovchena Sep 20, 2024
3a0ad0b
Resolve conflicts
Wovchena Sep 20, 2024
dce2d79
Remove dump
Wovchena Sep 20, 2024
2450d96
Merge branch 'master' into miniCPM-V-2_6
Wovchena Sep 30, 2024
1bcecdf
chat template
Wovchena Oct 1, 2024
bdc4a65
temp
Wovchena Oct 2, 2024
5f30821
Tokenizers bug
Wovchena Oct 2, 2024
d43d06b
Merge branch 'master' into miniCPM-V-2_6
Wovchena Oct 2, 2024
bde9fb7
Read use_image_id
Wovchena Oct 2, 2024
6a44ba7
Merge branch 'miniCPM-V-2_6' into image-miniCPM-V-2_6
Wovchena Oct 2, 2024
c4d6fec
increase timeout
Wovchena Oct 2, 2024
a7e18a7
Delete comment
Wovchena Oct 2, 2024
5f9e5f7
prints
Wovchena Oct 3, 2024
63cdd24
Fix
Wovchena Oct 4, 2024
1e68d7a
Clean up
Wovchena Oct 4, 2024
17f69ec
workflow
Wovchena Oct 4, 2024
ce665b8
unpipe
Wovchena Oct 4, 2024
ea946d1
unzip
Wovchena Oct 4, 2024
64c93b8
dot
Wovchena Oct 4, 2024
450b6f7
cmd
Wovchena Oct 4, 2024
3356f2a
l
Wovchena Oct 4, 2024
23d087f
ls
Wovchena Oct 4, 2024
c796323
ls
Wovchena Oct 4, 2024
1e15496
env
Wovchena Oct 4, 2024
ca6b9ac
ubuntu
Wovchena Oct 4, 2024
416323d
torchvision
Wovchena Oct 4, 2024
de7d768
rename
Wovchena Oct 4, 2024
08d133c
docstrings
Wovchena Oct 4, 2024
447e745
add set_chat_template
Wovchena Oct 4, 2024
ebaadb7
Finilize chat_template
Wovchena Oct 4, 2024
bfad182
Simplify test
Wovchena Oct 4, 2024
be9e203
Fix multiple images
Wovchena Oct 4, 2024
3d0e5ba
Temp
Wovchena Oct 4, 2024
57bb044
Add macos
Wovchena Oct 6, 2024
5efc24b
and
Wovchena Oct 6, 2024
c473d2a
cmp
Wovchena Oct 6, 2024
c780a2e
cmp
Wovchena Oct 6, 2024
68d0134
matrix
Wovchena Oct 6, 2024
7f4cafb
runner
Wovchena Oct 6, 2024
b316c92
env
Wovchena Oct 6, 2024
797d8ed
shorten
Wovchena Oct 6, 2024
3d9886b
matrix
Wovchena Oct 6, 2024
5b00440
4 cores
Wovchena Oct 6, 2024
798f42d
./
Wovchena Oct 6, 2024
ee057ba
env
Wovchena Oct 6, 2024
e88ae49
ld
Wovchena Oct 6, 2024
382bcbc
ld
Wovchena Oct 6, 2024
30dc931
tbb
Wovchena Oct 6, 2024
4c61b12
and
Wovchena Oct 6, 2024
1ff0889
8
Wovchena Oct 6, 2024
3a800a1
LD_LIBRARY_PATH
Wovchena Oct 6, 2024
8a3a21b
TBB_DIR
Wovchena Oct 6, 2024
99911bc
move
Wovchena Oct 6, 2024
6ac98c5
echo
Wovchena Oct 6, 2024
53f3154
16
Wovchena Oct 6, 2024
55af01e
LD
Wovchena Oct 6, 2024
f064bdc
Remove ls
Wovchena Oct 6, 2024
21da18b
timeout-minutes: 1
Wovchena Oct 6, 2024
44f21c0
timeout-minutes: 2
Wovchena Oct 6, 2024
02c928e
setuvars
Wovchena Oct 6, 2024
f327e8c
fix jump
Wovchena Oct 6, 2024
0eb693e
tbb
Wovchena Oct 6, 2024
ddb1c29
DYLD_LIBRARY_PATH
Wovchena Oct 6, 2024
0ea2656
source ./ov/setupvars.sh
Wovchena Oct 6, 2024
c1d1f3b
4
Wovchena Oct 6, 2024
7a00aa5
timeout-minutes: 8
Wovchena Oct 7, 2024
33e1cef
timeout-minutes: 16
Wovchena Oct 7, 2024
12a7134
timeout-minutes: 32
Wovchena Oct 7, 2024
c166e5f
Update the model
Wovchena Oct 7, 2024
df9420d
Fix macos compilation
Wovchena Oct 7, 2024
871f334
spelling
Wovchena Oct 7, 2024
d3bb229
Clean up VLMPipeline
Wovchena Oct 7, 2024
a5a58c2
Remove error handling
Wovchena Oct 7, 2024
7441a18
Allow [NHWC] and [HWC]
Wovchena Oct 8, 2024
eaaa971
Move subtract_chat_tokenized_inputs' implementation to .cpp
Wovchena Oct 8, 2024
5eb7011
Revert test to drop mac
Wovchena Oct 8, 2024
ff27cf7
Fix layout description
Wovchena Oct 8, 2024
770f7ed
Merge branch 'master' into clean-up
Wovchena Oct 9, 2024
6609a08
Remove py constructor
Wovchena Oct 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -708,12 +708,12 @@ jobs:
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg

- name: Run chat chat sample
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
<<< $'What is on the image?\nWhat is special on the image?'

cpp-continuous-batching-ubuntu:
Expand Down
2 changes: 1 addition & 1 deletion samples/cpp/visual_language_chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export_MiniCPM-V-2_6.py miniCPM-V-2_6

## Run

https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 can be used as a sample image.
[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.

`visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`

Expand Down
6 changes: 2 additions & 4 deletions samples/cpp/visual_language_chat/visual_language_chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,10 @@ int main(int argc, char* argv[]) try {

pipe.start_chat();
std::cout << "question:\n";
if (!std::getline(std::cin, prompt)) {
throw std::runtime_error("std::cin failed");
}
std::getline(std::cin, prompt);
pipe.generate(
prompt,
ov::genai::image(std::move(image)),
ov::genai::image(image),
ov::genai::streamer(print_subword)
);
std::cout << "\n----------\n"
Expand Down
8 changes: 4 additions & 4 deletions src/cpp/include/openvino/genai/vision_encoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

namespace ov::genai {
/// @brief A pair describing image size.
struct HeightWidth {
struct ImageSize {
/// @brief Height of a corresponding image.
size_t height;
/// @brief Width of a corresponding image.
Expand All @@ -25,16 +25,16 @@ struct EncodedImage {
ov::Tensor resized_source;
/// @brief A size of an image used to compute embeddings for
/// divided by ProcessorConfig's patch_size.
HeightWidth resized_source_size;
ImageSize resized_source_size;
/// @brief Embeddings of images obtained from a source image by
/// slicing at no more than max_slice_nums pieces and resizing.
/// The tensor's shape is
/// [slice_y, slice_x, number_of_embeddings, embedding_size].
/// slices_sizes.size() == slice_y * slice_x.
ov::Tensor slices;
/// @brief Flattened sizes of images used to compute embeddings
/// @brief A size of images used to compute embeddings
/// stored in slices member divided by ProcessorConfig's patch_size.
std::vector<HeightWidth> slices_sizes;
ImageSize slices_size;
};

/// @brief A class used to infer embeddings of an image using
Expand Down
31 changes: 4 additions & 27 deletions src/cpp/include/openvino/genai/vlm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,37 +65,14 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
explicit VLMPipeline(
const std::filesystem::path& model_dir,
const std::string& device="CPU",
const ov::AnyMap device_config={},
ov::Core core=ov::Core{}
) : VLMPipeline{
model_dir,
Tokenizer(model_dir.string(), device_config),
device,
device_config,
core
} {}

/// @brief Construct a pipeline form a folder containing model IRs
/// and from a Tokenizer instance.
/// @param model_dir A folder to read model IRs.
/// @param tokenizer An instance of Tokenizer to use.
/// @param device Inference device.
/// @param device_config A config to pass to ov::Core.set_property()
/// and ov::Core::compile_model().
/// @param core ov::Core instance to use.
VLMPipeline(
const std::filesystem::path& model_dir,
const ov::genai::Tokenizer& tokenizer,
const std::string& device="CPU",
const ov::AnyMap device_config={},
ov::Core core=ov::Core{}
const ov::AnyMap device_config={}
);

/// @brief Default destructor.
~VLMPipeline();

/// @brief Generate a response given a prompt and any number of
/// uint8 RGB images.
/// uint8 RGB images with [NCHW] or [CHW] layout.
/// @param prompt A prompt to respond to.
/// @param images Images to be prepended to a prompt.
/// @param generation_config A config to follow for text generation.
Expand All @@ -120,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
/// @brief Generate a response given a prompt and arbitrary number
/// of ov::Property instances.
/// Example:
/// generate("text", image(std::move(rgb)), do_sample(true));
/// generate("text", image(rgb), do_sample(true));
/// @param prompt A prompt to respond to.
/// @param ...properties ov::Property instances to be combined into
/// ov::AnyMap.
Expand Down Expand Up @@ -166,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {

/*
* utils that allow to use generate() in the following way:
* pipe.generate(prompt, ov::genai::image(std::move(image_tensor))).
* pipe.generate(prompt, ov::genai::image(image_tensor)).
*/
static constexpr ov::Property<ov::Tensor> image{"image"};
static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"};
Expand Down
3 changes: 0 additions & 3 deletions src/cpp/src/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.hpp"
ilya-lavrenov marked this conversation as resolved.
Show resolved Hide resolved

#include <cassert>
#include <cmath>
#include <cstdlib>
Expand Down
4 changes: 1 addition & 3 deletions src/cpp/src/clip.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#ifndef CLIP_H
#define CLIP_H
#pragma once

#include <vector>
#include <numeric>
Expand Down Expand Up @@ -53,4 +52,3 @@ bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_wid

/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img);
#endif // CLIP_H
14 changes: 14 additions & 0 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,20 @@ std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& p
return {core_config, compile_config};
};

ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) {
auto minuend_size = minuend.input_ids.get_size();
auto subtrahend_size = subtrahend.input_ids.get_size();
ov::Shape new_shape{1, minuend_size - subtrahend_size};

ov::Tensor new_input_ids(ov::element::i64, new_shape);
auto data_ptr = minuend.input_ids.data<int64_t>();
std::copy(data_ptr + subtrahend_size, data_ptr + minuend_size, new_input_ids.data<int64_t>());

ov::Tensor new_attention_mask(ov::element::i64, new_shape);
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);

return {new_input_ids, new_attention_mask};
}
} // namespace utils
} // namespace genai
} // namespace ov
15 changes: 1 addition & 14 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,7 @@ ProcessorConfig from_any_map(

std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config);

inline ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){
auto first_size = fisrt.input_ids.get_size();
auto second_size = second.input_ids.get_size();
ov::Shape new_shape{1, first_size - second_size};

ov::Tensor new_input_ids(ov::element::i64, new_shape);
auto data_ptr = fisrt.input_ids.data<int64_t>();
std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data<int64_t>());

ov::Tensor new_attention_mask(ov::element::i64, new_shape);
std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1);

return {new_input_ids, new_attention_mask};
}
ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
} // namespace utils
} // namespace genai
} // namespace ov
40 changes: 29 additions & 11 deletions src/cpp/src/vision_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ std::vector<int64_t> bucket_size_right(const std::vector<float>& fractional_coor
ov::Tensor prepare_vis_position_ids(
const ov::Tensor& pixel_values,
const ov::Tensor& patch_attention_mask,
const std::vector<HeightWidth> tgt_sizes,
const std::vector<ImageSize> tgt_sizes,
size_t patch_size,
size_t num_patches_per_side
) {
Expand Down Expand Up @@ -283,7 +283,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
};
std::vector<std::vector<clip_image_u8>> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split);
std::vector<std::vector<ov::Tensor>> results;
std::vector<std::vector<HeightWidth>> sizes;
std::vector<std::vector<ImageSize>> sizes;

// std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()};
Expand All @@ -296,7 +296,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
});

const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0);
HeightWidth resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size};
ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())};
ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size);
encoder.set_tensor("pixel_values", pixel_values);
Expand All @@ -314,35 +314,53 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
return {std::move(resized_source), resized_source_size};
}

HeightWidth size{
ImageSize raw_size{
size_t(preprocessed.at(1).at(0).ny),
size_t(preprocessed.at(1).at(0).nx)
};
std::vector<HeightWidth> sliced_sizes;
size_t n_patches = size.height / patch_size * size.width / patch_size,
ImageSize slices_size{
raw_size.height / patch_size,
raw_size.width / patch_size
};
size_t n_patches = slices_size.height * slices_size.width,
old_hidden_size = resized_source.get_shape().at(2);
ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}};
for (size_t row = 1; row < preprocessed.size(); ++row) {
for (size_t col = 0; col < preprocessed.at(row).size(); ++col) {
clip_image_f32& elem = preprocessed.at(row).at(col);
sliced_sizes.push_back({elem.ny / patch_size, elem.nx / patch_size});
ov::Tensor pixel_values = preprocess_for_encoder(
{ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()},
patch_size
);
encoder.set_tensor("pixel_values", pixel_values);
ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, sliced_sizes.back().height * sliced_sizes.back().width}};
ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}};
std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
encoder.set_tensor("patch_attention_mask", patch_attention_mask);
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {sliced_sizes.back()}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
encoder.set_tensor("position_ids", position_ids);
const ov::Tensor& old = encoder.get_output_tensor();
encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size});
encoder.infer();
encoder.set_output_tensor(old);
}
}
return {resized_source, resized_source_size, encoded_slices, sliced_sizes};
return {resized_source, resized_source_size, encoded_slices, slices_size};
}

ProcessorConfig from_any_map(
const ov::AnyMap& config_map,
const ProcessorConfig& initial
) {
auto iter = config_map.find("processor_config");
ProcessorConfig extracted_config = config_map.end() != iter ?
iter->second.as<ProcessorConfig>() : initial;
using utils::read_anymap_param;
read_anymap_param(config_map, "patch_size", extracted_config.patch_size);
read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution);
read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums);
read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean);
read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
return extracted_config;
}
}

Expand All @@ -366,7 +384,7 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
return encode(image, utils::from_any_map(
return encode(image, from_any_map(
config_map, m_processor_config
));
}
Loading
Loading