diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 9825356234..916074ced1 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -2,7 +2,8 @@ name: Build PR documentation on: pull_request: - branches: [ main ] + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -10,7 +11,8 @@ concurrency: jobs: build_documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 + env: COMMIT_SHA: ${{ github.event.pull_request.head.sha }} PR_NUMBER: ${{ github.event.number }} @@ -18,16 +20,21 @@ jobs: PR_CLONE_URL: ${{ github.event.pull_request.head.repo.clone_url }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: - repository: 'huggingface/doc-builder' + repository: "huggingface/doc-builder" path: doc-builder - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: - repository: 'huggingface/optimum-intel' + repository: "huggingface/optimum-intel" path: optimum-intel + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + - name: Setup environment run: | pip uninstall -y doc-builder diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml deleted file mode 100644 index c76b6f8042..0000000000 --- a/.github/workflows/check_code_quality.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: check_code_quality - -on: - push: - branches: [ main ] - paths: - - "optimum/**.py" - - "tests/**.py" - - "examples/**.py" - - pull_request: - branches: [ main ] - paths: - - "optimum/**.py" - - "tests/**.py" - - "examples/**.py" - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build: - strategy: - fail-fast: false - matrix: - python-version: [3.8] - os: [ubuntu-latest] - - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Create and start a virtual environment - run: | - python -m venv venv - source venv/bin/activate - - name: Install dependencies - run: | - source venv/bin/activate - pip install --upgrade pip - pip install .[quality] - - - name: Check style with black - run: | - source venv/bin/activate - black --check . - - name: Check style with ruff - run: | - source venv/bin/activate - ruff check . diff --git a/.github/workflows/dockerfile_sanity.yml b/.github/workflows/dockerfile_sanity.yml index 12be9a5b15..060b80ca45 100644 --- a/.github/workflows/dockerfile_sanity.yml +++ b/.github/workflows/dockerfile_sanity.yml @@ -5,40 +5,40 @@ on: branches: - main paths: - - 'docker/Dockerfile.intel' - + - "docker/Dockerfile.intel" + pull_request: branches: - main paths: - - 'docker/Dockerfile.intel' + - "docker/Dockerfile.intel" jobs: build_and_run: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - - name: Checkout code - uses: actions/checkout@v4 + - name: Checkout code + uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - - name: Build and Run Docker Image - run: | - IMAGE_NAME="intel_image:latest" - docker build -f docker/Dockerfile.intel -t $IMAGE_NAME . - if [ $? -ne 0 ]; then - echo "Docker image build failed." - exit 1 - fi - CONTAINER_ID=$(docker run -d $IMAGE_NAME tail -f /dev/null) - if docker inspect -f '{{.State.Running}}' $CONTAINER_ID 2>/dev/null | grep -q 'true'; then - echo "Container is running." - else - echo "Container failed to start." - docker logs $CONTAINER_ID 2>/dev/null || echo "No container ID found." - exit 1 - fi - docker stop $CONTAINER_ID - docker rm $CONTAINER_ID \ No newline at end of file + - name: Build and Run Docker Image + run: | + IMAGE_NAME="intel_image:latest" + docker build -f docker/Dockerfile.intel -t $IMAGE_NAME . + if [ $? -ne 0 ]; then + echo "Docker image build failed." + exit 1 + fi + CONTAINER_ID=$(docker run -d $IMAGE_NAME tail -f /dev/null) + if docker inspect -f '{{.State.Running}}' $CONTAINER_ID 2>/dev/null | grep -q 'true'; then + echo "Container is running." + else + echo "Container failed to start." + docker logs $CONTAINER_ID 2>/dev/null || echo "No container ID found." + exit 1 + fi + docker stop $CONTAINER_ID + docker rm $CONTAINER_ID diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml new file mode 100644 index 0000000000..3895038204 --- /dev/null +++ b/.github/workflows/quality.yml @@ -0,0 +1,41 @@ +name: Quality Checks +on: + push: + branches: + - main + - v*-release + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + quality: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install --upgrade pip + # .[quality] installs too many dependencies + # TODO: we should remove the the version pinning at some point + pip install "black~=23.1" "ruff==0.4.4" + + - name: Check style with black + run: | + black --check . + + - name: Check style with ruff + run: | + ruff check . diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 3c2dc94dac..378c78da41 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -9,6 +9,7 @@ permissions: jobs: secrets: runs-on: ubuntu-latest + steps: - shell: bash run: | diff --git a/.github/workflows/test_generation.yml b/.github/workflows/test_generation.yml index 3c27473185..cfa3fde404 100644 --- a/.github/workflows/test_generation.yml +++ b/.github/workflows/test_generation.yml @@ -1,12 +1,13 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Intel Generation Utils - Test +name: Generation Utils - Test (deprecated) on: push: - branches: [ main ] + branches: + - main + - v*-release pull_request: - branches: [ main ] + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -14,25 +15,22 @@ concurrency: jobs: build: - strategy: - fail-fast: false - matrix: - python-version: [3.8, 3.9] - os: [ubuntu-latest] + runs-on: ubuntu-22.04 - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install transformers==4.43.* - pip install optimum[exporters] - pip install .[tests] - - name: Test with Pytest - run: | - pytest tests/generation/ + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install .[tests] transformers[testing]==4.43.* + + - name: Test with Pytest + run: | + pytest tests/generation/ diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 6e99923487..c1a75a6e3e 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -1,6 +1,4 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Intel Neural Compressor - Test +name: INC - Test on: push: @@ -20,31 +18,34 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-latest] + torch-version: ["2.2.0", "2.3.*", "2.4.*"] + + runs-on: ubuntu-22.04 - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install cmake - pip install py-cpuinfo - pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu - pip install intel-extension-for-pytorch==2.3.0 - pip install datasets==2.19.0 - pip install .[neural-compressor,diffusers,tests] - pip install peft - - - name: Test with Pytest - run: | - pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0 - - name: Test IPEX - run: | - pytest tests/neural_compressor/test_ipex.py + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install torch==${{ matrix.torch-version }} torchaudio torchvision --index-url https://download.pytorch.org/whl/cpu + pip install .[neural-compressor,ipex,diffusers,peft,tests] transformers[testing] intel-extension-for-pytorch==${{ matrix.torch-version }} + + - if: ${{ matrix.torch-version == '2.2.0' }} + name: Downgrade Numpy + run: pip install numpy==1.* + + - name: Assert versions + run: | + python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))" + python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))" + - name: Test with Pytest + run: | + pytest tests/neural_compressor diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 2050c065a9..64a5c07a6a 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -1,6 +1,4 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions -name: Intel IPEX - Test +name: IPEX - Test on: push: @@ -17,29 +15,36 @@ concurrency: jobs: build: - runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: [3.10] - transformers-version: ["4.39.0", "4.45.*"] + transformers-version: ["4.44.0", "4.45.*"] ipex-version: ["2.4.0", "2.5.*"] + runs-on: ubuntu-22.04 + steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 + - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install torch==${{ matrix.ipex-version }} --extra-index-url https://download.pytorch.org/whl/cpu - pip install intel_extension_for_pytorch==${{ matrix.ipex-version }} - pip install Pillow parameterized - pip install transformers[testing]==${{ matrix.transformers-version }} - pip install .[ipex] + pip install --upgrade pip + pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }} + + - name: Assert versions + run: | + python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))" + python -c "import intel_extension_for_pytorch; print(intel_extension_for_pytorch.__version__); assert intel_extension_for_pytorch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))" + python -c "import transformers; print(transformers.__version__); assert transformers.__version__.startswith('${{ matrix.transformers-version }}'.replace('.*', ''))" - name: Test with Pytest run: | - pytest tests/ipex/ + pytest tests/ipex diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index a54ba20766..d2599faa56 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -1,10 +1,13 @@ -name: Offline usage / Python - Test +name: Offline - Test on: push: - branches: [main] + branches: + - main + - v*-release pull_request: - branches: [main] + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -12,22 +15,21 @@ concurrency: jobs: build: - strategy: - fail-fast: false - matrix: - python-version: [3.9] - os: [ubuntu-latest] + runs-on: ubuntu-22.04 - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v3 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 + - name: Install dependencies run: | pip install .[tests,openvino] + - name: Test run: | HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2 diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index bbdfdb32a0..e2889cb4e0 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -1,5 +1,3 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: OpenVINO - Test on: @@ -20,40 +18,55 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.12"] - transformers-version: ["4.36.0", "4.44.*"] - os: [ubuntu-latest] + test-pattern: + [ + "*modeling*", + "*diffusion*", + "*quantization*", + "*training*", + "*export*", + ] + transformers-version: ["4.36.0", "latest"] + + runs-on: ubuntu-22.04 - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 - name: Install dependencies run: | - python -m pip install --upgrade pip - # install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU + pip install --upgrade pip pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime - pip install transformers==${{ matrix.transformers-version }} + pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing] + + - if: ${{ matrix.transformers-version != 'latest' }} + name: Downgrade Transformers and Accelerate + run: | + pip install transformers==${{ matrix.transformers-version }} accelerate==0.* - - if: ${{ matrix.transformers-version == '4.36.0' }} - run: pip install accelerate==0.* + - if: ${{ matrix.test-pattern == '*modeling*' }} + name: Uninstall NNCF + run: | + pip uninstall -y nncf - name: Test with Pytest + run: | + pytest tests/openvino/${{ matrix.test-pattern }} --durations=0 env: HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + + - if: ${{ matrix.test-pattern == '*modeling*' }} + name: Install dependencies (nightly) run: | - pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0 - - name: Test basic - run: | - pip uninstall -y nncf - pytest tests/openvino/test_modeling_basic.py - - name: Test openvino-nightly + pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + + - if: ${{ matrix.test-pattern == '*modeling*' }} + name: Test with Pytest (nightly) run: | - pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" - optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov + pytest tests/openvino/test_modeling_basic.py --durations=0 diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml deleted file mode 100644 index 82c39da371..0000000000 --- a/.github/workflows/test_openvino_basic.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: OpenVINO - Basic Test - -on: - workflow_dispatch: - schedule: - - cron: "41 1 * * *" # run every day at 1:41 - push: - branches: - - v*-release - pull_request: - types: [opened, synchronize, reopened, labeled] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - build: - if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || (github.event_name == 'push') || contains( github.event.pull_request.labels.*.name, 'openvino-test') }} - strategy: - fail-fast: false - matrix: - # Testing lower and upper bound of supported Python versions - # This also ensures that the test fails if dependencies break for Python 3.7 - python-version: ["3.9", "3.12"] - os: ["ubuntu-22.04", "windows-latest"] - transformers-version: ["4.45.*"] - openvino: ["openvino openvino-tokenizers"] - nncf: ["nncf"] - include: - - python-version: "3.12" - os: "ubuntu-22.04" - transformers-version: "4.36.0" - openvino: "openvino openvino-tokenizers" - nncf: "nncf" - - python-version: "3.12" - os: "ubuntu-22.04" - transformers-version: "4.45.*" - openvino: "--pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" - nncf: "nncf" - - python-version: "3.12" - os: "ubuntu-22.04" - transformers-version: "4.45.*" - openvino: "--pre -U openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly" - nncf: "git+https://github.com/openvinotoolkit/nncf.git" - - runs-on: ${{ matrix.os }} - - steps: - - uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - # Install openvino manually to prevent dependency conflicts when .[openvino] pins - # optimum or transformers to a specific version - pip install ${{ matrix.openvino }} - pip install transformers==${{ matrix.transformers-version }} - pip install .[tests] - - - name: Pip freeze - run: pip freeze - - - name: Test with Pytest - run: | - pytest tests/openvino/test_modeling_basic.py - - - name: Slow tests - run: | - pip install ${{ matrix.nncf }} - pytest tests/openvino -s -m "run_slow" --durations=0 - env: - RUN_SLOW: 1 diff --git a/.github/workflows/test_openvino_examples.yml b/.github/workflows/test_openvino_examples.yml index c76374e9ea..5b1e8e9dff 100644 --- a/.github/workflows/test_openvino_examples.yml +++ b/.github/workflows/test_openvino_examples.yml @@ -3,15 +3,15 @@ name: OpenVINO - Examples Test on: workflow_dispatch: schedule: - - cron: 0 1 * * 1 # run weekly: every Monday at 1am + - cron: 0 1 * * 1 # run weekly: every Monday at 1am push: paths: - - '.github/workflows/test_openvino_examples.yml' - - 'examples/openvino/**' + - ".github/workflows/test_openvino_examples.yml" + - "examples/openvino/**" pull_request: paths: - - '.github/workflows/test_openvino_examples.yml' - - 'examples/openvino/**' + - ".github/workflows/test_openvino_examples.yml" + - "examples/openvino/**" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -22,25 +22,27 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.12"] + python-version: ["3.9", "3.12"] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - pip install .[openvino] jstyleson pytest - pip install -r examples/openvino/audio-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -r examples/openvino/image-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -r examples/openvino/question-answering/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu - pip install -r examples/openvino/text-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu - - - name: Test examples - run: | - python -m pytest examples/openvino/test_examples.py + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install -r examples/openvino/audio-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -r examples/openvino/image-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -r examples/openvino/question-answering/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -r examples/openvino/text-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install .[openvino] jstyleson pytest + + - name: Test examples + run: | + pytest examples/openvino/test_examples.py diff --git a/.github/workflows/test_openvino_notebooks.yml b/.github/workflows/test_openvino_notebooks.yml index ded091d5ad..8e3095b67e 100644 --- a/.github/workflows/test_openvino_notebooks.yml +++ b/.github/workflows/test_openvino_notebooks.yml @@ -3,16 +3,15 @@ name: OpenVINO - Notebooks Test on: workflow_dispatch: schedule: - - cron: '14 3 * * 1' # run weekly: every Monday at 3:14 + - cron: "14 3 * * 1" # run weekly: every Monday at 3:14 push: paths: - - '.github/workflows/test_openvino_notebooks.yml' - - 'notebooks/openvino/*' + - ".github/workflows/test_openvino_notebooks.yml" + - "notebooks/openvino/*" pull_request: paths: - - '.github/workflows/test_openvino_notebooks.yml' - - 'notebooks/openvino/*' - + - ".github/workflows/test_openvino_notebooks.yml" + - "notebooks/openvino/*" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -23,33 +22,29 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.12"] + python-version: ["3.9", "3.12"] runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages - # ffmpeg, torchaudio and pillow are required for image classification and audio classification pipelines - sudo apt-get install ffmpeg - pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cpu - pip install -r notebooks/openvino/requirements.txt - pip install .[tests,openvino] nbval - - - run: free -h - - run: lscpu - - run: pip freeze - - - name: Test with Pytest - run: | - sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb - sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb - python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb notebooks/openvino/question_answering_quantization.ipynb - + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + # ffmpeg is required for image classification and audio classification pipelines + sudo apt-get install ffmpeg + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install -r notebooks/openvino/requirements.txt + pip install .[tests,openvino] nbval + + - name: Test with Pytest + run: | + sed -i 's/NUM_TRAIN_ITEMS = 600/NUM_TRAIN_ITEMS = 10/' notebooks/openvino/question_answering_quantization.ipynb + sed -i 's/# %pip install/%pip install/' notebooks/openvino/optimum_openvino_inference.ipynb + python -m pytest --nbval-lax notebooks/openvino/optimum_openvino_inference.ipynb notebooks/openvino/question_answering_quantization.ipynb diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml new file mode 100644 index 0000000000..bf52413a7d --- /dev/null +++ b/.github/workflows/test_openvino_slow.yml @@ -0,0 +1,75 @@ +name: OpenVINO - Slow Test + +on: + workflow_dispatch: + schedule: + - cron: "41 1 * * *" # run every day at 1:41 + push: + branches: + - v*-release + pull_request: + types: + - opened + - labeled + - reopened + - synchronize + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || (github.event_name == 'push') || contains( github.event.pull_request.labels.*.name, 'openvino-test') }} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-22.04", "windows-2019"] + openvino-version: ["stable", "nightly"] + transformers-version: ["4.36.0", "latest"] + nncf: ["nncf", "git+https://github.com/openvinotoolkit/nncf.git"] + + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install .[openvino,tests] transformers[testing] + pip uninstall -y nncf + + - if: ${{ matrix.openvino-version == 'nightly' }} + name: Install nightly OpenVINO + run: | + pip install openvino openvino-tokenizers --pre --upgrade --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + + - if: ${{ matrix.transformers-version != 'latest' }} + name: Downgrade Transformers and Accelerate + run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* + + - name: Pip freeze + run: pip freeze + + - name: Test with Pytest (basic) + run: | + pytest tests/openvino/test_modeling_basic.py + + - name: Install dependencies (slow) + run: | + pip install ${{ matrix.nncf }} + + - name: Test with Pytest (slow) + run: | + pytest tests/openvino -m "run_slow" --durations=0 + env: + RUN_SLOW: 1 + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} diff --git a/docs/Dockerfile b/docs/Dockerfile index 4acde4e659..40142a2c0d 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8 +FROM python:3.9 ARG commit_sha ARG clone_url diff --git a/examples/neural_compressor/optical-character-recognition/README.md b/examples/neural_compressor/optical-character-recognition/README.md index f701badfc2..7a80ed035b 100644 --- a/examples/neural_compressor/optical-character-recognition/README.md +++ b/examples/neural_compressor/optical-character-recognition/README.md @@ -16,7 +16,7 @@ limitations under the License. # Optical Character Recognition -The script [`run_ocr.py`](https://github.com/huggingface/optimum-intel/blob/main/examples/neural_compressor/optical-character-recognition/run_ocr.py) +The script [`run_ocr_post_training.py`](https://github.com/huggingface/optimum-intel/blob/main/examples/neural_compressor/optical-character-recognition/run_ocr_post_training.py) allows us to apply different quantization approaches (such as dynamic and static quantization) using the [Intel Neural Compressor ](https://github.com/intel/neural-compressor) library for optical character recognition tasks and [IAM](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database) datasets. diff --git a/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json b/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json index c58903da17..41e53f5cbb 100644 --- a/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json +++ b/examples/openvino/audio-classification/configs/wav2vec2-base-jpqd.json @@ -54,6 +54,9 @@ "activations": { "{re}.*matmul_0": { "mode": "symmetric" + }, + "{re}.*scaled_dot_product_attention_0": { + "mode": "symmetric" } } }, diff --git a/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json b/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json index 8edc51cf24..191f266a65 100644 --- a/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json +++ b/examples/openvino/audio-classification/configs/wav2vec2-base-qat.json @@ -16,6 +16,9 @@ "activations": { "{re}.*matmul_0": { "mode": "symmetric" + }, + "{re}.*scaled_dot_product_attention_0": { + "mode": "symmetric" } } }, diff --git a/examples/openvino/audio-classification/requirements.txt b/examples/openvino/audio-classification/requirements.txt index f88b156da1..89569575f5 100644 --- a/examples/openvino/audio-classification/requirements.txt +++ b/examples/openvino/audio-classification/requirements.txt @@ -1,3 +1,4 @@ +transformers>=4.36.0,<4.46.0 datasets>=1.14.0,<2.20.0 evaluate librosa diff --git a/examples/openvino/image-classification/configs/swin-base-jpqd.json b/examples/openvino/image-classification/configs/swin-base-jpqd.json index 23b2fd3d84..a6057f6d71 100644 --- a/examples/openvino/image-classification/configs/swin-base-jpqd.json +++ b/examples/openvino/image-classification/configs/swin-base-jpqd.json @@ -32,7 +32,16 @@ "num_bn_adaptation_samples": 200 } }, - "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}}, + "scope_overrides": { + "activations": { + "{re}.*matmul_0": { + "mode": "symmetric" + }, + "{re}.*scaled_dot_product_attention_0": { + "mode": "symmetric" + } + } + }, "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", diff --git a/examples/openvino/image-classification/requirements.txt b/examples/openvino/image-classification/requirements.txt index 4c467e0d80..4ef9212757 100644 --- a/examples/openvino/image-classification/requirements.txt +++ b/examples/openvino/image-classification/requirements.txt @@ -1,3 +1,4 @@ +transformers>=4.36.0,<4.46.0 datasets>=1.14.0,<2.20.0 torch >= 1.9.0 torchvision>=0.6.0 diff --git a/examples/openvino/question-answering/requirements.txt b/examples/openvino/question-answering/requirements.txt index 2af8f02688..b4e37df13b 100644 --- a/examples/openvino/question-answering/requirements.txt +++ b/examples/openvino/question-answering/requirements.txt @@ -1,3 +1,4 @@ +transformers>=4.36.0,<4.46.0 datasets>=1.14.0,<2.20.0 torch >= 1.9.0 evaluate diff --git a/examples/openvino/text-classification/requirements.txt b/examples/openvino/text-classification/requirements.txt index bcf3f8025d..f8b37a9e56 100644 --- a/examples/openvino/text-classification/requirements.txt +++ b/examples/openvino/text-classification/requirements.txt @@ -1,3 +1,4 @@ +transformers>=4.36.0,<4.46.0 datasets>=1.14.0,<2.20.0 sentencepiece != 0.1.92 scipy diff --git a/notebooks/openvino/README.md b/notebooks/openvino/README.md index f63c13c55b..31c2580996 100644 --- a/notebooks/openvino/README.md +++ b/notebooks/openvino/README.md @@ -12,4 +12,5 @@ The notebooks have been tested with Python 3.8 and 3.10 on Ubuntu Linux. |:----------|:-------------|:-------------|------:| | [How to run inference with the OpenVINO](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb) | Explains how to export your model to OpenVINO and to run inference with OpenVINO Runtime on various tasks| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)| | [How to quantize a question answering model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb) | Show how to apply post-training quantization on a question answering model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)| -| [How to quantize Stable Diffusion model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| Show how to apply post-training hybrid quantization on a Stable Diffusion model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| \ No newline at end of file +| [How to quantize Stable Diffusion model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| Show how to apply post-training hybrid quantization on a Stable Diffusion model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| +| [How to quantize Sentence Transformer model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/sentence_transformer_quantization.ipynb)| Show how to apply post-training 8-bit quantization on a Sentence Transformer model using [NNCF](https://github.com/openvinotoolkit/nncf) and to accelerate inference with OpenVINO| [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/sentence_transformer_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/sentence_transformer_quantization.ipynb)| diff --git a/notebooks/openvino/requirements.txt b/notebooks/openvino/requirements.txt index bb7a517cff..64ccd6d8cc 100644 --- a/notebooks/openvino/requirements.txt +++ b/notebooks/openvino/requirements.txt @@ -4,4 +4,3 @@ evaluate[evaluator] ipywidgets pillow torchaudio - diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb new file mode 100644 index 0000000000..714544aa9a --- /dev/null +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -0,0 +1,625 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantization of Text Embedding model from Sentence Transformers library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install optimum[openvino]\n", + "%pip install evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize staticly model to 8-bit with NNCF via Optimum-Intel API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code snippet below shows how to use Optimum-Intel [Model Optimization API](https://huggingface.co/docs/optimum/en/intel/openvino/optimization#static-quantization) to quantize the model staticly. It leaverages [NNCF](https://github.com/openvinotoolkit/nncf) capabilites for static quantization of Transformer models where a combination of the special quantization scheme + SmoothQuant method + Bias Correction method are used to provide state-of-the-art accuracy.\n", + "\n", + "The static quantization requires some data to estimate quantization parameters of activations. It means that some calibration dataset should be provided. `OVQuantizer` class used for quantization provides an API to build such a dataset with `.get_calibration_dataset()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n", + "Framework not specified. Using pt to export the model.\n", + "Using framework PyTorch: 2.4.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "Compiling the model to CPU ...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a9bd847756fd467e905a7ad7a243640c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d8ad91623d642f48e85b60ac823aca4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a2a7d09a573c4092a830bbaadc39f756",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b67c493aab36426090f8fafd25a17a00",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Configuration saved in all-MiniLM-L6-v2_int8/openvino_config.json\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('all-MiniLM-L6-v2_int8/tokenizer_config.json',\n",
+       " 'all-MiniLM-L6-v2_int8/special_tokens_map.json',\n",
+       " 'all-MiniLM-L6-v2_int8/vocab.txt',\n",
+       " 'all-MiniLM-L6-v2_int8/added_tokens.json',\n",
+       " 'all-MiniLM-L6-v2_int8/tokenizer.json')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functools import partial\n",
+    "import datasets\n",
+    "from transformers import AutoTokenizer\n",
+    "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n",
+    "\n",
+    "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "base_model_path = \"all-MiniLM-L6-v2\"\n",
+    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
+    "\n",
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "model.save_pretrained(base_model_path)\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+    "tokenizer.save_pretrained(base_model_path)\n",
+    "\n",
+    "\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
+    "\n",
+    "def preprocess_function(examples, tokenizer):\n",
+    "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
+    "\n",
+    "\n",
+    "calibration_dataset = quantizer.get_calibration_dataset(\n",
+    "    \"glue\",\n",
+    "    dataset_config_name=\"sst2\",\n",
+    "    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),\n",
+    "    num_samples=300,\n",
+    "    dataset_split=\"train\",\n",
+    ")\n",
+    "\n",
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
+    "\n",
+    "quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=int8_ptq_model_path)\n",
+    "tokenizer.save_pretrained(int8_ptq_model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark model accuracy on GLUE STSB task"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we estimate accuracy impact from model quantization. We evaluate accuracy of both the baseline and quantized model on a different task from the GLUE benchmark."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Pipeline\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "# copied from the model card \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
+    "\n",
+    "\n",
+    "class SentenceEmbeddingPipeline(Pipeline):\n",
+    "    def _sanitize_parameters(self, **kwargs):\n",
+    "        # we don\"t have any hyperameters to sanitize\n",
+    "        preprocess_kwargs = {}\n",
+    "        return preprocess_kwargs, {}, {}\n",
+    "\n",
+    "    def preprocess(self, inputs):\n",
+    "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
+    "        return encoded_inputs\n",
+    "\n",
+    "    def _forward(self, model_inputs):\n",
+    "        outputs = self.model(**model_inputs)\n",
+    "        return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n",
+    "\n",
+    "    def postprocess(self, model_outputs):\n",
+    "        # Perform pooling\n",
+    "        sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs[\"attention_mask\"])\n",
+    "        # Normalize embeddings\n",
+    "        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
+    "        return sentence_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model to CPU ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Compiling the model to CPU ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = OVModelForFeatureExtraction.from_pretrained(base_model_path)\n",
+    "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
+    "\n",
+    "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
+    "q8_emb = SentenceEmbeddingPipeline(model=q_model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from evaluate import load\n",
+    "\n",
+    "eval_dataset = load_dataset(\"glue\", \"stsb\", split=\"validation\")\n",
+    "metric = load(\"glue\", \"stsb\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5cab9e8fc58245a4b395a9575017633b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1500 [00:00\n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 12.27 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         1988.84 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        9.74 ms\n",
+      "[ INFO ]    Average:       9.77 ms\n",
+      "[ INFO ]    Min:           9.59 ms\n",
+      "[ INFO ]    Max:           11.12 ms\n",
+      "[ INFO ] Throughput:   100.56 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# FP32 baseline model\n",
+    "!benchmark_app -m all-MiniLM-L6-v2/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Step 1/11] Parsing and validating input arguments\n",
+      "[ INFO ] Parsing input parameters\n",
+      "[Step 2/11] Loading OpenVINO Runtime\n",
+      "[ INFO ] OpenVINO:\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
+      "[ INFO ] \n",
+      "[ INFO ] Device info:\n",
+      "[ INFO ] CPU\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
+      "[ INFO ] \n",
+      "[ INFO ] \n",
+      "[Step 3/11] Setting device configuration\n",
+      "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
+      "[Step 4/11] Reading model files\n",
+      "[ INFO ] Loading model files\n",
+      "[ INFO ] Read model took 20.87 ms\n",
+      "[ INFO ] Original model I/O parameters:\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [?,?,384]\n",
+      "[Step 5/11] Resizing model to match image sizes and given batch\n",
+      "[ INFO ] Model batch size: 1\n",
+      "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
+      "[ INFO ] Reshape model took 3.42 ms\n",
+      "[Step 6/11] Configuring input of the model\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
+      "[Step 7/11] Loading the model to the device\n",
+      "[ INFO ] Compile model took 323.91 ms\n",
+      "[Step 8/11] Querying optimal runtime parameters\n",
+      "[ INFO ] Model:\n",
+      "[ INFO ]   NETWORK_NAME: Model0\n",
+      "[ INFO ]   OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1\n",
+      "[ INFO ]   NUM_STREAMS: 1\n",
+      "[ INFO ]   INFERENCE_NUM_THREADS: 18\n",
+      "[ INFO ]   PERF_COUNT: NO\n",
+      "[ INFO ]   INFERENCE_PRECISION_HINT: \n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 6.72 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         853.85 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        4.13 ms\n",
+      "[ INFO ]    Average:       4.15 ms\n",
+      "[ INFO ]    Min:           4.05 ms\n",
+      "[ INFO ]    Max:           5.13 ms\n",
+      "[ INFO ] Throughput:   234.23 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# INT8 counterpart\n",
+    "!benchmark_app -m all-MiniLM-L6-v2_int8/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test3.11_cpu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 93528e0085..3b6b4de69f 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -21,9 +21,10 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
+from ...exporters.openvino.convert import save_preprocessors
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path
-from ...utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
+from ...utils.save_utils import maybe_load_preprocessors
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -71,7 +72,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4", "mxfp4"],
+        choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4"],
         default=None,
         help="The weight format of the exported model.",
     )
@@ -262,8 +263,9 @@ def run(self):
         if self.args.weight_format is None:
             ov_config = None
             if not no_compression_parameter_provided(self.args):
-                logger.warning(
-                    "The provided compression parameters will not affect conversion because of the missing --weight-format argument."
+                raise ValueError(
+                    "Some compression parameters are provided, but the weight format is not specified. "
+                    "Please provide it with --weight-format argument."
                 )
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
@@ -318,6 +320,10 @@ def run(self):
                 from optimum.intel import OVStableDiffusionPipeline
 
                 model_cls = OVStableDiffusionPipeline
+            elif class_name == "StableDiffusion3Pipeline":
+                from optimum.intel import OVStableDiffusion3Pipeline
+
+                model_cls = OVStableDiffusion3Pipeline
             else:
                 raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
 
@@ -325,11 +331,18 @@ def run(self):
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
-        elif task.startswith("text-generation") and quantize_with_dataset:
-            from optimum.intel import OVModelForCausalLM
+        elif (task.startswith("text-generation") or task == "image-text-to-text") and quantize_with_dataset:
+            if task.startswith("text-generation"):
+                from optimum.intel import OVModelForCausalLM
+
+                model_cls = OVModelForCausalLM
+            else:
+                from optimum.intel import OVModelForVisualCausalLM
+
+                model_cls = OVModelForVisualCausalLM
 
-            # To quantize a text-generation model with a dataset, an instantiated OVModelForCausalLM is required
-            model = OVModelForCausalLM.from_pretrained(
+            # To quantize a model with a dataset, an instance of a model class is required
+            model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,
                 quantization_config=quantization_config,
@@ -338,11 +351,9 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
-            maybe_save_preprocessors(self.args.model, self.args.output, trust_remote_code=self.args.trust_remote_code)
+            preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code)
+            save_preprocessors(preprocessors, model.config, self.args.output, self.args.trust_remote_code)
             if not self.args.disable_convert_tokenizer:
-                preprocessors = maybe_load_preprocessors(
-                    self.args.model, trust_remote_code=self.args.trust_remote_code
-                )
                 maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
         else:
             # TODO : add input shapes
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 69cfec1d96..dba4628d79 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -45,6 +45,8 @@
 from .utils import _MAX_UNCOMPRESSED_SIZE, MULTI_MODAL_TEXT_GENERATION_MODELS, clear_class_registry
 
 
+FORCE_ATTN_MODEL_CLASSES = {"phi3-v": "eager"}
+
 if TYPE_CHECKING:
     from optimum.intel.openvino.configuration import OVConfig
 
@@ -55,6 +57,9 @@
 
 logger = logging.getLogger(__name__)
 
+# init core before import openvino tokenizers to prevent failed attempt loading extension
+core = Core()
+
 
 def infer_task(
     task,
@@ -261,6 +266,10 @@ def main_export(
 
         if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
             loading_kwargs["attn_implementation"] = "eager"
+
+        # some models force flash_attn attention by default that does not support load model on cpu
+        if is_transformers_version(">=", "4.36") and model_type in FORCE_ATTN_MODEL_CLASSES:
+            loading_kwargs["_attn_implementation"] = FORCE_ATTN_MODEL_CLASSES[model_type]
         # there are some difference between remote and in library representation of past key values for some models,
         # for avoiding confusion we disable remote code for them
         if (
@@ -413,7 +422,6 @@ class StoreAttr(object):
     del model
     gc.collect()
 
-    core = Core()
     for submodel_path in submodel_paths:
         submodel_path = Path(output) / submodel_path
         submodel = core.read_model(submodel_path)
@@ -491,7 +499,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
                         f"models won't be generated. Exception: {exception}"
                     )
         elif model:
-            for tokenizer_name in ("tokenizer", "tokenizer_2"):
+            for tokenizer_name in ("tokenizer", "tokenizer_2", "tokenizer_3"):
                 tokenizer = getattr(model, tokenizer_name, None)
                 if tokenizer:
                     export_tokenizer(tokenizer, output / tokenizer_name, task=task)
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 4db452dbd4..fdcfbecf53 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import copy
 import functools
 import gc
 import logging
@@ -20,6 +21,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import onnx
+from transformers import PretrainedConfig
 from transformers.generation import GenerationMixin
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -31,8 +33,14 @@
 from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed
 from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx
 from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx
-from optimum.exporters.utils import _get_submodels_and_export_configs as _default_get_submodels_and_export_configs
+from optimum.exporters.utils import (
+    _get_submodels_and_export_configs as _default_get_submodels_and_export_configs,
+)
+from optimum.exporters.utils import (
+    get_diffusion_models_for_export,
+)
 from optimum.intel.utils.import_utils import (
+    _diffusers_version,
     _nncf_version,
     _open_clip_version,
     _optimum_intel_version,
@@ -41,6 +49,8 @@
     _torch_version,
     _transformers_version,
     compare_versions,
+    is_openvino_tokenizers_version,
+    is_tokenizers_version,
     is_transformers_version,
 )
 from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
@@ -62,6 +72,7 @@
     _get_open_clip_submodels_fn_and_export_configs,
     clear_class_registry,
     remove_none_from_dummy_inputs,
+    save_config,
 )
 
 
@@ -82,9 +93,31 @@
     from optimum.intel.openvino.configuration import OVConfig
 
 
-def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
+def _set_runtime_options(
+    models_and_export_configs: Dict[
+        str,
+        Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin", "DiffusionPipeline"], "OnnxConfig"],
+    ],
+    task: str,
+):
+    for model_name in models_and_export_configs.keys():
+        _, sub_export_config = models_and_export_configs[model_name]
+        if "vae_" in model_name or "text-generation" in task:
+            sub_export_config.runtime_options = {"ACTIVATIONS_SCALE_FACTOR": "8.0"}
+
+
+def _save_model(
+    model,
+    path: str,
+    ov_config: Optional["OVConfig"] = None,
+    library_name: Optional[str] = None,
+    config: OnnxConfig = None,
+):
     compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
+
+    if hasattr(config, "runtime_options"):
+        model = _add_runtime_options_to_rt_info(model, config.runtime_options)
     save_model(model, path, compress_to_fp16)
 
 
@@ -204,7 +237,9 @@ def export_tensorflow(
         output.parent / output,
         ov_config=ov_config,
         library_name=library_name,
+        config=config,
     )
+    del ov_model
     return input_names, output_names, True
 
 
@@ -266,7 +301,9 @@ def export_pytorch_via_onnx(
         output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output,
         ov_config=ov_config,
         library_name=library_name,
+        config=config,
     )
+    del ov_model
     return input_names, output_names, True
 
 
@@ -439,8 +476,10 @@ def ts_patched_forward(*args, **kwargs):
             output,
             ov_config=ov_config,
             library_name=library_name,
+            config=config,
         )
         clear_class_registry()
+        del ov_model
         del model
         gc.collect()
     return input_names, output_names, False
@@ -614,23 +653,27 @@ def export_from_model(
             model, library_name, task, preprocessors, custom_export_configs, fn_get_submodels
         )
 
-    logging.disable(logging.INFO)
-    export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs(
-        model=model,
-        task=task,
-        monolith=False,
-        custom_export_configs=custom_export_configs if custom_export_configs is not None else {},
-        custom_architecture=custom_architecture,
-        fn_get_submodels=fn_get_submodels,
-        preprocessors=preprocessors,
-        library_name=library_name,
-        model_kwargs=model_kwargs,
-        _variant="default",
-        legacy=False,
-        exporter="openvino",
-        stateful=stateful,
-    )
-    logging.disable(logging.NOTSET)
+    if library_name == "diffusers":
+        export_config, models_and_export_configs = get_diffusion_models_for_export_ext(model, exporter="openvino")
+        stateful_submodels = False
+    else:
+        logging.disable(logging.INFO)
+        export_config, models_and_export_configs, stateful_submodels = _get_submodels_and_export_configs(
+            model=model,
+            task=task,
+            monolith=False,
+            custom_export_configs=custom_export_configs if custom_export_configs is not None else {},
+            custom_architecture=custom_architecture,
+            fn_get_submodels=fn_get_submodels,
+            preprocessors=preprocessors,
+            library_name=library_name,
+            model_kwargs=model_kwargs,
+            _variant="default",
+            legacy=False,
+            exporter="openvino",
+            stateful=stateful,
+        )
+        logging.disable(logging.NOTSET)
 
     if library_name == "open_clip":
         if hasattr(model.config, "save_pretrained"):
@@ -643,7 +686,11 @@ def export_from_model(
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
     elif library_name != "diffusers":
         if is_transformers_version(">=", "4.44.99"):
-            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            # some model configs may have issues with loading without parameters initialization
+            try:
+                misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            except KeyError:
+                misplaced_generation_parameters = {}
             if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0:
                 logger.warning(
                     "Moving the following attributes in the config to the generation config: "
@@ -655,7 +702,7 @@ def export_from_model(
                     setattr(model.config, param_name, None)
 
         # Saving the model config and preprocessor as this is needed sometimes.
-        model.config.save_pretrained(output)
+        save_config(model.config, output)
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
             try:
@@ -665,8 +712,7 @@ def export_from_model(
                     f"The generation config will not be saved, saving failed with following error:\n{exception}"
                 )
 
-        model_name_or_path = model.config._name_or_path
-        maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
+        save_preprocessors(preprocessors, model.config, output, trust_remote_code)
 
         files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()]
 
@@ -696,8 +742,17 @@ def export_from_model(
         if tokenizer_2 is not None:
             tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
 
+        tokenizer_3 = getattr(model, "tokenizer_3", None)
+        if tokenizer_3 is not None:
+            tokenizer_3.save_pretrained(output.joinpath("tokenizer_3"))
+        safety_checker = getattr(model, "safety_checker", None)
+        if safety_checker is not None:
+            safety_checker.save_pretrained(output.joinpath("safety_checker"))
+
         model.save_config(output)
 
+    _set_runtime_options(models_and_export_configs, task)
+
     export_models(
         models_and_export_configs=models_and_export_configs,
         output_dir=output,
@@ -730,6 +785,12 @@ def export_tokenizer(
     except ModuleNotFoundError:
         return
 
+    if is_tokenizers_version(">", "0.19") and is_openvino_tokenizers_version("<", "2024.5.0.0"):
+        logger.warning(
+            "Exporting tokenizers to OpenVINO is not supported for tokenizers version > 0.19 and openvino version <= 2024.4. "
+            "Please downgrade to tokenizers version <= 0.19 to export tokenizers to OpenVINO."
+        )
+
     if not isinstance(output, Path):
         output = Path(output)
 
@@ -766,6 +827,41 @@ def export_tokenizer(
         save_model(model, output / file_name.format(suffix))
 
 
+def save_preprocessors(
+    preprocessors: List, config: PretrainedConfig, output: Union[str, Path], trust_remote_code: bool
+):
+    model_name_or_path = config._name_or_path
+    if hasattr(config, "export_model_type"):
+        model_type = config.export_model_type.replace("_", "-")
+    else:
+        model_type = config.model_type.replace("_", "-")
+    if preprocessors is not None:
+        # phi3-vision processor does not have chat_template attribute that breaks Processor saving on disk
+        if is_transformers_version(">=", "4.45") and model_type == "phi3-v" and len(preprocessors) > 1:
+            if not hasattr(preprocessors[1], "chat_template"):
+                preprocessors[1].chat_template = getattr(preprocessors[0], "chat_template", None)
+        for processor in preprocessors:
+            try:
+                processor.save_pretrained(output)
+            except Exception as ex:
+                logger.error(f"Saving {type(processor)} failed with {ex}")
+    else:
+        maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)
+
+
+def _add_runtime_options_to_rt_info(model: Model, options: Dict):
+    """
+    Add runtime optinos
+    """
+    try:
+        for name, value in options.items():
+            model.set_rt_info(value, ["runtime_options", name])
+    except Exception:
+        pass
+
+    return model
+
+
 def _add_version_info_to_model(model: Model, library_name: Optional[str] = None):
     """
     Add dependency versions to OpenVINO model
@@ -781,7 +877,7 @@ def _add_version_info_to_model(model: Model, library_name: Optional[str] = None)
 
             model.set_rt_info(sentence_transformers.__version__, ["optimum", "sentence_transformers_version"])
         if library_name == "diffusers":
-            model.set_rt_info(_optimum_version, ["optimum", "diffusers_version"])
+            model.set_rt_info(_diffusers_version, ["optimum", "diffusers_version"])
         elif library_name == "timm":
             model.set_rt_info(_timm_version, ["optimum", "timm_version"])
         elif library_name == "open_clip":
@@ -817,6 +913,10 @@ def _get_multi_modal_submodels_and_export_configs(
     if model_type == "internvl-chat" and preprocessors is not None:
         model.config.img_context_token_id = preprocessors[0].convert_tokens_to_ids("")
 
+    if model_type == "phi3-v":
+        model.config.glb_GN = model.model.vision_embed_tokens.glb_GN.tolist()
+        model.config.sub_GN = model.model.vision_embed_tokens.sub_GN.tolist()
+
     if hasattr(model, "image_newline"):
         model.config.image_newline = model.image_newline.tolist()
     main_config_cls = TasksManager.get_exporter_config_constructor(
@@ -878,3 +978,218 @@ def _get_submodels_and_export_configs(
     )
     stateful_per_model = [stateful] * len(models_for_export)
     return export_config, models_for_export, stateful_per_model
+
+
+def get_diffusion_models_for_export_ext(
+    pipeline: "DiffusionPipeline", int_dtype: str = "int64", float_dtype: str = "fp32", exporter: str = "openvino"
+):
+    try:
+        from diffusers import (
+            StableDiffusion3Img2ImgPipeline,
+            StableDiffusion3InpaintPipeline,
+            StableDiffusion3Pipeline,
+        )
+
+        is_sd3 = isinstance(
+            pipeline, (StableDiffusion3Pipeline, StableDiffusion3InpaintPipeline, StableDiffusion3Img2ImgPipeline)
+        )
+    except ImportError:
+        is_sd3 = False
+
+    try:
+        from diffusers import FluxPipeline
+
+        is_flux = isinstance(pipeline, FluxPipeline)
+    except ImportError:
+        is_flux = False
+
+    if not is_sd3 and not is_flux:
+        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
+    if is_sd3:
+        models_for_export = get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+    else:
+        models_for_export = get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype)
+
+    return None, models_for_export
+
+
+def get_sd3_models_for_export(pipeline, exporter, int_dtype, float_dtype):
+    models_for_export = {}
+
+    # Text encoder
+    text_encoder = getattr(pipeline, "text_encoder", None)
+    if text_encoder is not None:
+        text_encoder.config.output_hidden_states = True
+        text_encoder.text_model.config.output_hidden_states = True
+        text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder,
+            exporter=exporter,
+            library_name="diffusers",
+            task="feature-extraction",
+            model_type="clip-text-with-projection",
+        )
+        text_encoder_export_config = text_encoder_config_constructor(
+            pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config)
+
+    transformer = pipeline.transformer
+    transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim
+    transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
+    transformer.config.time_cond_proj_dim = None
+    export_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=transformer,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="sd3-transformer",
+    )
+    transformer_export_config = export_config_constructor(
+        pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["transformer"] = (transformer, transformer_export_config)
+
+    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    vae_encoder = copy.deepcopy(pipeline.vae)
+    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
+    vae_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=vae_encoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="vae-encoder",
+    )
+    vae_encoder_export_config = vae_config_constructor(
+        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)
+
+    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    vae_decoder = copy.deepcopy(pipeline.vae)
+    vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
+    vae_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=vae_decoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="vae-decoder",
+    )
+    vae_decoder_export_config = vae_config_constructor(
+        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)
+
+    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
+    if text_encoder_2 is not None:
+        text_encoder_2.config.output_hidden_states = True
+        text_encoder_2.text_model.config.output_hidden_states = True
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_2,
+            exporter=exporter,
+            library_name="diffusers",
+            task="feature-extraction",
+            model_type="clip-text-with-projection",
+        )
+        export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["text_encoder_2"] = (text_encoder_2, export_config)
+
+    text_encoder_3 = getattr(pipeline, "text_encoder_3", None)
+    if text_encoder_3 is not None:
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_3,
+            exporter=exporter,
+            library_name="diffusers",
+            task="feature-extraction",
+            model_type="t5-encoder-model",
+        )
+        export_config = export_config_constructor(
+            text_encoder_3.config,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+        )
+        models_for_export["text_encoder_3"] = (text_encoder_3, export_config)
+
+    return models_for_export
+
+
+def get_flux_models_for_export(pipeline, exporter, int_dtype, float_dtype):
+    models_for_export = {}
+
+    # Text encoder
+    text_encoder = getattr(pipeline, "text_encoder", None)
+    if text_encoder is not None:
+        text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder,
+            exporter=exporter,
+            library_name="diffusers",
+            task="feature-extraction",
+            model_type="clip-text-model",
+        )
+        text_encoder_export_config = text_encoder_config_constructor(
+            pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        models_for_export["text_encoder"] = (text_encoder, text_encoder_export_config)
+
+    transformer = pipeline.transformer
+    transformer.config.text_encoder_projection_dim = transformer.config.joint_attention_dim
+    transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
+    transformer.config.time_cond_proj_dim = None
+    export_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=transformer,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="flux-transformer",
+    )
+    transformer_export_config = export_config_constructor(
+        pipeline.transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["transformer"] = (transformer, transformer_export_config)
+
+    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    vae_encoder = copy.deepcopy(pipeline.vae)
+    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
+    vae_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=vae_encoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="vae-encoder",
+    )
+    vae_encoder_export_config = vae_config_constructor(
+        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)
+
+    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    vae_decoder = copy.deepcopy(pipeline.vae)
+    vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
+    vae_config_constructor = TasksManager.get_exporter_config_constructor(
+        model=vae_decoder,
+        exporter=exporter,
+        library_name="diffusers",
+        task="semantic-segmentation",
+        model_type="vae-decoder",
+    )
+    vae_decoder_export_config = vae_config_constructor(
+        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)
+
+    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
+    if text_encoder_2 is not None:
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_2,
+            exporter=exporter,
+            library_name="diffusers",
+            task="feature-extraction",
+            model_type="t5-encoder-model",
+        )
+        export_config = export_config_constructor(
+            text_encoder_2.config,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+        )
+        models_for_export["text_encoder_2"] = (text_encoder_2, export_config)
+
+    return models_for_export
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index bb036fdc1a..b8310882ba 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -17,16 +17,19 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from packaging import version
-from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
 from optimum.exporters.onnx.model_configs import (
     CLIPOnnxConfig,
     CLIPTextOnnxConfig,
+    CLIPTextWithProjectionOnnxConfig,
+    CLIPVisionModelOnnxConfig,
     CodeGenOnnxConfig,
     FalconOnnxConfig,
     GemmaOnnxConfig,
+    GPTJOnnxConfig,
     GPTNeoXOnnxConfig,
     IBertOnnxConfig,
     LlamaOnnxConfig,
@@ -34,23 +37,25 @@
     MPTOnnxConfig,
     PhiOnnxConfig,
     UNetOnnxConfig,
-    VaeDecoderOnnxConfig,
-    VaeEncoderOnnxConfig,
     VisionOnnxConfig,
 )
+from optimum.exporters.onnx.model_patcher import ModelPatcher
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.input_generators import (
+    DTYPE_MAPPER,
     DummyInputGenerator,
     DummyPastKeyValuesGenerator,
+    DummySeq2SeqDecoderTextInputGenerator,
     DummyTextInputGenerator,
+    DummyTimestepInputGenerator,
     DummyVisionInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     MistralDummyPastKeyValuesGenerator,
 )
-from optimum.utils.normalized_config import NormalizedTextConfig, NormalizedVisionConfig
+from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig
 
-from ...intel.utils.import_utils import _transformers_version, is_transformers_version
+from ...intel.utils.import_utils import _transformers_version, is_diffusers_version, is_transformers_version
 from .model_patcher import (
     AquilaModelPatcher,
     ArcticModelPatcher,
@@ -60,21 +65,28 @@
     DBRXModelPatcher,
     DeciLMModelPatcher,
     FalconModelPatcher,
+    FluxTransfromerModelPatcher,
     Gemma2ModelPatcher,
+    GptJModelPatcher,
     GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     IBertModelPatcher,
+    InputEmbeddingPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
     InternVLChatImageEmbeddingModelPatcher,
     JaisModelPatcher,
     LlamaModelPatcher,
     LlavaImageEmbeddingModelPatcher,
+    LlavaQwen2ImageEmbeddingsModelPatcher,
+    MiniCPMVImageEmbeddingsModelPatcher,
+    MiniCPMVResamplerModelPatcher,
     MistralModelPatcher,
     MixtralModelPatcher,
     MPTModelPatcher,
     PersimmonModelPatcher,
     Phi3ModelPatcher,
+    Phi3VisionImageEmbeddingsPatcher,
     QwenModelPatcher,
     RotaryEmbPatcher,
     UpdateCausalMaskModelPatcher,
@@ -119,12 +131,12 @@ def init_model_configs():
 
 
 if TYPE_CHECKING:
-    from transformers.modeling_utils import PreTrainedModel
+    from transformers.modeling_utils import PreTrainedModel  # noqa: F811
 
-    from optimum.exporters.onnx.model_patcher import ModelPatcher
+    from optimum.exporters.onnx.model_patcher import ModelPatcher  # noqa: F811
 
     if is_tf_available():
-        from transformers.modeling_tf_utils import TFPreTrainedModel
+        from transformers.modeling_tf_utils import TFPreTrainedModel  # noqa: F811
 
 
 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True)
@@ -675,62 +687,6 @@ def patch_model_for_export(
         return FalconModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
-@register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers")
-class UNetOpenVINOConfig(UNetOnnxConfig):
-    @property
-    def inputs(self) -> Dict[str, Dict[int, str]]:
-        common_inputs = {
-            "sample": {0: "batch_size", 2: "height", 3: "width"},
-            "timestep": {0: "steps"},
-            "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
-        }
-
-        # TODO : add text_image, image and image_embeds
-        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
-            common_inputs["text_embeds"] = {0: "batch_size"}
-            common_inputs["time_ids"] = {0: "batch_size"}
-
-        if getattr(self._normalized_config, "time_cond_proj_dim", None) is not None:
-            common_inputs["timestep_cond"] = {0: "batch_size"}
-        return common_inputs
-
-    @property
-    def outputs(self) -> Dict[str, Dict[int, str]]:
-        return {
-            "out_sample": {0: "batch_size", 2: "height", 3: "width"},
-        }
-
-
-@register_in_tasks_manager("vae-encoder", *["semantic-segmentation"], library_name="diffusers")
-class VaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig):
-    @property
-    def inputs(self) -> Dict[str, Dict[int, str]]:
-        return {
-            "sample": {0: "batch_size", 2: "height", 3: "width"},
-        }
-
-    @property
-    def outputs(self) -> Dict[str, Dict[int, str]]:
-        return {
-            "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
-        }
-
-
-@register_in_tasks_manager("vae-decoder", *["semantic-segmentation"], library_name="diffusers")
-class VaeDecoderOpenVINOConfig(VaeDecoderOnnxConfig):
-    @property
-    def inputs(self) -> Dict[str, Dict[int, str]]:
-        return {
-            "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
-        }
-
-    @property
-    def outputs(self) -> Dict[str, Dict[int, str]]:
-        return {
-            "sample": {0: "batch_size", 2: "height", 3: "width"},
-        }
-
-
 @register_in_tasks_manager(
     "persimmon",
     *[
@@ -773,6 +729,24 @@ def patch_model_for_export(
         return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "gptj",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class GPTJOpenVINOConfig(GPTJOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptJModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager(
     "cohere",
     *[
@@ -1138,6 +1112,11 @@ def generate_dummy_inputs_for_validation(
             reference_model_inputs["text"] = reference_model_inputs.pop("input_ids")
         return super().generate_dummy_inputs_for_validation(reference_model_inputs)
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return ModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="open_clip")
 class OpenCLIPTextOpenVINOConfig(CLIPTextOnnxConfig):
@@ -1168,6 +1147,11 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
         return dummy_inputs
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return ModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="open_clip")
 class OpenCLIPVisualOpenVINOConfig(VisionOnnxConfig):
@@ -1193,6 +1177,42 @@ def rename_ambiguous_inputs(self, inputs):
         return model_inputs
 
 
+@register_in_tasks_manager(
+    "clip", *["feature-extraction", "zero-shot-image-classification"], library_name="transformers"
+)
+class CLIPOpenVINOConfig(CLIPOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return ModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="transformers")
+@register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers")
+class CLIPTextOpenVINOConfig(CLIPTextOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return ModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="transformers")
+@register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="diffusers")
+class CLIPTextWithProjectionOpenVINOConfig(CLIPTextWithProjectionOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return ModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="transformers")
+class CLIPVisionModelOpenVINOConfig(CLIPVisionModelOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return ModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager(
     "ibert",
     *[
@@ -1266,6 +1286,54 @@ def rename_ambiguous_inputs(self, inputs):
         model_inputs["input"] = inputs["input_ids"]
         return model_inputs
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        # making 16bit tracable overrides embeedings input signature these changes required to prevent this issue
+        return InputEmbeddingPatcher(self, model, model_kwargs)
+
+
+def get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype):
+    model_type = model_type.replace("_", "-")
+
+    if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
+        raise ValueError(
+            f"Unsupported language model type provided `{model_type}`. Please define custom export config"
+        )
+
+    if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
+        raise ValueError(
+            f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
+        )
+    export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]["text-generation-with-past"]
+    export_config = export_config_class(
+        model_config,
+        use_past=True,
+        use_past_in_inputs=True,
+        int_dtype=int_dtype,
+        float_dtype=float_dtype,
+    )
+    return export_config
+
+
+def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dtype):
+    internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
+    InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
+    export_config = InputEmbedOpenvVINOConfig(
+        model_config,
+        task="feature-extraction",
+        int_dtype=int_dtype,
+        float_dtype=float_dtype,
+    )
+    return export_config
+
+
+def get_vlm_text_generation_config(model_type, model_config, int_dtype, float_dtype):
+    internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype)
+    export_config = LMInputEmbedsConfigHelper(internal_export_config)
+    export_config._normalized_config = internal_export_config._normalized_config
+    return export_config
+
 
 class LlavaConfigBehavior(str, enum.Enum):
     LANGUAGE = "language"
@@ -1330,61 +1398,15 @@ def with_behavior(
 
         if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
             model_type = self._orig_config.text_config.model_type
-            model_type = model_type.replace("_", "-")
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.text_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_embeddings_config(
+                model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype
             )
-            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-            export_config = InputEmbedOpenvVINOConfig(
-                self._orig_config.text_config,
-                task="feature-extraction",
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            return export_config
 
         if behavior == LlavaConfigBehavior.LANGUAGE:
             model_type = self._orig_config.text_config.model_type
-            model_type = model_type.replace("_", "-")
-
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.text_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_generation_config(
+                model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype
             )
-            export_config = LMInputEmbedsConfigHelper(internal_export_config)
-            export_config._normalized_config = internal_export_config._normalized_config
-            return export_config
 
         if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -1439,7 +1461,7 @@ class InternVLChatConfigBehavior(str, enum.Enum):
 @register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers")
 class InternVLChatOpenVINOConfig(OnnxConfig):
     SUPPORTED_BEHAVIORS = [model_type.value for model_type in InternVLChatConfigBehavior]
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
 
     def __init__(
@@ -1492,61 +1514,15 @@ def with_behavior(
 
         if behavior == InternVLChatConfigBehavior.TEXT_EMBEDDINGS:
             model_type = self._orig_config.llm_config.model_type
-            model_type = model_type.replace("_", "-")
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.llm_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_embeddings_config(
+                model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype
             )
-            InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS
-            export_config = InputEmbedOpenvVINOConfig(
-                self._orig_config.llm_config,
-                task="feature-extraction",
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
-            )
-            return export_config
 
         if behavior == InternVLChatConfigBehavior.LANGUAGE:
             model_type = self._orig_config.llm_config.model_type
-            model_type = model_type.replace("_", "-")
-
-            if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
-                raise ValueError(
-                    f"Unsupported language model type provided `{model_type}`. Please define custom export config"
-                )
-
-            if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]:
-                raise ValueError(
-                    f"Export config for text generation for `{model_type}` is not available. Please define custom export config"
-                )
-            internal_export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"][
-                "text-generation-with-past"
-            ]
-            internal_export_config = internal_export_config_class(
-                self._orig_config.llm_config,
-                use_past=True,
-                use_past_in_inputs=True,
-                int_dtype=self.int_dtype,
-                float_dtype=self.float_dtype,
+            return get_vlm_text_generation_config(
+                model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype
             )
-            export_config = LMInputEmbedsConfigHelper(internal_export_config)
-            export_config._normalized_config = internal_export_config._normalized_config
-            return export_config
 
         if behavior == InternVLChatConfigBehavior.VISION_EMBEDDINGS:
             return self.__class__(
@@ -1558,7 +1534,8 @@ def with_behavior(
                 preprocessors=self._preprocessors,
             )
 
-    def get_model_for_behavior(self, model, behavior: Union[str, LlavaConfigBehavior]):
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, LlavaConfigBehavior]):
         if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
             behavior = InternVLChatConfigBehavior(behavior)
 
@@ -1580,3 +1557,650 @@ def patch_model_for_export(
         if self._behavior != InternVLChatConfigBehavior.VISION_EMBEDDINGS:
             return super().patch_model_for_export(model, model_kwargs)
         return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs)
+
+
+@register_in_tasks_manager(
+    "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers"
+)
+class LlavaQwen2OpenVINOConfig(OnnxConfig):
+    SUPPORTS_PAST = True
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: LlavaConfigBehavior = LlavaConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+        use_past: bool = False,
+    ):
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True)
+            if hasattr(config, "vision_config"):
+                config = config.vision_config
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return {}
+        return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if not self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return {}
+        return {"last_hidden_state": {0: "batch_size"}}
+
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, LlavaConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
+            behavior = LlavaConfigBehavior(behavior)
+
+        if behavior == LlavaConfigBehavior.LANGUAGE:
+            model.forward = super(type(model), model).forward
+            return model
+
+        if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return model
+
+        if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.model.embed_tokens
+            text_embedding.config = model.model.config
+            return text_embedding
+
+    def with_behavior(
+        self,
+        behavior: Union[str, LlavaConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, LlavaConfigBehavior):
+            behavior = LlavaConfigBehavior(behavior)
+
+        if behavior == LlavaConfigBehavior.TEXT_EMBEDDINGS:
+            model_type = self._orig_config.model_type.replace("llava-", "")
+            return get_vlm_text_embeddings_config(model_type, self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == LlavaConfigBehavior.LANGUAGE:
+            model_type = self._orig_config.model_type.replace("llava-", "")
+            return get_vlm_text_generation_config(model_type, self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior != LlavaConfigBehavior.VISION_EMBEDDINGS:
+            return super().patch_model_for_export(model, model_kwargs)
+        return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs)
+
+    def rename_ambiguous_inputs(self, inputs):
+        if self._behavior == LlavaConfigBehavior.VISION_EMBEDDINGS:
+            model_inputs = {}
+            model_inputs["images"] = inputs["pixel_values"]
+            return model_inputs
+        return super().rename_ambiguous_inputs(inputs)
+
+
+class PooledProjectionsDummyInputGenerator(DummyInputGenerator):
+    SUPPORTED_INPUT_NAMES = ["pooled_projections"]
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = batch_size
+        self.pooled_projection_dim = normalized_config.config.pooled_projection_dim
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        shape = [self.batch_size, self.pooled_projection_dim]
+        return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+
+
+class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("timestep", "text_embeds", "time_ids", "timestep_cond", "guidance")
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name in ["timestep", "guidance"]:
+            shape = [self.batch_size]
+            return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+@register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers")
+class SD3TransformerOpenVINOConfig(UNetOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        (DummyTransformerTimestpsInputGenerator,)
+        + UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+        + (PooledProjectionsDummyInputGenerator,)
+    )
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
+        image_size="sample_size",
+        num_channels="in_channels",
+        hidden_size="joint_attention_dim",
+        vocab_size="attention_head_dim",
+        allow_new=True,
+    )
+
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        common_inputs["pooled_projections"] = {0: "batch_size"}
+        return common_inputs
+
+    def rename_ambiguous_inputs(self, inputs):
+        #  The input name in the model signature is `x, hence the export input name is updated.
+        hidden_states = inputs.pop("sample", None)
+        if hidden_states is not None:
+            inputs["hidden_states"] = hidden_states
+        return inputs
+
+
+@register_in_tasks_manager("t5-encoder-model", *["feature-extraction"], library_name="diffusers")
+class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig):
+    pass
+
+
+class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "pixel_values",
+        "pixel_mask",
+        "sample",
+        "latent_sample",
+        "hidden_states",
+        "img_ids",
+    )
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)
+        if getattr(normalized_config, "in_channels", None):
+            self.num_channels = normalized_config.in_channels // 4
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name in ["hidden_states", "sample"]:
+            shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels * 4]
+            return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+        if input_name == "img_ids":
+            img_ids_height = self.height // 2
+            img_ids_width = self.width // 2
+            return self.random_int_tensor(
+                [self.batch_size, img_ids_height * img_ids_width, 3]
+                if is_diffusers_version("<", "0.31.0")
+                else [img_ids_height * img_ids_width, 3],
+                min_value=0,
+                max_value=min(img_ids_height, img_ids_width),
+                framework=framework,
+                dtype=float_dtype,
+            )
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "decoder_input_ids",
+        "decoder_attention_mask",
+        "encoder_outputs",
+        "encoder_hidden_states",
+        "txt_ids",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "txt_ids":
+            import torch
+
+            shape = (
+                [self.batch_size, self.sequence_length, 3]
+                if is_diffusers_version("<", "0.31.0")
+                else [self.sequence_length, 3]
+            )
+            dtype = DTYPE_MAPPER.pt(float_dtype)
+            return torch.full(shape, 0, dtype=dtype)
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+@register_in_tasks_manager("flux-transformer", *["semantic-segmentation"], library_name="diffusers")
+class FluxTransformerOpenVINOConfig(SD3TransformerOpenVINOConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTransformerTimestpsInputGenerator,
+        DummyFluxTransformerInputGenerator,
+        DummyFluxTextInputGenerator,
+        PooledProjectionsDummyInputGenerator,
+    )
+
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        common_inputs.pop("sample", None)
+        common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"}
+        common_inputs["txt_ids"] = (
+            {0: "batch_size", 1: "sequence_length"} if is_diffusers_version("<", "0.31.0") else {0: "sequence_length"}
+        )
+        common_inputs["img_ids"] = (
+            {0: "batch_size", 1: "packed_height_width"}
+            if is_diffusers_version("<", "0.31.0")
+            else {0: "packed_height_width"}
+        )
+        if getattr(self._normalized_config, "guidance_embeds", False):
+            common_inputs["guidance"] = {0: "batch_size"}
+        return common_inputs
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> ModelPatcher:
+        return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+class DummyMiniCPMVImageInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("pixel_values", "patch_attention_mask", "position_ids")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width, height)
+        self.patch_size = normalized_config.config.patch_size
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "pixel_values":
+            return self.random_float_tensor(
+                shape=[
+                    self.batch_size,
+                    self.num_channels,
+                    self.patch_size,
+                    (self.height * self.width) // self.patch_size,
+                ],
+                framework=framework,
+                dtype=float_dtype,
+            )
+
+        if input_name == "patch_attention_mask":
+            return self.random_int_tensor(
+                shape=[self.batch_size, 1, (self.height // self.patch_size) * (self.width // self.patch_size)],
+                framework=framework,
+                dtype=float_dtype,
+                min_value=0,
+                max_value=2,
+            )
+
+        if input_name == "position_ids":
+            return self.random_int_tensor(
+                shape=[self.batch_size, (self.height // self.patch_size) * (self.width // self.patch_size)],
+                max_value=self.patch_size,
+            )
+
+
+class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask")
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(task, normalized_config, batch_size, num_channels, width, height)
+        self.patch_size = normalized_config.config.patch_size
+        self.hidden_size = normalized_config.config.hidden_size
+        self.img_hidden_size = normalized_config.config.vision_config.hidden_size
+        self.feat_size = (normalized_config.config.vision_config.image_size // self.patch_size) * (
+            normalized_config.config.vision_config.image_size // self.patch_size
+        )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "image_feature":
+            return self.random_float_tensor(
+                shape=[self.batch_size, self.feat_size, self.img_hidden_size], framework=framework, dtype=float_dtype
+            )
+
+        if input_name == "key_padding_mask":
+            return self.constant_tensor(
+                shape=[self.batch_size, self.feat_size],
+                framework=framework,
+                value=1,
+                dtype=DTYPE_MAPPER.pt(float_dtype),
+            )
+
+        if input_name == "pos_embed":
+            return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size])
+
+
+class MiniCPMVConfigBehavior(str, enum.Enum):
+    RESAMPLER = "resampler"
+    LANGUAGE = "language"
+    VISION_EMBEDDINGS = "vision_embeddings"
+    TEXT_EMBEDDINGS = "text_embeddings"
+
+
+@register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers")
+class MiniCPMVOpenVINOConfig(OnnxConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = ()
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: MiniCPMVConfigBehavior = MiniCPMVConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"):
+            self._config = config.vision_config
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,)
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVResampleInputGenerator,)
+        self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return {
+                "pixel_values": {0: "batch_size", 2: "height", 3: "width"},
+                "patch_attention_mask": {0: "batch_size", 1: "num_patches", 2: "patch_size"},
+                "position_ids": {0: "batch_size", 1: "patch_size"},
+            }
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return {
+                "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"},
+                "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"},
+                "key_padding_mask": {0: "batch_size", 1: "patch_size"},
+            }
+        return {}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return {"last_hidden_state": {0: "batch_size", 1: "patch_height", 2: "patch_width"}}
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return {"last_hidden_state": {0: "batch_size"}}
+
+        return {}
+
+    def with_behavior(
+        self,
+        behavior: Union[str, MiniCPMVConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
+            behavior = MiniCPMVConfigBehavior(behavior)
+
+        if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == MiniCPMVConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+        if behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, MiniCPMVConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior):
+            behavior = MiniCPMVConfigBehavior(behavior)
+
+        if behavior == MiniCPMVConfigBehavior.LANGUAGE:
+            return model.llm
+
+        if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return model.vpm
+
+        if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.get_input_embeddings()
+            text_embedding.config = model.llm.config
+            return text_embedding
+        if behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            model.resampler.config = model.vpm.config
+            return model.resampler
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS:
+            return MiniCPMVImageEmbeddingsModelPatcher(self, model, model_kwargs)
+
+        if self._behavior == MiniCPMVConfigBehavior.RESAMPLER:
+            return MiniCPMVResamplerModelPatcher(self, model, model_kwargs)
+
+        return super().patch_model_for_export(model, model_kwargs)
+
+
+class Phi3VisionConfigBehavior(str, enum.Enum):
+    LANGUAGE = "language"
+    VISION_PROJECTION = "vision_projection"
+    VISION_EMBEDDINGS = "vision_embeddings"
+    TEXT_EMBEDDINGS = "text_embeddings"
+
+
+class DummyPhi3VisionProjectionInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("input",)
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = 336,
+        height: int = 336,
+        **kwargs,
+    ):
+        self.batch_size = batch_size
+        self._embed_layer_realization = normalized_config.config.embd_layer["embedding_cls"]
+        self.image_dim_out = normalized_config.config.img_processor["image_dim_out"]
+        self.height = height
+        self.width = width
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        h = self.height // 336
+        w = self.width // 336
+        feat_size = (h * w + 1) * 144 + 1 + (h + 1) * 12
+        if self._embed_layer_realization == "linear":
+            shape = [self.batch_size, feat_size, self.image_dim_out]
+        else:
+            shape = [self.batch_size, feat_size, self.image_dim_out * 4]
+        return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+
+
+@register_in_tasks_manager("phi3-v", *["image-text-to-text"], library_name="transformers")
+class Phi3VisionOpenVINOConfig(OnnxConfig):
+    SUPPORTED_BEHAVIORS = [model_type.value for model_type in Phi3VisionConfigBehavior]
+    NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        behavior: Phi3VisionConfigBehavior = Phi3VisionConfigBehavior.VISION_EMBEDDINGS,
+        preprocessors: Optional[List[Any]] = None,
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            preprocessors=preprocessors,
+        )
+        self._behavior = behavior
+        self._orig_config = config
+        if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "img_processor"):
+            self._config = AutoConfig.from_pretrained(
+                config.img_processor["model_name"], trust_remote_code=True
+            ).vision_config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,)
+        if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION and hasattr(config, "img_processor"):
+            self._config = config
+            self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
+            self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyPhi3VisionProjectionInputGenerator,)
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}}
+        if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
+            return {"input": {0: "batch_size", 1: "img_feat_size"}}
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self._behavior in [Phi3VisionConfigBehavior.VISION_EMBEDDINGS, Phi3VisionConfigBehavior.VISION_PROJECTION]:
+            return {"last_hidden_state": {0: "batch_size", 1: "height_width_projection"}}
+        return {}
+
+    def with_behavior(
+        self,
+        behavior: Union[str, Phi3VisionConfigBehavior],
+    ):
+        """
+        Creates a config for different behaviour.
+        Args:
+            behavior ([`ConfigBehavior`]):
+                The behavior to use for the new instance.
+        """
+        if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior):
+            behavior = Phi3VisionConfigBehavior(behavior)
+
+        if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS:
+            return get_vlm_text_embeddings_config("phi3", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == Phi3VisionConfigBehavior.LANGUAGE:
+            return get_vlm_text_generation_config("phi3", self._orig_config, self.int_dtype, self.float_dtype)
+
+        if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+        if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
+            return self.__class__(
+                self._orig_config,
+                task=self.task,
+                int_dtype=self.int_dtype,
+                float_dtype=self.float_dtype,
+                behavior=behavior,
+                preprocessors=self._preprocessors,
+            )
+
+    @staticmethod
+    def get_model_for_behavior(model, behavior: Union[str, Phi3VisionConfigBehavior]):
+        if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior):
+            behavior = Phi3VisionConfigBehavior(behavior)
+
+        if behavior == Phi3VisionConfigBehavior.LANGUAGE:
+            return model
+
+        if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            vision_embeddings = model.model.vision_embed_tokens
+            vision_embeddings.config = model.config
+            return vision_embeddings
+
+        if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION:
+            projection = model.model.vision_embed_tokens.img_projection
+            projection.config = model.config
+            return projection
+
+        if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS:
+            text_embedding = model.model.embed_tokens
+            text_embedding.config = model.config
+            return text_embedding
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        model_kwargs = model_kwargs or {}
+        if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS:
+            return Phi3VisionImageEmbeddingsPatcher(self, model, model_kwargs)
+        return super().patch_model_for_export(model, model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index eadce6d382..58659e637b 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -21,7 +21,7 @@
 
 import torch
 import torch.nn.functional as F
-from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.modeling_outputs import BaseModelOutputWithPast, BaseModelOutputWithPooling
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, ModelPatcher, override_arguments
@@ -29,6 +29,7 @@
     _openvino_version,
     _torch_version,
     _transformers_version,
+    is_diffusers_version,
     is_openvino_version,
     is_torch_version,
     is_transformers_version,
@@ -108,11 +109,20 @@ def patch_model_with_bettertransformer(model):
     return model
 
 
-def patch_update_causal_mask(model, transformers_version):
+def patch_update_causal_mask(model, transformers_version, inner_model_name="model", patch_fn=None):
     if is_transformers_version(">=", transformers_version):
-        inner_model = getattr(model, "model", getattr(model, "transformer", None))
+        inner_model = getattr(model, inner_model_name, None)
         if inner_model is not None:
-            inner_model._update_causal_mask = types.MethodType(_llama_gemma_update_causal_mask, inner_model)
+            if hasattr(inner_model, "_update_causal_mask"):
+                inner_model._orig_update_causal_mask = inner_model._update_causal_mask
+            patch_fn = patch_fn or _llama_gemma_update_causal_mask
+            inner_model._update_causal_mask = types.MethodType(patch_fn, inner_model)
+
+
+def unpatch_update_causal_mask(model, inner_model_name="model"):
+    inner_model = getattr(model, inner_model_name, None)
+    if inner_model is not None and hasattr(inner_model, "._orig_update_causal_mask"):
+        inner_model._update_causal_mask = inner_model._orig_update_causal_mask
 
 
 # initialization of sin/cos cached in bf16/fp16 leads to accuracy loss
@@ -578,13 +588,11 @@ def __enter__(self):
 
         # llama/gemma has some accuracy issues with bf16 with transformers >= 4.39
         # fill causal mask in slightly different way for avoid overflow on some platforms
-        patch_update_causal_mask(self._model, "4.39.0")
+        patch_update_causal_mask(self._model, "4.39.0", "model" if hasattr(self._model, "model") else "transformer")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        inner_model = getattr(self._model, "model", getattr(self._model, "transformer", None))
-        if hasattr(inner_model, "_orig_update_causal_mask"):
-            inner_model._update_causal_mask = inner_model._orig_update_causal_mask
+        unpatch_update_causal_mask(self._model, "model" if hasattr(self._model, "model") else "transformer")
 
 
 # copied from https://github.com/huggingface/transformers/commit/57d7594a79a9f5d835abf2d4d384db0e4818e548 to unblock export with transformers 4.42
@@ -1361,6 +1369,7 @@ def phi3_442_forward(
     output_attentions: Optional[bool] = None,
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
+    **kwargs,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
     from transformers.cache_utils import Cache, DynamicCache
     from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
@@ -1864,6 +1873,67 @@ def __exit__(self, exc_type, exc_value, traceback):
                 layer.self_attn.forward = layer.self_attn._orig_forward
 
 
+# copied from  https://github.com/huggingface/optimum/blob/2112e99122d7f23a1da1a9d263fef64301050ea7/optimum/bettertransformer/models/attention.py#L168
+# for preserving backward compatibility between outdated codegen remote code and new transformers
+def _codegen_wrapped_scaled_dot_product_legacy(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+):
+    from optimum.bettertransformer.models.attention import raise_on_head_mask
+
+    raise_on_head_mask(head_mask)
+    batch_size = query.shape[0]
+    mask_value = torch.finfo(value.dtype).min
+    mask_value = torch.full([], mask_value, dtype=value.dtype)
+
+    if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1:
+        raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.")
+
+    # in codegen the query and key are always in fp32 regardless of the dtype of the model
+    # https://github.com/huggingface/transformers/blob/5b28b7833297adf65c5160a685425ddb1eee5ce2/src/transformers/models/codegen/modeling_codegen.py#L226
+    query = query.to(value.dtype)
+    key = key.to(value.dtype)
+
+    dropout_p = self.dropout_prob_attn if self.training else 0.0
+    if batch_size == 1 or self.training:
+        if query.shape[2] > 1:
+            # first step of the decoding
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
+            )
+        else:
+            # in this case, which is the later decoding steps, the `causal_mask`` in
+            # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195
+            # is [True, ..., True] so actually not causal
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False
+            )
+    else:
+        query_length, key_length = query.size(-2), key.size(-2)
+
+        # causal_mask is always [True, ..., True] otherwise, so executing this is unnecessary
+        if query_length > 1:
+            causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+
+            causal_mask = torch.where(causal_mask, 0, mask_value)
+
+            # torch.Tensor.expand does no memory copy
+            causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+
+            # we use torch.min to avoid having tensor(-inf)
+            attention_mask = torch.min(causal_mask, attention_mask)
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
+        )
+
+    return sdpa_result, None
+
+
 class CodeGenModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
@@ -1872,14 +1942,23 @@ def __enter__(self):
         # For avoiding breaking model on tracing stage, we reduce area of bettertransformer patch only for _attn.
         from optimum.bettertransformer.models.attention import codegen_wrapped_scaled_dot_product
 
+        attn_fn = codegen_wrapped_scaled_dot_product
+        if is_torch_version(">=", "2.1.0") and is_transformers_version(">=", "4.45"):
+            # in transformers 4.45 causal_mask const buffer was removed from the model
+            # if it still exists, it means legacy remote code was loaded
+            if hasattr(self._model.transformer.h[0].attn, "causal_mask"):
+                attn_fn = _codegen_wrapped_scaled_dot_product_legacy
+
         for layer in self._model.transformer.h:
             if is_torch_version(">=", "2.1.0") and not self._model.config.output_attentions:
                 orig_self_attn_fwd = layer.attn._attn
-                layer.attn._attn = types.MethodType(codegen_wrapped_scaled_dot_product, layer.attn)
+                layer.attn._attn = types.MethodType(attn_fn, layer.attn)
                 layer.attn._orig_attn = orig_self_attn_fwd
+        patch_update_causal_mask(self._model, "4.45.0", "transformer")
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "transformer")
         for layer in self._model.transformer.h:
             if hasattr(layer.attn, "_orig_attn"):
                 layer.attn._attn = layer.attn._orig_attn
@@ -2274,8 +2353,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+        unpatch_update_causal_mask(self._model)
         for layer in self._model.model.layers:
             if hasattr(layer.self_attn, "_orig_forward"):
                 layer.self_attn.forward = layer.self_attn._orig_forward
@@ -2412,8 +2490,7 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+        unpatch_update_causal_mask(self._model)
 
 
 class RotaryEmbPatcher(DecoderModelPatcher):
@@ -2424,12 +2501,119 @@ def __enter__(self):
                 _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
 
+def _falcon_update_causal_mask(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: "Cache",
+    output_attentions: bool,
+    head_mask: torch.Tensor,
+    alibi: torch.Tensor,
+):
+    # copied from  https://github.com/huggingface/transformers/blob/a30c865f991dfec9452cc64bd9a97bfbb96be036/src/transformers/models/falcon/modeling_falcon.py#L1130
+    from transformers.cache_utils import StaticCache
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if hasattr(self, "_prepare_4d_causal_attention_mask_with_cache_position"):
+        _prepare_4d_causal_attention_mask_with_cache_position = (
+            self._prepare_4d_causal_attention_mask_with_cache_position
+        )
+    else:
+        from transformers.models.falcon.modeling_falcon import _prepare_4d_causal_attention_mask_with_cache_position
+
+    if self.config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+
+    # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+    if (
+        self.config._attn_implementation == "sdpa"
+        and not using_static_cache
+        and not output_attentions
+        and head_mask is None
+        and alibi is None
+    ):
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    # difference from original, replace torch.finfo(dtype).min to float16 for prevent overflow for fp16/bf16 execution
+    min_dtype = torch.finfo(torch.float16).min
+    batch_size, sequence_length, _ = input_tensor.shape
+    if using_static_cache:
+        target_length = past_key_values.get_max_length()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length
+        )
+
+    # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+    causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask,
+        sequence_length=sequence_length,
+        target_length=target_length,
+        dtype=dtype,
+        device=device,
+        min_dtype=min_dtype,
+        cache_position=cache_position,
+        batch_size=input_tensor.shape[0],
+    )
+
+    # We take care to integrate alibi bias in the causal_mask here
+    if head_mask is None and alibi is not None:
+        alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+        causal_mask = torch.masked_fill(
+            alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+            causal_mask < -1,
+            min_dtype,
+        )
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+        and not output_attentions
+    ):
+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
 class FalconModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
         if is_transformers_version("<", "4.44.99"):
             for layer in self._model.transformer.h:
                 _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
+        else:
+            patch_update_causal_mask(self._model, "4.45.0", "transformer", _falcon_update_causal_mask)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "transformer")
 
 
 class GptNeoxModelPatcher(DecoderModelPatcher):
@@ -2438,6 +2622,22 @@ def __enter__(self):
         if is_transformers_version("<", "4.44.99"):
             for layer in self._model.gpt_neox.layers:
                 _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+        else:
+            patch_update_causal_mask(self._model, "4.45.0", "gpt_neox")
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "gpt_neox")
+
+
+class GptJModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        patch_update_causal_mask(self._model, "4.45.0", "transformer")
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "transformer")
 
 
 class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
@@ -2446,6 +2646,12 @@ def __enter__(self):
         if is_transformers_version("<", "4.44.99"):
             for layer in self._model.gpt_neox_japanese.layers:
                 _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+        else:
+            patch_update_causal_mask(self._model, "4.45.0", "gpt_neox_japanese")
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        unpatch_update_causal_mask(self._model, "gpt_neox_japanese")
 
 
 class Gemma2ModelPatcher(LlamaModelPatcher):
@@ -2504,6 +2710,26 @@ def patched_forward(*args, **kwargs):
 
         self.patched_forward = patched_forward
 
+    def __enter__(self):
+        super().__enter__()
+        if is_transformers_version(">=", "4.45.0"):
+            from transformers.models.gemma2.modeling_gemma2 import GEMMA2_ATTENTION_CLASSES
+
+            sdpa_attn = GEMMA2_ATTENTION_CLASSES["sdpa"]
+            eager_attn = GEMMA2_ATTENTION_CLASSES["eager"]
+
+            for layer in self._model.model.layers:
+                if isinstance(layer.self_attn, eager_attn):
+                    layer.self_attn._orig_forward = layer.self_attn.forward
+                    layer.self_attn.forward = types.MethodType(sdpa_attn.forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if is_transformers_version(">=", "4.45.0"):
+            for layer in self._model.model.layers:
+                if hasattr(layer.self_attn, "_orig_forward"):
+                    layer.self_attn.forward = layer.self_attn._orig_forward
+
 
 def _decilm_attn_forward(
     self,
@@ -2705,3 +2931,309 @@ def __init__(
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         self._model.forward = self._model.__orig_forward
+
+
+def _embednb_forward(self, ids: torch.Tensor) -> torch.Tensor:
+    def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+        assert dim % 2 == 0, "The dimension must be even."
+
+        scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
+        omega = 1.0 / (theta**scale)
+
+        batch_size, seq_length = pos.shape
+        out = pos.unsqueeze(-1) * omega.unsqueeze(0).unsqueeze(0)
+        cos_out = torch.cos(out)
+        sin_out = torch.sin(out)
+
+        stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+        out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+        return out.float()
+
+    n_axes = ids.shape[-1]
+    emb = torch.cat(
+        [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+        dim=-3,
+    )
+    return emb.unsqueeze(1)
+
+
+class FluxTransfromerModelPatcher(ModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        if is_diffusers_version("<", "0.31.0"):
+            self._model.pos_embed._orig_forward = self._model.pos_embed.forward
+            self._model.pos_embed.forward = types.MethodType(_embednb_forward, self._model.pos_embed)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if hasattr(self._model.pos_embed, "_orig_forward"):
+            self._model.pos_embed.forward = self._model.pos_embed._orig_forward
+
+
+def _minicpmv_resampler_forward(self, image_feature, pos_embed, key_padding_mask):
+    bs = image_feature.shape[0]
+    image_feature = self.kv_proj(image_feature)  # B * L * D
+    image_feature = self.ln_kv(image_feature).permute(1, 0, 2)  # L * B * D
+
+    q = self.ln_q(self.query)  # Q * D
+
+    q_bs = q.unsqueeze(1).repeat(1, bs, 1)
+
+    out = self.attn(q_bs, image_feature + pos_embed, image_feature, key_padding_mask=key_padding_mask)[
+        0
+    ]  # Q * B * D  # L * B * D +  L * B * D
+    #  out: Q * B * D
+    x = out.permute(1, 0, 2)  # B * Q * D
+
+    x = self.ln_post(x)
+    x = x @ self.proj
+    return x
+
+
+def _minicpmv_siglip_vis_embed_forward(
+    self,
+    pixel_values: torch.FloatTensor,
+    patch_attention_mask: torch.BoolTensor,
+    tgt_sizes: Optional[torch.IntTensor] = None,
+    position_ids: Optional[torch.FloatTensor] = None,
+) -> torch.Tensor:
+    patch_embeds = self.patch_embedding(pixel_values)
+    embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+    if position_ids is None:
+        batch_size = pixel_values.size(0)
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+    position_ids = position_ids.to(self.position_embedding.weight.device)
+
+    embeddings = embeddings + self.position_embedding(position_ids)
+    return embeddings
+
+
+def _minicpmv_siglip_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    output_attentions: Optional[bool] = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel"""
+
+    batch_size, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None
+    )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, None
+
+
+def _minicpmv_siglip_transformer_forward(
+    self,
+    pixel_values,
+    patch_attention_mask: Optional[torch.BoolTensor] = None,
+    tgt_sizes: Optional[torch.IntTensor] = None,
+    position_ids: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, BaseModelOutputWithPooling]:
+    from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    batch_size = pixel_values.size(0)
+    if patch_attention_mask is None:
+        patch_attention_mask = torch.ones(
+            size=(
+                batch_size,
+                pixel_values.size(2) // self.config.patch_size,
+                pixel_values.size(3) // self.config.patch_size,
+            ),
+            dtype=torch.bool,
+            device=pixel_values.device,
+        )
+
+    hidden_states = self.embeddings(
+        pixel_values=pixel_values,
+        patch_attention_mask=patch_attention_mask,
+        tgt_sizes=tgt_sizes,
+        position_ids=position_ids,
+    )
+
+    patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+    attention_mask = (
+        _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
+        if not self._use_flash_attention_2
+        else patch_attention_mask
+    )
+
+    encoder_outputs = self.encoder(
+        inputs_embeds=hidden_states,
+        attention_mask=attention_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    last_hidden_state = encoder_outputs[0]
+    last_hidden_state = self.post_layernorm(last_hidden_state)
+
+    if not return_dict:
+        return (last_hidden_state, None) + encoder_outputs[1:]
+
+    return BaseModelOutputWithPooling(
+        last_hidden_state=last_hidden_state,
+        pooler_output=None,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+    )
+
+
+class MiniCPMVResamplerModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(_minicpmv_resampler_forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+class MiniCPMVImageEmbeddingsModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(_minicpmv_siglip_transformer_forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __enter__(self):
+        super().__enter__()
+        self._model.embeddings._orig_forward = self._model.embeddings.forward
+        self._model.embeddings.forward = types.MethodType(_minicpmv_siglip_vis_embed_forward, self._model.embeddings)
+
+        if is_torch_version(">=", "2.0.0"):
+            for layer in self._model.encoder.layers:
+                layer.self_attn._orig_forward = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_minicpmv_siglip_attn_forward, layer.self_attn)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+        self._model.embeddings.forward = self._model.embeddings._orig_forward
+        if is_torch_version(">=", "2.0.0"):
+            for layer in self._model.encoder.layers:
+                layer.self_attn.forward = layer.self_attn._orig_forward
+
+
+class LlavaQwen2ImageEmbeddingsModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = model.encode_images
+        super().__init__(config, model, model_kwargs)
+        if not self._model.get_vision_tower().is_loaded:
+            self._model.get_vision_tower().load_model()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+class InputEmbeddingPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+
+        def forward(self, input):
+            return self.__orig_forward(input)
+
+        model.forward = types.MethodType(forward, model)
+
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
+
+
+def phi3_vision_embeddings_forward(self, pixel_values: torch.FloatTensor):
+    return self.get_img_features(pixel_values)
+
+
+class Phi3VisionImageEmbeddingsPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Dict[str, Any],
+    ):
+        model.__orig_forward = model.forward
+        model.forward = types.MethodType(phi3_vision_embeddings_forward, model)
+        super().__init__(config, model, model_kwargs)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.forward = self._model.__orig_forward
diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py
index 9b87472432..7fb1bb5f1d 100644
--- a/optimum/exporters/openvino/utils.py
+++ b/optimum/exporters/openvino/utils.py
@@ -13,7 +13,9 @@
 #  limitations under the License.
 
 import inspect
+import logging
 from collections import namedtuple
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from transformers.utils import is_torch_available
@@ -25,6 +27,9 @@
 from optimum.utils import is_diffusers_available
 
 
+logger = logging.getLogger(__name__)
+
+
 InputInfo = namedtuple("InputInfo", ["name", "shape", "type", "example"])
 
 
@@ -103,7 +108,7 @@ def _get_input_info(
                     symbol = name_to_symbol[dim_name]
                 else:
                     symbol = Symbol()
-                    name_to_symbol[name] = symbol
+                    name_to_symbol[dim_name] = symbol
                 dim = Dimension(-1)
                 dim.set_symbol(symbol)
                 shape[idx] = dim
@@ -208,4 +213,17 @@ def get_submodels(model):
     return custom_export, fn_get_submodels
 
 
-MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "internvl-chat"]
+MULTI_MODAL_TEXT_GENERATION_MODELS = ["llava", "llava-next", "llava-qwen2", "internvl-chat", "minicpmv", "phi3-v"]
+
+
+def save_config(config, save_dir):
+    try:
+        config.save_pretrained(save_dir)
+    except Exception as exp:
+        logger.warning(
+            f"Attempt to save config using standard API has failed with {exp}. There may be an issue with model config, please check its correctness before usage."
+        )
+        save_dir = Path(save_dir)
+        save_dir.mkdir(exist_ok=True, parents=True)
+        output_config_file = Path(save_dir / "config.json")
+        config.to_json_file(output_config_file, use_diff=True)
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
index fc6b0a7756..67a01011a2 100644
--- a/optimum/intel/__init__.py
+++ b/optimum/intel/__init__.py
@@ -99,7 +99,17 @@
         "OVStableDiffusionInpaintPipeline",
         "OVStableDiffusionXLPipeline",
         "OVStableDiffusionXLImg2ImgPipeline",
+        "OVStableDiffusionXLInpaintPipeline",
+        "OVStableDiffusion3Pipeline",
+        "OVStableDiffusion3Image2ImagePipeline",
+        "OVStableDiffusion3InpaintPipeline",
         "OVLatentConsistencyModelPipeline",
+        "OVLatentConsistencyModelImg2ImgPipeline",
+        "OVFluxPipeline",
+        "OVPipelineForImage2Image",
+        "OVPipelineForText2Image",
+        "OVPipelineForInpainting",
+        "OVDiffusionPipeline",
     ]
 else:
     _import_structure["openvino"].extend(
@@ -109,7 +119,17 @@
             "OVStableDiffusionInpaintPipeline",
             "OVStableDiffusionXLPipeline",
             "OVStableDiffusionXLImg2ImgPipeline",
+            "OVStableDiffusionXLInpaintPipeline",
+            "OVStableDiffusion3Pipeline",
+            "OVStableDiffusion3Image2ImagePipeline",
+            "OVStableDiffusion3InpaintPipeline",
             "OVLatentConsistencyModelPipeline",
+            "OVLatentConsistencyModelImg2ImgPipeline",
+            "OVFluxPipeline",
+            "OVPipelineForImage2Image",
+            "OVPipelineForText2Image",
+            "OVPipelineForInpainting",
+            "OVDiffusionPipeline",
         ]
     )
 
@@ -250,7 +270,15 @@
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from .utils.dummy_openvino_and_diffusers_objects import (
+            OVDiffusionPipeline,
+            OVFluxPipeline,
             OVLatentConsistencyModelPipeline,
+            OVPipelineForImage2Image,
+            OVPipelineForInpainting,
+            OVPipelineForText2Image,
+            OVStableDiffusion3Img2ImgPipeline,
+            OVStableDiffusion3InpaintPipeline,
+            OVStableDiffusion3Pipeline,
             OVStableDiffusionImg2ImgPipeline,
             OVStableDiffusionInpaintPipeline,
             OVStableDiffusionPipeline,
@@ -259,11 +287,21 @@
         )
     else:
         from .openvino import (
+            OVDiffusionPipeline,
+            OVFluxPipeline,
+            OVLatentConsistencyModelImg2ImgPipeline,
             OVLatentConsistencyModelPipeline,
+            OVPipelineForImage2Image,
+            OVPipelineForInpainting,
+            OVPipelineForText2Image,
+            OVStableDiffusion3Img2ImgPipeline,
+            OVStableDiffusion3InpaintPipeline,
+            OVStableDiffusion3Pipeline,
             OVStableDiffusionImg2ImgPipeline,
             OVStableDiffusionInpaintPipeline,
             OVStableDiffusionPipeline,
             OVStableDiffusionXLImg2ImgPipeline,
+            OVStableDiffusionXLInpaintPipeline,
             OVStableDiffusionXLPipeline,
         )
 
diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py
index c0fe0cf6d2..c4ecf8570c 100644
--- a/optimum/intel/neural_compressor/trainer.py
+++ b/optimum/intel/neural_compressor/trainer.py
@@ -39,6 +39,7 @@
 from transformers import Trainer
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
+from transformers.feature_extraction_utils import FeatureExtractionMixin
 from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype, unwrap_model
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
@@ -104,7 +105,7 @@
     from neural_compressor.config import _BaseQuantizationConfig
 
 
-__version__ = "4.22.2"
+__version__ = "4.46.0"
 
 
 logger = logging.get_logger(__name__)
@@ -122,8 +123,9 @@ def __init__(
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
         eval_dataset: Optional[Dataset] = None,
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        processing_class: Optional[Union[PreTrainedTokenizerBase, FeatureExtractionMixin]] = None,
         model_init: Callable[[], PreTrainedModel] = None,
+        compute_loss_func: Optional[Callable] = None,
         compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
         callbacks: Optional[List[TrainerCallback]] = None,
         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
@@ -132,6 +134,7 @@ def __init__(
         pruning_config: Optional[_BaseQuantizationConfig] = None,
         distillation_config: Optional[_BaseQuantizationConfig] = None,
         task: Optional[str] = None,
+        **kwargs,
     ):
         self.neftune_noise_alpha = None
 
@@ -141,12 +144,12 @@ def __init__(
             data_collator,
             train_dataset,
             eval_dataset,
-            tokenizer,
-            model_init,
-            compute_metrics,
-            callbacks,
-            optimizers,
-            preprocess_logits_for_metrics,
+            processing_class or kwargs.get("tokenizer", None),
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
         )
 
         if self.args.device.type == "cuda" and not is_neural_compressor_version(">", "2.0.0"):
@@ -766,7 +769,7 @@ def _get_logits(model_outputs):
         output_names = ["logits", "start_logits", "end_logits"]
         return tuple(model_outputs.get(name) for name in output_names if name in model_outputs)
 
-    def compute_loss(self, model, inputs, return_outputs=False):
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         """
         How the loss is computed by Trainer. By default, all models return the loss in the first element.
         """
diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py
index 9f3e983ff2..589a0938e3 100644
--- a/optimum/intel/openvino/__init__.py
+++ b/optimum/intel/openvino/__init__.py
@@ -81,11 +81,21 @@
 
 if is_diffusers_available():
     from .modeling_diffusion import (
+        OVDiffusionPipeline,
+        OVFluxPipeline,
+        OVLatentConsistencyModelImg2ImgPipeline,
         OVLatentConsistencyModelPipeline,
+        OVPipelineForImage2Image,
+        OVPipelineForInpainting,
+        OVPipelineForText2Image,
+        OVStableDiffusion3Img2ImgPipeline,
+        OVStableDiffusion3InpaintPipeline,
+        OVStableDiffusion3Pipeline,
         OVStableDiffusionImg2ImgPipeline,
         OVStableDiffusionInpaintPipeline,
         OVStableDiffusionPipeline,
         OVStableDiffusionXLImg2ImgPipeline,
+        OVStableDiffusionXLInpaintPipeline,
         OVStableDiffusionXLPipeline,
     )
 
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 403498ff70..1dba6c32f1 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -26,6 +26,7 @@
 from optimum.configuration_utils import BaseConfig
 
 from ..utils.import_utils import is_nncf_available
+from .utils import PREDEFINED_SD_DATASETS, PREDEFINED_VISUAL_LM_DATASETS
 
 
 if is_nncf_available():
@@ -346,10 +347,15 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
             Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
             compressed layers. Providing a dataset is required to run scale estimation.
         weight_format (`str`, defaults to 'int'):
-            Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4'].
+            Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4'].
         qptq (`bool`, *optional*):
             Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
             difference between activations of a compressed and original layer. Dataset is required to run GPTQ.
+        processor (`str`, *optional*):
+            A transformers processor used to process inputs for multi-modal models. You can pass either:
+                - A string, the *model id* of a predefined processor hosted inside a model repo on huggingface.co.
+                - A path to a *directory* containing files required by the processor, for instance saved
+                    using the [`~AutoProcessor.save_pretrained`] method, e.g., `./my_model_directory/`.
     """
 
     def __init__(
@@ -369,6 +375,7 @@ def __init__(
         scale_estimation: bool = None,
         weight_format: Optional[str] = None,
         gptq: bool = None,
+        processor: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
@@ -383,6 +390,7 @@ def __init__(
         self.scale_estimation = scale_estimation
         self.weight_format = weight_format
         self.gptq = gptq
+        self.processor = processor
         self.post_init()
 
     def post_init(self):
@@ -400,16 +408,14 @@ def post_init(self):
                 f"If you wish to provide a custom dataset, please use the `OVQuantizer` instead."
             )
         if self.dataset is not None and isinstance(self.dataset, str):
-            llm_datasets = ["wikitext2", "c4", "c4-new"]
-            stable_diffusion_datasets = [
-                "conceptual_captions",
-                "laion/220k-GPT4Vision-captions-from-LIVIS",
-                "laion/filtered-wit",
-            ]
-            if self.dataset not in llm_datasets + stable_diffusion_datasets:
+            lm_datasets = ["wikitext2", "c4", "c4-new"]
+            visual_lm_datasets = list(PREDEFINED_VISUAL_LM_DATASETS.keys())
+            stable_diffusion_datasets = list(PREDEFINED_SD_DATASETS.keys())
+            if self.dataset not in lm_datasets + visual_lm_datasets + stable_diffusion_datasets:
                 raise ValueError(
                     f"""You have entered a string value for dataset. You can only choose between
-                    {llm_datasets} for LLLMs or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
+                    {lm_datasets} for LLMs, {visual_lm_datasets} for visual LLMs
+                    or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}"""
                 )
 
         if self.bits not in [4, 8]:
@@ -444,22 +450,27 @@ def post_init(self):
         if self.tokenizer is not None and not isinstance(self.tokenizer, str):
             raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
 
+        if self.processor is not None and not isinstance(self.processor, str):
+            raise ValueError(f"Processor is expected to be a string, but found {self.processor}")
+
         if self.weight_format is None:
             self.weight_format = "int4" if self.bits == 4 else "int8"
-        if self.weight_format not in ["int4", "int8", "mxfp4"]:
+        if self.weight_format not in ["int4", "int8", "mxfp4", "nf4"]:
             raise ValueError(
-                f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4'], but found: {self.weight_format}."
+                f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4', 'nf4'], but found: {self.weight_format}."
             )
-        if self.weight_format == "mxfp4":
+        if self.weight_format in ["mxfp4", "nf4"]:
             if self.bits != 4:
                 raise ValueError(
-                    f"When applying weight compression with 'mxfp4' weight format the `bits` parameters must be set to 4, but found {self.bits}"
+                    f"When applying weight compression with '{self.weight_format}' weight format, the `bits` parameter must be set to 4, but found {self.bits}"
                 )
             if self.quant_method == OVQuantizationMethod.AWQ:
-                raise ValueError("The AWQ algorithm is not supported for 'mxfp4' weight format")
+                raise ValueError(f"The AWQ algorithm is not supported for '{self.weight_format}' weight format")
             if self.scale_estimation:
-                raise ValueError("The Scale Estimation algorithm is not supported for 'mxfp4' weight format")
-            if self.gptq:
+                raise ValueError(
+                    f"The Scale Estimation algorithm is not supported for '{self.weight_format}' weight format"
+                )
+            if self.weight_format == "mxfp4" and self.gptq:
                 raise ValueError("The GPTQ algorithm is not supported for 'mxfp4' weight format")
 
 
diff --git a/optimum/intel/openvino/loaders.py b/optimum/intel/openvino/loaders.py
index fc5ae97495..5da2877002 100644
--- a/optimum/intel/openvino/loaders.py
+++ b/optimum/intel/openvino/loaders.py
@@ -13,26 +13,18 @@
 #  limitations under the License.
 
 import logging
-import warnings
 from typing import Dict, List, Optional, Union
 
-import torch
-from diffusers.utils import _get_model_file
-
-from ..utils.import_utils import is_safetensors_available
-
-
-if is_safetensors_available():
-    import safetensors
-
 import openvino
-from huggingface_hub.constants import HF_HUB_OFFLINE, HUGGINGFACE_HUB_CACHE
+import torch
+from diffusers.loaders.textual_inversion import TextualInversionLoaderMixin, load_textual_inversion_state_dicts
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from openvino.runtime import Type
 from openvino.runtime import opset11 as ops
 from openvino.runtime.passes import Manager, Matcher, MatcherPass, WrapType
 from transformers import PreTrainedTokenizer
 
-from .utils import TEXTUAL_INVERSION_EMBEDDING_KEY, TEXTUAL_INVERSION_NAME, TEXTUAL_INVERSION_NAME_SAFE
+from .utils import TEXTUAL_INVERSION_EMBEDDING_KEY
 
 
 try:
@@ -49,17 +41,17 @@ class InsertTextEmbedding(MatcherPass):
     OpenVINO ngraph transformation for inserting pre-trained texual inversion embedding to text encoder
     """
 
-    def __init__(self, token_ids_and_embeddings):
+    def __init__(self, tokens_ids, embeddings):
         MatcherPass.__init__(self)
-        self.model_changed = False
+
         param = WrapType("opset1.Constant")
 
         def callback(matcher: Matcher) -> bool:
             root = matcher.get_match_root()
-            if root.get_friendly_name() == TEXTUAL_INVERSION_EMBEDDING_KEY:
+            if root.get_friendly_name() == TEXTUAL_INVERSION_EMBEDDING_KEY:  # there should be a better way to do this
                 add_ti = root
                 consumers = matcher.get_match_value().get_target_inputs()
-                for token_id, embedding in token_ids_and_embeddings:
+                for token_id, embedding in zip(tokens_ids, embeddings):
                     ti_weights = ops.constant(embedding, Type.f32, name=str(token_id))
                     ti_weights_unsqueeze = ops.unsqueeze(ti_weights, axes=0)
                     add_ti = ops.concat(
@@ -81,341 +73,74 @@ def callback(matcher: Matcher) -> bool:
 
 
 # Adapted from diffusers.loaders.TextualInversionLoaderMixin
-class OVTextualInversionLoaderMixin:
-    r"""
-    Load textual inversion tokens and embeddings to the tokenizer and text encoder.
-    """
-
-    def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):
-        r"""
-        Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to
-        be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
-        inversion token or if the textual inversion token is a single vector, the input prompt is returned.
-
-        Parameters:
-            prompt (`str` or list of `str`):
-                The prompt or prompts to guide the image generation.
-            tokenizer (`PreTrainedTokenizer`):
-                The tokenizer responsible for encoding the prompt into input tokens.
-
-        Returns:
-            `str` or list of `str`: The converted prompt
-        """
-        if not isinstance(prompt, List):
-            prompts = [prompt]
-        else:
-            prompts = prompt
-
-        prompts = [self._maybe_convert_prompt(p, tokenizer) for p in prompts]
-
-        if not isinstance(prompt, List):
-            return prompts[0]
-
-        return prompts
-
-    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):
-        r"""
-        Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
-        to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
-        is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
-        inversion token or a textual inversion token that is a single vector, the input prompt is simply returned.
-
-        Parameters:
-            prompt (`str`):
-                The prompt to guide the image generation.
-            tokenizer (`PreTrainedTokenizer`):
-                The tokenizer responsible for encoding the prompt into input tokens.
-
-        Returns:
-            `str`: The converted prompt
-        """
-        tokens = tokenizer.tokenize(prompt)
-        unique_tokens = set(tokens)
-        for token in unique_tokens:
-            if token in tokenizer.added_tokens_encoder:
-                replacement = token
-                i = 1
-                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
-                    replacement += f" {token}_{i}"
-                    i += 1
-
-                prompt = prompt.replace(token, replacement)
-
-        return prompt
-
+class OVTextualInversionLoaderMixin(TextualInversionLoaderMixin):
     def load_textual_inversion(
         self,
         pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]],
         token: Optional[Union[str, List[str]]] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa: F821
+        text_encoder: Optional["openvino.runtime.Model"] = None,  # noqa: F821
         **kwargs,
     ):
-        r"""
-        Load textual inversion embeddings into the text encoder of [`StableDiffusionPipeline`] (both 🤗 Diffusers and
-        Automatic1111 formats are supported).
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`):
-                Can be either one of the following or a list of them:
-
-                    - A string, the *model id* (for example `sd-concepts-library/low-poly-hd-logos-icons`) of a
-                      pretrained model hosted on the Hub.
-                    - A path to a *directory* (for example `./my_text_inversion_directory/`) containing the textual
-                      inversion weights.
-                    - A path to a *file* (for example `./my_text_inversions.pt`) containing textual inversion weights.
-                    - A [torch state
-                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
-
-            token (`str` or `List[str]`, *optional*):
-                Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
-                list, then `token` must also be a list of equal length.
-            weight_name (`str`, *optional*):
-                Name of a custom weight file. This should be used when:
-
-                    - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight
-                      name such as `text_inv.bin`.
-                    - The saved textual inversion file is in the Automatic1111 format.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            use_auth_token (Optional[Union[bool, str]], defaults to `None`):
-                Deprecated. Please use `token` instead.
-            token (Optional[Union[bool, str]], defaults to `None`):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            mirror (`str`, *optional*):
-                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
-                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
-                information.
-
-        Example:
-
-        To load a textual inversion embedding vector in 🤗 Diffusers format:
-
-        ```py
-        from optimum.intel import OVStableDiffusionPipeline
-
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = OVStableDiffusionPipeline.from_pretrained(model_id, compile=False)
-
-        pipe.load_textual_inversion("sd-concepts-library/cat-toy")
-        pipe.compile()
-
-        prompt = "A  backpack"
-
-        image = pipe(prompt, num_inference_steps=50).images[0]
-        image.save("cat-backpack.png")
-        ```
-
-        To load a textual inversion embedding vector in Automatic1111 format, make sure to download the vector first
-        (for example from [civitAI](https://civitai.com/models/3036?modelVersionId=9857)) and then load the vector
-        locally:
-
-        ```py
-        from optimum.intel import OVStableDiffusionPipeline
-
-        model_id = "runwayml/stable-diffusion-v1-5"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, compile=False)
-
-        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
-        pipe.compile()
-
-        prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
-
-        image = pipe(prompt, num_inference_steps=50).images[0]
-        image.save("character.png")
-        ```
-        """
-
-        if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PreTrainedTokenizer):
+        if not hasattr(self, "tokenizer"):
             raise ValueError(
-                f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling"
-                f" `{self.load_textual_inversion.__name__}`"
+                f"{self.__class__.__name__} requires `self.tokenizer` for calling `{self.load_textual_inversion.__name__}`"
             )
-
-        if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder.model, openvino.runtime.Model):
+        elif not isinstance(self.tokenizer, PreTrainedTokenizer):
             raise ValueError(
-                f"{self.__class__.__name__} requires `self.text_encoder.model` of type `openvino.runtime.Model` for calling"
-                f" `{self.load_textual_inversion.__name__}`"
-            )
-
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        token = kwargs.pop("token", None)
-        revision = kwargs.pop("revision", None)
-        subfolder = kwargs.pop("subfolder", None)
-        weight_name = kwargs.pop("weight_name", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
+                f"{self.__class__.__name__} requires `self.tokenizer` of type `PreTrainedTokenizer` for calling `{self.load_textual_inversion.__name__}`"
             )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
 
-        if use_safetensors and not is_safetensors_available():
+        if not hasattr(self, "text_encoder"):
             raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                f"{self.__class__.__name__} requires `self.text_encoder` for calling `{self.load_textual_inversion.__name__}`"
             )
-
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-
-        user_agent = {
-            "file_type": "text_inversion",
-            "framework": "pytorch",
-        }
-
-        if not isinstance(pretrained_model_name_or_path, list):
-            pretrained_model_name_or_paths = [pretrained_model_name_or_path]
-        else:
-            pretrained_model_name_or_paths = pretrained_model_name_or_path
-
-        if isinstance(token, str):
-            tokens = [token]
-        elif token is None:
-            tokens = [None] * len(pretrained_model_name_or_paths)
-        else:
-            tokens = token
-
-        if len(pretrained_model_name_or_paths) != len(tokens):
+        elif not isinstance(self.text_encoder.model, openvino.runtime.Model):
             raise ValueError(
-                f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)}"
-                f"Make sure both lists have the same length."
+                f"{self.__class__.__name__} requires `self.text_encoder` of type `openvino.runtime.Model` for calling `{self.load_textual_inversion.__name__}`"
             )
 
-        valid_tokens = [t for t in tokens if t is not None]
-        if len(set(valid_tokens)) < len(valid_tokens):
-            raise ValueError(f"You have passed a list of tokens that contains duplicates: {tokens}")
-
-        token_ids_and_embeddings = []
-
-        for pretrained_model_name_or_path, token in zip(pretrained_model_name_or_paths, tokens):
-            if not isinstance(pretrained_model_name_or_path, dict):
-                # 1. Load textual inversion file
-                model_file = None
-                # Let's first try to load .safetensors weights
-                if (use_safetensors and weight_name is None) or (
-                    weight_name is not None and weight_name.endswith(".safetensors")
-                ):
-                    try:
-                        model_file = _get_model_file(
-                            pretrained_model_name_or_path,
-                            weights_name=weight_name or TEXTUAL_INVERSION_NAME_SAFE,
-                            cache_dir=cache_dir,
-                            force_download=force_download,
-                            resume_download=resume_download,
-                            proxies=proxies,
-                            local_files_only=local_files_only,
-                            use_auth_token=token,  # still uses use_auth_token
-                            revision=revision,
-                            subfolder=subfolder,
-                            user_agent=user_agent,
-                        )
-                        state_dict = safetensors.torch.load_file(model_file, device="cpu")
-                    except Exception as e:
-                        if not allow_pickle:
-                            raise e
-
-                        model_file = None
-
-                if model_file is None:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=weight_name or TEXTUAL_INVERSION_NAME,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=token,  # still uses use_auth_token
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                    )
-                    state_dict = torch.load(model_file, map_location="cpu")
-            else:
-                state_dict = pretrained_model_name_or_path
-
-            # 2. Load token and embedding correcly from file
-            loaded_token = None
-            if isinstance(state_dict, torch.Tensor):
-                if token is None:
+        # 1. Set correct tokenizer and text encoder
+        tokenizer = tokenizer or getattr(self, "tokenizer", None)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+
+        # 2. Normalize inputs
+        pretrained_model_name_or_paths = (
+            [pretrained_model_name_or_path]
+            if not isinstance(pretrained_model_name_or_path, list)
+            else pretrained_model_name_or_path
+        )
+        tokens = [token] if not isinstance(token, list) else token
+        if tokens[0] is None:
+            tokens = tokens * len(pretrained_model_name_or_paths)
+
+        # 3. Check inputs
+        self._check_text_inv_inputs(tokenizer, text_encoder, pretrained_model_name_or_paths, tokens)
+
+        # 4. Load state dicts of textual embeddings
+        state_dicts = load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
+
+        # 4.1 Handle the special case when state_dict is a tensor that contains n embeddings for n tokens
+        if len(tokens) > 1 and len(state_dicts) == 1:
+            if isinstance(state_dicts[0], torch.Tensor):
+                state_dicts = list(state_dicts[0])
+                if len(tokens) != len(state_dicts):
                     raise ValueError(
-                        "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
+                        f"You have passed a state_dict contains {len(state_dicts)} embeddings, and list of tokens of length {len(tokens)} "
+                        f"Make sure both have the same length."
                     )
-                embedding = state_dict
-            elif len(state_dict) == 1:
-                # diffusers
-                loaded_token, embedding = next(iter(state_dict.items()))
-            elif "string_to_param" in state_dict:
-                # A1111
-                loaded_token = state_dict["name"]
-                embedding = state_dict["string_to_param"]["*"]
-
-            if token is not None and loaded_token != token:
-                logger.info(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
-            else:
-                token = loaded_token
-
-            embedding = embedding.detach().cpu().numpy()
 
-            # 3. Make sure we don't mess up the tokenizer or text encoder
-            vocab = self.tokenizer.get_vocab()
-            if token in vocab:
-                raise ValueError(
-                    f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
-                )
-            elif f"{token}_1" in vocab:
-                multi_vector_tokens = [token]
-                i = 1
-                while f"{token}_{i}" in self.tokenizer.added_tokens_encoder:
-                    multi_vector_tokens.append(f"{token}_{i}")
-                    i += 1
+        # 4. Retrieve tokens and embeddings
+        tokens, embeddings = self._retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer)
 
-                raise ValueError(
-                    f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
-                )
-            is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
-            if is_multi_vector:
-                tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
-                embeddings = [e for e in embedding]  # noqa: C416
-            else:
-                tokens = [token]
-                embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding]
-            # add tokens and get ids
-            self.tokenizer.add_tokens(tokens)
-            token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-            token_ids_and_embeddings += zip(token_ids, embeddings)
+        # 5. Extend tokens and embeddings for multi vector
+        tokens, embeddings = self._extend_tokens_and_embeddings(tokens, embeddings, tokenizer)
 
-            logger.info(f"Loaded textual inversion embedding for {token}.")
+        # 7.4 add tokens to tokenizer (modified)
+        tokenizer.add_tokens(tokens)
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
 
         # Insert textual inversion embeddings to text encoder with OpenVINO ngraph transformation
         manager = Manager()
-        manager.register_pass(InsertTextEmbedding(token_ids_and_embeddings))
-        manager.run_passes(self.text_encoder.model)
+        manager.register_pass(InsertTextEmbedding(token_ids, embeddings))
+        manager.run_passes(text_encoder.model)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 6a1d0cea3e..4c91169bba 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -16,7 +16,7 @@
 import os
 import warnings
 from pathlib import Path
-from tempfile import TemporaryDirectory, gettempdir
+from tempfile import gettempdir
 from typing import Dict, Optional, Union
 
 import openvino
@@ -41,6 +41,7 @@
     ONNX_WEIGHTS_NAME,
     OV_TO_PT_TYPE,
     OV_XML_FILE_NAME,
+    TemporaryDirectory,
     _print_compiled_model_properties,
     model_has_dynamic_inputs,
 )
@@ -135,7 +136,11 @@ def __init__(
             self.generation_config = generation_config or GenerationConfig.from_model_config(config)
 
             if is_transformers_version(">=", "4.44.99"):
-                misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+                # some model configs may have issues with loading without parameters initialization
+                try:
+                    misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+                except KeyError:
+                    misplaced_generation_parameters = {}
                 if len(misplaced_generation_parameters) > 0:
                     logger.warning(
                         "Moving the following attributes in the config to the generation config: "
@@ -439,7 +444,7 @@ def from_pretrained(
 
             ov_files = _find_files_matching_pattern(
                 model_dir,
-                pattern=r"(.*)?openvino(.*)?\_model.xml",
+                pattern=r"(.*)?openvino(.*)?\_model(.*)?.xml$",
                 subfolder=subfolder,
                 use_auth_token=token,
                 revision=revision,
@@ -512,7 +517,7 @@ def _cached_file(
         # locates a file in a local folder and repo, downloads and cache it if necessary.
         model_path = Path(model_path)
         if model_path.is_dir():
-            model_cache_path = model_path / file_name
+            model_cache_path = model_path / subfolder / file_name
         else:
             file_name = Path(file_name)
             if file_name.suffix != ".onnx":
@@ -777,7 +782,7 @@ def __init__(
             for inputs in self.model.inputs
         }
         self.ov_config = ov_config or {**self.parent_model.ov_config}
-        self.request = None
+        self.request = None if not self.parent_model._compile_only else self.model
         self._model_name = model_name
         self.config = self.parent_model.config
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
@@ -827,3 +832,6 @@ def __call__(self, *args, **kwargs):
 
     def forward(self, *args, **kwargs):
         raise NotImplementedError
+
+    def clear_requests(self):
+        self.request = None
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
index 763dd2b50e..0ce15641fe 100644
--- a/optimum/intel/openvino/modeling_base_seq2seq.py
+++ b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -15,7 +15,6 @@
 import logging
 import os
 from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Union
 
 import openvino
@@ -36,6 +35,7 @@
     OV_DECODER_NAME,
     OV_DECODER_WITH_PAST_NAME,
     OV_ENCODER_NAME,
+    TemporaryDirectory,
 )
 
 
@@ -84,7 +84,11 @@ def __init__(
         self.generation_config = generation_config or GenerationConfig.from_model_config(config)
 
         if is_transformers_version(">=", "4.44.99"):
-            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            # some model configs may have issues with loading without parameters initialization
+            try:
+                misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            except KeyError:
+                misplaced_generation_parameters = {}
             if len(misplaced_generation_parameters) > 0:
                 logger.warning(
                     "Moving the following attributes in the config to the generation config: "
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 733f5a4119..4897db1459 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -15,8 +15,7 @@
 import logging
 import os
 from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -31,7 +30,7 @@
 from transformers.generation.logits_process import LogitsProcessorList
 from transformers.generation.stopping_criteria import StoppingCriteriaList
 from transformers.generation.utils import GenerateOutput, GenerationMode
-from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
 
 from optimum.utils.normalized_config import NormalizedConfigManager
 
@@ -50,6 +49,7 @@
     ONNX_WEIGHTS_NAME,
     OV_XML_FILE_NAME,
     STR_TO_OV_TYPE,
+    TemporaryDirectory,
     get_export_transformers_version,
     model_has_dynamic_inputs,
 )
@@ -504,8 +504,8 @@ def prepare_inputs(
             else:
                 position_ids = np.cumsum(attention_mask, axis=1) - 1
                 position_ids[attention_mask == 0] = 1
-                if past_key_values:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
 
             inputs["position_ids"] = position_ids
 
@@ -522,9 +522,12 @@ def forward(
         attention_mask: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         self.compile()
+        # added as model.generate validates model inputs based on forward signature
+        kwargs["token_type_ids"] = token_type_ids
 
         inputs = self.prepare_inputs(
             input_ids=input_ids,
@@ -604,6 +607,24 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
 
         return model_inputs
 
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs, model_kwargs=model_kwargs, is_encoder_decoder=is_encoder_decoder, **kwargs
+        )
+
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1)
+        return model_kwargs
+
     def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple):
         batch_size = logits.shape[0]
         if indicies.shape[0] != 1:
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 5c80fe255e..3ce1cc73f0 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -13,41 +13,47 @@
 #  limitations under the License.
 
 import importlib
+import inspect
 import logging
 import os
 import shutil
+from abc import abstractmethod
+from collections import OrderedDict
 from copy import deepcopy
 from pathlib import Path
-from tempfile import TemporaryDirectory
+from tempfile import gettempdir
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import openvino
-import PIL
+import torch
 from diffusers import (
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    DiffusionPipeline,
+    LatentConsistencyModelImg2ImgPipeline,
+    LatentConsistencyModelPipeline,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
     StableDiffusionXLPipeline,
+    pipelines,
 )
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.schedulers import SchedulerMixin
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
+from diffusers.utils.constants import CONFIG_NAME
 from huggingface_hub import snapshot_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import validate_hf_hub_args
 from openvino._offline_transformations import compress_model_transformation
 from openvino.runtime import Core
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
+from transformers.modeling_outputs import ModelOutput
 
-from optimum.pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin
-from optimum.pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
-from optimum.pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
-from optimum.pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
-from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin
-from optimum.pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin
-from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
 from optimum.utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
@@ -57,107 +63,189 @@
 )
 
 from ...exporters.openvino import main_export
+from ..utils.import_utils import is_diffusers_version
 from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig
 from .loaders import OVTextualInversionLoaderMixin
-from .modeling_base import OVBaseModel, OVModelPart
-from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME
+from .modeling_base import OVBaseModel
+from .utils import (
+    ONNX_WEIGHTS_NAME,
+    OV_TO_PT_TYPE,
+    OV_XML_FILE_NAME,
+    TemporaryDirectory,
+    _print_compiled_model_properties,
+    model_has_dynamic_inputs,
+    np_to_pt_generators,
+)
+
+
+if is_diffusers_version(">=", "0.25.0"):
+    from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+else:
+    from diffusers.models.vae import DiagonalGaussianDistribution
+
+if is_diffusers_version(">=", "0.29.0"):
+    from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
+else:
+    StableDiffusion3Pipeline, StableDiffusion3Img2ImgPipeline = StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
 
+if is_diffusers_version(">=", "0.30.0"):
+    from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline
+else:
+    StableDiffusion3InpaintPipeline = StableDiffusionInpaintPipeline
+    FluxPipeline = StableDiffusionPipeline
+
+
+DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
+DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3"
 
 core = Core()
 
 logger = logging.getLogger(__name__)
 
 
-class OVStableDiffusionPipelineBase(OVBaseModel, OVTextualInversionLoaderMixin):
-    auto_model_class = StableDiffusionPipeline
+# TODO: support DiffusionPipeline.from_pipe()
+# TODO: makes more sense to have a compositional OVMixin class
+# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline
+class OVDiffusionPipeline(OVBaseModel, DiffusionPipeline):
+    auto_model_class = DiffusionPipeline
     config_name = "model_index.json"
-    export_feature = "text-to-image"
     _library_name = "diffusers"
 
     def __init__(
         self,
-        unet: openvino.runtime.Model,
-        config: Dict[str, Any],
-        scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"],
+        scheduler: SchedulerMixin,
+        unet: Optional[openvino.runtime.Model] = None,
         vae_decoder: Optional[openvino.runtime.Model] = None,
+        # optional pipeline models
         vae_encoder: Optional[openvino.runtime.Model] = None,
         text_encoder: Optional[openvino.runtime.Model] = None,
         text_encoder_2: Optional[openvino.runtime.Model] = None,
-        tokenizer: Optional["CLIPTokenizer"] = None,
-        tokenizer_2: Optional["CLIPTokenizer"] = None,
-        feature_extractor: Optional["CLIPFeatureExtractor"] = None,
-        safety_checker: Optional["StableDiffusionSafetyChecker"] = None,
+        text_encoder_3: Optional[openvino.runtime.Model] = None,
+        transformer: Optional[openvino.runtime.Model] = None,
+        # optional pipeline submodels
+        tokenizer: Optional[CLIPTokenizer] = None,
+        tokenizer_2: Optional[CLIPTokenizer] = None,
+        tokenizer_3: Optional[CLIPTokenizer] = None,
+        feature_extractor: Optional[CLIPFeatureExtractor] = None,
+        # stable diffusion xl specific arguments
+        force_zeros_for_empty_prompt: bool = True,
+        requires_aesthetics_score: bool = False,
+        add_watermarker: Optional[bool] = None,
+        # openvino specific arguments
         device: str = "CPU",
-        dynamic_shapes: bool = True,
         compile: bool = True,
+        compile_only: bool = False,
+        dynamic_shapes: bool = True,
         ov_config: Optional[Dict[str, str]] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
         **kwargs,
     ):
-        self._internal_dict = config
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
+        self._compile_only = compile_only
+        self.model_save_dir = model_save_dir
         self.ov_config = {} if ov_config is None else {**ov_config}
-        self._compile_only = kwargs.get("compile_only", False)
-
-        # This attribute is needed to keep one reference on the temporary directory, since garbage collecting
-        # would end-up removing the directory containing the underlying OpenVINO model
-        self._model_save_dir_tempdirectory_instance = None
-        if isinstance(model_save_dir, TemporaryDirectory):
-            self._model_save_dir_tempdirectory_instance = model_save_dir
-            self._model_save_dir = Path(model_save_dir.name)
-        elif isinstance(model_save_dir, str):
-            self._model_save_dir = Path(model_save_dir)
-        else:
-            self._model_save_dir = model_save_dir
+        self.preprocessors = kwargs.get("preprocessors", [])
 
-        self.vae_decoder = OVModelVaeDecoder(vae_decoder, self)
-        self.unet = OVModelUnet(unet, self)
-        self.text_encoder = OVModelTextEncoder(text_encoder, self) if text_encoder is not None else None
+        if self._compile_only:
+            if not compile:
+                raise ValueError(
+                    "`compile_only` mode does not support disabling compilation."
+                    "Please provide `compile=True` if you want to use `compile_only=True` or set `compile_only=False`"
+                )
+
+            if not isinstance(unet, openvino.runtime.CompiledModel):
+                raise ValueError("`compile_only` expect that already compiled model will be provided")
+
+            model_is_dynamic = model_has_dynamic_inputs(unet)
+            if dynamic_shapes ^ model_is_dynamic:
+                requested_shapes = "dynamic" if dynamic_shapes else "static"
+                compiled_shapes = "dynamic" if model_is_dynamic else "static"
+                raise ValueError(
+                    f"Provided compiled model with {compiled_shapes} shapes but requested to use {requested_shapes}. "
+                    f"Please set `compile_only=False` or `dynamic_shapes={model_is_dynamic}`"
+                )
+
+        self.unet = OVModelUnet(unet, self, DIFFUSION_MODEL_UNET_SUBFOLDER) if unet is not None else None
+        self.transformer = (
+            OVModelTransformer(transformer, self, DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER)
+            if transformer is not None
+            else None
+        )
+
+        if unet is None and transformer is None:
+            raise ValueError("`unet` or `transformer` model should be provided for pipeline work")
+        self.vae_decoder = OVModelVaeDecoder(vae_decoder, self, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER)
+        self.vae_encoder = (
+            OVModelVaeEncoder(vae_encoder, self, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER)
+            if vae_encoder is not None
+            else None
+        )
+        self.text_encoder = (
+            OVModelTextEncoder(text_encoder, self, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER)
+            if text_encoder is not None
+            else None
+        )
         self.text_encoder_2 = (
-            OVModelTextEncoder(text_encoder_2, self, model_name=DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER)
+            OVModelTextEncoder(text_encoder_2, self, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER)
             if text_encoder_2 is not None
             else None
         )
-        self.vae_encoder = OVModelVaeEncoder(vae_encoder, self) if vae_encoder is not None else None
-
-        if "block_out_channels" in self.vae_decoder.config:
-            self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
-        else:
-            self.vae_scale_factor = 8
-
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.text_encoder_3 = (
+            OVModelTextEncoder(text_encoder_3, self, DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER)
+            if text_encoder_3 is not None
+            else None
+        )
+        # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API
+        self.vae = OVModelVae(decoder=self.vae_decoder, encoder=self.vae_encoder)
 
+        self.scheduler = scheduler
         self.tokenizer = tokenizer
         self.tokenizer_2 = tokenizer_2
-        self.scheduler = scheduler
+        self.tokenizer_3 = tokenizer_3
         self.feature_extractor = feature_extractor
-        self.safety_checker = safety_checker
-        self.preprocessors = []
-
-        if self.is_dynamic and not self._compile_only:
-            self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1)
 
-        sub_models = {
-            DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder,
-            DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
-            DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
-            DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
-            DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
+        # we allow passing these as torch models for now
+        self.image_encoder = kwargs.pop("image_encoder", None)  # TODO: maybe mplement OVModelImageEncoder
+        self.safety_checker = kwargs.pop("safety_checker", None)  # TODO: maybe mplement OVModelSafetyChecker
+
+        all_pipeline_init_args = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "transformer": self.transformer,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "text_encoder_3": self.text_encoder_3,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+            "scheduler": self.scheduler,
+            "tokenizer": self.tokenizer,
+            "tokenizer_2": self.tokenizer_2,
+            "tokenizer_3": self.tokenizer_3,
+            "feature_extractor": self.feature_extractor,
+            "requires_aesthetics_score": requires_aesthetics_score,
+            "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt,
+            "add_watermarker": add_watermarker,
         }
-        for name in sub_models.keys():
-            self._internal_dict[name] = (
-                ("optimum", sub_models[name].__class__.__name__) if sub_models[name] is not None else (None, None)
-            )
 
-        self._internal_dict.pop("vae", None)
+        diffusers_pipeline_args = {}
+        for key in inspect.signature(self.auto_model_class).parameters.keys():
+            if key in all_pipeline_init_args:
+                diffusers_pipeline_args[key] = all_pipeline_init_args[key]
+        # inits diffusers pipeline specific attributes (registers modules and config)
+        self.auto_model_class.__init__(self, **diffusers_pipeline_args)
+        # we use auto_model_class.__init__ here because we can't call super().__init__
+        # as OptimizedModel already defines an __init__ which is the first in the MRO
 
         self._openvino_config = None
         if quantization_config:
             self._openvino_config = OVConfig(quantization_config=quantization_config)
         self._set_ov_config_parameters()
 
+        if self.is_dynamic and not self._compile_only:
+            self.reshape(batch_size=-1, height=-1, width=-1, num_images_per_prompt=-1)
+
         if compile and not self._compile_only:
             self.compile()
 
@@ -177,34 +265,61 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
 
         save_directory = Path(save_directory)
 
-        sub_models_to_save = {
-            self.unet: DIFFUSION_MODEL_UNET_SUBFOLDER,
-            self.vae_decoder: DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
-            self.vae_encoder: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-            self.text_encoder: DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
-            self.text_encoder_2: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+        models_to_save_paths = {
+            (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER),
+            (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER),
+            (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER),
+            (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER),
+            (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER),
+            (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER),
+            (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER),
         }
-
-        for ov_model, dst_path in sub_models_to_save.items():
-            if ov_model is not None:
-                dst_path = save_directory / dst_path / OV_XML_FILE_NAME
+        for model, save_path in models_to_save_paths:
+            if model is not None:
+                dst_path = save_path / OV_XML_FILE_NAME
                 dst_path.parent.mkdir(parents=True, exist_ok=True)
-                openvino.save_model(ov_model.model, dst_path, compress_to_fp16=False)
-                model_dir = ov_model.config.get("_name_or_path", None) or ov_model._model_dir / ov_model._model_name
-                config_path = Path(model_dir) / ov_model.CONFIG_NAME
+                openvino.save_model(model.model, dst_path, compress_to_fp16=False)
+                model_dir = model.config.get("_name_or_path", None) or model.model_save_dir
+                config_path = Path(model_dir) / CONFIG_NAME
                 if config_path.is_file():
-                    shutil.copyfile(config_path, dst_path.parent / ov_model.CONFIG_NAME)
+                    config_save_path = save_path / CONFIG_NAME
+                    shutil.copyfile(config_path, config_save_path)
 
         self.scheduler.save_pretrained(save_directory / "scheduler")
-        if self.feature_extractor is not None:
-            self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
+
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(save_directory / "tokenizer")
         if self.tokenizer_2 is not None:
             self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")
+        if self.tokenizer_3 is not None:
+            self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3")
+        if self.feature_extractor is not None:
+            self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
+        if getattr(self, "safety_checker", None) is not None:
+            self.safety_checker.save_pretrained(save_directory / "safety_checker")
 
         self._save_openvino_config(save_directory)
 
+    def _save_config(self, save_directory):
+        """
+        Saves a model configuration into a directory, so that it can be re-loaded using the
+        [`from_pretrained`] class method.
+        """
+        model_dir = (
+            self.model_save_dir
+            if not isinstance(self.model_save_dir, TemporaryDirectory)
+            else self.model_save_dir.name
+        )
+        save_dir = Path(save_directory)
+        original_config = Path(model_dir) / self.config_name
+        if original_config.exists():
+            if not save_dir.exists():
+                save_dir.mkdir(parents=True)
+
+            shutil.copy(original_config, save_dir)
+        else:
+            self.config.save_pretrained(save_dir)
+
     @classmethod
     def _from_pretrained(
         cls,
@@ -212,143 +327,195 @@ def _from_pretrained(
         config: Dict[str, Any],
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
-        vae_decoder_file_name: Optional[str] = None,
-        text_encoder_file_name: Optional[str] = None,
         unet_file_name: Optional[str] = None,
+        vae_decoder_file_name: Optional[str] = None,
         vae_encoder_file_name: Optional[str] = None,
+        text_encoder_file_name: Optional[str] = None,
         text_encoder_2_file_name: Optional[str] = None,
-        local_files_only: bool = False,
+        text_encoder_3_file_name: Optional[str] = None,
+        transformer_file_name: Optional[str] = None,
         from_onnx: bool = False,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         load_in_8bit: bool = False,
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
+        # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config
+        if cls.__name__ == "OVDiffusionPipeline":
+            class_name = config["_class_name"]
+            ov_pipeline_class = _get_ov_class(class_name)
+        else:
+            ov_pipeline_class = cls
+
         default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME
+
+        unet_file_name = unet_file_name or default_file_name
+        vae_encoder_file_name = vae_encoder_file_name or default_file_name
         vae_decoder_file_name = vae_decoder_file_name or default_file_name
         text_encoder_file_name = text_encoder_file_name or default_file_name
         text_encoder_2_file_name = text_encoder_2_file_name or default_file_name
-        unet_file_name = unet_file_name or default_file_name
-        vae_encoder_file_name = vae_encoder_file_name or default_file_name
-        model_id = str(model_id)
-        patterns = set(config.keys())
-        sub_models_names = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"})
-        if not os.path.isdir(model_id):
-            patterns.update({"vae_encoder", "vae_decoder"})
-            allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")}
+        text_encoder_3_file_name = text_encoder_3_file_name or default_file_name
+        transformer_file_name = transformer_file_name or default_file_name
+
+        if not os.path.isdir(str(model_id)):
+            all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"}
+            allow_patterns = {os.path.join(component, "*") for component in all_components}
             allow_patterns.update(
                 {
+                    unet_file_name,
+                    transformer_file_name,
+                    vae_encoder_file_name,
                     vae_decoder_file_name,
                     text_encoder_file_name,
                     text_encoder_2_file_name,
-                    unet_file_name,
-                    vae_encoder_file_name,
+                    text_encoder_3_file_name,
+                    unet_file_name.replace(".xml", ".bin"),
+                    transformer_file_name.replace(".xml", ".bin"),
+                    vae_encoder_file_name.replace(".xml", ".bin"),
                     vae_decoder_file_name.replace(".xml", ".bin"),
                     text_encoder_file_name.replace(".xml", ".bin"),
                     text_encoder_2_file_name.replace(".xml", ".bin"),
-                    unet_file_name.replace(".xml", ".bin"),
-                    vae_encoder_file_name.replace(".xml", ".bin"),
+                    text_encoder_3_file_name.replace(".xml", ".bin"),
                     SCHEDULER_CONFIG_NAME,
-                    CONFIG_NAME,
                     cls.config_name,
+                    CONFIG_NAME,
                 }
             )
             ignore_patterns = ["*.msgpack", "*.safetensors", "*pytorch_model.bin"]
             if not from_onnx:
                 ignore_patterns.extend(["*.onnx", "*.onnx_data"])
-            # Downloads all repo's files matching the allowed patterns
-            model_id = snapshot_download(
+
+            model_save_folder = snapshot_download(
                 model_id,
                 cache_dir=cache_dir,
+                force_download=force_download,
                 local_files_only=local_files_only,
-                token=token,
                 revision=revision,
+                token=token,
                 allow_patterns=allow_patterns,
                 ignore_patterns=ignore_patterns,
             )
-        new_model_save_dir = Path(model_id)
-
-        for name in sub_models_names:
-            # Check if the subcomponent needs to be loaded
-            if kwargs.get(name, None) is not None:
-                continue
-            library_name, library_classes = config[name]
-            if library_classes is not None:
-                library = importlib.import_module(library_name)
-                class_obj = getattr(library, library_classes)
+        else:
+            model_save_folder = str(model_id)
+
+        model_save_path = Path(model_save_folder)
+
+        if model_save_dir is None:
+            model_save_dir = model_save_path
+
+        submodels = {
+            "scheduler": None,
+            "tokenizer": None,
+            "tokenizer_2": None,
+            "tokenizer_3": None,
+            "feature_extractor": None,
+            "safety_checker": None,
+            "image_encoder": None,
+        }
+        for name in submodels.keys():
+            if name in kwargs:
+                submodels[name] = kwargs.pop(name)
+            elif config.get(name, (None, None))[0] is not None:
+                module_name, module_class = config.get(name)
+                if hasattr(pipelines, module_name):
+                    module = getattr(pipelines, module_name)
+                else:
+                    module = importlib.import_module(module_name)
+                class_obj = getattr(module, module_class)
                 load_method = getattr(class_obj, "from_pretrained")
                 # Check if the module is in a subdirectory
-                if (new_model_save_dir / name).is_dir():
-                    kwargs[name] = load_method(new_model_save_dir / name)
+                if (model_save_path / name).is_dir():
+                    submodels[name] = load_method(model_save_path / name)
+                # For backward compatibility with models exported using previous optimum version, where safety_checker saving was disabled
+                elif name == "safety_checker":
+                    logger.warning(
+                        "Pipeline config contains `safety_checker` subcomponent, while `safety_checker` is not available in model directory. "
+                        "`safety_checker` will be disabled. If you want to enable it please set it explicitly to `from_pretrained` method "
+                        "or reexport model with new optimum-intel version"
+                    )
+                    submodels[name] = None
                 else:
-                    kwargs[name] = load_method(new_model_save_dir)
-
-        unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
-        components = {
-            "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
-            "vae_decoder": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
-            "text_encoder": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
-            "text_encoder_2": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name,
+                    submodels[name] = load_method(model_save_path)
+
+        models = {
+            "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
+            "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name,
+            "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
+            "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
+            "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
+            "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name,
+            "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name,
         }
 
-        compile_only = kwargs.get("compile_only", False)
-
-        if model_save_dir is None:
-            model_save_dir = new_model_save_dir
+        for config_key, value in config.items():
+            if config_key not in models and config_key not in kwargs and config_key not in submodels:
+                kwargs[config_key] = value
 
+        compile_only = kwargs.get("compile_only", False)
         quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
         if (quantization_config is None or quantization_config.dataset is None) and not compile_only:
-            unet = cls.load_model(unet_path, quantization_config)
-            for key, value in components.items():
-                components[key] = cls.load_model(value, quantization_config) if value.is_file() else None
+            for name, path in models.items():
+                if name in kwargs:
+                    models[name] = kwargs.pop(name)
+                else:
+                    models[name] = cls.load_model(path, quantization_config) if path.is_file() else None
         elif compile_only:
             ov_config = kwargs.get("ov_config", {})
             device = kwargs.get("device", "CPU")
             vae_ov_conifg = {**ov_config}
             if "GPU" in device.upper() and "INFERENCE_PRECISION_HINT" not in vae_ov_conifg:
                 vae_ov_conifg["INFERENCE_PRECISION_HINT"] = "f32"
-            unet = cls._compile_model(unet_path, device, ov_config, Path(model_save_dir) / "unet")
-            for key, value in components.items():
-                components[key] = (
-                    cls._compile_model(
-                        value, device, ov_config if "vae" not in key else vae_ov_conifg, Path(model_save_dir) / key
+            for name, path in models.items():
+                if name in kwargs:
+                    models[name] = kwargs.pop(name)
+                else:
+                    models[name] = (
+                        cls._compile_model(
+                            path,
+                            device,
+                            ov_config if "vae" not in name else vae_ov_conifg,
+                            Path(model_save_dir) / name,
+                        )
+                        if path.is_file()
+                        else None
                     )
-                    if value.is_file()
-                    else None
-                )
         else:
-            # Load uncompressed models to apply hybrid quantization further
-            unet = cls.load_model(unet_path)
-            for key, value in components.items():
-                components[key] = cls.load_model(value) if value.is_file() else None
-            sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
-
-            supported_pipelines = (
-                OVStableDiffusionPipeline,
-                OVStableDiffusionXLPipeline,
-                OVLatentConsistencyModelPipeline,
-            )
-            if not isinstance(sd_model, supported_pipelines):
+            # why is this quantization not performed in __init__?
+            if ov_pipeline_class.export_feature != "text-to-image":
                 raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")
 
             from optimum.intel import OVQuantizer
 
+            for name, path in models.items():
+                if name in kwargs:
+                    models[name] = kwargs.pop(name)
+                else:
+                    models[name] = cls.load_model(path) if path.is_file() else None
+
+            ov_pipeline = ov_pipeline_class(**models, **submodels, model_save_dir=model_save_dir, **kwargs)
+            # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from
+            ov_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id)))
+
             hybrid_quantization_config = deepcopy(quantization_config)
             hybrid_quantization_config.quant_method = OVQuantizationMethod.HYBRID
-            quantizer = OVQuantizer(sd_model)
+            quantizer = OVQuantizer(ov_pipeline)
             quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config))
 
-            return sd_model
-
-        return cls(
-            unet=unet,
-            config=config,
+            return ov_pipeline
+        ov_pipeline = ov_pipeline_class(
+            **models,
+            **submodels,
             model_save_dir=model_save_dir,
             quantization_config=quantization_config,
-            **components,
             **kwargs,
         )
+        # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from
+        ov_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id)))
+
+        return ov_pipeline
 
     @classmethod
     def _from_transformers(
@@ -360,25 +527,11 @@ def _from_transformers(
         force_download: bool = False,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
         local_files_only: bool = False,
-        tokenizer: Optional["CLIPTokenizer"] = None,
-        scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"] = None,
-        feature_extractor: Optional["CLIPFeatureExtractor"] = None,
-        tokenizer_2: Optional["CLIPTokenizer"] = None,
         load_in_8bit: Optional[bool] = None,
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        compile_only: bool = False,
         **kwargs,
     ):
-        save_dir = TemporaryDirectory()
-        save_dir_path = Path(save_dir.name)
-
-        # If load_in_8bit and quantization_config not specified then ov_config is set to None and will be set by default in convert depending on the model size
-        if load_in_8bit is None and not quantization_config:
-            ov_config = None
-        else:
-            ov_config = OVConfig(dtype="fp32")
-
-        compile_only = kwargs.pop("compile_only", False)
-
         if compile_only:
             logger.warning(
                 "`compile_only` mode will be disabled because it does not support model export."
@@ -386,14 +539,24 @@ def _from_transformers(
             )
             compile_only = False
 
+        # If load_in_8bit and quantization_config not specified then ov_config is set
+        # to None and will be set by default in convert depending on the model size
+        if load_in_8bit is None and not quantization_config:
+            ov_config = None
+        else:
+            ov_config = OVConfig(dtype="fp32")
+
+        model_save_dir = TemporaryDirectory()
+        model_save_path = Path(model_save_dir.name)
+
         main_export(
             model_name_or_path=model_id,
-            output=save_dir_path,
-            task=cls.export_feature,
+            output=model_save_path,
             do_validation=False,
             no_post_process=True,
             revision=revision,
             cache_dir=cache_dir,
+            task=cls.export_feature,
             token=token,
             local_files_only=local_files_only,
             force_download=force_download,
@@ -402,42 +565,64 @@ def _from_transformers(
         )
 
         return cls._from_pretrained(
-            model_id=save_dir_path,
+            model_id=model_save_path,
             config=config,
             from_onnx=False,
             token=token,
             revision=revision,
-            force_download=force_download,
             cache_dir=cache_dir,
+            force_download=force_download,
             local_files_only=local_files_only,
-            model_save_dir=save_dir,
-            tokenizer=tokenizer,
-            tokenizer_2=tokenizer_2,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-            load_in_8bit=load_in_8bit,
+            model_save_dir=model_save_dir,
             quantization_config=quantization_config,
+            load_in_8bit=load_in_8bit,
             compile_only=compile_only,
             **kwargs,
         )
 
+    def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = None):
+        for arg in args:
+            if isinstance(arg, str):
+                device = arg
+            elif isinstance(arg, torch.dtype):
+                dtype = arg
+
+        if isinstance(device, str):
+            self._device = device.upper()
+            self.clear_requests()
+        elif device is not None:
+            raise ValueError(
+                "The `device` argument should be a string representing the device on which the model should be loaded."
+            )
+
+        if dtype is not None and dtype != self.dtype:
+            raise NotImplementedError(
+                f"Cannot change the dtype of the model from {self.dtype} to {dtype}. "
+                f"Please export the model with the desired dtype."
+            )
+
+        return self
+
     @property
     def height(self) -> int:
-        height = self.unet.model.inputs[0].get_partial_shape()[2]
+        model = self.vae.decoder.model
+        height = model.inputs[0].get_partial_shape()[2]
         if height.is_dynamic:
             return -1
         return height.get_length() * self.vae_scale_factor
 
     @property
     def width(self) -> int:
-        width = self.unet.model.inputs[0].get_partial_shape()[3]
+        model = self.vae.decoder.model
+        width = model.inputs[0].get_partial_shape()[3]
         if width.is_dynamic:
             return -1
         return width.get_length() * self.vae_scale_factor
 
     @property
-    def _batch_size(self) -> int:
-        batch_size = self.unet.model.inputs[0].get_partial_shape()[0]
+    def batch_size(self) -> int:
+        model = self.unet.model if self.unet is not None else self.transformer.model
+        batch_size = model.inputs[0].get_partial_shape()[0]
         if batch_size.is_dynamic:
             return -1
         return batch_size.get_length()
@@ -489,6 +674,65 @@ def _reshape_unet(
         model.reshape(shapes)
         return model
 
+    def _reshape_transformer(
+        self,
+        model: openvino.runtime.Model,
+        batch_size: int = -1,
+        height: int = -1,
+        width: int = -1,
+        num_images_per_prompt: int = -1,
+        tokenizer_max_length: int = -1,
+    ):
+        if batch_size == -1 or num_images_per_prompt == -1:
+            batch_size = -1
+        else:
+            # The factor of 2 comes from the guidance scale > 1
+            batch_size *= num_images_per_prompt
+            if "img_ids" not in {inputs.get_any_name() for inputs in model.inputs}:
+                batch_size *= 2
+
+        height = height // self.vae_scale_factor if height > 0 else height
+        width = width // self.vae_scale_factor if width > 0 else width
+        packed_height = height // 2 if height > 0 else height
+        packed_width = width // 2 if width > 0 else width
+        packed_height_width = packed_width * packed_height if height > 0 and width > 0 else -1
+        shapes = {}
+        for inputs in model.inputs:
+            shapes[inputs] = inputs.get_partial_shape()
+            if inputs.get_any_name() in ["timestep", "guidance"]:
+                shapes[inputs][0] = batch_size
+            elif inputs.get_any_name() == "hidden_states":
+                in_channels = self.transformer.config.get("in_channels", None)
+                if in_channels is None:
+                    in_channels = (
+                        shapes[inputs][1] if inputs.get_partial_shape().rank.get_length() == 4 else shapes[inputs][2]
+                    )
+                    if in_channels.is_dynamic:
+                        logger.warning(
+                            "Could not identify `in_channels` from the unet configuration, to statically reshape the unet please provide a configuration."
+                        )
+                        self.is_dynamic = True
+                if inputs.get_partial_shape().rank.get_length() == 4:
+                    shapes[inputs] = [batch_size, in_channels, height, width]
+                else:
+                    shapes[inputs] = [batch_size, packed_height_width, in_channels]
+
+            elif inputs.get_any_name() == "pooled_projections":
+                shapes[inputs] = [batch_size, self.transformer.config["pooled_projection_dim"]]
+            elif inputs.get_any_name() == "img_ids":
+                shapes[inputs] = (
+                    [batch_size, packed_height_width, 3]
+                    if is_diffusers_version("<", "0.31.0")
+                    else [packed_height_width, 3]
+                )
+            elif inputs.get_any_name() == "txt_ids":
+                shapes[inputs] = [batch_size, -1, 3] if is_diffusers_version("<", "0.31.0") else [-1, 3]
+            else:
+                shapes[inputs][0] = batch_size
+                shapes[inputs][1] = -1  # text_encoder_3 may have vary input length
+        model.reshape(shapes)
+        return model
+
     def _reshape_text_encoder(
         self, model: openvino.runtime.Model, batch_size: int = -1, tokenizer_max_length: int = -1
     ):
@@ -497,21 +741,6 @@ def _reshape_text_encoder(
             model.reshape(shapes)
         return model
 
-    def _reshape_vae_decoder(self, model: openvino.runtime.Model, height: int = -1, width: int = -1):
-        height = height // self.vae_scale_factor if height > -1 else height
-        width = width // self.vae_scale_factor if width > -1 else width
-        latent_channels = self.vae_decoder.config.get("latent_channels", None)
-        if latent_channels is None:
-            latent_channels = model.inputs[0].get_partial_shape()[1]
-            if latent_channels.is_dynamic:
-                logger.warning(
-                    "Could not identify `latent_channels` from the VAE decoder configuration, to statically reshape the VAE decoder please provide a configuration."
-                )
-                self.is_dynamic = True
-        shapes = {model.inputs[0]: [1, latent_channels, height, width]}
-        model.reshape(shapes)
-        return model
-
     def _reshape_vae_encoder(
         self, model: openvino.runtime.Model, batch_size: int = -1, height: int = -1, width: int = -1
     ):
@@ -527,6 +756,23 @@ def _reshape_vae_encoder(
         model.reshape(shapes)
         return model
 
+    def _reshape_vae_decoder(
+        self, model: openvino.runtime.Model, height: int = -1, width: int = -1, num_images_per_prompt: int = -1
+    ):
+        height = height // self.vae_scale_factor if height > -1 else height
+        width = width // self.vae_scale_factor if width > -1 else width
+        latent_channels = self.vae_decoder.config.get("latent_channels", None)
+        if latent_channels is None:
+            latent_channels = model.inputs[0].get_partial_shape()[1]
+            if latent_channels.is_dynamic:
+                logger.warning(
+                    "Could not identify `latent_channels` from the VAE decoder configuration, to statically reshape the VAE decoder please provide a configuration."
+                )
+                self.is_dynamic = True
+        shapes = {model.inputs[0]: [num_images_per_prompt, latent_channels, height, width]}
+        model.reshape(shapes)
+        return model
+
     def reshape(
         self,
         batch_size: int,
@@ -540,17 +786,29 @@ def reshape(
             )
 
         self.is_dynamic = -1 in {batch_size, height, width, num_images_per_prompt}
-        self.vae_decoder.model = self._reshape_vae_decoder(self.vae_decoder.model, height, width)
+
         if self.tokenizer is None and self.tokenizer_2 is None:
             tokenizer_max_len = -1
         else:
             tokenizer_max_len = (
                 self.tokenizer.model_max_length if self.tokenizer is not None else self.tokenizer_2.model_max_length
             )
-        self.unet.model = self._reshape_unet(
-            self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len
+
+        if self.unet is not None:
+            self.unet.model = self._reshape_unet(
+                self.unet.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len
+            )
+        if self.transformer is not None:
+            self.transformer.model = self._reshape_transformer(
+                self.transformer.model, batch_size, height, width, num_images_per_prompt, tokenizer_max_len
+            )
+        self.vae_decoder.model = self._reshape_vae_decoder(
+            self.vae_decoder.model, height, width, num_images_per_prompt
         )
 
+        if self.vae_encoder is not None:
+            self.vae_encoder.model = self._reshape_vae_encoder(self.vae_encoder.model, batch_size, height, width)
+
         if self.text_encoder is not None:
             self.text_encoder.model = self._reshape_text_encoder(
                 self.text_encoder.model, batch_size, self.tokenizer.model_max_length
@@ -561,8 +819,10 @@ def reshape(
                 self.text_encoder_2.model, batch_size, self.tokenizer_2.model_max_length
             )
 
-        if self.vae_encoder is not None:
-            self.vae_encoder.model = self._reshape_vae_encoder(self.vae_encoder.model, batch_size, height, width)
+        if self.text_encoder_3 is not None:
+            self.text_encoder_3.model = self._reshape_text_encoder(
+                self.text_encoder_3.model, batch_size, self.tokenizer_3.model_max_length
+            )
 
         self.clear_requests()
         return self
@@ -576,12 +836,20 @@ def half(self):
                 "`half()` is not supported with `compile_only` mode, please intialize model without this option"
             )
 
-        compress_model_transformation(self.vae_decoder.model)
-        compress_model_transformation(self.unet.model)
-        for component in {self.text_encoder, self.text_encoder_2, self.vae_encoder}:
+        for component in {
+            self.unet,
+            self.transformer,
+            self.vae_encoder,
+            self.vae_decoder,
+            self.text_encoder,
+            self.text_encoder_2,
+            self.text_encoder_3,
+        }:
             if component is not None:
                 compress_model_transformation(component.model)
+
         self.clear_requests()
+
         return self
 
     def clear_requests(self):
@@ -590,16 +858,28 @@ def clear_requests(self):
                 "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
             )
 
-        self.vae_decoder.request = None
-        self.unet.request = None
-        for component in {self.text_encoder, self.text_encoder_2, self.vae_encoder}:
+        for component in [
+            self.unet,
+            self.transformer,
+            self.vae_encoder,
+            self.vae_decoder,
+            self.text_encoder,
+            self.text_encoder_2,
+            self.text_encoder_3,
+        ]:
             if component is not None:
                 component.request = None
 
     def compile(self):
-        self.vae_decoder._compile()
-        self.unet._compile()
-        for component in {self.text_encoder, self.text_encoder_2, self.vae_encoder}:
+        for component in [
+            self.unet,
+            self.transformer,
+            self.vae_encoder,
+            self.vae_decoder,
+            self.text_encoder,
+            self.text_encoder_2,
+            self.text_encoder_3,
+        ]:
             if component is not None:
                 component._compile()
 
@@ -610,97 +890,301 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs):
     def _save_config(self, save_directory):
         self.save_config(save_directory)
 
+    @property
+    def components(self) -> Dict[str, Any]:
+        components = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "transformer": self.transformer,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "text_encoder_3": self.text_encoder_2,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+        }
+        components = {k: v for k, v in components.items() if v is not None}
+        return components
 
-class OVDiffusersModelPart(OVModelPart):
-    CONFIG_NAME = "config.json"
+    def __call__(self, *args, **kwargs):
+        # we do this to keep numpy random states support for now
+        # TODO: deprecate and add warnings when a random state is passed
+
+        args = list(args)
+        for i in range(len(args)):
+            args[i] = np_to_pt_generators(args[i], self.device)
+
+        for k, v in kwargs.items():
+            kwargs[k] = np_to_pt_generators(v, self.device)
+
+        # we use auto_model_class.__call__ here because we can't call super().__call__
+        # as OptimizedModel already defines a __call__ which is the first in the MRO
+        return self.auto_model_class.__call__(self, *args, **kwargs)
 
-    def __init__(
-        self,
-        model: openvino.runtime.Model,
-        parent_model: OVBaseModel,
-        ov_config: Optional[Dict[str, str]] = None,
-        model_name: str = "encoder",
-        model_dir: str = None,
-    ):
-        super().__init__(
-            model=model, parent_model=parent_model, ov_config=ov_config, model_name=model_name, model_dir=model_dir
-        )
-        config_path = self._model_dir / model_name / self.CONFIG_NAME
-        self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-        self.input_dtype = {
-            inputs.get_any_name(): OV_TO_NP_TYPE[inputs.get_element_type().get_type_name()]
-            for inputs in self.model.inputs
-        }
 
+class OVPipelinePart(ConfigMixin):
+    config_name: str = CONFIG_NAME
 
-class OVModelTextEncoder(OVDiffusersModelPart):
     def __init__(
         self,
         model: openvino.runtime.Model,
-        parent_model: OVBaseModel,
-        ov_config: Optional[Dict[str, str]] = None,
-        model_name: str = "text_encoder",
+        parent_pipeline: OVDiffusionPipeline,
+        model_name: str = "",
     ):
-        super().__init__(model, parent_model, ov_config, model_name)
+        self.model = model
+        self.model_name = model_name
+        self.parent_pipeline = parent_pipeline
+        self.request = None if not parent_pipeline._compile_only else self.model
+        self.ov_config = parent_pipeline.ov_config
+
+        if isinstance(parent_pipeline.model_save_dir, TemporaryDirectory):
+            self.model_save_dir = Path(parent_pipeline.model_save_dir.name) / self.model_name
+        else:
+            self.model_save_dir = Path(parent_pipeline.model_save_dir) / self.model_name
 
-    def forward(self, input_ids: np.ndarray):
-        self._compile()
+        config_file_path = self.model_save_dir / self.config_name
 
-        inputs = {
-            "input_ids": input_ids,
-        }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        if not config_file_path.is_file():
+            # config is mandatory for the model part to be used for inference
+            raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}")
 
+        config_dict = self._dict_from_json_file(config_file_path)
+        self.register_to_config(**config_dict)
 
-class OVModelUnet(OVDiffusersModelPart):
-    def __init__(
-        self, model: openvino.runtime.Model, parent_model: OVBaseModel, ov_config: Optional[Dict[str, str]] = None
+    @property
+    def _device(self) -> str:
+        return self.parent_pipeline._device
+
+    @property
+    def device(self) -> torch.device:
+        return self.parent_pipeline.device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return OV_TO_PT_TYPE[self.ov_config.get("dtype", "f32")]
+
+    def _compile(self):
+        if self.request is None:
+            if (
+                "CACHE_DIR" not in self.ov_config.keys()
+                and not str(self.model_save_dir).startswith(gettempdir())
+                and "GPU" in self._device
+            ):
+                self.ov_config["CACHE_DIR"] = os.path.join(self.model_save_dir, "model_cache")
+
+            logger.info(f"Compiling the {self.model_name} to {self._device} ...")
+            self.request = core.compile_model(self.model, self._device, self.ov_config)
+            # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html
+            if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2:
+                logger.info(f"{self._device} SUPPORTED_PROPERTIES:")
+                _print_compiled_model_properties(self.request)
+
+    def to(self, *args, device: Optional[str] = None, dtype: Optional[torch.dtype] = None):
+        for arg in args:
+            if isinstance(arg, str):
+                device = arg
+            elif isinstance(arg, torch.dtype):
+                dtype = arg
+
+        if isinstance(device, str):
+            self._device = device.upper()
+            self.request = None
+        elif device is not None:
+            raise ValueError(
+                "The `device` argument should be a string representing the device on which the model should be loaded."
+            )
+
+        if dtype is not None and dtype != self.dtype:
+            raise NotImplementedError(
+                f"Cannot change the dtype of the model from {self.dtype} to {dtype}. "
+                f"Please export the model with the desired dtype."
+            )
+
+        return self
+
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def modules(self):
+        return []
+
+
+class OVModelTextEncoder(OVPipelinePart):
+    def __init__(self, model: openvino.runtime.Model, parent_pipeline: OVDiffusionPipeline, model_name: str = ""):
+        super().__init__(model, parent_pipeline, model_name)
+        self.hidden_states_output_names = [
+            name for out in self.model.outputs for name in out.names if name.startswith("hidden_states")
+        ]
+
+    def forward(
+        self,
+        input_ids: Union[np.ndarray, torch.Tensor],
+        attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: bool = False,
     ):
-        super().__init__(model, parent_model, ov_config, "unet")
+        self._compile()
+        model_inputs = {"input_ids": input_ids}
+
+        ov_outputs = self.request(model_inputs, share_inputs=True)
+        main_out = ov_outputs[0]
+        model_outputs = {}
+        model_outputs[self.model.outputs[0].get_any_name()] = torch.from_numpy(main_out)
+        if len(self.model.outputs) > 1 and "pooler_output" in self.model.outputs[1].get_any_name():
+            model_outputs["pooler_output"] = torch.from_numpy(ov_outputs[1])
+        if self.hidden_states_output_names and "last_hidden_state" not in model_outputs:
+            model_outputs["last_hidden_state"] = torch.from_numpy(ov_outputs[self.hidden_states_output_names[-1]])
+        if (
+            self.hidden_states_output_names
+            and output_hidden_states
+            or getattr(self.config, "output_hidden_states", False)
+        ):
+            hidden_states = [torch.from_numpy(ov_outputs[out_name]) for out_name in self.hidden_states_output_names]
+            model_outputs["hidden_states"] = hidden_states
+
+        if return_dict:
+            return model_outputs
+        return ModelOutput(**model_outputs)
+
+
+class OVModelUnet(OVPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if not hasattr(self.config, "time_cond_proj_dim"):
+            logger.warning(
+                "The `time_cond_proj_dim` attribute is missing from the UNet configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(time_cond_proj_dim=None)
 
     def forward(
         self,
-        sample: np.ndarray,
-        timestep: np.ndarray,
-        encoder_hidden_states: np.ndarray,
-        text_embeds: Optional[np.ndarray] = None,
-        time_ids: Optional[np.ndarray] = None,
-        timestep_cond: Optional[np.ndarray] = None,
+        sample: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
     ):
         self._compile()
 
-        inputs = {
+        model_inputs = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
         }
 
         if text_embeds is not None:
-            inputs["text_embeds"] = text_embeds
+            model_inputs["text_embeds"] = text_embeds
         if time_ids is not None:
-            inputs["time_ids"] = time_ids
+            model_inputs["time_ids"] = time_ids
         if timestep_cond is not None:
-            inputs["timestep_cond"] = timestep_cond
+            model_inputs["timestep_cond"] = timestep_cond
+        if cross_attention_kwargs is not None:
+            model_inputs.update(cross_attention_kwargs)
+        if added_cond_kwargs is not None:
+            model_inputs.update(added_cond_kwargs)
 
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
 
+        model_outputs = {}
+        for key, value in ov_outputs.items():
+            model_outputs[next(iter(key.names))] = torch.from_numpy(value)
 
-class OVModelVaeDecoder(OVDiffusersModelPart):
-    def __init__(
-        self, model: openvino.runtime.Model, parent_model: OVBaseModel, ov_config: Optional[Dict[str, str]] = None
-    ):
-        super().__init__(model, parent_model, ov_config, "vae_decoder")
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
 
-    def forward(self, latent_sample: np.ndarray):
+
+class OVModelTransformer(OVPipelinePart):
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        pooled_projections: torch.FloatTensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        block_controlnet_hidden_states: List = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
         self._compile()
 
-        inputs = {
-            "latent_sample": latent_sample,
+        model_inputs = {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
         }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+
+        if img_ids is not None:
+            model_inputs["img_ids"] = img_ids
+        if txt_ids is not None:
+            model_inputs["txt_ids"] = txt_ids
+        if guidance is not None:
+            model_inputs["guidance"] = guidance
+
+        ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
+
+        model_outputs = {}
+        for key, value in ov_outputs.items():
+            model_outputs[next(iter(key.names))] = torch.from_numpy(value)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class OVModelVaeEncoder(OVPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if not hasattr(self.config, "scaling_factor"):
+            logger.warning(
+                "The `scaling_factor` attribute is missing from the VAE encoder configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
+
+    def forward(
+        self,
+        sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
+        self._compile()
+
+        model_inputs = {"sample": sample}
+
+        ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
+
+        model_outputs = {}
+        for key, value in ov_outputs.items():
+            model_outputs[next(iter(key.names))] = torch.from_numpy(value)
+
+        if "latent_sample" in model_outputs:
+            model_outputs["latents"] = model_outputs.pop("latent_sample")
+
+        if "latent_parameters" in model_outputs:
+            model_outputs["latent_dist"] = DiagonalGaussianDistribution(
+                parameters=model_outputs.pop("latent_parameters")
+            )
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
 
     def _compile(self):
         if "GPU" in self._device and "INFERENCE_PRECISION_HINT" not in self.ov_config:
@@ -708,20 +1192,38 @@ def _compile(self):
         super()._compile()
 
 
-class OVModelVaeEncoder(OVDiffusersModelPart):
-    def __init__(
-        self, model: openvino.runtime.Model, parent_model: OVBaseModel, ov_config: Optional[Dict[str, str]] = None
-    ):
-        super().__init__(model, parent_model, ov_config, "vae_encoder")
+class OVModelVaeDecoder(OVPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-    def forward(self, sample: np.ndarray):
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "scaling_factor"):
+            logger.warning(
+                "The `scaling_factor` attribute is missing from the VAE decoder configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
+
+    def forward(
+        self,
+        latent_sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
         self._compile()
 
-        inputs = {
-            "sample": sample,
-        }
-        outputs = self.request(inputs, share_inputs=True)
-        return list(outputs.values())
+        model_inputs = {"latent_sample": latent_sample}
+
+        ov_outputs = self.request(model_inputs, share_inputs=True).to_dict()
+
+        model_outputs = {}
+        for key, value in ov_outputs.items():
+            model_outputs[next(iter(key.names))] = torch.from_numpy(value)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
 
     def _compile(self):
         if "GPU" in self._device and "INFERENCE_PRECISION_HINT" not in self.ov_config:
@@ -729,382 +1231,351 @@ def _compile(self):
         super()._compile()
 
 
-class OVStableDiffusionPipeline(OVStableDiffusionPipelineBase, StableDiffusionPipelineMixin):
-    def __call__(
+class OVModelVae:
+    def __init__(self, decoder: OVModelVaeDecoder, encoder: OVModelVaeEncoder):
+        self.decoder = decoder
+        self.encoder = encoder
+
+    @property
+    def config(self):
+        return self.decoder.config
+
+    @property
+    def dtype(self):
+        return self.decoder.dtype
+
+    @property
+    def device(self):
+        return self.decoder.device
+
+    def decode(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+    def encode(self, *args, **kwargs):
+        return self.encoder(*args, **kwargs)
+
+    def to(self, *args, **kwargs):
+        self.decoder.to(*args, **kwargs)
+        if self.encoder is not None:
+            self.encoder.to(*args, **kwargs)
+
+
+class OVStableDiffusionPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionPipeline):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion#diffusers.StableDiffusionPipeline).
+    """
+
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusionPipeline
+
+
+class OVStableDiffusionImg2ImgPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionImg2ImgPipeline
+):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_img2img#diffusers.StableDiffusionImg2ImgPipeline).
+    """
+
+    main_input_name = "image"
+    export_feature = "image-to-image"
+    auto_model_class = StableDiffusionImg2ImgPipeline
+
+
+class OVStableDiffusionInpaintPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionInpaintPipeline
+):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_inpaint#diffusers.StableDiffusionInpaintPipeline).
+    """
+
+    main_input_name = "image"
+    export_feature = "inpainting"
+    auto_model_class = StableDiffusionInpaintPipeline
+
+
+class OVStableDiffusionXLPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLPipeline):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
+    """
+
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusionXLPipeline
+
+    def _get_add_time_ids(
         self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        **kwargs,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        dtype,
+        text_encoder_projection_dim=None,
     ):
-        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        _height = self.height
-        _width = self.width
-        expected_batch_size = self._batch_size
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
 
-        if _height != -1 and height != _height:
-            logger.warning(
-                f"`height` was set to {height} but the static model will output images of height {_height}."
-                "To fix the height, please reshape your model accordingly using the `.reshape()` method."
-            )
-            height = _height
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
 
-        if _width != -1 and width != _width:
-            logger.warning(
-                f"`width` was set to {width} but the static model will output images of width {_width}."
-                "To fix the width, please reshape your model accordingly using the `.reshape()` method."
-            )
-            width = _width
-
-        if expected_batch_size != -1:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = kwargs.get("prompt_embeds").shape[0]
-
-            _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale)
-
-        return StableDiffusionPipelineMixin.__call__(
-            self,
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            **kwargs,
-        )
 
+class OVStableDiffusionXLImg2ImgPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLImg2ImgPipeline
+):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
+    """
 
-class OVStableDiffusionImg2ImgPipeline(OVStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin):
+    main_input_name = "image"
     export_feature = "image-to-image"
+    auto_model_class = StableDiffusionXLImg2ImgPipeline
 
-    def __call__(
+    def _get_add_time_ids(
         self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        **kwargs,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
     ):
-        _height = self.height
-        _width = self.width
-        expected_batch_size = self._batch_size
-
-        if _height != -1 and _width != -1:
-            image = self.image_processor.preprocess(image, height=_height, width=_width).transpose(0, 2, 3, 1)
-
-        if expected_batch_size != -1:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = kwargs.get("prompt_embeds").shape[0]
-
-            _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale)
-
-        return StableDiffusionImg2ImgPipelineMixin.__call__(
-            self,
-            prompt=prompt,
-            image=image,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            **kwargs,
-        )
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
 
+        return add_time_ids, add_neg_time_ids
 
-class OVStableDiffusionInpaintPipeline(OVStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
+
+class OVStableDiffusionXLInpaintPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusionXLInpaintPipeline
+):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline).
+    """
+
+    main_input_name = "image"
     export_feature = "inpainting"
+    auto_model_class = StableDiffusionXLInpaintPipeline
 
-    def __call__(
+    def _get_add_time_ids(
         self,
-        prompt: Optional[Union[str, List[str]]],
-        image: PIL.Image.Image,
-        mask_image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        **kwargs,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
     ):
-        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        _height = self.height
-        _width = self.width
-        expected_batch_size = self._batch_size
-
-        if _height != -1 and _width != -1:
-            if height != _height:
-                logger.warning(
-                    f"`height` was set to {height} but the static model will output images of height {_height}."
-                    "To fix the height, please reshape your model accordingly using the `.reshape()` method."
-                )
-                height = _height
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
 
-            if width != _width:
-                logger.warning(
-                    f"`width` was set to {width} but the static model will output images of width {_width}."
-                    "To fix the width, please reshape your model accordingly using the `.reshape()` method."
-                )
-                width = _width
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
 
-            if isinstance(image, list):
-                image = [self.image_processor.resize(i, _height, _width) for i in image]
-            else:
-                image = self.image_processor.resize(image, _height, _width)
+        return add_time_ids, add_neg_time_ids
 
-            if isinstance(mask_image, list):
-                mask_image = [self.image_processor.resize(i, _height, _width) for i in mask_image]
-            else:
-                mask_image = self.image_processor.resize(mask_image, _height, _width)
 
-        if expected_batch_size != -1:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = kwargs.get("prompt_embeds").shape[0]
-
-            _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale)
-
-        return StableDiffusionInpaintPipelineMixin.__call__(
-            self,
-            prompt=prompt,
-            image=image,
-            mask_image=mask_image,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            **kwargs,
-        )
+class OVLatentConsistencyModelPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelPipeline
+):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
+    """
 
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = LatentConsistencyModelPipeline
 
-class OVStableDiffusionXLPipelineBase(OVStableDiffusionPipelineBase):
-    auto_model_class = StableDiffusionXLPipeline
 
-    def __init__(self, *args, add_watermarker: Optional[bool] = None, **kwargs):
-        super().__init__(*args, **kwargs)
+class OVLatentConsistencyModelImg2ImgPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, LatentConsistencyModelImg2ImgPipeline
+):
+    """
+    OpenVINO-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline).
+    """
 
-        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+    main_input_name = "image"
+    export_feature = "image-to-image"
+    auto_model_class = LatentConsistencyModelImg2ImgPipeline
 
-        if add_watermarker:
-            if not is_invisible_watermark_available():
-                raise ImportError(
-                    "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`."
-                )
-            from optimum.pipelines.diffusers.watermark import StableDiffusionXLWatermarker
 
-            self.watermark = StableDiffusionXLWatermarker()
-        else:
-            self.watermark = None
+class OVStableDiffusion3Pipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Pipeline):
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusion3Pipeline
 
 
-class OVStableDiffusionXLPipeline(OVStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin):
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        **kwargs,
-    ):
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
-        _height = self.height
-        _width = self.width
-        expected_batch_size = self._batch_size
+class OVStableDiffusion3Img2ImgPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3Img2ImgPipeline
+):
+    main_input_name = "image"
+    export_feature = "image-to-image"
+    auto_model_class = StableDiffusion3Img2ImgPipeline
 
-        if _height != -1 and height != _height:
-            logger.warning(
-                f"`height` was set to {height} but the static model will output images of height {_height}."
-                "To fix the height, please reshape your model accordingly using the `.reshape()` method."
-            )
-            height = _height
 
-        if _width != -1 and width != _width:
-            logger.warning(
-                f"`width` was set to {width} but the static model will output images of width {_width}."
-                "To fix the width, please reshape your model accordingly using the `.reshape()` method."
-            )
-            width = _width
+class OVStableDiffusion3InpaintPipeline(
+    OVDiffusionPipeline, OVTextualInversionLoaderMixin, StableDiffusion3InpaintPipeline
+):
+    main_input_name = "image"
+    export_feature = "inpainting"
+    auto_model_class = StableDiffusion3InpaintPipeline
 
-        if expected_batch_size != -1:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = kwargs.get("prompt_embeds").shape[0]
-
-            _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale)
-
-        return StableDiffusionXLPipelineMixin.__call__(
-            self,
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            **kwargs,
-        )
 
+class OVFluxPipeline(OVDiffusionPipeline, OVTextualInversionLoaderMixin, FluxPipeline):
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = FluxPipeline
+
+
+SUPPORTED_OV_PIPELINES = [
+    OVStableDiffusionPipeline,
+    OVStableDiffusionImg2ImgPipeline,
+    OVStableDiffusionInpaintPipeline,
+    OVStableDiffusionXLPipeline,
+    OVStableDiffusionXLImg2ImgPipeline,
+    OVStableDiffusionXLInpaintPipeline,
+    OVLatentConsistencyModelPipeline,
+    OVLatentConsistencyModelImg2ImgPipeline,
+]
+
+
+def _get_ov_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True):
+    for ov_pipeline_class in SUPPORTED_OV_PIPELINES:
+        if (
+            ov_pipeline_class.__name__ == pipeline_class_name
+            or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+        ):
+            return ov_pipeline_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"OVDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}")
+
+
+OV_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", OVStableDiffusionPipeline),
+        ("stable-diffusion-xl", OVStableDiffusionXLPipeline),
+        ("latent-consistency", OVLatentConsistencyModelPipeline),
+    ]
+)
 
-class OVStableDiffusionXLImg2ImgPipeline(OVStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin):
-    auto_model_class = StableDiffusionXLImg2ImgPipeline
-    export_feature = "image-to-image"
+OV_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", OVStableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-xl", OVStableDiffusionXLImg2ImgPipeline),
+        ("latent-consistency", OVLatentConsistencyModelImg2ImgPipeline),
+    ]
+)
 
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.3,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        **kwargs,
-    ):
-        _height = self.height
-        _width = self.width
-        expected_batch_size = self._batch_size
-
-        if _height != -1 and _width != -1:
-            image = self.image_processor.preprocess(image, height=_height, width=_width).transpose(0, 2, 3, 1)
-
-        if expected_batch_size != -1:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = kwargs.get("prompt_embeds").shape[0]
-
-            _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale)
-
-        return StableDiffusionXLImg2ImgPipelineMixin.__call__(
-            self,
-            prompt=prompt,
-            image=image,
-            strength=strength,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            negative_prompt=negative_prompt,
-            num_images_per_prompt=num_images_per_prompt,
-            **kwargs,
-        )
+OV_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", OVStableDiffusionInpaintPipeline),
+        ("stable-diffusion-xl", OVStableDiffusionXLInpaintPipeline),
+    ]
+)
 
+if is_diffusers_version(">=", "0.29.0"):
+    SUPPORTED_OV_PIPELINES.extend(
+        [
+            OVStableDiffusion3Pipeline,
+            OVStableDiffusion3Img2ImgPipeline,
+        ]
+    )
 
-class OVLatentConsistencyModelPipeline(OVStableDiffusionPipelineBase, LatentConsistencyPipelineMixin):
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 4,
-        original_inference_steps: int = None,
-        guidance_scale: float = 8.5,
-        num_images_per_prompt: int = 1,
-        **kwargs,
-    ):
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
-        _height = self.height
-        _width = self.width
-        expected_batch_size = self._batch_size
+    OV_TEXT2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Pipeline
+    OV_IMAGE2IMAGE_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3Img2ImgPipeline
 
-        if _height != -1 and height != _height:
-            logger.warning(
-                f"`height` was set to {height} but the static model will output images of height {_height}."
-                "To fix the height, please reshape your model accordingly using the `.reshape()` method."
-            )
-            height = _height
+if is_diffusers_version(">=", "0.30.0"):
+    SUPPORTED_OV_PIPELINES.extend([OVStableDiffusion3InpaintPipeline, OVFluxPipeline])
+    OV_INPAINT_PIPELINES_MAPPING["stable-diffusion-3"] = OVStableDiffusion3InpaintPipeline
+    OV_TEXT2IMAGE_PIPELINES_MAPPING["flux"] = OVFluxPipeline
 
-        if _width != -1 and width != _width:
-            logger.warning(
-                f"`width` was set to {width} but the static model will output images of width {_width}."
-                "To fix the width, please reshape your model accordingly using the `.reshape()` method."
-            )
-            width = _width
 
-        if expected_batch_size != -1:
-            if isinstance(prompt, str):
-                batch_size = 1
-            elif isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = kwargs.get("prompt_embeds").shape[0]
-
-            _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale=0.0)
-
-        return LatentConsistencyPipelineMixin.__call__(
-            self,
-            prompt=prompt,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            original_inference_steps=original_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            **kwargs,
-        )
+SUPPORTED_OV_PIPELINES_MAPPINGS = [
+    OV_TEXT2IMAGE_PIPELINES_MAPPING,
+    OV_IMAGE2IMAGE_PIPELINES_MAPPING,
+    OV_INPAINT_PIPELINES_MAPPING,
+]
 
-    def run_safety_checker(self, image: np.ndarray):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            # Transpose the image to NHWC
-            image = image.transpose(0, 2, 3, 1)
 
-            feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt")
-            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
+def _get_task_ov_class(mapping, pipeline_class_name):
+    def _get_model_name(pipeline_class_name):
+        for ov_pipelines_mapping in SUPPORTED_OV_PIPELINES_MAPPINGS:
+            for model_name, ov_pipeline_class in ov_pipelines_mapping.items():
+                if (
+                    ov_pipeline_class.__name__ == pipeline_class_name
+                    or ov_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+                ):
+                    return model_name
 
-            # Transpose the image back to NCHW
-            image = image.transpose(0, 3, 1, 2)
+    model_name = _get_model_name(pipeline_class_name)
 
-        return image, has_nsfw_concept
+    if model_name is not None:
+        task_class = mapping.get(model_name, None)
+        if task_class is not None:
+            return task_class
 
+    raise ValueError(f"OVPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}")
 
-def _raise_invalid_batch_size(
-    expected_batch_size: int, batch_size: int, num_images_per_prompt: int, guidance_scale: float
-):
-    current_batch_size = batch_size * num_images_per_prompt * (1 if guidance_scale <= 1 else 2)
-
-    if expected_batch_size != current_batch_size:
-        msg = ""
-        if guidance_scale is not None and guidance_scale <= 1:
-            msg = f"`guidance_scale` was set to {guidance_scale}, static shapes are currently only supported for `guidance_scale` > 1 "
-
-        raise ValueError(
-            "The model was statically reshaped and the pipeline inputs do not match the expected shapes. "
-            f"The `batch_size`, `num_images_per_prompt` and `guidance_scale` were respectively set to {batch_size}, {num_images_per_prompt} and {guidance_scale}. "
-            f"The static model expects an input of size equal to {expected_batch_size} and got the following value instead : {current_batch_size}. "
-            f"To fix this, please either provide a different inputs to your model so that `batch_size` * `num_images_per_prompt` * 2 is equal to {expected_batch_size} "
-            "or reshape it again accordingly using the `.reshape()` method by setting `batch_size` to -1. " + msg
-        )
+
+class OVPipelineForTask(ConfigMixin):
+    auto_model_class = DiffusionPipeline
+    config_name = "model_index.json"
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        load_config_kwargs = {
+            "force_download": kwargs.get("force_download", False),
+            "resume_download": kwargs.get("resume_download", None),
+            "local_files_only": kwargs.get("local_files_only", False),
+            "cache_dir": kwargs.get("cache_dir", None),
+            "revision": kwargs.get("revision", None),
+            "proxies": kwargs.get("proxies", None),
+            "token": kwargs.get("token", None),
+        }
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        config = config[0] if isinstance(config, tuple) else config
+        class_name = config["_class_name"]
+
+        ov_pipeline_class = _get_task_ov_class(cls.ov_pipelines_mapping, class_name)
+
+        return ov_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
+
+
+class OVPipelineForText2Image(OVPipelineForTask):
+    auto_model_class = AutoPipelineForText2Image
+    ov_pipelines_mapping = OV_TEXT2IMAGE_PIPELINES_MAPPING
+    export_feature = "text-to-image"
+
+
+class OVPipelineForImage2Image(OVPipelineForTask):
+    auto_model_class = AutoPipelineForImage2Image
+    ov_pipelines_mapping = OV_IMAGE2IMAGE_PIPELINES_MAPPING
+    export_feature = "image-to-image"
+
+
+class OVPipelineForInpainting(OVPipelineForTask):
+    auto_model_class = AutoPipelineForInpainting
+    ov_pipelines_mapping = OV_INPAINT_PIPELINES_MAPPING
+    export_feature = "inpainting"
diff --git a/optimum/intel/openvino/modeling_open_clip.py b/optimum/intel/openvino/modeling_open_clip.py
index 967153a037..ef00c182e5 100644
--- a/optimum/intel/openvino/modeling_open_clip.py
+++ b/optimum/intel/openvino/modeling_open_clip.py
@@ -16,7 +16,6 @@
 import logging
 import os
 from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Union
 
 import numpy as np
@@ -39,6 +38,7 @@
 from ..utils.modeling_utils import _find_files_matching_pattern, _OpenClipForZeroShotImageClassification
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling import MODEL_START_DOCSTRING, OVModel
+from .utils import TemporaryDirectory
 
 
 logger = logging.getLogger(__name__)
@@ -152,7 +152,7 @@ def from_pretrained(
 
             ov_files = _find_files_matching_pattern(
                 model_dir,
-                pattern=r"(.*)?openvino(.*)?\_model\_(.*)?.xml",
+                pattern=r"(.*)?openvino(.*)?\_model\_(.*)?.xml$",
                 subfolder=subfolder,
                 use_auth_token=token,
                 revision=revision,
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
index cf6aee7b10..a1b531a1f5 100644
--- a/optimum/intel/openvino/modeling_visual_language.py
+++ b/optimum/intel/openvino/modeling_visual_language.py
@@ -1,8 +1,9 @@
+import copy
 import logging
 import os
 import warnings
+from abc import abstractmethod
 from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import Dict, Optional, Tuple, Union
 
 import numpy as np
@@ -11,14 +12,42 @@
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from openvino._offline_transformations import apply_moc_transformations, compress_model_transformation
-from transformers import AutoConfig, GenerationConfig, GenerationMixin, PretrainedConfig
+from PIL.Image import Image
+from transformers import (
+    AutoConfig,
+    AutoImageProcessor,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    GenerationMixin,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
 from transformers.modeling_outputs import BaseModelOutputWithPooling
 
 from ...exporters.openvino import main_export
-from ...exporters.openvino.stateful import ensure_stateful_is_available
+from ...exporters.openvino.stateful import ensure_stateful_is_available, model_has_input_output_name
+from ...exporters.openvino.utils import save_config
+from .. import OVQuantizer
 from .configuration import OVConfig, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel, OVModelPart
 from .modeling_decoder import CausalLMOutputWithPast, OVModelForCausalLM
+from .utils import (
+    OV_LANGUAGE_MODEL_NAME,
+    OV_TEXT_EMBEDDINGS_MODEL_NAME,
+    OV_VISION_EMBEDDINGS_MODEL_NAME,
+    TemporaryDirectory,
+)
+
+
+try:
+    from transformers import LlavaForConditionalGeneration
+except ImportError:
+    LlavaForConditionalGeneration = None
+
+try:
+    from transformers import LlavaNextForConditionalGeneration
+except ImportError:
+    LlavaNextForConditionalGeneration = None
 
 
 logger = logging.getLogger(__name__)
@@ -55,13 +84,19 @@ def __init__(
     def compile(self):
         if self.request is None:
             logger.info(f"Compiling the Language model to {self._device} ...")
-            self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request()
+            super().compile()
         self._compile_text_emb()
 
     def _compile_text_emb(self):
         if self.text_emb_request is None:
             logger.info(f"Compiling the Text embeddings model to {self._device} ...")
-            self.text_emb_request = core.compile_model(self.text_emb_model, self._device, self.ov_config)
+            if self._compile_only:
+                self.text_emb_request = self.text_emb_model
+            else:
+                logger.info(f"Compiling the Text embeddings model to {self._device} ...")
+                self.text_emb_request = self._compile_model(
+                    self.text_emb_model, self._device, self.ov_config, self.model_save_dir
+                )
 
     def clear_requests(self):
         if self._compile_only:
@@ -122,8 +157,8 @@ def prepare_inputs(
             else:
                 position_ids = np.cumsum(attention_mask, axis=1) - 1
                 position_ids[attention_mask == 0] = 1
-                if past_key_values:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
+            if past_len:
+                position_ids = position_ids[:, -inputs_embeds.shape[1] :]
 
             inputs["position_ids"] = position_ids
 
@@ -171,14 +206,23 @@ def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
         super().__init__(model, parent_model, model_name=self._model_name)
         self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
         self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
+        self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
         self.hidden_states_output_names = []
         if len(self.model.outputs) > 2:
             self.hidden_states_output_names = [
                 key.get_any_name() for key in self.model.outputs[2:] if "hidden_states" in key.get_any_name()
             ]
+        self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
+        self._main_input = "images" if model_has_input_output_name(self.model, "images") else "pixel_values"
 
     def forward(self, pixel_values, **kwargs):
-        result = self.request({"pixel_values": pixel_values})
+        self._compile()
+        inputs = {self._main_input: pixel_values}
+        if len(self.input_names) > 1:
+            for name in self.input_names:
+                if name in kwargs:
+                    inputs[name] = kwargs[name]
+        result = self.request(inputs)
         last_hidden_state = result[0]
         hidden_states = None
         pooler_out = None
@@ -193,12 +237,42 @@ def forward(self, pixel_values, **kwargs):
         )
 
 
-MODEL_PARTS_CLS_MAPPING = {}
+class OVResampler(OVModelPart):
+    _model_name = "resampler"
+
+    def __init__(self, model: ov.Model, parent_model: OVBaseModel) -> None:
+        super().__init__(model, parent_model, model_name=self._model_name)
+        self.output_dtypes = {key.get_any_name(): key.get_element_type().get_type_name() for key in self.model.outputs}
+        self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
+
+    def forward(self, image_feature, pos_embed, key_padding_mask):
+        self._compile()
+        result = self.request(
+            {"image_feature": image_feature, "pos_embed": pos_embed, "key_padding_mask": key_padding_mask}
+        )[0]
+        return result
+
+
+class OVVisionProjection(OVModelPart):
+    _model_name = "vision_projection"
+
+    def forward(self, img_features):
+        self._compile()
+        return self.request(img_features)[0]
+
+
+MODEL_PARTS_CLS_MAPPING = {
+    "resampler": OVResampler,
+    "language_model": OVModelWithEmbedForCausalLM,
+    "vision_embeddings": OVVisionEmbedding,
+    "vision_projection": OVVisionProjection,
+}
 
 
 class OVModelForVisualCausalLM(OVBaseModel, GenerationMixin):
     export_feature = "image-text-to-text"
     additional_parts = []
+    auto_model_class = AutoModelForCausalLM
 
     def __init__(
         self,
@@ -221,7 +295,7 @@ def __init__(
         self.ov_config = {} if ov_config is None else {**ov_config}
         self.preprocessors = kwargs.get("preprocessors", [])
         self.lm_model = language_model
-        self.text_embdings_model = text_embeddings
+        self.text_embeddings_model = text_embeddings
         self.vision_embeddings_model = vision_embeddings
         self._supports_cache_class = False
         self.main_input_name = "input_ids"
@@ -238,13 +312,13 @@ def __init__(
         self._set_ov_config_parameters()
         self.language_model = OVModelWithEmbedForCausalLM(
             self.lm_model,
-            self.text_embdings_model,
+            self.text_embeddings_model,
             config=config,
-            deivce=device,
+            device=device,
             ov_config=ov_config,
             model_save_dir=model_save_dir,
             quantization_config=quantization_config,
-            compile=not self._compile_only,
+            compile=self._compile_only or enable_compilation,
             compile_only=self._compile_only,
         )
         self.vision_embeddings = OVVisionEmbedding(self.vision_embeddings_model, self)
@@ -264,13 +338,28 @@ def __init__(
         except AttributeError:
             pass
 
+    def clear_requests(self):
+        if self._compile_only:
+            raise ValueError(
+                "`clear_requests()` is not supported with `compile_only` mode, please intialize model without this option"
+            )
+
+        for _, component in self.components.items():
+            component.clear_requests()
+
     def compile(self):
-        self.language_model.compile()
-        self.vision_embeddings._compile()
-        for part in self.additional_parts:
-            part_model = getattr(self, part, None)
-            if part_model is not None:
-                part_model._compile()
+        for _, component in self.components.items():
+            if isinstance(component, OVModelPart):
+                component._compile()
+            else:
+                component.compile()
+
+    def _save_config(self, save_directory):
+        """
+        Saves a model configuration into a directory, so that it can be re-loaded using the
+        [`from_pretrained`] class method.
+        """
+        save_config(self.config, save_directory)
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         """
@@ -281,21 +370,21 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             save_directory (`str` or `Path`):
                 The directory where to save the model files.
         """
-        src_files = [self.lm_model, self.text_embdings_model, self.vision_embeddings_model]
-        dst_file_names = [
-            "openvino_language_model.xml",
-            "openvino_text_embeddings_model.xml",
-            "openvino_vision_embeddings.xml",
-        ]
-        for part in self.additional_parts:
-            model = getattr(self, f"{part}_model", None)
-            if model is not None:
-                src_files.append(model)
-                dst_file_names.append(f"openvino_{part}_model.xml")
+        src_models = self.submodels
+        dst_file_names = {
+            "lm_model": OV_LANGUAGE_MODEL_NAME,
+            "text_embeddings_model": OV_TEXT_EMBEDDINGS_MODEL_NAME,
+            "vision_embeddings_model": OV_VISION_EMBEDDINGS_MODEL_NAME,
+        }
+        for name in self._submodel_names:
+            if name not in dst_file_names:
+                dst_file_names[name] = f"openvino_{name}.xml"
 
-        for src_file, dst_file_name in zip(src_files, dst_file_names):
+        for name in self._submodel_names:
+            model = src_models[name]
+            dst_file_name = dst_file_names[name]
             dst_path = os.path.join(save_directory, dst_file_name)
-            ov.save_model(src_file, dst_path, compress_to_fp16=False)
+            ov.save_model(model, dst_path, compress_to_fp16=False)
 
         self._save_openvino_config(save_directory)
         if self.generation_config is not None:
@@ -364,26 +453,22 @@ def _from_pretrained(
                 raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
             token = use_auth_token
 
-        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
-
-        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
-        compile_only = kwargs.get("compile_only", False)
-
-        # Load model from a local directory
-        if os.path.isdir(model_id):
-            model_save_dir = Path(model_id)
         model_file_names = {
-            "language_model": "openvino_language_model.xml",
-            "text_embeddings": "openvino_text_embeddings_model.xml",
-            "vision_embeddings": "openvino_vision_embeddings_model.xml",
+            "language_model": OV_LANGUAGE_MODEL_NAME,
+            "language_model_bin": OV_LANGUAGE_MODEL_NAME.replace(".xml", ".bin"),
+            "text_embeddings": OV_TEXT_EMBEDDINGS_MODEL_NAME,
+            "text_embeddings_bin": OV_TEXT_EMBEDDINGS_MODEL_NAME.replace(".xml", ".bin"),
+            "vision_embeddings": OV_VISION_EMBEDDINGS_MODEL_NAME,
+            "vision_embeddings_bin": OV_VISION_EMBEDDINGS_MODEL_NAME.replace(".xml", ".bin"),
         }
 
+        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
         for part in model_cls.additional_parts:
             model_file_names[part] = f"openvino_{part}_model.xml"
-        model_cls = MODEL_TYPE_TO_CLS_MAPPING[config.model_type]
-        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+            model_file_names[part + "_bin"] = f"openvino_{part}_model.bin"
         compile_only = kwargs.get("compile_only", False)
         if os.path.isdir(model_id):
+            # Load model from a local directory
             model_save_dir = Path(model_id)
             file_names = {k: os.path.join(model_id, model_file_names[k]) for k in model_file_names}
         else:
@@ -401,11 +486,11 @@ def _from_pretrained(
                 file_names[name] = model_cache_path
             model_save_dir = Path(model_cache_path).parent
         if not compile_only:
-            language_model = model_cls.load_model(file_names["language_model"], quantization_config)
-            text_embeddings = model_cls.load_model(file_names["text_embeddings"], quantization_config)
-            vision_embeddings = model_cls.load_model(file_names["vision_embeddings"], quantization_config)
+            language_model = model_cls.load_model(file_names["language_model"])
+            text_embeddings = model_cls.load_model(file_names["text_embeddings"])
+            vision_embeddings = model_cls.load_model(file_names["vision_embeddings"])
             for part in model_cls.additional_parts:
-                kwargs[part] = model_cls.load_model(file_names[part], quantization_config)
+                kwargs[part] = model_cls.load_model(file_names[part])
         else:
             language_model = model_cls._compile_model(
                 file_names["language_model"],
@@ -445,7 +530,12 @@ def _from_pretrained(
         except Exception:
             pass
 
-        return model_cls(
+        quantization_config = model_cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+        to_quantize = not compile_only and quantization_config is not None
+        if to_quantize:
+            kwargs["compile"] = False
+
+        model = model_cls(
             language_model=language_model,
             text_embeddings=text_embeddings,
             vision_embeddings=vision_embeddings,
@@ -455,6 +545,15 @@ def _from_pretrained(
             **kwargs,
         )
 
+        if to_quantize:
+            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
+            potential_processor_id = config.mm_vision_tower if isinstance(model, _OVNanoLlavaForCausalLM) else model_id
+            quantization_config_copy.processor = quantization_config.processor or potential_processor_id
+            OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+
+        return model
+
     @classmethod
     def _from_transformers(
         cls,
@@ -513,7 +612,7 @@ def _from_transformers(
             ov_config=ov_config,
             stateful=stateful,
         )
-        config = AutoConfig.from_pretrained(save_dir_path)
+        config = AutoConfig.from_pretrained(save_dir_path, trust_remote_code=trust_remote_code)
         return cls._from_pretrained(
             model_id=save_dir_path,
             config=config,
@@ -523,6 +622,28 @@ def _from_transformers(
             **kwargs,
         )
 
+    @property
+    def _component_names(self):
+        base_components = ["language_model", "vision_embeddings"]
+        additional_components = [part for part in self.additional_parts if getattr(self, part, None) is not None]
+        return base_components + additional_components
+
+    @property
+    def components(self):
+        return {component_name: getattr(self, component_name) for component_name in self._component_names}
+
+    @property
+    def _submodel_names(self):
+        model_names = ["lm_model", "text_embeddings_model", "vision_embeddings_model"]
+        for part in self.additional_parts:
+            if getattr(self, part, None) is not None:
+                model_names.append(part + "_model")
+        return model_names
+
+    @property
+    def submodels(self):
+        return {submodel_name: getattr(self, submodel_name) for submodel_name in self._submodel_names}
+
     def reshape(self, batch_size: int, sequence_length: int):
         logger.warning("Static shapes are not supported for causal language model.")
         return self
@@ -531,28 +652,27 @@ def half(self):
         """
         Converts all the model weights to FP16 for more efficient inference on GPU.
         """
-        apply_moc_transformations(self.lm_model, cf=False)
-        compress_model_transformation(self.lm_model)
-        apply_moc_transformations(self.text_embdings_model, cf=False)
-        compress_model_transformation(self.text_embdings_model)
-        apply_moc_transformations(self.vision_embeddings_model, cf=False)
-        compress_model_transformation(self.vision_embeddings_model)
-        for part in self.additional_parts:
-            model = getattr(self, f"{part}_model", None)
-            if model is not None:
-                apply_moc_transformations(model, cf=False)
-                compress_model_transformation(model)
+        for _, submodel in self.submodels.items():
+            apply_moc_transformations(submodel, cf=False)
+            compress_model_transformation(submodel)
+        return self
+
+    def to(self, device):
+        self.language_model.to(device)
+        super().to(device)
         return self
 
     def forward(
         self,
         input_ids,
-        pixel_values,
+        pixel_values=None,
         past_key_values=None,
         inputs_embeds=None,
         image_sizes=None,
         attention_mask=None,
         position_ids=None,
+        image_bound=None,
+        tgt_sizes=None,
         **kwargs,
     ):
         inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings(
@@ -562,6 +682,8 @@ def forward(
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
+            image_bound=image_bound,
+            tgt_sizes=tgt_sizes,
             **kwargs,
         )
         return self.language_model.forward(
@@ -604,6 +726,7 @@ def get_multimodal_embeddings(
                 )
         return inputs_embeds, attention_mask, position_ids
 
+    # Adopted from https://github.com/huggingface/transformers/blob/v4.44.2/src/transformers/models/llava/modeling_llava.py#L521
     def prepare_inputs_for_generation(
         self,
         input_ids,
@@ -621,21 +744,22 @@ def prepare_inputs_for_generation(
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            if attention_mask is not None and past_length + 1 > input_ids.shape[1]:
+                input_discount = max(attention_mask.shape[1] - past_length, 1)
+                input_ids = input_ids[:, -input_discount:]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.llava
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
+            elif getattr(self.config, "image_token_index", -1) in input_ids:
                 input_ids = input_ids[:, input_ids.shape[1] - 1 :]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
+            if past_key_values is not None:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
@@ -652,6 +776,8 @@ def prepare_inputs_for_generation(
                 "attention_mask": attention_mask,
                 "pixel_values": pixel_values,
                 "image_sizes": image_sizes,
+                "image_bound": kwargs.get("image_bound"),
+                "tgt_sizes": kwargs.get("tgt_sizes"),
             }
         )
         return model_inputs
@@ -660,8 +786,50 @@ def can_generate(self):
         """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
         return True
 
+    @staticmethod
+    @abstractmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+    ):
+        """
+        Preprocess input instruction and an image.
+        """
+
 
 class _OVLlavaForCausalLM(OVModelForVisualCausalLM):
+    auto_model_class = LlavaForConditionalGeneration
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = True,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            language_model=language_model,
+            text_embeddings=text_embeddings,
+            vision_embeddings=vision_embeddings,
+            config=config,
+            device=device,
+            dynamic_shapes=dynamic_shapes,
+            ov_config=ov_config,
+            model_save_dir=model_save_dir,
+            quantization_config=quantization_config,
+            **kwargs,
+        )
+        self._support_new_processing = hasattr(self.config, "image_seq_length")
+
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
@@ -690,17 +858,11 @@ def merge_vision_text_embeddings(
         input_ids,
         attention_mask,
         position_ids=None,
-        legacy_processing=None,
+        legacy_processing=False,
         **kwargs,
     ):
         image_features = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
         inputs_embeds = torch.from_numpy(inputs_embeds) if isinstance(inputs_embeds, np.ndarray) else inputs_embeds
-        if legacy_processing is None:
-            legacy_processing = (
-                not hasattr(self.config, "image_seq_length")
-                or ((input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length)
-                or (input_ids.shape[-1] == 1)
-            )
 
         if legacy_processing:
             pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
@@ -733,15 +895,6 @@ def merge_vision_text_embeddings(
             final_attention_mask = torch.zeros(
                 batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
             )
-            # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-            # set the corresponding tensors into their correct target device.
-            target_device = inputs_embeds.device
-            batch_indices, non_image_indices, text_to_overwrite = (
-                batch_indices.to(target_device),
-                non_image_indices.to(target_device),
-                text_to_overwrite.to(target_device),
-            )
-            attention_mask = attention_mask.to(target_device)
 
             # 4. Fill the embeddings based on the mask. If we have ["hey" "", "how", "are"]
             # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
@@ -752,7 +905,7 @@ def merge_vision_text_embeddings(
                 (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
             )
             image_to_overwrite[batch_indices, text_to_overwrite] = False
-            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+            image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
 
             if image_to_overwrite.sum() != image_features.shape[:-1].numel():
                 raise ValueError(
@@ -760,7 +913,7 @@ def merge_vision_text_embeddings(
                     f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
                 )
 
-            final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+            final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim)
             final_attention_mask |= image_to_overwrite
             position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
 
@@ -780,11 +933,12 @@ def merge_vision_text_embeddings(
     def get_multimodal_embeddings(
         self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, past_key_values=None, **kwargs
     ):
-        legacy_processing = (
-            not hasattr(self.config, "image_seq_length")
-            or ((input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length)
-            or (input_ids.shape[-1] == 1 and pixel_values is not None)
-        )
+        if pixel_values is not None and self._support_new_processing and past_key_values is None:
+            legacy_processing = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+        else:
+            legacy_processing = True
         inputs_embeds, attention_mask, position_ids = super().get_multimodal_embeddings(
             input_ids, pixel_values, attention_mask, position_ids, legacy_processing=legacy_processing, **kwargs
         )
@@ -795,19 +949,9 @@ def get_multimodal_embeddings(
         return inputs_embeds, attention_mask, position_ids
 
     def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
-        if not self.language_model.stateful:
-            first_layer_past_key_value = torch.from_numpy(past_key_values[0][0][:, :, :, 0])
-        else:
-            first_layer_past_key_value = torch.from_numpy(
-                self.language_model.request.query_state()[0].state.data[:, :, :, 0]
-            )
-
-        # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-        batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
         # Get the target length
         target_length = input_ids.shape[1]
-        past_length = first_layer_past_key_value.shape[-1]
+        past_length = self.language_model._get_past_length(past_key_values)
 
         extended_attention_mask = torch.ones(
             (attention_mask.shape[0], past_length),
@@ -815,22 +959,38 @@ def _filter_unattended_tokens(self, input_ids, attention_mask, past_key_values):
             device=attention_mask.device,
         )
 
-        # Filter out only the tokens that can be un-attended, this can happen
-        # if one uses Llava + Fused modules where the cache on the
-        # first iteration is already big enough, or if one passes custom cache
-        valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-        new_batch_index = batch_index[valid_indices]
-        new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-        # Zero-out the places where we don't need to attend
-        extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
         attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-        position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        position_ids = torch.cumsum(attention_mask, axis=1) - 1
+        position_ids[attention_mask == 0] = 1
         return attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if getattr(processor, "chat_template", None) is not None:
+            chat_prompt = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+            if image is not None:
+                chat_prompt[0]["content"].append({"type": "image"})
+            prompt = processor.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+        else:
+            if image is not None and "" not in text:
+                prompt = "\n" + text
+            else:
+                prompt = text
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        return inputs
+
 
 class _OVLlavaNextForCausalLM(_OVLlavaForCausalLM):
+    auto_model_class = LlavaNextForConditionalGeneration
+
     # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_next/modeling_llava_next.py#L655
     def pack_image_features(self, image_features, image_sizes, image_newline=None):
         from transformers.models.llava_next.modeling_llava_next import get_anyres_image_grid_shape, unpad_image
@@ -903,11 +1063,13 @@ def get_multimodal_embeddings(
 
         inputs_embeds = self.get_text_embeddings(input_ids, **kwargs)
 
-        legacy_processing = (
-            not hasattr(self.config, "image_seq_length")
-            or ((input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length)
-            or (input_ids.shape[-1] == 1 and pixel_values is not None)
-        )
+        if pixel_values is not None and self._support_new_processing and past_key_values is None:
+            legacy_processing = (input_ids == self.config.image_token_index).sum(
+                1
+            ).max() < self.config.image_seq_length
+        else:
+            legacy_processing = True
+
         if pixel_values is not None and pixel_values.size(0) > 0:
             # ! infer image_num_patches from image_sizes
             image_num_patches = [
@@ -961,7 +1123,7 @@ def merge_vision_text_embeddings(
         input_ids,
         attention_mask,
         position_ids=None,
-        legacy_processing=None,
+        legacy_processing=False,
         **kwargs,
     ):
         image_token_index = self.config.image_token_index
@@ -1099,7 +1261,7 @@ def get_text_embeddings(self, input_ids, **kwargs):
         return super().get_text_embeddings(for_inputs_embeds_ids, **kwargs)
 
 
-class _OvInternVLForCausalLM(OVModelForVisualCausalLM):
+class _OVInternVLForCausalLM(OVModelForVisualCausalLM):
     def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
         if input_ids is not None and input_ids.shape[1] == 1:
             return None
@@ -1122,9 +1284,760 @@ def merge_vision_text_embeddings(
         input_embeds = input_embeds.reshape(B, N, C)
         return input_embeds, attention_mask, position_ids
 
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+    ):
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required.")
+        import torchvision.transforms as T
+        from torchvision.transforms.functional import InterpolationMode
+
+        IMG_START_TOKEN = ""
+        IMG_END_TOKEN = ""
+        IMG_CONTEXT_TOKEN = ""
+
+        IMAGENET_MEAN = (0.485, 0.456, 0.406)
+        IMAGENET_STD = (0.229, 0.224, 0.225)
+
+        def build_transform(input_size):
+            MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+            transform = T.Compose(
+                [
+                    T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+                    T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+                    T.ToTensor(),
+                    T.Normalize(mean=MEAN, std=STD),
+                ]
+            )
+            return transform
+
+        def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+            best_ratio_diff = float("inf")
+            best_ratio = (1, 1)
+            area = width * height
+            for ratio in target_ratios:
+                target_aspect_ratio = ratio[0] / ratio[1]
+                ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+                if ratio_diff < best_ratio_diff:
+                    best_ratio_diff = ratio_diff
+                    best_ratio = ratio
+                elif ratio_diff == best_ratio_diff:
+                    if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                        best_ratio = ratio
+            return best_ratio
+
+        def dynamic_preprocess(image, min_num=1, max_num=12, image_size=28, use_thumbnail=False):
+            orig_width, orig_height = image.size
+            aspect_ratio = orig_width / orig_height
+
+            # calculate the existing image aspect ratio
+            target_ratios = {
+                (i, j)
+                for n in range(min_num, max_num + 1)
+                for i in range(1, n + 1)
+                for j in range(1, n + 1)
+                if i * j <= max_num and i * j >= min_num
+            }
+            target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+            # find the closest aspect ratio to the target
+            target_aspect_ratio = find_closest_aspect_ratio(
+                aspect_ratio, target_ratios, orig_width, orig_height, image_size
+            )
+
+            # calculate the target width and height
+            target_width = image_size * target_aspect_ratio[0]
+            target_height = image_size * target_aspect_ratio[1]
+            blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+            # resize the image
+            resized_img = image.resize((target_width, target_height))
+            processed_images = []
+            for i in range(blocks):
+                box = (
+                    (i % (target_width // image_size)) * image_size,
+                    (i // (target_width // image_size)) * image_size,
+                    ((i % (target_width // image_size)) + 1) * image_size,
+                    ((i // (target_width // image_size)) + 1) * image_size,
+                )
+                # split the image
+                split_img = resized_img.crop(box)
+                processed_images.append(split_img)
+            assert len(processed_images) == blocks
+            if use_thumbnail and len(processed_images) != 1:
+                thumbnail_img = image.resize((image_size, image_size))
+                processed_images.append(thumbnail_img)
+            return processed_images
+
+        def load_image(image, input_size=448, max_num=12):
+            transform = build_transform(input_size=input_size)
+            images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+            pixel_values = [transform(image) for image in images]
+            pixel_values = torch.stack(pixel_values)
+            return pixel_values
+
+        if image is not None:
+            if config is None:
+                raise ValueError("Config is required.")
+            if "" not in text:
+                text = "\n" + text
+            pixel_values = load_image(image, input_size=config.vision_config.image_size)
+            num_patches = pixel_values.shape[0]
+            num_image_token = int(
+                (config.vision_config.image_size // config.vision_config.patch_size) ** 2
+                * (config.downsample_ratio**2)
+            )
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * num_image_token * num_patches + IMG_END_TOKEN
+            text = text.replace("", image_tokens, 1)
+            text_inputs = tokenizer(text, return_tensors="pt")
+            inputs = dict(text_inputs)
+            inputs.update({"pixel_values": pixel_values})
+        else:
+            inputs = tokenizer(text, return_tensors="pt")
+        return inputs
+
+    # internvl has issue with check  _get_non_default_parameters, as wrkaraund overide _prepare_generation_config
+    def _prepare_generation_config(
+        self, generation_config: Optional[GenerationConfig], **kwargs: Dict
+    ) -> Tuple[GenerationConfig, Dict]:
+        using_model_generation_config = False
+        if generation_config is None:
+            if (
+                self.generation_config._from_model_config  # 1)
+                and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
+            ):
+                new_generation_config = GenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:  # 4)
+                    warnings.warn(
+                        "You have modified the pretrained model configuration to control generation. This is a"
+                        " deprecated strategy to control generation and will be removed in v5."
+                        " Please use and modify the model generation configuration (see"
+                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+                        UserWarning,
+                    )
+                    self.generation_config = new_generation_config
+
+            generation_config = self.generation_config
+            using_model_generation_config = True
+
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model
+        if not using_model_generation_config:
+            if generation_config.bos_token_id is None:
+                generation_config.bos_token_id = self.generation_config.bos_token_id
+            if generation_config.eos_token_id is None:
+                generation_config.eos_token_id = self.generation_config.eos_token_id
+            if generation_config.pad_token_id is None:
+                generation_config.pad_token_id = self.generation_config.pad_token_id
+            if generation_config.decoder_start_token_id is None:
+                generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id
+
+        return generation_config, model_kwargs
+
+
+class _OVMiniCPMVForCausalLM(OVModelForVisualCausalLM):
+    additional_parts = ["resampler"]
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = True,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            language_model,
+            text_embeddings,
+            vision_embeddings,
+            config,
+            device,
+            dynamic_shapes,
+            ov_config,
+            model_save_dir,
+            quantization_config,
+            **kwargs,
+        )
+        self.embed_dim = self.language_model.config.hidden_size
+        max_size = self.config.vision_config.image_size // self.config.vision_config.patch_size
+        self._pos_embeds = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
+        self.max_size = (max_size, max_size)
+
+    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        tgt_sizes = kwargs["tgt_sizes"]
+        pixel_values_list = pixel_values
+        vision_hidden_states = []
+        all_pixel_values = []
+        img_cnt = []
+        for pixel_value in pixel_values_list:
+            img_cnt.append(len(pixel_value))
+            all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_value])
+
+        vision_embedding = None
+        # exist image
+        if all_pixel_values:
+            tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
+            tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+
+            max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+
+            all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0)
+            B, L, _ = all_pixel_values.shape
+            all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+
+            patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool)
+            for i in range(B):
+                patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+            position_ids = self._prepare_vis_position_ids(
+                all_pixel_values,
+                patch_attn_mask,
+                tgt_sizes,
+                self.config.vision_config.patch_size,
+                self.config.vision_config.image_size // self.config.patch_size,
+            )
+            vision_embedding = torch.from_numpy(
+                self.vision_embeddings(
+                    pixel_values=all_pixel_values, patch_attention_mask=patch_attn_mask, position_ids=position_ids
+                )[0]
+            )
+            vision_embedding = self.resampling(vision_embedding, tgt_sizes)
+
+            start = 0
+            for pixel_value in pixel_values_list:
+                img_cnt = len(pixel_value)
+                if img_cnt > 0:
+                    vision_hidden_states.append(vision_embedding[start : start + img_cnt])
+                    start += img_cnt
+                else:
+                    vision_hidden_states.append([])
+        else:  # no image
+            dummy_feature = []
+            for _ in range(len(pixel_values_list)):
+                vision_hidden_states.append(dummy_feature)
+        return vision_hidden_states
+
+    def resampling(self, x, tgt_sizes):
+        bs = x.shape[0]
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes)
+
+        max_patch_len = torch.max(patch_len)
+        key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)))  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+        res = torch.from_numpy(self.resampler(image_feature=x, pos_embed=pos_embed, key_padding_mask=key_padding_mask))
+        return res
+
+    def _set_2d_pos_cache(self, max_size):
+        pos_embed = torch.from_numpy(self._get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
+        self._pos_embed = pos_embed
+
+    def _adjust_pos_cache(self, tgt_sizes):
+        max_h = torch.max(tgt_sizes[:, 0])
+        max_w = torch.max(tgt_sizes[:, 1])
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])]
+            self._set_2d_pos_cache(self.max_size)
+
+    def _get_2d_sincos_pos_embed(self, embed_dim, image_size):
+        """
+        image_size: image_size or (image_height, image_width)
+        return:
+        pos_embed: [image_height, image_width, embed_dim]
+        """
+        if isinstance(image_size, int):
+            grid_h_size, grid_w_size = image_size, image_size
+        else:
+            grid_h_size, grid_w_size = image_size[0], image_size[1]
+
+        grid_h = np.arange(grid_h_size, dtype=np.float32)
+        grid_w = np.arange(grid_w_size, dtype=np.float32)
+        grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+        grid = np.stack(grid, axis=0)
+
+        pos_embed = self._get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+        return pos_embed
+
+    def _get_2d_sincos_pos_embed_from_grid(self, embed_dim, grid):
+        assert embed_dim % 2 == 0
+
+        # use half of dimensions to encode grid_h
+        emb_h = self._get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0])  # (H, W, D/2)
+        emb_w = self._get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1])  # (H, W, D/2)
+
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+        return emb
+
+    def _get_1d_sincos_pos_embed_from_grid_new(self, embed_dim, pos):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (H, W)
+        out: (H, W, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.0
+        omega = 1.0 / 10000**omega  # (D/2,)
+
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+        return emb
+
+    def _prepare_vis_position_ids(
+        self, pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side
+    ):
+        batch_size = pixel_values.size(0)
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size
+        boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side)
+        position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        return position_ids
+
+    def merge_vision_text_embeddings(
+        self, vision_embeds, input_embeds, input_ids, attention_mask, position_ids=None, **kwargs
+    ):
+        bs = input_ids.shape[0]
+        image_bound = kwargs["image_bound"]
+        vllm_embedding = torch.from_numpy(input_embeds)
+        for i in range(bs):
+            cur_vs_hs = vision_embeds[i]
+            if len(cur_vs_hs) > 0:
+                cur_vllm_emb = vllm_embedding[i]
+                cur_image_bound = image_bound[i]
+                if len(cur_image_bound) > 0:
+                    image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound])
+
+                    cur_vllm_emb.scatter_(
+                        0,
+                        image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]),
+                        cur_vs_hs.view(-1, cur_vs_hs.shape[-1]),
+                    )
+        return vllm_embedding, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if getattr(processor, "chat_template", None) is not None:
+            messages = [{"role": "user", "content": text if image is None else "(./)\n" + text}]
+            prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            prompt = (
+                f"<|im_start|>user\n(./)\n{text}<|im_end|>\n<|im_start|>assistant\n"
+                if image is not None
+                else text
+            )
+        inputs = processor([prompt], [image], return_tensors="pt")
+        inputs.pop("image_sizes", None)
+        return inputs
+
+
+class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM):
+    def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs):
+        if input_ids is not None and input_ids.shape[1] == 1:
+            return None
+        if isinstance(pixel_values, list) or pixel_values.ndim == 5:
+            concat_images = torch.cat(pixel_values, dim=0) if isinstance(pixel_values, list) else pixel_values
+            image_features = torch.from_numpy(self.vision_embeddings(concat_images).last_hidden_state)
+            split_sizes = [image.shape[0] for image in pixel_values]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
+        else:
+            image_features = self.vision_embeddings(pixel_values).last_hidden_state
+
+        return image_features
+
+    def get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, **kwargs
+    ):
+        vision_embeds = None
+        IGNORE_INDEX = -100
+        IMAGE_TOKEN_INDEX = -200
+        if pixel_values is not None:
+            vision_embeds = self.get_vision_embeddings(pixel_values, input_ids=input_ids, **kwargs)
+        if vision_embeds is None:
+            inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids))
+            past_len = self.language_model._get_past_length(kwargs.get("past_key_values"))
+            if attention_mask is not None and attention_mask.shape[1] < past_len + input_ids.shape[1]:
+                attention_mask = torch.cat(
+                    [
+                        attention_mask,
+                        torch.ones(attention_mask.shape[0], past_len + input_ids.shape[1] - attention_mask.shape[1]),
+                    ],
+                    dim=1,
+                )
+                position_ids = None
+            return inputs_embeds, attention_mask, position_ids
+
+        vision_embeds = torch.from_numpy(vision_embeds) if isinstance(vision_embeds, np.ndarray) else vision_embeds
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long)
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask.bool())
+        ]
+        labels = [
+            cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask.bool())
+        ]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = vision_embeds[cur_image_idx]
+                cur_input_embeds_1 = torch.from_numpy(self.get_text_embeddings(cur_input_ids.unsqueeze(0))[0])
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = (
+                [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = torch.from_numpy(
+                self.get_text_embeddings(torch.cat(cur_input_ids_noim).unsqueeze(0))[0]
+            )
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = vision_embeds[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(
+                        torch.full(
+                            (cur_image_features.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device
+        )
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        return new_input_embeds, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+    ):
+        if tokenizer is None:
+            raise ValueError("Tokenizer is required.")
+        if image is not None and processor is None:
+            raise ValueError("Processor is required.")
+        text = f"\n{text}" if image is not None else text
+        messages = [{"role": "user", "content": text}]
+        if tokenizer.chat_template is not None:
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        if image is not None:
+            text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("")]
+            input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
+        else:
+            input_ids = tokenizer(text, return_tensors="pt").input_ids
+        attention_mask = torch.ones_like(input_ids, dtype=torch.int64)
+        result = {"input_ids": input_ids, "attention_mask": attention_mask}
+        if image is not None:
+            result["pixel_values"] = processor(images=[image], return_tensors="pt")["pixel_values"]
+        return result
+
+
+class _OVPhi3VisionForCausalLM(OVModelForVisualCausalLM):
+    additional_parts = ["vision_projection"]
+
+    def __init__(
+        self,
+        language_model: ov.Model,
+        text_embeddings: ov.Model,
+        vision_embeddings: ov.Model,
+        config: PretrainedConfig = None,
+        device: str = "CPU",
+        dynamic_shapes: bool = True,
+        ov_config: Optional[Dict[str, str]] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            language_model,
+            text_embeddings,
+            vision_embeddings,
+            config,
+            device,
+            dynamic_shapes,
+            ov_config,
+            model_save_dir,
+            quantization_config,
+            **kwargs,
+        )
+        self.sub_GN = torch.tensor(self.config.sub_GN)
+        self.glb_GN = torch.tensor(self.config.glb_GN)
+
+    def get_vision_embeddings(self, pixel_values, image_sizes, **kwargs):
+        num_images, num_crops, c, h, w = pixel_values.shape
+        img_features = self.vision_embeddings(pixel_values.flatten(0, 1)).last_hidden_state.reshape(
+            num_images, num_crops, -1, self.config.img_processor["image_dim_out"]
+        )
+        image_features_proj = self.hd_feature_transform(img_features, image_sizes)
+        return image_features_proj
+
+    def hd_feature_transform(self, image_features, image_sizes):
+        """
+        image_features: (num_images, num_crops+1, 24*24, 1024)
+        """
+
+        image_features = torch.from_numpy(image_features)
+        global_image_features = image_features[:, 0]  # (num_images, 24*24, 1024)
+        # global feature can be viewed as a special HD case with num_crops 1x1
+        global_image_features_hd = self.reshape_hd_patches_2x2merge(global_image_features, 1, 1)
+        global_image_features_hd_newline = self.add_image_newline(global_image_features_hd)
+
+        all_image_embeddings = []
+        # need a for loop to process each image because of different image sizes
+        # (patch arrangement is different for each image)
+        for i, img_size in enumerate(image_sizes):
+            h, w = img_size
+            h_crop = h // 336
+            w_crop = w // 336
+            num_crops = h_crop * w_crop
+
+            # NOTE: real num_crops is padded
+            # (num_crops, 24*24, 1024)
+            sub_image_features = image_features[i, 1 : 1 + num_crops]
+            sub_image_features_hd = self.reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop)
+            sub_image_features_hd_newline = self.add_image_newline(sub_image_features_hd)
+
+            # [sub features, separator, global features]
+            all_image_embeddings.extend(
+                [
+                    sub_image_features_hd_newline.squeeze(0),  # (h_crop*12*(w_crop*12+1), 4096)
+                    self.glb_GN.squeeze(0),
+                    global_image_features_hd_newline[i],
+                ]
+            )
+        image_features_proj = self.vision_projection(torch.cat(all_image_embeddings, dim=0).unsqueeze(0))[0]
+
+        return image_features_proj
+
+    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
+        """
+        image_features: (num_images*num_crops, 24*24, 1024)
+        output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops
+        """
+        N, L, C = image_features.shape
+        assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0
+        num_images = N // (h_crop * w_crop)
+        H = int(L**0.5)
+        image_features_hd = (
+            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
+            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
+            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
+            .reshape(N, -1, 4 * C)  # N, 144, 4096
+            .reshape(num_images, h_crop, w_crop, H // 2, H // 2, -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
+            .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+        )
+
+        return image_features_hd
+
+    def add_image_newline(self, image_features_hd):
+        """
+        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
+        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
+        """
+        num_images, h, w, hid_dim = image_features_hd.shape
+        # add the newline token to the HD image feature patches
+        newline_embeddings = self.sub_GN.expand(num_images, h, -1, -1)  # (n_img, h, 1, hid_dim)
+        image_features_hd_newline = torch.cat([image_features_hd, newline_embeddings], dim=2).reshape(
+            num_images, -1, hid_dim
+        )
+        return image_features_hd_newline
+
+    def get_multimodal_embeddings(
+        self, input_ids, pixel_values=None, attention_mask=None, position_ids=None, image_sizes=None, **kwargs
+    ):
+        MAX_INPUT_ID = int(1e9)
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        # positions for image tokens
+        positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=True)
+        has_image = len(positions[0].tolist()) > 0
+        input_ids = input_ids.clamp_min(0).clamp_max(self.config.vocab_size)
+        inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids, **kwargs))
+        if has_image:
+            vision_embeds = self.get_vision_embeddings(
+                pixel_values, input_ids=input_ids, image_sizes=image_sizes, **kwargs
+            )
+            image_features_proj = torch.from_numpy(vision_embeds)
+            inputs_embeds = inputs_embeds.index_put(positions, image_features_proj, accumulate=False)
+
+        return inputs_embeds, attention_mask, position_ids
+
+    @staticmethod
+    def preprocess_inputs(
+        text: str,
+        image: Optional[Image] = None,
+        processor: Optional[AutoImageProcessor] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        config: Optional[PretrainedConfig] = None,
+    ):
+        if processor is None:
+            raise ValueError("Processor is required.")
+        if image is not None and "<|image_1|>" not in text:
+            text = "<|image_1|>\n" + text
+        if getattr(processor.tokenizer, "chat_template", None) is not None:
+            chat_prompt = [{"role": "user", "content": text}]
+            text = processor.tokenizer.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+        inputs = processor(images=image, text=text, return_tensors="pt")
+        return inputs
+
 
 MODEL_TYPE_TO_CLS_MAPPING = {
     "llava": _OVLlavaForCausalLM,
     "llava_next": _OVLlavaNextForCausalLM,
-    "internvl_chat": _OvInternVLForCausalLM,
+    "minicpmv": _OVMiniCPMVForCausalLM,
+    "llava-qwen2": _OVNanoLlavaForCausalLM,
+    "phi3_v": _OVPhi3VisionForCausalLM,
+    "internvl_chat": _OVInternVLForCausalLM,
 }
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 6395880e44..1b36c98b47 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -19,11 +19,14 @@
 import os
 import warnings
 from collections import deque
+from itertools import islice
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
+import datasets
 import nncf
 import openvino
+import requests
 import torch
 import transformers
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
@@ -33,9 +36,11 @@
 from nncf.torch.initialization import PTInitializingDataLoader
 from openvino._offline_transformations import compress_quantize_weights_transformation
 from openvino.runtime import Core, Tensor
+from PIL import Image
 from torch.utils._pytree import tree_map
 from torch.utils.data import DataLoader, RandomSampler
-from transformers import AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
+from tqdm import tqdm
+from transformers import AutoProcessor, AutoTokenizer, DataCollator, PreTrainedModel, default_data_collator
 from transformers.pytorch_utils import Conv1D
 from transformers.utils import is_accelerate_available
 
@@ -62,6 +67,7 @@
     ONNX_WEIGHTS_NAME,
     OV_XML_FILE_NAME,
     PREDEFINED_SD_DATASETS,
+    PREDEFINED_VISUAL_LM_DATASETS,
 )
 
 
@@ -313,8 +319,10 @@ def _quantize_ovbasemodel(
         remove_unused_columns: bool = True,
         **kwargs,
     ):
+        from optimum.intel.openvino.modeling_visual_language import OVModelForVisualCausalLM
+
         if is_diffusers_available():
-            from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase
+            from optimum.intel.openvino.modeling_diffusion import OVDiffusionPipeline
 
         if save_directory is not None:
             save_directory = Path(save_directory)
@@ -324,7 +332,7 @@ def _quantize_ovbasemodel(
         if calibration_dataset is not None:
             # Process custom calibration dataset
 
-            if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
+            if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
                 calibration_dataset = self._prepare_unet_dataset(
                     quantization_config.num_samples, dataset=calibration_dataset
                 )
@@ -361,7 +369,9 @@ def _quantize_ovbasemodel(
 
                 if isinstance(self.model, OVModelForCausalLM):
                     calibration_dataset = self._prepare_causal_lm_dataset(quantization_config)
-                elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
+                elif isinstance(self.model, OVModelForVisualCausalLM):
+                    calibration_dataset = self._prepare_visual_causal_lm_dataset(quantization_config)
+                elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
                     if not isinstance(quantization_config.dataset, str):
                         raise ValueError("Please provide dataset as one of the accepted dataset labels.")
                     calibration_dataset = self._prepare_unet_dataset(
@@ -375,32 +385,60 @@ def _quantize_ovbasemodel(
             if quantization_config.quant_method == OVQuantizationMethod.HYBRID:
                 if calibration_dataset is None:
                     raise ValueError("Calibration dataset is required to run hybrid quantization.")
-                if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
+                if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
                     # Apply weight-only quantization to all SD submodels except UNet
                     quantization_config_copy = copy.deepcopy(quantization_config)
                     quantization_config_copy.dataset = None
                     quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT
-                    sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]
+                    sub_model_names = [
+                        "vae_encoder",
+                        "vae_decoder",
+                        "text_encoder",
+                        "text_encoder_2",
+                        "text_encoder_3",
+                    ]
                     sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names))
                     for sub_model in sub_models:
                         _weight_only_quantization(sub_model.model, quantization_config_copy)
 
-                    # Apply hybrid quantization to UNet
-                    self.model.unet.model = _hybrid_quantization(
-                        self.model.unet.model, quantization_config, calibration_dataset
-                    )
+                    if self.model.unet is not None:
+                        # Apply hybrid quantization to UNet
+                        self.model.unet.model = _hybrid_quantization(
+                            self.model.unet.model, quantization_config, calibration_dataset
+                        )
+                    else:
+                        self.model.transformer.model = _hybrid_quantization(
+                            self.model.transformer.model, quantization_config, calibration_dataset
+                        )
+
                     self.model.clear_requests()
                 else:
                     # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
                     self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset)
                     self.model.request = None
             else:
-                if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
-                    sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"]
+                if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
+                    sub_model_names = [
+                        "vae_encoder",
+                        "vae_decoder",
+                        "text_encoder",
+                        "text_encoder_2",
+                        "unet",
+                        "transformer",
+                        "text_encoder_3",
+                    ]
                     sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names))
                     for sub_model in sub_models:
                         _weight_only_quantization(sub_model.model, quantization_config)
                     self.model.clear_requests()
+                elif isinstance(self.model, OVModelForVisualCausalLM):
+                    language_model = self.model.language_model
+                    _weight_only_quantization(language_model.model, quantization_config, calibration_dataset)
+                    sub_model_names = ["vision_embeddings", "text_embeddings"] + self.model.additional_parts
+                    sub_models = [getattr(self.model, f"{name}_model") for name in sub_model_names]
+                    for sub_model in sub_models:
+                        _weight_only_quantization(sub_model, OVWeightQuantizationConfig(bits=8, sym=True))
+                    self.model.clear_requests()
                 else:
                     _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
                     self.model.request = None
@@ -713,6 +751,67 @@ def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationCo
 
         return calibration_dataset
 
+    def _prepare_visual_causal_lm_dataset(self, config: OVWeightQuantizationConfig):
+        dataset_name = config.dataset
+        if dataset_name not in PREDEFINED_VISUAL_LM_DATASETS:
+            raise ValueError(
+                "You have entered a string value for dataset. You can only choose between"
+                f"{list(PREDEFINED_VISUAL_LM_DATASETS.keys())}, but the {dataset_name} was found"
+            )
+        if config.processor is None:
+            raise ValueError(
+                "`processor` must be specified in order to run data-aware weight compression. "
+                "Please provide it as a model id, or a path to a directory containing all the required "
+                "configuration files."
+            )
+
+        processor = AutoProcessor.from_pretrained(config.processor, trust_remote_code=config.trust_remote_code)
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)
+            tokenizer_error = None
+        except Exception as tokenizer_error:  # noqa: F841
+            tokenizer = None
+
+        dataset_metadata = PREDEFINED_VISUAL_LM_DATASETS[dataset_name]
+        dataset = datasets.load_dataset(dataset_metadata["name"], split=dataset_metadata["split"]).shuffle(seed=0)
+        num_samples = min(config.num_samples or 128, len(dataset))
+        dataset = islice(dataset, num_samples)
+
+        calibration_dataset = []
+        for item in tqdm(dataset, desc="Collecting calibration dataset", total=num_samples):
+            instruction = item[dataset_metadata["inputs"]["instruction"]]
+            image_url = item[dataset_metadata["inputs"]["image_url"]]
+            image = Image.open(requests.get(image_url, stream=True).raw)
+
+            try:
+                inputs = self.model.preprocess_inputs(
+                    text=instruction, image=image, processor=processor, tokenizer=tokenizer, config=self.model.config
+                )
+            except ValueError as value_error:
+                if "Tokenizer is required." in str(value_error) and tokenizer_error is not None:
+                    raise tokenizer_error
+                raise value_error
+
+            input_ids = inputs.get("input_ids")
+            position_ids = torch.arange(input_ids.size(1)).unsqueeze(0).to(input_ids.device)
+
+            inputs_embeds, attention_mask, position_ids = self.model.get_multimodal_embeddings(
+                **inputs,
+                position_ids=position_ids,
+            )
+
+            language_model_inputs = self.model.language_model.prepare_inputs(
+                input_ids=None,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+            )
+
+            calibration_dataset.append(language_model_inputs)
+
+        calibration_dataset = nncf.Dataset(calibration_dataset)
+        return calibration_dataset
+
     def _prepare_text_generation_dataset(
         self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader
     ) -> nncf.Dataset:
@@ -743,7 +842,9 @@ def _prepare_unet_dataset(
     ) -> nncf.Dataset:
         self.model.compile()
 
-        size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor
+        diffuser = self.model.unet if self.model.unet is not None else self.model.transformer
+
+        size = diffuser.config.get("sample_size", 64) * self.model.vae_scale_factor
         height, width = 2 * (min(size, 512),)
         num_samples = num_samples or 200
 
@@ -784,7 +885,7 @@ def transform_fn(data_item):
 
         calibration_data = []
         try:
-            self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data)
+            diffuser.request = InferRequestWrapper(diffuser.request, calibration_data)
 
             for inputs in dataset:
                 inputs = transform_fn(inputs)
@@ -795,7 +896,7 @@ def transform_fn(data_item):
                 if len(calibration_data) >= num_samples:
                     break
         finally:
-            self.model.unet.request = self.model.unet.request.request
+            diffuser.request = diffuser.request.request
 
         calibration_dataset = nncf.Dataset(calibration_data[:num_samples])
         return calibration_dataset
@@ -829,6 +930,8 @@ def _weight_only_quantization(
 
     if config.weight_format == "mxfp4":
         mode = CompressWeightsMode.E2M1
+    elif config.weight_format == "nf4":
+        mode = CompressWeightsMode.NF4
     else:
         if config.bits == 8:
             mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index a2f08b647f..0edb3a7307 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -148,7 +148,12 @@
         "range": {"num_init_samples": 300, "type": "mean_min_max"},
         "batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
     },
-    "scope_overrides": {"activations": {"{re}.*matmul_0": {"mode": "symmetric"}}},
+    "scope_overrides": {
+        "activations": {
+            "{re}.*matmul_0": {"mode": "symmetric"},
+            "{re}.*scaled_dot_product_attention_0": {"mode": "symmetric"},
+        }
+    },
     "ignored_scopes": [
         "{re}.*Embedding.*",
         "{re}.*add___.*",
@@ -216,6 +221,11 @@ def __init__(
         logger.warning("OVTrainer is deprecated and will be removed in optimum-intel v1.22.0.")
 
         if is_transformers_version(">=", "4.45.0"):
+            if is_transformers_version(">=", "4.46.0"):
+                raise ImportError(
+                    f"Unsupported transformers version found is {_transformers_version} which is not supported by the OVTrainer. Please downgrade to v4.44"
+                )
+
             logger.warning(
                 f"The transformers version found is {_transformers_version} which is not officially supported by the OVTrainer, use at your own risk"
             )
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 3426abd5f1..cf5060f420 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -16,8 +16,12 @@
 import json
 import logging
 import os
+import stat
+import warnings
+import weakref
 from glob import glob
 from pathlib import Path
+from tempfile import TemporaryDirectory as OrigTemporaryDirectory
 from typing import Tuple, Type, Union
 
 import numpy as np
@@ -38,6 +42,9 @@
 OV_ENCODER_NAME = "openvino_encoder_model.xml"
 OV_DECODER_NAME = "openvino_decoder_model.xml"
 OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"
+OV_TEXT_EMBEDDINGS_MODEL_NAME = "openvino_text_embeddings_model.xml"
+OV_LANGUAGE_MODEL_NAME = "openvino_language_model.xml"
+OV_VISION_EMBEDDINGS_MODEL_NAME = "openvino_vision_embeddings_model.xml"
 
 OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
 OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"
@@ -53,9 +60,7 @@
 
 EXTERNAL_DATA_FORMAT_SIZE_LIMIT = 2 * 1024 * 1024 * 1024
 
-TEXTUAL_INVERSION_NAME = "learned_embeds.bin"
-TEXTUAL_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
-TEXTUAL_INVERSION_EMBEDDING_KEY = "text_model.embeddings.token_embedding.weight"
+TEXTUAL_INVERSION_EMBEDDING_KEY = "self.text_model.embeddings.token_embedding.weight"
 
 OV_TO_NP_TYPE = {
     "boolean": np.bool_,
@@ -114,9 +119,12 @@
     "token-classification": "OVModelForTokenClassification",
     "question-answering": "OVModelForQuestionAnswering",
     "image-classification": "OVModelForImageClassification",
+    "image-text-to-text": "OVModelForVisualCausalLM",
     "audio-classification": "OVModelForAudioClassification",
     "stable-diffusion": "OVStableDiffusionPipeline",
     "stable-diffusion-xl": "OVStableDiffusionXLPipeline",
+    "stable-diffusion-3": "OVStableDiffusion3Pipeline",
+    "flux": "OVFluxPipeline",
     "pix2struct": "OVModelForPix2Struct",
     "latent-consistency": "OVLatentConsistencyModelPipeline",
     "open_clip_text": "OVModelOpenCLIPText",
@@ -131,6 +139,14 @@
     "laion/filtered-wit": {"split": "train", "inputs": {"prompt": "caption"}},
 }
 
+PREDEFINED_VISUAL_LM_DATASETS = {
+    "contextual": {
+        "name": "ucla-contextual/contextual_test",
+        "split": "test",
+        "inputs": {"image_url": "image_url", "instruction": "instruction"},
+    }
+}
+
 
 NEED_CONVERT_TO_FAST_TOKENIZER: Tuple[Type[PreTrainedTokenizer]] = (CLIPTokenizer,)
 
@@ -207,6 +223,40 @@ def _print_compiled_model_properties(compiled_model):
         logger.error("[error] Get FULL_DEVICE_NAME failed")
 
 
+def np_to_pt_generators(np_object, device):
+    if isinstance(np_object, np.random.RandomState):
+        return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0]))
+    elif isinstance(np_object, np.random.Generator):
+        return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0]))
+    elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)):
+        return [np_to_pt_generators(a, device) for a in np_object]
+    elif isinstance(np_object, dict) and isinstance(
+        next(iter(np_object.values())), (np.random.RandomState, np.random.Generator)
+    ):
+        return {k: np_to_pt_generators(v, device) for k, v in np_object.items()}
+    else:
+        return np_object
+
+
+def _raise_invalid_batch_size(
+    expected_batch_size: int, batch_size: int, num_images_per_prompt: int, guidance_scale: float
+):
+    current_batch_size = batch_size * num_images_per_prompt * (1 if guidance_scale <= 1 else 2)
+
+    if expected_batch_size != current_batch_size:
+        msg = ""
+        if guidance_scale is not None and guidance_scale <= 1:
+            msg = f"`guidance_scale` was set to {guidance_scale}, static shapes are currently only supported for `guidance_scale` > 1 "
+
+        raise ValueError(
+            "The model was statically reshaped and the pipeline inputs do not match the expected shapes. "
+            f"The `batch_size`, `num_images_per_prompt` and `guidance_scale` were respectively set to {batch_size}, {num_images_per_prompt} and {guidance_scale}. "
+            f"The static model expects an input of size equal to {expected_batch_size} and got the following value instead : {current_batch_size}. "
+            f"To fix this, please either provide a different inputs to your model so that `batch_size` * `num_images_per_prompt` * 2 is equal to {expected_batch_size} "
+            "or reshape it again accordingly using the `.reshape()` method by setting `batch_size` to -1. " + msg
+        )
+
+
 def get_export_transformers_version(model, config):
     version_str = None
 
@@ -228,3 +278,275 @@ def model_has_dynamic_inputs(model):
         if is_dynamic:
             return is_dynamic
     return is_dynamic
+
+
+# adopted from https://github.com/python/cpython/blob/3.12/Lib/shutil.py for compatibility with python<3.10
+def _rmtree(path, ignore_errors=False, onerror=None, *, onexc=None, dir_fd=None):
+    """Recursively delete a directory tree.
+
+    If dir_fd is not None, it should be a file descriptor open to a directory;
+    path will then be relative to that directory.
+    dir_fd may not be implemented on your platform.
+    If it is unavailable, using it will raise a NotImplementedError.
+
+    If ignore_errors is set, errors are ignored; otherwise, if onexc or
+    onerror is set, it is called to handle the error with arguments (func,
+    path, exc_info) where func is platform and implementation dependent;
+    path is the argument to that function that caused it to fail; and
+    the value of exc_info describes the exception. For onexc it is the
+    exception instance, and for onerror it is a tuple as returned by
+    sys.exc_info().  If ignore_errors is false and both onexc and
+    onerror are None, the exception is reraised.
+
+    onerror is deprecated and only remains for backwards compatibility.
+    If both onerror and onexc are set, onerror is ignored and onexc is used.
+    """
+    _use_fd_functions = (
+        {os.open, os.stat, os.unlink, os.rmdir} <= os.supports_dir_fd
+        and os.scandir in os.supports_fd
+        and os.stat in os.supports_follow_symlinks
+    )
+
+    if hasattr(os.stat_result, "st_file_attributes"):
+
+        def _rmtree_islink(path):
+            try:
+                st = os.lstat(path)
+                return stat.S_ISLNK(st.st_mode) or (
+                    st.st_file_attributes & stat.FILE_ATTRIBUTE_REPARSE_POINT
+                    and st.st_reparse_tag == stat.IO_REPARSE_TAG_MOUNT_POINT
+                )
+            except OSError:
+                return False
+
+    else:
+
+        def _rmtree_islink(path):
+            return os.path.islink(path)
+
+    def _rmtree_safe_fd(stack, onexc):
+        # Each stack item has four elements:
+        # * func: The first operation to perform: os.lstat, os.close or os.rmdir.
+        #   Walking a directory starts with an os.lstat() to detect symlinks; in
+        #   this case, func is updated before subsequent operations and passed to
+        #   onexc() if an error occurs.
+        # * dirfd: Open file descriptor, or None if we're processing the top-level
+        #   directory given to rmtree() and the user didn't supply dir_fd.
+        # * path: Path of file to operate upon. This is passed to onexc() if an
+        #   error occurs.
+        # * orig_entry: os.DirEntry, or None if we're processing the top-level
+        #   directory given to rmtree(). We used the cached stat() of the entry to
+        #   save a call to os.lstat() when walking subdirectories.
+        func, dirfd, path, orig_entry = stack.pop()
+        name = path if orig_entry is None else orig_entry.name
+        try:
+            if func is os.close:
+                os.close(dirfd)
+                return
+            if func is os.rmdir:
+                os.rmdir(name, dir_fd=dirfd)
+                return
+
+            # Note: To guard against symlink races, we use the standard
+            # lstat()/open()/fstat() trick.
+            assert func is os.lstat
+            if orig_entry is None:
+                orig_st = os.lstat(name, dir_fd=dirfd)
+            else:
+                orig_st = orig_entry.stat(follow_symlinks=False)
+
+            func = os.open  # For error reporting.
+            topfd = os.open(name, os.O_RDONLY | os.O_NONBLOCK, dir_fd=dirfd)
+
+            func = os.path.islink  # For error reporting.
+            try:
+                if not os.path.samestat(orig_st, os.fstat(topfd)):
+                    # Symlinks to directories are forbidden, see GH-46010.
+                    raise OSError("Cannot call rmtree on a symbolic link")
+                stack.append((os.rmdir, dirfd, path, orig_entry))
+            finally:
+                stack.append((os.close, topfd, path, orig_entry))
+
+            func = os.scandir  # For error reporting.
+            with os.scandir(topfd) as scandir_it:
+                entries = list(scandir_it)
+            for entry in entries:
+                fullname = os.path.join(path, entry.name)
+                try:
+                    if entry.is_dir(follow_symlinks=False):
+                        # Traverse into sub-directory.
+                        stack.append((os.lstat, topfd, fullname, entry))
+                        continue
+                except OSError:
+                    pass
+                try:
+                    os.unlink(entry.name, dir_fd=topfd)
+                except OSError as err:
+                    onexc(os.unlink, fullname, err)
+        except OSError as err:
+            err.filename = path
+            onexc(func, path, err)
+
+    def _rmtree_unsafe(path, onexc):
+        def onerror(err):
+            onexc(os.scandir, err.filename, err)
+
+        results = os.walk(path, topdown=False, onerror=onerror, followlinks=hasattr(os, "_walk_symlinks_as_files"))
+        for dirpath, dirnames, filenames in results:
+            for name in dirnames:
+                fullname = os.path.join(dirpath, name)
+                try:
+                    os.rmdir(fullname)
+                except OSError as err:
+                    onexc(os.rmdir, fullname, err)
+            for name in filenames:
+                fullname = os.path.join(dirpath, name)
+                try:
+                    os.unlink(fullname)
+                except OSError as err:
+                    onexc(os.unlink, fullname, err)
+        try:
+            os.rmdir(path)
+        except OSError as err:
+            onexc(os.rmdir, path, err)
+
+    if ignore_errors:
+
+        def onexc(*args):
+            pass
+
+    elif onerror is None and onexc is None:
+
+        def onexc(*args):
+            raise
+
+    elif onexc is None:
+        if onerror is None:
+
+            def onexc(*args):
+                raise
+
+        else:
+            # delegate to onerror
+            def onexc(*args):
+                func, path, exc = args
+                if exc is None:
+                    exc_info = None, None, None
+                else:
+                    exc_info = type(exc), exc, exc.__traceback__
+                return onerror(func, path, exc_info)
+
+    if _use_fd_functions:
+        # While the unsafe rmtree works fine on bytes, the fd based does not.
+        if isinstance(path, bytes):
+            path = os.fsdecode(path)
+        stack = [(os.lstat, dir_fd, path, None)]
+        try:
+            while stack:
+                _rmtree_safe_fd(stack, onexc)
+        finally:
+            # Close any file descriptors still on the stack.
+            while stack:
+                func, fd, path, entry = stack.pop()
+                if func is not os.close:
+                    continue
+                try:
+                    os.close(fd)
+                except OSError as err:
+                    onexc(os.close, path, err)
+    else:
+        if dir_fd is not None:
+            raise NotImplementedError("dir_fd unavailable on this platform")
+        try:
+            if _rmtree_islink(path):
+                # symlinks to directories are forbidden, see bug #1669
+                raise OSError("Cannot call rmtree on a symbolic link")
+        except OSError as err:
+            onexc(os.path.islink, path, err)
+            # can't continue even if onexc hook returns
+            return
+        return _rmtree_unsafe(path, onexc)
+
+
+# copied https://github.com/python/cpython/blob/3.12/Lib/tempfile.py
+# to add behaviour that available only for python3.10+ for older python version
+class TemporaryDirectory(OrigTemporaryDirectory):
+    def __init__(self, suffix=None, prefix=None, dir=None, ignore_cleanup_errors=True, *, delete=True):
+        super().__init__(suffix=suffix, prefix=prefix, dir=dir)
+        self._ignore_cleanup_errors = ignore_cleanup_errors
+        self._delete = delete
+        self._finalizer = weakref.finalize(
+            self,
+            self._cleanup,
+            self.name,
+            warn_message="Implicitly cleaning up {!r}".format(self),
+            ignore_errors=self._ignore_cleanup_errors,
+            delete=self._delete,
+        )
+
+    @classmethod
+    def _cleanup(cls, name, warn_message, ignore_errors=False, delete=True):
+        if delete:
+            cls._rmtree(name, ignore_errors=ignore_errors)
+            warnings.warn(warn_message, ResourceWarning)
+
+    @classmethod
+    def _rmtree(cls, name, ignore_errors=False, repeated=False):
+        def _dont_follow_symlinks(func, path, *args):
+            # Pass follow_symlinks=False, unless not supported on this platform.
+            if func in os.supports_follow_symlinks:
+                func(path, *args, follow_symlinks=False)
+            elif os.name == "nt" or not os.path.islink(path):
+                func(path, *args)
+
+        def _resetperms(path):
+            try:
+                chflags = os.chflags
+            except AttributeError:
+                pass
+            else:
+                _dont_follow_symlinks(chflags, path, 0)
+            _dont_follow_symlinks(os.chmod, path, 0o700)
+
+        def onexc(func, path, exc):
+            if isinstance(exc, PermissionError):
+                if repeated and path == name:
+                    if ignore_errors:
+                        return
+                    raise
+
+                try:
+                    if path != name:
+                        _resetperms(os.path.dirname(path))
+                    _resetperms(path)
+
+                    try:
+                        os.unlink(path)
+                    except IsADirectoryError:
+                        cls._rmtree(path, ignore_errors=ignore_errors)
+                    except PermissionError:
+                        # The PermissionError handler was originally added for
+                        # FreeBSD in directories, but it seems that it is raised
+                        # on Windows too.
+                        # bpo-43153: Calling _rmtree again may
+                        # raise NotADirectoryError and mask the PermissionError.
+                        # So we must re-raise the current PermissionError if
+                        # path is not a directory.
+                        if not os.path.isdir(path) or os.path.isjunction(path):
+                            if ignore_errors:
+                                return
+                            raise
+                        cls._rmtree(path, ignore_errors=ignore_errors, repeated=(path == name))
+                except FileNotFoundError:
+                    pass
+            elif isinstance(exc, FileNotFoundError):
+                pass
+            else:
+                if not ignore_errors:
+                    raise
+
+        _rmtree(name, onexc=onexc)
+
+    def cleanup(self):
+        if self._finalizer.detach() or os.path.exists(self.name):
+            self._rmtree(self.name, ignore_errors=self._ignore_cleanup_errors)
diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
index 78016ea71c..38aea6c1f1 100644
--- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
+++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py
@@ -70,6 +70,17 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["openvino", "diffusers"])
 
 
+class OVStableDiffusionXLInpaintPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
 class OVLatentConsistencyModelPipeline(metaclass=DummyObject):
     _backends = ["openvino", "diffusers"]
 
@@ -79,3 +90,102 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVLatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVStableDiffusion3Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVStableDiffusion3Pipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVStableDiffusion3InpaintPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
+
+
+class OVFluxPipeline(metaclass=DummyObject):
+    _backends = ["openvino", "diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["openvino", "diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["openvino", "diffusers"])
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index d231a6befa..6fa6c590b5 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -43,6 +43,13 @@
     except importlib_metadata.PackageNotFoundError:
         _transformers_available = False
 
+_tokenizers_available = importlib.util.find_spec("tokenizers") is not None
+_tokenizers_version = "N/A"
+if _tokenizers_available:
+    try:
+        _tokenizers_version = importlib_metadata.version("tokenizers")
+    except importlib_metadata.PackageNotFoundError:
+        _tokenizers_available = False
 
 _torch_available = importlib.util.find_spec("torch") is not None
 _torch_version = "N/A"
@@ -181,6 +188,10 @@ def is_transformers_available():
     return _transformers_available
 
 
+def is_tokenizers_available():
+    return _tokenizers_available
+
+
 def is_neural_compressor_available():
     return _neural_compressor_available
 
@@ -340,6 +351,15 @@ def is_transformers_version(operation: str, version: str):
     return compare_versions(parse(_transformers_version), operation, version)
 
 
+def is_tokenizers_version(operation: str, version: str):
+    """
+    Compare the current Tokenizers version to a given reference with an operation.
+    """
+    if not _tokenizers_available:
+        return False
+    return compare_versions(parse(_tokenizers_version), operation, version)
+
+
 def is_optimum_version(operation: str, version: str):
     return compare_versions(parse(_optimum_version), operation, version)
 
@@ -362,6 +382,24 @@ def is_openvino_version(operation: str, version: str):
     return compare_versions(parse(_openvino_version), operation, version)
 
 
+def is_openvino_tokenizers_version(operation: str, version: str):
+    if not is_openvino_available():
+        return False
+    if not is_openvino_tokenizers_available():
+        return False
+    import openvino_tokenizers
+
+    tokenizers_version = openvino_tokenizers.__version__
+
+    if tokenizers_version == "0.0.0.0":
+        try:
+            tokenizers_version = importlib_metadata.version("openvino_tokenizers") or tokenizers_version
+        except importlib_metadata.PackageNotFoundError:
+            pass
+
+    return compare_versions(parse(tokenizers_version), operation, version)
+
+
 def is_diffusers_version(operation: str, version: str):
     """
     Compare the current diffusers version to a given reference with an operation.
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index a05efc46c7..a39957bbf7 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -123,17 +123,20 @@ def _find_files_matching_pattern(
         str(model_name_or_path), subfolder=subfolder, revision=revision, token=token
     )
     if library_name == "diffusers":
-        subfolder = os.path.join(subfolder, "unet")
+        subfolders = [os.path.join(subfolder, "unet"), os.path.join(subfolder, "transformer")]
     else:
-        subfolder = subfolder or "."
+        subfolders = [subfolder or "."]
 
     if model_path.is_dir():
-        glob_pattern = subfolder + "/*"
-        files = model_path.glob(glob_pattern)
-        files = [p for p in files if re.search(pattern, str(p))]
+        files = []
+        for subfolder in subfolders:
+            glob_pattern = subfolder + "/*"
+            files_ = model_path.glob(glob_pattern)
+            files_ = [p for p in files_ if re.search(pattern, str(p))]
+            files.extend(files_)
     else:
         repo_files = map(Path, HfApi().list_repo_files(model_name_or_path, revision=revision, token=token))
-        files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder]
+        files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) in subfolders]
 
     return files
 
diff --git a/optimum/intel/version.py b/optimum/intel/version.py
index e118ea7131..16bf124e0e 100644
--- a/optimum/intel/version.py
+++ b/optimum/intel/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.20.0.dev0"
+__version__ = "1.21.0.dev0"
diff --git a/setup.py b/setup.py
index 18993b31a6..fa4d94e507 100644
--- a/setup.py
+++ b/setup.py
@@ -55,19 +55,16 @@
     "tiktoken",
     "sentence-transformers",
     "open_clip_torch>=2.26.1",
+    "peft",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate"],
-    "openvino": [
-        "openvino==2024.4.1.dev20240926",
-        "nncf>=2.11.0",
-        "openvino-tokenizers[transformers]==2024.4.1.0.dev20240926",
-    ],
     "nncf": ["nncf>=2.11.0"],
     "ipex": ["intel-extension-for-pytorch", "transformers>=4.45,<4.46"],
+    "openvino": ["nncf>=2.11.0", "openvino==2024.5.0", "openvino-tokenizers==2024.5.0"],
+    "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
@@ -88,7 +85,6 @@
         "Intended Audience :: Education",
         "Intended Audience :: Science/Research",
         "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index dc919ec5a2..f74675ddd6 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -46,7 +46,7 @@
 )
 from optimum.intel.utils.import_utils import is_ipex_version
 from optimum.utils.testing_utils import grid_parameters
-from utils_tests import MODEL_NAMES
+from utils_tests import MODEL_NAMES, IS_XPU
 
 
 SEED = 42
@@ -80,11 +80,12 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
         ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True)
+        device = ipex_model.device
         self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id)
+        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         inputs = "This is a sample input"
-        tokens = tokenizer(inputs, return_tensors="pt")
+        tokens = tokenizer(inputs, return_tensors="pt").to(device)
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
         outputs = ipex_model(**tokens)
@@ -144,11 +145,12 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
         ipex_model = IPEXModelForQuestionAnswering.from_pretrained(model_id, export=True)
+        device = ipex_model.device
         self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id)
+        transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id).to(device)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         inputs = "This is a sample input"
-        tokens = tokenizer(inputs, return_tensors="pt")
+        tokens = tokenizer(inputs, return_tensors="pt").to(device)
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
         outputs = ipex_model(**tokens)
@@ -201,14 +203,14 @@ class IPEXModelForCausalLMTest(unittest.TestCase):
         "gpt_neo",
         "gpt_neox",
         "mistral",
-        "llama",
+        # "llama",
         "llama2",
         # "phi",
-        "distilgpt2",
+        # "distilgpt2",
         "mpt",
         "opt",
     )
-    IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "distilgpt2", "falcon")
+    IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "falcon", "gpt2")
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.0
 
@@ -216,7 +218,11 @@ class IPEXModelForCausalLMTest(unittest.TestCase):
     def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
-        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
+        dtype = torch.float32
+        if IS_XPU:
+            dtype = torch.float16
+        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype)
+        device = ipex_model.device
         self.assertIsInstance(ipex_model.config, PretrainedConfig)
         self.assertTrue(ipex_model.use_cache)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -224,20 +230,20 @@ def test_compare_to_transformers(self, model_arch):
             "This is a sample",
             return_tensors="pt",
             return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
-        )
+        ).to(device)
         inputs = ipex_model.prepare_inputs_for_generation(**tokens)
         outputs = ipex_model(**inputs)
 
         self.assertIsInstance(outputs.logits, torch.Tensor)
 
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
 
         # Test re-load model
         with tempfile.TemporaryDirectory() as tmpdirname:
             ipex_model.save_pretrained(tmpdirname)
-            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname)
+            loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype)
             loaded_model_outputs = loaded_model(**inputs)
 
         # Test init method
@@ -252,11 +258,14 @@ def test_compare_to_transformers(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
+        dtype = torch.float32
+        if IS_XPU:
+            dtype = torch.float16
         model_id = MODEL_NAMES[model_arch]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
+        model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype)
         model.config.encoder_no_repeat_ngram_size = 0
-        model.to("cpu")
+        # model.to("cpu")
         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
         outputs = pipe("This is a sample", max_new_tokens=10)
         self.assertEqual(pipe.device, model.device)
@@ -264,14 +273,18 @@ def test_pipeline(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_assisted_decoding(self, model_arch):
-        # Patched models are not support assisted decoding if ipex < 2.5.
-        if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES and is_ipex_version("<", "2.4.0"):
+        # assist decoding does not support static cache now
+        if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES:
             return
         model_id = MODEL_NAMES[model_arch]
+        dtype = torch.float32
+        if IS_XPU:
+            dtype = torch.float16
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
-        tokens = tokenizer("This is a sample input", return_tensors="pt")
+        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype)
+        device = ipex_model.device
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
+        tokens = tokenizer("This is a sample input", return_tensors="pt").to(device)
         ipex_output = ipex_model.generate(**tokens, do_sample=False, max_new_tokens=4)
         ipex_output_assisted = ipex_model.generate(
             **tokens, do_sample=False, assistant_model=transformers_model, max_new_tokens=4
@@ -299,8 +312,12 @@ def test_assisted_decoding(self, model_arch):
     def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
-        model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache)
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
+        dtype = torch.float32
+        if IS_XPU:
+            dtype = torch.float16
+        model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache, torch_dtype=dtype)
+        device = model.device
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
         self.assertEqual(model.use_cache, use_cache)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokenizer.pad_token = tokenizer.eos_token
@@ -316,7 +333,7 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):
             ),
         )
         for text in texts:
-            tokens = tokenizer(text, padding=True, return_tensors="pt")
+            tokens = tokenizer(text, padding=True, return_tensors="pt").to(device)
             for generation_config in generation_configs:
                 outputs = model.generate(**tokens, generation_config=generation_config)
                 transformers_outputs = transformers_model.generate(**tokens, generation_config=generation_config)
@@ -325,18 +342,21 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):
 
     @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
     def test_compare_with_and_without_past_key_values(self):
-        model_id = "Jiqing/tiny_random_llama2"
+        model_id = "Intel/tiny_random_llama2"
+        dtype = torch.float32
+        if IS_XPU:
+            dtype = torch.float16
+        model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True, torch_dtype=dtype)
+        device = model_with_pkv.device
         tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokens = tokenizer("This is a sample input", return_tensors="pt")
-
-        model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True)
+        tokens = tokenizer("This is a sample input", return_tensors="pt").to(device)
         # Warmup
         model_with_pkv.generate(**tokens)
         with Timer() as with_pkv_timer:
             outputs_model_with_pkv = model_with_pkv.generate(
                 **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
             )
-        model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False)
+        model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=dtype)
         # Warmup
         model_without_pkv.generate(**tokens)
         with Timer() as without_pkv_timer:
@@ -366,10 +386,11 @@ def _generate_random_audio_data(self):
     def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True)
+        device = ipex_model.device
         self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id)
+        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device)
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
-        inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt")
+        inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt").to(device)
         with torch.no_grad():
             transformers_outputs = transformers_model(**inputs)
         outputs = ipex_model(**inputs)
@@ -417,12 +438,13 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)
         ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True)
+        device = ipex_model.device
         self.assertIsInstance(ipex_model.config, PretrainedConfig)
-        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id)
+        transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device)
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
         url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         image = Image.open(requests.get(url, stream=True).raw)
-        inputs = preprocessor(images=image, return_tensors="pt")
+        inputs = preprocessor(images=image, return_tensors="pt").to(device)
         with torch.no_grad():
             transformers_outputs = transformers_model(**inputs)
         outputs = ipex_model(**inputs)
@@ -440,7 +462,7 @@ def test_compare_to_transformers(self, model_arch):
         self.assertIn("logits", outputs)
         # Compare tensor outputs
         self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
-        self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits))
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-4))
         self.assertTrue(torch.allclose(init_model_outputs.logits, transformers_outputs.logits, atol=1e-4))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index 696f5c9c20..4580303469 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -20,7 +20,7 @@
 from parameterized import parameterized
 from transformers import AutoTokenizer
 from transformers.pipelines import pipeline as transformers_pipeline
-from utils_tests import MODEL_NAMES
+from utils_tests import IS_XPU, MODEL_NAMES
 
 from optimum.intel.ipex.modeling_base import (
     IPEXModelForAudioClassification,
@@ -56,7 +56,6 @@ class PipelinesIntegrationTest(unittest.TestCase):
         "gpt2",
         "gpt_neo",
         "gpt_neox",
-        "llama",
         "llama2",
         "mistral",
         "mpt",
@@ -130,8 +129,11 @@ def test_fill_mask_pipeline_inference(self, model_arch):
     @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
     def test_text_generation_pipeline_inference(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
-        transformers_generator = transformers_pipeline("text-generation", model_id)
-        ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
+        dtype = torch.float32
+        if IS_XPU:
+            dtype = torch.float16
+        transformers_generator = transformers_pipeline("text-generation", model_id, torch_dtype=dtype)
+        ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex", torch_dtype=dtype)
         inputs = "Describe a real-world application of AI."
         with torch.inference_mode():
             transformers_output = transformers_generator(inputs, max_new_tokens=10)
diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py
index 78bdcd7ec6..a16f91dc04 100644
--- a/tests/ipex/utils_tests.py
+++ b/tests/ipex/utils_tests.py
@@ -11,8 +11,11 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+from transformers import is_torch_xpu_available
 
 
+IS_XPU = is_torch_xpu_available(check_device=True)
+
 MODEL_NAMES = {
     "albert": "hf-internal-testing/tiny-random-albert",
     "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
@@ -28,15 +31,15 @@
     "distilgpt2": "Jiqing/tiny_random_distilgpt2",
     "electra": "hf-internal-testing/tiny-random-electra",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
-    "falcon": "Jiqing/tiny_random_falcon",
+    "falcon": "Intel/tiny_random_falcon",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
-    "gpt2": "hf-internal-testing/tiny-random-gpt2",
+    "gpt2": "Intel/tiny_random_gpt2",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
     "llama": "fxmarty/tiny-llama-fast-tokenizer",
-    "llama2": "Jiqing/tiny_random_llama2",
+    "llama2": "Intel/tiny_random_llama2",
     "marian": "sshleifer/tiny-marian-en-de",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
index 6248d3eda6..2baeba9a42 100644
--- a/tests/openvino/test_diffusion.py
+++ b/tests/openvino/test_diffusion.py
@@ -12,52 +12,45 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import random
-import tempfile
 import unittest
-from typing import Dict
+from pathlib import Path
 
 import numpy as np
-import PIL
 import pytest
 import torch
 from diffusers import (
-    StableDiffusionPipeline,
-    StableDiffusionXLImg2ImgPipeline,
-    StableDiffusionXLPipeline,
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    DiffusionPipeline,
 )
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from diffusers.utils import load_image
-from diffusers.utils.testing_utils import floats_tensor
-from openvino.runtime.ie_api import CompiledModel
 from parameterized import parameterized
 from transformers.testing_utils import slow
-from utils_tests import MODEL_NAMES
-
-from optimum.intel import (
-    OVLatentConsistencyModelPipeline,
-    OVStableDiffusionImg2ImgPipeline,
-    OVStableDiffusionInpaintPipeline,
-    OVStableDiffusionPipeline,
-    OVStableDiffusionXLImg2ImgPipeline,
-    OVStableDiffusionXLPipeline,
-)
-from optimum.intel.openvino.modeling_diffusion import (
-    OVModelTextEncoder,
-    OVModelUnet,
-    OVModelVaeDecoder,
-    OVModelVaeEncoder,
-)
-from optimum.intel.utils.import_utils import is_diffusers_version
-from optimum.utils.import_utils import is_onnxruntime_available
+from utils_tests import MODEL_NAMES, SEED
 
-
-F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"}
+from optimum.intel.openvino import (
+    OVDiffusionPipeline,
+    OVPipelineForImage2Image,
+    OVPipelineForInpainting,
+    OVPipelineForText2Image,
+)
+from optimum.intel.openvino.utils import TemporaryDirectory
+from optimum.intel.utils.import_utils import is_transformers_version
+from optimum.utils.testing_utils import require_diffusers
 
 
-SEED = 0
+def get_generator(framework, seed):
+    if framework == "np":
+        return np.random.RandomState(seed)
+    elif framework == "pt":
+        return torch.Generator().manual_seed(seed)
+    else:
+        raise ValueError(f"Unknown framework: {framework}")
 
 
-def _generate_inputs(batch_size=1):
+def _generate_prompts(batch_size=1):
     inputs = {
         "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
         "num_inference_steps": 3,
@@ -67,7 +60,7 @@ def _generate_inputs(batch_size=1):
     return inputs
 
 
-def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
     if input_type == "pil":
         image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
@@ -81,537 +74,779 @@ def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pi
     return [image] * batch_size
 
 
-def to_np(image):
-    if isinstance(image[0], PIL.Image.Image):
-        return np.stack([np.array(i) for i in image], axis=0)
-    elif isinstance(image, torch.Tensor):
-        return image.cpu().numpy().transpose(0, 2, 3, 1)
-    return image
+class OVPipelineForText2ImageTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES.extend(["stable-diffusion-3", "flux"])
+        NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES.append("stable-diffusion-3")
+    CALLBACK_SUPPORT_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
 
+    OVMODEL_CLASS = OVPipelineForText2Image
+    AUTOMODEL_CLASS = AutoPipelineForText2Image
 
-class OVStableDiffusionPipelineBaseTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ("stable-diffusion",)
-    MODEL_CLASS = OVStableDiffusionPipeline
     TASK = "text-to-image"
 
+    def generate_inputs(self, height=128, width=128, batch_size=1):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(f"does not appear to have a file named {self.OVMODEL_CLASS.config_name}", str(context.exception))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_ov_pipeline_class_dispatch(self, model_arch: str):
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        ov_pipeline = OVDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+
+        self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
     def test_num_images_per_prompt(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, compile=False)
-        pipeline.to("cpu")
-        pipeline.compile()
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
-        batch_size, height = 2, 128
-        for width in [64, 128]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        for batch_size in [1, 3]:
+            for height in [64, 128]:
+                for width in [64, 128]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None)
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None)
+
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
+
+            ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+            np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2)
+
+    @parameterized.expand(CALLBACK_SUPPORT_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        height, width, batch_size = 64, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, *args, **kwargs) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ov_callback = Callback()
+        auto_callback = Callback()
+
+        ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        # callback_steps=1 to trigger callback every step
+        ov_pipe(**inputs, callback=ov_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
+
+        self.assertTrue(ov_callback.has_been_called)
+        self.assertTrue(auto_callback.has_been_called)
+        self.assertEqual(auto_callback.number_of_steps, ov_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        height, width, batch_size = 128, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for output_type in ["pil", "np", "pt", "latent"]:
+            inputs["output_type"] = output_type
+            outputs = pipeline(**inputs).images
+            if output_type == "pil":
+                self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+            elif output_type == "np":
+                self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+            elif output_type == "pt":
+                self.assertEqual(outputs.shape, (batch_size, 3, height, width))
+            else:
+                if model_arch != "flux":
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
+                    self.assertEqual(
+                        outputs.shape,
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
+                    )
+                else:
+                    packed_height = height // pipeline.vae_scale_factor
+                    packed_width = width // pipeline.vae_scale_factor
+                    channels = pipeline.transformer.config.in_channels
+                    self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for generator_framework in ["np", "pt"]:
+            ov_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ov_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ov_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0]))
+            np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(NEGATIVE_PROMPT_SUPPORT_ARCHITECTURES)
+    def test_negative_prompt(self, model_arch: str):
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        negative_prompt = ["This is a negative prompt"]
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images
+        prompt = inputs.pop("prompt")
+
+        if model_arch == "stable-diffusion-xl":
+            (
+                inputs["prompt_embeds"],
+                inputs["negative_prompt_embeds"],
+                inputs["pooled_prompt_embeds"],
+                inputs["negative_pooled_prompt_embeds"],
+            ) = pipeline.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=1,
+                device=torch.device("cpu"),
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+        elif model_arch == "stable-diffusion-3":
+            (
+                inputs["prompt_embeds"],
+                inputs["negative_prompt_embeds"],
+                inputs["pooled_prompt_embeds"],
+                inputs["negative_pooled_prompt_embeds"],
+            ) = pipeline.encode_prompt(
+                prompt=prompt,
+                prompt_2=None,
+                prompt_3=None,
+                num_images_per_prompt=1,
+                device=torch.device("cpu"),
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+
+        else:
+            inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=1,
+                device=torch.device("cpu"),
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+
+        images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+        np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(["stable-diffusion", "latent-consistency"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ov_nsfw_content_detected = ov_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ov_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ov_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ov_images = ov_output.images
+        diffusers_images = diffusers_output.images
+
+        np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+    @require_diffusers
+    def test_load_and_save_pipeline_with_safety_checker(self):
+        model_id = "katuni4ka/tiny-random-stable-diffusion-with-safety-checker"
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id)
+        self.assertTrue(ov_pipeline.safety_checker is not None)
+        self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker)
+        with TemporaryDirectory() as tmpdirname:
+            ov_pipeline.save_pretrained(tmpdirname)
+            for subdir in [
+                "text_encoder",
+                "tokenizer",
+                "unet",
+                "vae_encoder",
+                "vae_decoder",
+                "scheduler",
+                "feature_extractor",
+            ]:
+                subdir_path = Path(tmpdirname) / subdir
+                self.assertTrue(subdir_path.is_dir())
+            loaded_pipeline = self.OVMODEL_CLASS.from_pretrained(tmpdirname)
+            self.assertTrue(loaded_pipeline.safety_checker is not None)
+            self.assertIsInstance(loaded_pipeline.safety_checker, StableDiffusionSafetyChecker)
+            del loaded_pipeline
+        del ov_pipeline
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_height_width_properties(self, model_arch: str):
+        batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(
+            MODEL_NAMES[model_arch], export=True, compile=False, dynamic_shapes=True
+        )
+
+        self.assertTrue(ov_pipeline.is_dynamic)
+        self.assertEqual(ov_pipeline.batch_size, -1)
+        self.assertEqual(ov_pipeline.height, -1)
+        self.assertEqual(ov_pipeline.width, -1)
+
+        ov_pipeline.reshape(
+            batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt
+        )
+
+        self.assertFalse(ov_pipeline.is_dynamic)
+        expected_batch = batch_size * num_images_per_prompt
+        if (
+            ov_pipeline.unet is not None
+            and "timestep_cond" not in {inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs}
+        ) or (
+            ov_pipeline.transformer is not None
+            and "txt_ids" not in {inputs.get_any_name() for inputs in ov_pipeline.transformer.model.inputs}
+        ):
+            expected_batch *= 2
+        self.assertEqual(
+            ov_pipeline.batch_size,
+            expected_batch,
+        )
+        self.assertEqual(ov_pipeline.height, height)
+        self.assertEqual(ov_pipeline.width, width)
+
     @pytest.mark.run_slow
     @slow
+    @require_diffusers
+    def test_textual_inversion(self):
+        # for now we only test for stable-diffusion
+        # this is very slow and costly to run right now
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        ti_id = "sd-concepts-library/cat-toy"
+
+        inputs = self.generate_inputs()
+        inputs["prompt"] = "A  backpack"
+
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(model_id, safety_checker=None)
+        diffusers_pipeline.load_textual_inversion(ti_id)
+
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id, compile=False, safety_checker=None)
+        ov_pipeline.load_textual_inversion(ti_id)
+
+        diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+        ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+        np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2)
+
+
+class OVPipelineForImage2ImageTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
+
+    AUTOMODEL_CLASS = AutoPipelineForImage2Image
+    OVMODEL_CLASS = OVPipelineForImage2Image
+
+    TASK = "image-to-image"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
+
+        inputs["strength"] = 0.75
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(f"does not appear to have a file named {self.OVMODEL_CLASS.config_name}", str(context.exception))
+
+    @parameterized.expand(list(SUPPORTED_ARCHITECTURES))
+    @require_diffusers
+    def test_ov_pipeline_class_dispatch(self, model_arch: str):
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        for batch_size in [1, 3]:
+            for height in [64, 128]:
+                for width in [64, 128]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion", "stable-diffusion-xl", "latent-consistency"])
+    @require_diffusers
     def test_callback(self, model_arch: str):
-        MODEL_NAMES[model_arch]
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, *args, **kwargs) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+
+        ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            callback_fn.has_been_called = True
-            callback_fn.number_of_steps += 1
+        ov_callback = Callback()
+        auto_callback = Callback()
+        # callback_steps=1 to trigger callback every step
+        ov_pipe(**inputs, callback=ov_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
 
-        pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        callback_fn.has_been_called = False
-        callback_fn.number_of_steps = 0
-        inputs = self.generate_inputs(height=64, width=64)
-        pipeline(**inputs, callback=callback_fn, callback_steps=1)
-        self.assertTrue(callback_fn.has_been_called)
-        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
+        self.assertTrue(ov_callback.has_been_called)
+        self.assertEqual(ov_callback.number_of_steps, auto_callback.number_of_steps)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
     def test_shape(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
         height, width, batch_size = 128, 64, 1
-        pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
 
-        if self.TASK == "image-to-image":
-            input_types = ["np", "pil", "pt"]
-        elif self.TASK == "text-to-image":
-            input_types = ["np"]
-        else:
-            input_types = ["pil"]
+        for input_type in ["pil", "np", "pt"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
 
-        for input_type in input_types:
-            if self.TASK == "image-to-image":
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
-            else:
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for output_type in ["np", "pil", "latent"]:
+            for output_type in ["pil", "np", "pt", "latent"]:
                 inputs["output_type"] = output_type
                 outputs = pipeline(**inputs).images
                 if output_type == "pil":
                     self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
                 elif output_type == "np":
                     self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                elif output_type == "pt":
+                    self.assertEqual(outputs.shape, (batch_size, 3, height, width))
                 else:
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
                     self.assertEqual(
                         outputs.shape,
-                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
                     )
 
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = _generate_inputs(batch_size)
-        inputs["height"] = height
-        inputs["width"] = width
-        return inputs
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None)
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], text_encoder_3=None)
 
-class OVStableDiffusionImg2ImgPipelineTest(OVStableDiffusionPipelineBaseTest):
-    SUPPORTED_ARCHITECTURES = ("stable-diffusion",)
-    MODEL_CLASS = OVStableDiffusionImg2ImgPipeline
-    TASK = "image-to-image"
+        for output_type in ["latent", "np", "pt"]:
+            print(output_type)
+            inputs["output_type"] = output_type
+
+            ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+            np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        height, width, batch_size = 128, 128, 1
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        height, width, batch_size = 64, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        inputs["prompt"] = "A painting of a squirrel eating a burger"
-        inputs["image"] = floats_tensor((batch_size, 3, height, width), rng=random.Random(SEED))
-        output = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1].flatten()
-        # https://github.com/huggingface/diffusers/blob/v0.17.1/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py#L71
-        expected_slice = np.array([0.66964, 0.61614, 0.48283, 0.57811, 0.55551, 0.55392, 0.53045, 0.41177, 0.46099])
-        self.assertTrue(
-            np.allclose(output, expected_slice, atol=1e-1),
-            msg=f"Max difference: {np.abs(output - expected_slice).max()}. Actual value: {output}",
-        )
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.run_slow
-    @slow
-    def test_num_images_per_prompt_static_model(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False)
-        batch_size, num_images, height, width = 2, 3, 128, 64
-        pipeline.half()
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        for _height in [height, height + 16]:
-            inputs = self.generate_inputs(height=_height, width=width, batch_size=batch_size)
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
+        for generator_framework in ["np", "pt"]:
+            ov_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ov_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ov_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
 
+            self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0]))
+            np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2)
 
-class OVStableDiffusionPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ("stable-diffusion",)
-    MODEL_CLASS = OVStableDiffusionPipeline
-    TASK = "text-to-image"
+    @parameterized.expand(["stable-diffusion", "latent-consistency"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ov_nsfw_content_detected = ov_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ov_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ov_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ov_images = ov_output.images
+        diffusers_images = diffusers_output.images
+
+        np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_compare_to_diffusers(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
-        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
-        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
-        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
-        self.assertIsInstance(ov_pipeline.config, Dict)
-
-        pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        pipeline.safety_checker = None
-        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64
-
-        latents = ov_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ov_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
+    def test_height_width_properties(self, model_arch: str):
+        batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(
+            MODEL_NAMES[model_arch], export=True, compile=False, dynamic_shapes=True
         )
 
-        kwargs = {
-            "prompt": "sailing ship in storm by Leonardo da Vinci",
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ov_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-            # Compare model outputs
-            self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
-
-        # Compare model devices
-        self.assertEqual(pipeline.device, ov_pipeline.device)
+        self.assertTrue(ov_pipeline.is_dynamic)
+        self.assertEqual(ov_pipeline.batch_size, -1)
+        self.assertEqual(ov_pipeline.height, -1)
+        self.assertEqual(ov_pipeline.width, -1)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_image_reproducibility(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 64
-        ov_outputs_1 = pipeline(**inputs, height=height, width=width, generator=np.random.RandomState(SEED))
-        ov_outputs_2 = pipeline(**inputs, height=height, width=width, generator=np.random.RandomState(SEED))
-        ov_outputs_3 = pipeline(**inputs, height=height, width=width)
-        # Compare model outputs
-        self.assertTrue(np.array_equal(ov_outputs_1.images[0], ov_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0]))
+        ov_pipeline.reshape(
+            batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt
+        )
+
+        self.assertFalse(ov_pipeline.is_dynamic)
+        expected_batch = batch_size * num_images_per_prompt
+        if ov_pipeline.unet is None or "timestep_cond" not in {
+            inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs
+        }:
+            expected_batch *= 2
+        self.assertEqual(ov_pipeline.batch_size, expected_batch)
+        self.assertEqual(ov_pipeline.height, height)
+        self.assertEqual(ov_pipeline.width, width)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
-    def test_num_images_per_prompt_static_model(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False)
-        batch_size, num_images, height, width = 3, 4, 128, 64
-        pipeline.half()
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        self.assertFalse(pipeline.is_dynamic)
-        pipeline.compile()
-        # Verify output shapes requirements not matching the static model doesn't impact the final outputs
-        for _height in [height, height + 16]:
-            inputs = _generate_inputs(batch_size)
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+    @require_diffusers
+    def test_textual_inversion(self):
+        # for now we only test for stable-diffusion
+        # this is very slow and costly to run right now
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        ti_id = "sd-concepts-library/cat-toy"
+
+        inputs = self.generate_inputs()
+        inputs["prompt"] = "A  backpack"
+
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(model_id, safety_checker=None)
+        diffusers_pipeline.load_textual_inversion(ti_id)
+
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id, compile=False, safety_checker=None)
+        ov_pipeline.load_textual_inversion(ti_id)
+
+        diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+        ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+        np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2)
+
+
+class OVPipelineForInpaintingTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"]
+
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES.append("stable-diffusion-3")
+
+    AUTOMODEL_CLASS = AutoPipelineForInpainting
+    OVMODEL_CLASS = OVPipelineForInpainting
+
+    TASK = "inpainting"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
+        inputs["mask_image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type
+        )
+
+        inputs["strength"] = 0.75
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(f"does not appear to have a file named {self.OVMODEL_CLASS.config_name}", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_height_width_properties(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        batch_size, num_images, height, width = 2, 4, 128, 64
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=True)
-        self.assertTrue(pipeline.is_dynamic)
-        self.assertEqual(pipeline.height, -1)
-        self.assertEqual(pipeline.width, -1)
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        self.assertFalse(pipeline.is_dynamic)
-        self.assertEqual(pipeline.height, height)
-        self.assertEqual(pipeline.width, width)
-
-
-class OVStableDiffusionInpaintPipelineTest(OVStableDiffusionPipelineBaseTest):
-    SUPPORTED_ARCHITECTURES = ("stable-diffusion",)
-    MODEL_CLASS = OVStableDiffusionInpaintPipeline
-    TASK = "inpaint"
+    @require_diffusers
+    def test_ov_pipeline_class_dispatch(self, model_arch: str):
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        self.assertEqual(ov_pipeline.auto_model_class, auto_pipeline.__class__)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(not is_onnxruntime_available(), "this test requires onnxruntime")
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline
-
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        batch_size, num_images, height, width = 1, 1, 64, 64
-        latents = pipeline.prepare_latents(
-            batch_size * num_images,
-            pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-        inputs = self.generate_inputs(height=height, width=width)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        for batch_size in [1, 3]:
+            for height in [64, 128]:
+                for width in [64, 128]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion", "stable-diffusion-xl"])
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
-        inputs["image"] = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
 
-        inputs["mask_image"] = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        ).resize((width, height))
+            def __call__(self, *args, **kwargs) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
 
-        outputs = pipeline(**inputs, latents=latents).images
-        self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+        ov_pipe = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        ort_pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, export=True)
-        ort_outputs = ort_pipeline(**inputs, latents=latents).images
-        self.assertTrue(np.allclose(outputs, ort_outputs, atol=1e-1))
+        ov_callback = Callback()
+        auto_callback = Callback()
+        # callback_steps=1 to trigger callback every step
+        ov_pipe(**inputs, callback=ov_callback, callback_steps=1)
+        auto_pipe(**inputs, callback=auto_callback, callback_steps=1)
 
-        expected_slice = np.array([0.4692, 0.5260, 0.4005, 0.3609, 0.3259, 0.4676, 0.5593, 0.4728, 0.4411])
-        self.assertTrue(np.allclose(outputs[0, -3:, -3:, -1].flatten(), expected_slice, atol=1e-1))
+        self.assertTrue(ov_callback.has_been_called)
+        self.assertEqual(ov_callback.number_of_steps, auto_callback.number_of_steps)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.run_slow
-    @slow
-    def test_num_images_per_prompt_static_model(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False)
-        batch_size, num_images, height, width = 1, 3, 128, 64
-        pipeline.half()
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        for _height in [height, height + 16]:
-            inputs = self.generate_inputs(height=_height, width=width, batch_size=batch_size)
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = super(OVStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width, batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        return inputs
+        height, width, batch_size = 128, 64, 1
 
+        for input_type in ["pil", "np", "pt"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
 
-class OVtableDiffusionXLPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ("stable-diffusion-xl",)
-    MODEL_CLASS = OVStableDiffusionXLPipeline
-    PT_MODEL_CLASS = StableDiffusionXLPipeline
-    TASK = "text-to-image"
+            for output_type in ["pil", "np", "pt", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                elif output_type == "pt":
+                    self.assertEqual(outputs.shape, (batch_size, 3, height, width))
+                else:
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
+                    self.assertEqual(
+                        outputs.shape,
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
+                    )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_compare_to_diffusers(self, model_arch: str):
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG)
-        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
-        self.assertIsInstance(ov_pipeline.text_encoder_2, OVModelTextEncoder)
-        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
-        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
-        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
-        self.assertIsInstance(ov_pipeline.config, Dict)
-
-        pipeline = self.PT_MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
-        latents = ov_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ov_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
-        for output_type in ["latent", "np"]:
-            ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
 
-            self.assertIsInstance(ov_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+            ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-            # Compare model outputs
-            self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
-        # Compare model devices
-        self.assertEqual(pipeline.device, ov_pipeline.device)
+            np.testing.assert_allclose(ov_output, diffusers_output, atol=6e-3, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
     def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-
-        # Verify every subcomponent is compiled by default
-        for component in {"unet", "vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"}:
-            self.assertIsInstance(getattr(pipeline, component).request, CompiledModel)
-
-        batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
-        inputs = _generate_inputs(batch_size)
-        ov_outputs_1 = pipeline(
-            **inputs,
-            height=height,
-            width=width,
-            num_images_per_prompt=num_images_per_prompt,
-            generator=np.random.RandomState(SEED),
-        )
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pipeline.save_pretrained(tmp_dir)
-            pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir)
-        ov_outputs_2 = pipeline(
-            **inputs,
-            height=height,
-            width=width,
-            num_images_per_prompt=num_images_per_prompt,
-            generator=np.random.RandomState(SEED),
-        )
-        ov_outputs_3 = pipeline(**inputs, height=height, width=width, num_images_per_prompt=num_images_per_prompt)
-        self.assertTrue(np.array_equal(ov_outputs_1.images[0], ov_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0]))
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.run_slow
-    @slow
-    def test_num_images_per_prompt_static_model(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False)
-        batch_size, num_images, height, width = 3, 4, 128, 64
-        pipeline.half()
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        self.assertFalse(pipeline.is_dynamic)
-        pipeline.compile()
-
-        for _height in [height, height + 16]:
-            inputs = _generate_inputs(batch_size)
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-
-class OVStableDiffusionXLImg2ImgPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ("stable-diffusion-xl", "stable-diffusion-xl-refiner")
-    MODEL_CLASS = OVStableDiffusionXLImg2ImgPipeline
-    PT_MODEL_CLASS = StableDiffusionXLImg2ImgPipeline
-    TASK = "image-to-image"
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
 
-    def test_inference(self):
-        model_id = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, ov_config=F32_CONFIG)
+        for generator_framework in ["np", "pt"]:
+            ov_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ov_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ov_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertFalse(np.array_equal(ov_outputs_1.images[0], ov_outputs_3.images[0]))
+            np.testing.assert_allclose(ov_outputs_1.images[0], ov_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(["stable-diffusion"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            pipeline.save_pretrained(tmp_dir)
-            pipeline = self.MODEL_CLASS.from_pretrained(tmp_dir, ov_config=F32_CONFIG)
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
 
-        batch_size, height, width = 1, 128, 128
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ov_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
         inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        inputs["image"] = floats_tensor((batch_size, 3, height, width), rng=random.Random(SEED))
-        output = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1]
-        expected_slice = np.array([0.5747, 0.5182, 0.4857, 0.5295, 0.5106, 0.5520, 0.4814, 0.4289, 0.4868])
-        self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-3))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.run_slow
-    @slow
-    def test_num_images_per_prompt_static_model(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False)
-        batch_size, num_images, height, width = 2, 3, 128, 64
-        pipeline.half()
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        for _height in [height, height + 16]:
-            inputs = self.generate_inputs(height=_height, width=width, batch_size=batch_size)
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
+        ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
 
+        ov_nsfw_content_detected = ov_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
 
-class OVLatentConsistencyModelPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = ("latent-consistency",)
-    MODEL_CLASS = OVLatentConsistencyModelPipeline
-    TASK = "text-to-image"
+        self.assertTrue(ov_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ov_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ov_images = ov_output.images
+        diffusers_images = diffusers_output.images
+
+        np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version")
-    def test_compare_to_diffusers(self, model_arch: str):
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG)
-        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
-        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
-        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
-        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
-        self.assertIsInstance(ov_pipeline.config, Dict)
-
-        from diffusers import LatentConsistencyModelPipeline
-
-        pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
-        latents = ov_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ov_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
+    def test_height_width_properties(self, model_arch: str):
+        batch_size, height, width, num_images_per_prompt = 2, 128, 64, 4
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(
+            MODEL_NAMES[model_arch], export=True, compile=False, dynamic_shapes=True
         )
 
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_scale": 8.5,
-        }
-
-        for output_type in ["latent", "np"]:
-            ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ov_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
-        # Compare model devices
-        self.assertEqual(pipeline.device, ov_pipeline.device)
+        self.assertTrue(ov_pipeline.is_dynamic)
+        self.assertEqual(ov_pipeline.batch_size, -1)
+        self.assertEqual(ov_pipeline.height, -1)
+        self.assertEqual(ov_pipeline.width, -1)
+
+        ov_pipeline.reshape(
+            batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt
+        )
+
+        self.assertFalse(ov_pipeline.is_dynamic)
+        expected_batch = batch_size * num_images_per_prompt
+        if ov_pipeline.unet is None or "timestep_cond" not in {
+            inputs.get_any_name() for inputs in ov_pipeline.unet.model.inputs
+        }:
+            expected_batch *= 2
+        self.assertEqual(
+            ov_pipeline.batch_size,
+            expected_batch,
+        )
+        self.assertEqual(ov_pipeline.height, height)
+        self.assertEqual(ov_pipeline.width, width)
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
-    @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version")
-    def test_num_images_per_prompt_static_model(self, model_arch: str):
-        model_id = MODEL_NAMES[model_arch]
-        pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False)
-        batch_size, num_images, height, width = 3, 4, 128, 64
-        pipeline.half()
-        pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
-        self.assertFalse(pipeline.is_dynamic)
-        pipeline.compile()
-
-        for _height in [height, height + 16]:
-            inputs = _generate_inputs(batch_size)
-            outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images
-            self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+    @require_diffusers
+    def test_textual_inversion(self):
+        # for now we only test for stable-diffusion
+        # this is very slow and costly to run right now
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version")
-    def test_safety_checker(self, model_arch: str):
-        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG)
-        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
-        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
-        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
-        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
-        self.assertIsInstance(ov_pipeline.config, Dict)
+        model_id = "runwayml/stable-diffusion-v1-5"
+        ti_id = "sd-concepts-library/cat-toy"
 
-        from diffusers import LatentConsistencyModelPipeline
-        from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+        inputs = self.generate_inputs()
+        inputs["prompt"] = "A  backpack"
 
-        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
-        pipeline = LatentConsistencyModelPipeline.from_pretrained(
-            MODEL_NAMES[model_arch], safety_checker=safety_checker
-        )
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(model_id, safety_checker=None)
+        diffusers_pipeline.load_textual_inversion(ti_id)
 
-        batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
-        latents = ov_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ov_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
+        ov_pipeline = self.OVMODEL_CLASS.from_pretrained(model_id, compile=False, safety_checker=None)
+        ov_pipeline.load_textual_inversion(ti_id)
+
+        diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+        ov_output = ov_pipeline(**inputs, generator=get_generator("pt", SEED)).images
 
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_scale": 8.5,
-        }
-
-        for output_type in ["latent", "np"]:
-            ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ov_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
-        # Compare model devices
-        self.assertEqual(pipeline.device, ov_pipeline.device)
+        np.testing.assert_allclose(ov_output, diffusers_output, atol=1e-4, rtol=1e-2)
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index d48e86fe27..4c42f8a337 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -15,7 +15,6 @@
 
 import unittest
 from pathlib import Path
-from tempfile import TemporaryDirectory
 
 import torch
 from parameterized import parameterized
@@ -28,6 +27,7 @@
 from optimum.exporters.openvino import export_from_model, main_export
 from optimum.exporters.tasks import TasksManager
 from optimum.intel import (
+    OVFluxPipeline,
     OVLatentConsistencyModelPipeline,
     OVModelForAudioClassification,
     OVModelForCausalLM,
@@ -41,12 +41,16 @@
     OVModelForSequenceClassification,
     OVModelForSpeechSeq2Seq,
     OVModelForTokenClassification,
+    OVModelForVisualCausalLM,
+    OVStableDiffusion3Pipeline,
     OVStableDiffusionPipeline,
     OVStableDiffusionXLImg2ImgPipeline,
     OVStableDiffusionXLPipeline,
 )
 from optimum.intel.openvino.modeling_base import OVBaseModel
-from optimum.intel.utils.import_utils import _transformers_version
+from optimum.intel.openvino.modeling_visual_language import MODEL_TYPE_TO_CLS_MAPPING
+from optimum.intel.openvino.utils import TemporaryDirectory
+from optimum.intel.utils.import_utils import _transformers_version, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
 
 
@@ -68,9 +72,13 @@ class ExportModelTest(unittest.TestCase):
         "stable-diffusion-xl": OVStableDiffusionXLPipeline,
         "stable-diffusion-xl-refiner": OVStableDiffusionXLImg2ImgPipeline,
         "latent-consistency": OVLatentConsistencyModelPipeline,
+        "llava": OVModelForVisualCausalLM,
     }
 
-    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper")
+    if is_transformers_version(">=", "4.45"):
+        SUPPORTED_ARCHITECTURES.update({"stable-diffusion-3": OVStableDiffusion3Pipeline, "flux": OVFluxPipeline})
+
+    GENERATIVE_MODELS = ("pix2struct", "t5", "bart", "gpt2", "whisper", "llava")
 
     def _openvino_export(
         self,
@@ -88,6 +96,10 @@ def _openvino_export(
             model_class = TasksManager.get_model_class_for_task(task, library=library_name)
             model = model_class(f"hf_hub:{model_name}", pretrained=True, exportable=True)
             TasksManager.standardize_model_attributes(model_name, model, library_name=library_name)
+        elif model_type == "llava":
+            model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained(
+                model_name, **loading_kwargs
+            )
         else:
             model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs)
 
@@ -119,6 +131,15 @@ def _openvino_export(
                     self.assertEqual(
                         ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version
                     )
+                    self.assertTrue(ov_model.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"]))
+
+                if library_name == "diffusers":
+                    self.assertTrue(
+                        ov_model.vae_encoder.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"])
+                    )
+                    self.assertTrue(
+                        ov_model.vae_decoder.model.has_rt_info(["runtime_options", "ACTIVATIONS_SCALE_FACTOR"])
+                    )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_export(self, model_type: str):
@@ -130,8 +151,12 @@ def test_export_with_custom_gen_config(self, model_type):
         task = auto_model.export_feature
         model_name = MODEL_NAMES[model_type]
         loading_kwargs = {"attn_implementation": "eager"} if model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED else {}
-
-        model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs)
+        if model_type == "llava":
+            model = MODEL_TYPE_TO_CLS_MAPPING[model_type].auto_model_class.from_pretrained(
+                model_name, **loading_kwargs
+            )
+        else:
+            model = auto_model.auto_model_class.from_pretrained(model_name, **loading_kwargs)
 
         model.generation_config.top_k = 42
         model.generation_config.do_sample = True
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 0cd19a2d41..67511bb845 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -14,7 +14,6 @@
 import subprocess
 import unittest
 from pathlib import Path
-from tempfile import TemporaryDirectory
 
 from parameterized import parameterized
 from transformers import AutoModelForCausalLM
@@ -26,6 +25,7 @@
 
 from optimum.exporters.openvino.__main__ import main_export
 from optimum.intel import (  # noqa
+    OVFluxPipeline,
     OVLatentConsistencyModelPipeline,
     OVModelForAudioClassification,
     OVModelForCausalLM,
@@ -36,18 +36,23 @@
     OVModelForSeq2SeqLM,
     OVModelForSequenceClassification,
     OVModelForTokenClassification,
+    OVModelForVisualCausalLM,
     OVModelOpenCLIPForZeroShotImageClassification,
     OVModelOpenCLIPText,
     OVModelOpenCLIPVisual,
     OVSentenceTransformer,
+    OVStableDiffusion3Pipeline,
     OVStableDiffusionPipeline,
     OVStableDiffusionXLPipeline,
 )
 from optimum.intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS
-from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
+from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS, TemporaryDirectory
 from optimum.intel.utils.import_utils import (
     compare_versions,
     is_openvino_tokenizers_available,
+    is_openvino_version,
+    is_tokenizers_version,
+    is_transformers_version,
 )
 
 
@@ -56,7 +61,7 @@ class OVCLIExportTestCase(unittest.TestCase):
     Integration tests ensuring supported models are correctly exported.
     """
 
-    SUPPORTED_ARCHITECTURES = (
+    SUPPORTED_ARCHITECTURES = [
         ("text-generation", "gpt2"),
         ("text-generation-with-past", "gpt2"),
         ("text2text-generation", "t5"),
@@ -71,31 +76,41 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("text-to-image", "stable-diffusion"),
         ("text-to-image", "stable-diffusion-xl"),
         ("image-to-image", "stable-diffusion-xl-refiner"),
-    )
+    ]
+
+    if is_transformers_version(">=", "4.45"):
+        SUPPORTED_ARCHITECTURES.extend([("text-to-image", "stable-diffusion-3"), ("text-to-image", "flux")])
     EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
-        "gpt2": 2,
+        "gpt2": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "t5": 0,  # no .model file in the repository
         "albert": 0,  # not supported yet
         "distilbert": 1,  # no detokenizer
-        "roberta": 2,
+        "roberta": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
         "vit": 0,  # no tokenizer for image model
         "wav2vec2": 0,  # no tokenizer
         "bert": 1,  # no detokenizer
-        "blenderbot": 2,
-        "stable-diffusion": 2,
-        "stable-diffusion-xl": 4,
+        "blenderbot": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "stable-diffusion": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "stable-diffusion-xl": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "stable-diffusion-3": 6 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 2,
+        "flux": 4 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
+        "llava": 2 if is_tokenizers_version("<", "0.20") or is_openvino_version(">=", "2024.5") else 0,
     }
 
-    SUPPORTED_SD_HYBRID_ARCHITECTURES = (
+    SUPPORTED_SD_HYBRID_ARCHITECTURES = [
         ("stable-diffusion", 72, 195),
         ("stable-diffusion-xl", 84, 331),
         ("latent-consistency", 50, 135),
-    )
+    ]
 
-    TEST_4BIT_CONFIGURATONS = [
+    if is_transformers_version(">=", "4.45"):
+        SUPPORTED_SD_HYBRID_ARCHITECTURES.append(("stable-diffusion-3", 9, 65))
+
+    TEST_4BIT_CONFIGURATIONS = [
         ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
         ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
         ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
+        ("text-generation-with-past", "opt125m", "nf4", {"int8": 4, "nf4": 72}),
         ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}),
         (
             "text-generation-with-past",
@@ -118,6 +133,46 @@ class OVCLIExportTestCase(unittest.TestCase):
         ),
     ]
 
+    if is_transformers_version(">=", "4.40.0"):
+        TEST_4BIT_CONFIGURATIONS.extend(
+            [
+                (
+                    "image-text-to-text",
+                    "llava_next",
+                    'int4 --group-size 16 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
+                    "--dataset contextual --num-samples 1",
+                    {"int8": 8, "int4": 22},
+                ),
+                (
+                    "image-text-to-text",
+                    "nanollava",
+                    'int4 --group-size 8 --ratio 0.9 --sensitivity-metric "mean_activation_variance" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    {"int8": 12, "int4": 18},
+                ),
+            ]
+        )
+
+    if is_transformers_version(">=", "4.45.0"):
+        TEST_4BIT_CONFIGURATIONS.extend(
+            [
+                (
+                    "image-text-to-text",
+                    "internvl2",
+                    'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "hessian_input_activation" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    {"int8": 6, "int4": 24},
+                ),
+                (
+                    "image-text-to-text",
+                    "phi3_v",
+                    'int4 --group-size 4 --ratio 0.9 --sensitivity-metric "mean_activation_magnitude" '
+                    "--dataset contextual --num-samples 1 --trust-remote-code",
+                    {"int8": 4, "int4": 14},
+                ),
+            ]
+        )
+
     def _openvino_export(self, model_name: str, task: str):
         with TemporaryDirectory() as tmpdir:
             main_export(
@@ -153,7 +208,7 @@ def test_exporters_cli(self, task: str, model_type: str):
     def test_exporters_cli_tokenizers(self, task: str, model_type: str):
         with TemporaryDirectory() as tmpdir:
             output = subprocess.check_output(
-                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}",
+                f"TRANSFORMERS_VERBOSITY=debug optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}",
                 shell=True,
                 stderr=subprocess.STDOUT,
             ).decode()
@@ -208,9 +263,11 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
                 models = [model.encoder, model.decoder]
                 if task.endswith("with-past"):
                     models.append(model.decoder_with_past)
-            elif model_type.startswith("stable-diffusion"):
-                models = [model.unet, model.vae_encoder, model.vae_decoder]
+            elif model_type.startswith("stable-diffusion") or model_type.startswith("flux"):
+                models = [model.unet or model.transformer, model.vae_encoder, model.vae_decoder]
                 models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
+            elif task.startswith("image-text-to-text"):
+                models = [model.language_model, model.vision_embeddings]
             else:
                 models = [model]
 
@@ -228,12 +285,14 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
                 check=True,
             )
             model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
-            num_fq, num_weight_nodes = get_num_quantized_nodes(model.unet)
+            num_fq, num_weight_nodes = get_num_quantized_nodes(
+                model.unet if model.unet is not None else model.transformer
+            )
             self.assertEqual(exp_num_int8, num_weight_nodes["int8"])
             self.assertEqual(exp_num_fq, num_fq)
 
-    @parameterized.expand(TEST_4BIT_CONFIGURATONS)
-    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
+    @parameterized.expand(TEST_4BIT_CONFIGURATIONS)
+    def test_exporters_cli_4bit(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
         with TemporaryDirectory() as tmpdir:
             result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -242,13 +301,17 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
                 capture_output=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
+            if "--trust-remote-code" in option:
+                model_kwargs["trust_remote_code"] = True
             model = eval(
                 _HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]
                 if task.replace("-with-past", "") in _HEAD_TO_AUTOMODELS
                 else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]
             ).from_pretrained(tmpdir, **model_kwargs)
 
-            _, num_weight_nodes = get_num_quantized_nodes(model)
+            ov_model = model.lm_model if task == "image-text-to-text" else model.model
+
+            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
             expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
             self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
@@ -346,3 +409,20 @@ def test_exporters_cli_open_clip(self):
             model = eval(_HEAD_TO_AUTOMODELS["open_clip"]).from_pretrained(tmpdir, compile=False)
             self.assertTrue("text_features" in model.text_model.output_names)
             self.assertTrue("image_features" in model.visual_model.output_names)
+
+    def test_export_openvino_with_missed_weight_format(self):
+        # Test that exception is raised when some compression parameter is given, but weight format is not.
+        with TemporaryDirectory() as tmpdir:
+            with self.assertRaises(subprocess.CalledProcessError) as exc_info:
+                subprocess.run(
+                    f"optimum-cli export openvino --model {MODEL_NAMES['gpt2']} --task text-generation --sym {tmpdir}",
+                    shell=True,
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                )
+            self.assertIn(
+                "Some compression parameters are provided, but the weight format is not specified.",
+                str(exc_info.exception.stderr),
+            )
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index ea72453443..f7f677bf8c 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import copy
 import gc
 import os
 import tempfile
@@ -32,6 +33,7 @@
 from huggingface_hub import HfApi
 from parameterized import parameterized
 from PIL import Image
+from sentence_transformers import SentenceTransformer
 from transformers import (
     AutoConfig,
     AutoFeatureExtractor,
@@ -50,6 +52,7 @@
     AutoModelForSpeechSeq2Seq,
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
+    AutoProcessor,
     AutoTokenizer,
     GenerationConfig,
     Pix2StructForConditionalGeneration,
@@ -59,7 +62,7 @@
 )
 from transformers.onnx.utils import get_preprocessor
 from transformers.testing_utils import slow
-from utils_tests import MODEL_NAMES
+from utils_tests import MODEL_NAMES, TEST_IMAGE_URL
 
 from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.intel import (
@@ -91,10 +94,14 @@
 from optimum.intel.openvino.modeling_visual_language import (
     MODEL_PARTS_CLS_MAPPING,
     MODEL_TYPE_TO_CLS_MAPPING,
-    OVModelWithEmbedForCausalLM,
-    OVVisionEmbedding,
 )
-from optimum.intel.openvino.utils import _print_compiled_model_properties
+from optimum.intel.openvino.utils import (
+    OV_LANGUAGE_MODEL_NAME,
+    OV_TEXT_EMBEDDINGS_MODEL_NAME,
+    OV_VISION_EMBEDDINGS_MODEL_NAME,
+    TemporaryDirectory,
+    _print_compiled_model_properties,
+)
 from optimum.intel.pipelines import pipeline as optimum_pipeline
 from optimum.intel.utils.import_utils import is_openvino_version, is_transformers_version
 from optimum.intel.utils.modeling_utils import _find_files_matching_pattern
@@ -133,6 +140,7 @@ def __init__(self, *args, **kwargs):
         self.OV_DECODER_MODEL_ID = "helenai/gpt2-ov"
         self.OV_SEQ2SEQ_MODEL_ID = "echarlaix/t5-small-openvino"
         self.OV_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-stable-diffusion-openvino"
+        self.OV_VLM_MODEL_ID = "katuni4ka/tiny-random-llava-ov"
 
     def test_load_from_hub_and_save_model(self):
         tokenizer = AutoTokenizer.from_pretrained(self.OV_MODEL_ID)
@@ -171,7 +179,7 @@ def test_load_from_hub_and_save_model(self):
         self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
         del compile_only_model
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(OV_XML_FILE_NAME in folder_contents)
@@ -200,7 +208,7 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache):
         self.assertEqual(loaded_model.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
         loaded_model_outputs = loaded_model(**tokens)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(OV_XML_FILE_NAME in folder_contents)
@@ -221,6 +229,76 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache):
         del model
         gc.collect()
 
+    @unittest.skipIf(
+        is_transformers_version("<", "4.45"),
+        "model tokenizer exported with tokenizers 0.20 is not compatible with old transformers",
+    )
+    def test_load_from_hub_and_save_visual_language_model(self):
+        model_id = self.OV_VLM_MODEL_ID
+        processor = get_preprocessor(model_id)
+        prompt = "\n What is shown in this image?"
+        image = Image.open(
+            requests.get(
+                TEST_IMAGE_URL,
+                stream=True,
+            ).raw
+        )
+        loaded_model = OVModelForVisualCausalLM.from_pretrained(model_id)
+        self.assertIsInstance(loaded_model, MODEL_TYPE_TO_CLS_MAPPING[loaded_model.config.model_type])
+        for component_name, component in loaded_model.components.items():
+            self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name])
+        self.assertIsInstance(loaded_model.config, PretrainedConfig)
+        # Test that PERFORMANCE_HINT is set to LATENCY by default
+        self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+
+        for component_name, component in loaded_model.components.items():
+            self.assertIsInstance(component.model, ov.Model)
+            if component_name == "language_model":
+                self.assertEqual(component.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
+                self.assertIsInstance(component.text_emb_model, ov.Model)
+                self.assertEqual(component.text_emb_request.get_property("PERFORMANCE_HINT"), "LATENCY")
+            else:
+                self.assertEqual(component.request.get_property("PERFORMANCE_HINT"), "LATENCY")
+
+        inputs = processor(images=image, text=prompt, return_tensors="pt")
+        set_seed(SEED)
+        loaded_model_outputs = loaded_model(**inputs)
+
+        with TemporaryDirectory() as tmpdirname:
+            loaded_model.save_pretrained(tmpdirname)
+            folder_contents = os.listdir(tmpdirname)
+            model_files = [
+                OV_LANGUAGE_MODEL_NAME,
+                OV_TEXT_EMBEDDINGS_MODEL_NAME,
+                OV_VISION_EMBEDDINGS_MODEL_NAME,
+            ]
+            model_files += ["openvino_{part}_model.xml" for part in loaded_model.additional_parts]
+            for xml_file_name in model_files:
+                self.assertTrue(xml_file_name in folder_contents)
+                self.assertTrue(xml_file_name.replace(".xml", ".bin") in folder_contents)
+            model = OVModelForVisualCausalLM.from_pretrained(tmpdirname)
+            compile_only_model = OVModelForVisualCausalLM.from_pretrained(tmpdirname, compile_only=True)
+            for _, submodel in compile_only_model.submodels.items():
+                self.assertIsInstance(submodel, ov.runtime.CompiledModel)
+            for component_name, component in compile_only_model.components.items():
+                self.assertIsInstance(component.model, ov.runtime.CompiledModel)
+                if component_name == "language_model":
+                    self.assertIsInstance(component.request, ov.runtime.InferRequest)
+                    self.assertIsInstance(component.text_emb_model, ov.runtime.CompiledModel)
+                    self.assertIsInstance(component.text_emb_request, ov.runtime.CompiledModel)
+                else:
+                    self.assertIsInstance(component.request, ov.runtime.CompiledModel)
+
+            outputs = compile_only_model(**inputs)
+            self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+            del compile_only_model
+
+        outputs = model(**inputs)
+        self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+        del loaded_model
+        del model
+        gc.collect()
+
     def test_load_from_hub_and_save_seq2seq_model(self):
         tokenizer = AutoTokenizer.from_pretrained(self.OV_SEQ2SEQ_MODEL_ID)
         tokens = tokenizer("This is a sample input", return_tensors="pt")
@@ -234,7 +312,7 @@ def test_load_from_hub_and_save_seq2seq_model(self):
 
         loaded_model_outputs = loaded_model.generate(**tokens)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(OV_ENCODER_NAME in folder_contents)
@@ -272,9 +350,13 @@ def test_load_from_hub_and_save_stable_diffusion_model(self):
             "num_inference_steps": 2,
             "output_type": "np",
         }
-        pipeline_outputs = loaded_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
+
+        np.random.seed(0)
+        torch.manual_seed(0)
+        pipeline_outputs = loaded_pipeline(**inputs).images
         self.assertEqual(pipeline_outputs.shape, (batch_size, height, width, 3))
-        with tempfile.TemporaryDirectory() as tmpdirname:
+
+        with TemporaryDirectory() as tmpdirname:
             loaded_pipeline.save_pretrained(tmpdirname)
             pipeline = OVStableDiffusionPipeline.from_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
@@ -294,12 +376,17 @@ def test_load_from_hub_and_save_stable_diffusion_model(self):
             self.assertIsInstance(compile_only_pipeline.text_encoder.model, ov.runtime.CompiledModel)
             self.assertIsInstance(compile_only_pipeline.vae_encoder.model, ov.runtime.CompiledModel)
             self.assertIsInstance(compile_only_pipeline.vae_decoder.model, ov.runtime.CompiledModel)
-            outputs = compile_only_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-            self.assertTrue(np.array_equal(pipeline_outputs, outputs))
+
+            np.random.seed(0)
+            torch.manual_seed(0)
+            outputs = compile_only_pipeline(**inputs).images
+            np.testing.assert_allclose(pipeline_outputs, outputs, atol=1e-4, rtol=1e-4)
             del compile_only_pipeline
 
-        outputs = pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-        self.assertTrue(np.array_equal(pipeline_outputs, outputs))
+        np.random.seed(0)
+        torch.manual_seed(0)
+        outputs = pipeline(**inputs).images
+        np.testing.assert_allclose(pipeline_outputs, outputs, atol=1e-4, rtol=1e-4)
         del pipeline
         gc.collect()
 
@@ -315,10 +402,24 @@ def test_load_model_from_hub_private_with_token(self):
         self.assertIsInstance(model.config, PretrainedConfig)
         self.assertTrue(model.stateful)
 
+    @parameterized.expand(("", "openvino"))
+    def test_loading_with_config_in_root(self, subfolder):
+        # config.json file in the root directory and not in the subfolder
+        model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino"
+        export = subfolder == ""
+        # hub model
+        OVModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=export)
+        # local model
+        api = HfApi()
+        with TemporaryDirectory() as tmpdirname:
+            local_dir = Path(tmpdirname) / "model"
+            api.snapshot_download(repo_id=model_id, local_dir=local_dir)
+            OVModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=export)
+
     def test_infer_export_when_loading(self):
         model_id = MODEL_NAMES["phi"]
         model = AutoModelForCausalLM.from_pretrained(model_id)
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             model.save_pretrained(Path(tmpdirname) / "original")
             # Load original model and convert
             model = OVModelForCausalLM.from_pretrained(Path(tmpdirname) / "original")
@@ -330,7 +431,7 @@ def test_infer_export_when_loading(self):
 
     def test_find_files_matching_pattern(self):
         model_id = "echarlaix/tiny-random-PhiForCausalLM"
-        pattern = r"(.*)?openvino(.*)?\_model.xml"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
         # hub model
         for revision in ("main", "ov", "itrex"):
             ov_files = _find_files_matching_pattern(
@@ -340,7 +441,7 @@ def test_find_files_matching_pattern(self):
 
         # local model
         api = HfApi()
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             for revision in ("main", "ov", "itrex"):
                 local_dir = Path(tmpdirname) / revision
                 api.snapshot_download(repo_id=model_id, local_dir=local_dir, revision=revision)
@@ -351,7 +452,7 @@ def test_find_files_matching_pattern(self):
 
     @parameterized.expand(("stable-diffusion", "stable-diffusion-openvino"))
     def test_find_files_matching_pattern_sd(self, model_arch):
-        pattern = r"(.*)?openvino(.*)?\_model.xml"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
         model_id = MODEL_NAMES[model_arch]
         # hub model
         ov_files = _find_files_matching_pattern(model_id, pattern=pattern)
@@ -359,12 +460,47 @@ def test_find_files_matching_pattern_sd(self, model_arch):
 
         # local model
         api = HfApi()
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             local_dir = Path(tmpdirname) / "model"
             api.snapshot_download(repo_id=model_id, local_dir=local_dir)
             ov_files = _find_files_matching_pattern(local_dir, pattern=pattern)
             self.assertTrue(len(ov_files) > 0 if "openvino" in model_id else len(ov_files) == 0)
 
+    @parameterized.expand(("", "openvino"))
+    def test_find_files_matching_pattern_with_config_in_root(self, subfolder):
+        # Notably, the model has a config.json file in the root directory and not in the subfolder
+        model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
+        # hub model
+        ov_files = _find_files_matching_pattern(model_id, pattern=pattern, subfolder=subfolder)
+        self.assertTrue(len(ov_files) == 1 if subfolder == "openvino" else len(ov_files) == 0)
+
+        # local model
+        api = HfApi()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            local_dir = Path(tmpdirname) / "model"
+            api.snapshot_download(repo_id=model_id, local_dir=local_dir)
+            ov_files = _find_files_matching_pattern(local_dir, pattern=pattern, subfolder=subfolder)
+            self.assertTrue(len(ov_files) == 1 if subfolder == "openvino" else len(ov_files) == 0)
+
+    def test_find_files_matching_pattern_with_quantized_ov_model(self):
+        # This model only has "openvino/openvino_model_qint8_quantized.xml" and "openvino/openvino_model_qint8_quantized.bin"
+        # We want to ensure that this model is found, so the `export` isn't forced to True
+        model_id = "sentence-transformers-testing/stsb-bert-tiny-openvino-quantized-only"
+        subfolder = "openvino"
+        pattern = r"(.*)?openvino(.*)?\_model(.*)?.xml$"
+        # hub model
+        ov_files = _find_files_matching_pattern(model_id, pattern=pattern, subfolder=subfolder)
+        self.assertTrue(len(ov_files) == 1)
+
+        # local model
+        api = HfApi()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            local_dir = Path(tmpdirname) / "model"
+            api.snapshot_download(repo_id=model_id, local_dir=local_dir)
+            ov_files = _find_files_matching_pattern(local_dir, pattern=pattern, subfolder=subfolder)
+            self.assertTrue(len(ov_files) == 1)
+
 
 class PipelineTest(unittest.TestCase):
     def test_load_model_from_hub(self):
@@ -376,7 +512,7 @@ def test_load_model_from_hub(self):
         self.assertIsInstance(ov_exported_pipe.model, OVBaseModel)
         self.assertIsInstance(ov_pipe.model, OVBaseModel)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             ov_exported_pipe.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(OV_XML_FILE_NAME in folder_contents)
@@ -396,7 +532,7 @@ def test_seq2seq_load_from_hub(self):
         self.assertIsInstance(ov_exported_pipe.model, OVBaseModel)
         self.assertIsInstance(ov_pipe.model, OVBaseModel)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             ov_exported_pipe.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(OV_DECODER_WITH_PAST_NAME in folder_contents)
@@ -712,7 +848,7 @@ def test_sentence_transformers_pipeline(self, model_arch):
         from Sentence Transformers then an appropriate exception raises.
         """
         model_id = MODEL_NAMES[model_arch]
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             save_dir = str(tmp_dir)
             OVSentenceTransformer.from_pretrained(model_id, export=True).save_pretrained(save_dir)
             with self.assertRaises(Exception) as context:
@@ -820,12 +956,15 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch in self.REMOTE_CODE_MODELS:
             model_kwargs = {"trust_remote_code": True}
 
+        # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
+        if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
+            model_kwargs["attn_implementation"] = "sdpa"
+
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
         tokens = tokenizer("This is a sample output", return_tensors="pt")
-        tokens.pop("token_type_ids", None)
 
         ov_outputs = ov_model(**tokens)
         self.assertTrue("logits" in ov_outputs)
@@ -862,7 +1001,6 @@ def test_compare_to_transformers(self, model_arch):
         # Compare batched generation
         tokenizer.padding_side = "left"
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
-        tokens.pop("token_type_ids", None)
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -886,7 +1024,10 @@ def test_compare_to_transformers(self, model_arch):
 
             additional_inputs = {"past_key_values": DynamicCache()}
         transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs)
-        self.assertTrue(torch.allclose(ov_outputs, transformers_outputs))
+        self.assertTrue(
+            torch.allclose(ov_outputs, transformers_outputs),
+            "OV output {ov_outputs}\nTransformers output  {transformers_output}",
+        )
 
         del transformers_model
         del ov_model
@@ -1051,6 +1192,11 @@ def test_beam_search(self, model_arch):
                 "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
                 "trust_remote_code": True,
             }
+
+        # starting from transformers 4.45.0 gemma2 uses eager attention by default, while ov - sdpa
+        if model_arch == "gemma2" and is_transformers_version(">=", "4.45.0"):
+            model_kwargs["attn_implementation"] = "sdpa"
+
         # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
         if model_arch in ["qwen", "chatglm", "glm4"]:
             return
@@ -1126,7 +1272,6 @@ def test_beam_search(self, model_arch):
             from transformers.cache_utils import DynamicCache
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
-        tokens.pop("token_type_ids", None)
         ov_model_stateful.generation_config.eos_token_id = None
         ov_model_stateless.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
@@ -1281,7 +1426,7 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
         transformers_model = AutoModelForImageClassification.from_pretrained(model_id)
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         inputs = preprocessor(images=image, return_tensors="pt")
         with torch.no_grad():
@@ -1307,7 +1452,7 @@ def test_pipeline(self, model_arch):
         model.eval()
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
         pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor)
-        inputs = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        inputs = TEST_IMAGE_URL
         outputs = pipe(inputs)
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs[0]["score"], 0.0)
@@ -1328,7 +1473,7 @@ def test_compare_to_timm(self, model_id):
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         timm_model = timm.create_model(model_id, pretrained=True)
         preprocessor = TimmImageProcessor.from_pretrained(model_id)
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         inputs = preprocessor(images=image, return_tensors="pt")
         with torch.no_grad():
@@ -1346,7 +1491,7 @@ def test_compare_to_timm(self, model_id):
     @parameterized.expand(TIMM_MODELS)
     def test_timm_save_and_infer(self, model_id):
         ov_model = OVModelForImageClassification.from_pretrained(model_id, export=True)
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             model_save_path = os.path.join(tmpdirname, "timm_ov_model")
             ov_model.save_pretrained(model_save_path)
             model = OVModelForImageClassification.from_pretrained(model_save_path)
@@ -1824,17 +1969,18 @@ def test_compare_with_and_without_past_key_values(self):
 
 
 class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = [
-        "llava",
-    ]
+    SUPPORTED_ARCHITECTURES = ["llava"]
 
     if is_transformers_version(">=", "4.40.0"):
-        SUPPORTED_ARCHITECTURES += ["llava_next"]
+        SUPPORTED_ARCHITECTURES += ["llava_next", "nanollava"]
+    if is_transformers_version(">=", "4.45.0"):
+        SUPPORTED_ARCHITECTURES += ["minicpmv", "internvl2", "phi3_v"]
     TASK = "image-text-to-text"
+    REMOTE_CODE_MODELS = ["internvl2", "minicpmv", "nanollava", "phi3_v"]
 
     IMAGE = Image.open(
         requests.get(
-            "http://images.cocodataset.org/val2017/000000039769.jpg",
+            TEST_IMAGE_URL,
             stream=True,
         ).raw
     )
@@ -1848,29 +1994,130 @@ def get_transformer_model_class(self, model_arch):
             from transformers import LlavaNextForConditionalGeneration
 
             return LlavaNextForConditionalGeneration
-        return None
+        return AutoModelForCausalLM
+
+    def _check_device_and_request(self, ov_model, expected_device, has_request):
+        request_check_fn = self.assertFalse if has_request else self.assertTrue
+        self.assertEqual(ov_model._device, expected_device)
+        for component_name, component in ov_model.components.items():
+            if component_name == "language_model":
+                request_check_fn(component.text_emb_request is None)
+            self.assertEqual(component._device, expected_device)
+            request_check_fn(component.request is None)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
+        prompt = "What is shown in this image?"
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(
+            model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
+        transformers_model.eval()
+        if "internvl2" in model_arch:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trast_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            img_context_token_id = tokenizer.convert_tokens_to_ids("")
+            transformers_model.img_context_token_id = img_context_token_id
+        if "nanollava" in model_arch:
+            transformers_model.get_vision_tower().load_model()
+        preprocessors = self.get_preprocessors(model_arch)
+        set_seed(SEED)
+        ov_model = OVModelForVisualCausalLM.from_pretrained(
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS, compile=False
+        )
+        self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type])
+        for component_name, component in ov_model.components.items():
+            self.assertIsInstance(component, MODEL_PARTS_CLS_MAPPING[component_name])
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        inputs = ov_model.preprocess_inputs(**preprocessors, text=prompt, image=self.IMAGE.resize((600, 600)))
+        transformers_inputs = copy.deepcopy(inputs)
+        test_device = "AUTO"
+        ov_model.to(test_device)
+        self._check_device_and_request(ov_model, test_device, False)
+        test_device = "CPU"
+        ov_model.to(test_device)
+        ov_model.compile()
+        self._check_device_and_request(ov_model, test_device, True)
+        ov_model.clear_requests()
+        self._check_device_and_request(ov_model, test_device, False)
+
+        # nanollava pixel_values input named as images
+        if model_arch == "nanollava":
+            pixel_values = transformers_inputs.pop("pixel_values", None)
+            transformers_inputs["images"] = pixel_values
+        # pytorch minicpmv is not designed to be used via forward
+        if model_arch not in ["minicpmv", "internvl2"]:
+            set_seed(SEED)
+            ov_outputs = ov_model(**inputs)
+            set_seed(SEED)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**transformers_inputs)
+            self.assertTrue(
+                torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4),
+                f"Max abs diff {(torch.abs(ov_outputs.logits - transformers_outputs.logits).max())}",
+            )
+
+        ov_model.generation_config.eos_token_id = None
+        transformers_model.generation_config.eos_token_id = None
+        ov_model.config.eos_token_id = None
+        transformers_model.config.eos_token_id = None
+        gen_config = GenerationConfig(
+            max_new_tokens=30,
+            min_new_tokens=30,
+            do_sample=False,
+            eos_token_id=None,
+        )
+        set_seed(SEED)
+        ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
+        set_seed(SEED)
+        with torch.no_grad():
+            transformers_outputs = transformers_model.generate(**transformers_inputs, generation_config=gen_config)
+
+        # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them
+        if model_arch in ["minicpmv", "internvl2"]:
+            ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :]
+        self.assertTrue(
+            torch.equal(ov_outputs, transformers_outputs),
+            f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
+        )
+        del transformers_model
+        del ov_model
+
+        gc.collect()
+
+    @parameterized.expand(["llava", "llava_next"])
+    @unittest.skipIf(
+        is_transformers_version("<", "4.45.0"), reason="New preprocessing available only in transformers >= 4.45"
+    )
+    def test_llava_with_new_preprocessing(self, model_arch):
         prompt = "\n What is shown in this image?"
         model_id = MODEL_NAMES[model_arch]
-        processor = get_preprocessor(model_id)
+        config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+        processor = AutoProcessor.from_pretrained(
+            model_id,
+            patch_size=config.vision_config.patch_size,
+            vision_feature_select_strategy=config.vision_feature_select_strategy,
+            trust_remote_code=model_arch in self.REMOTE_CODE_MODELS,
+        )
         transformers_model = self.get_transformer_model_class(model_arch).from_pretrained(model_id)
+        ov_model = OVModelForVisualCausalLM.from_pretrained(
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
+        self.assertTrue(ov_model._support_new_processing)
+        self.assertTrue(processor.patch_size is not None)
+        self.assertTrue(processor.vision_feature_select_strategy is not None)
         inputs = processor(images=self.IMAGE, text=prompt, return_tensors="pt")
+        self.assertTrue(
+            (inputs.input_ids == ov_model.config.image_token_index).sum(1).max() >= ov_model.config.image_seq_length
+        )
         set_seed(SEED)
         with torch.no_grad():
             transformers_outputs = transformers_model(**inputs)
-        ov_model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True)
-        self.assertIsInstance(ov_model, MODEL_TYPE_TO_CLS_MAPPING[ov_model.config.model_type])
-        self.assertIsInstance(ov_model.vision_embeddings, OVVisionEmbedding)
-        self.assertIsInstance(ov_model.language_model, OVModelWithEmbedForCausalLM)
-        for additional_part in ov_model.additional_parts:
-            self.assertTrue(hasattr(ov_model, additional_part))
-            self.assertIsInstance(getattr(ov_model, additional_part), MODEL_PARTS_CLS_MAPPING[additional_part])
-        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        set_seed(SEED)
         ov_outputs = ov_model(**inputs)
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, atol=1e-4))
-
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -1885,39 +2132,82 @@ def test_compare_to_transformers(self, model_arch):
         set_seed(SEED)
         ov_outputs = ov_model.generate(**inputs, generation_config=gen_config)
         set_seed(SEED)
-        transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
+        with torch.no_grad():
+            transformers_outputs = transformers_model.generate(**inputs, generation_config=gen_config)
         self.assertTrue(
             torch.equal(ov_outputs, transformers_outputs),
             f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model output {ov_outputs}",
         )
 
-        del transformers_model
         del ov_model
-
+        del transformers_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
-        model = OVModelForVisualCausalLM.from_pretrained(model_id, export=True)
-        preprocessor = get_preprocessor(model_id)
-        question = "\nDescribe image"
-        inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt")
+        model = OVModelForVisualCausalLM.from_pretrained(
+            model_id, export=True, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+        )
 
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+        question = "Describe image"
+        preprocessors = self.get_preprocessors(model_arch)
+        inputs = model.preprocess_inputs(**preprocessors, text=question, image=self.IMAGE.resize((600, 600)))
         # General case
         outputs = model.generate(**inputs, max_new_tokens=10)
-        outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True)
+        outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
 
+        # No input image case
         question = "Hi, how are you?"
-        inputs = preprocessor(images=None, text=question, return_tensors="pt")
+        inputs = model.preprocess_inputs(**preprocessors, text=question, image=None)
         outputs = model.generate(**inputs, max_new_tokens=10)
-        outputs = preprocessor.batch_decode(outputs, skip_special_tokens=True)
+        # filter out original prompt becuase it may contains out of tokenizer tokens e.g. in nanollva text separator = -200
+        outputs = outputs[:, inputs["input_ids"].shape[1] :]
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
         self.assertIsInstance(outputs[0], str)
         del model
 
         gc.collect()
 
+    def get_preprocessors(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        if model_arch == "nanollava":
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+            processor = AutoProcessor.from_pretrained(
+                config.mm_vision_tower, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": processor, "tokenizer": tokenizer}
+        elif model_arch == "internvl2":
+            config = AutoConfig.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": None, "tokenizer": tokenizer, "config": config}
+        else:
+            processor = AutoProcessor.from_pretrained(
+                model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            preprocessors = {"processor": processor, "tokenizer": None}
+        return preprocessors
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_model_can_be_loaded_after_saving(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        with TemporaryDirectory() as save_dir:
+            ov_model = OVModelForVisualCausalLM.from_pretrained(
+                model_id, compile=False, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            ov_model.save_pretrained(save_dir)
+            ov_restored_model = OVModelForVisualCausalLM.from_pretrained(
+                save_dir, compile=False, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS
+            )
+            self.assertIsInstance(ov_restored_model, type(ov_model))
+
 
 class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ("whisper",)
@@ -1995,7 +2285,7 @@ class OVModelForVision2SeqIntegrationTest(unittest.TestCase):
     SPEEDUP_CACHE = 1.1
 
     def _get_sample_image(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
@@ -2102,7 +2392,7 @@ class OVModelForCustomTasksIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_WITH_HIDDEN_STATES = ["vit-with-hidden-states"]
 
     def _get_sample_image(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
@@ -2186,7 +2476,7 @@ class OVModelForOpenCLIPZeroShortImageClassificationTest(unittest.TestCase):
     OV_MODEL_ID_IR = MODEL_NAMES["open-clip-ov"]
 
     def _get_sample_image(self):
-        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        url = TEST_IMAGE_URL
         image = Image.open(requests.get(url, stream=True).raw)
         return image
 
@@ -2215,7 +2505,7 @@ def test_load_from_hub_and_save_model(self):
 
         loaded_model_outputs = loaded_model(tokens, processed_image)
 
-        with tempfile.TemporaryDirectory() as tmpdirname:
+        with TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(loaded_model.text_model._xml_model_name in folder_contents)
@@ -2317,3 +2607,36 @@ def test_functions(self):
 
         del model
         gc.collect()
+
+
+class OVModelForSTFeatureExtractionIntegrationTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = ("st-bert", "st-mpnet")
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        set_seed(SEED)
+        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_model.config, PretrainedConfig)
+        self.assertTrue(hasattr(ov_model, "encode"))
+        st_model = SentenceTransformer(model_id)
+        sentences = ["This is an example sentence", "Each sentence is converted"]
+        st_embeddings = st_model.encode(sentences)
+        ov_embeddings = ov_model.encode(sentences)
+        # Compare tensor outputs
+        self.assertTrue(np.allclose(ov_embeddings, st_embeddings, atol=1e-4))
+        del st_embeddings
+        del ov_model
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_sentence_transformers_save_and_infer(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
+        with TemporaryDirectory() as tmpdirname:
+            model_save_path = os.path.join(tmpdirname, "sentence_transformers_ov_model")
+            ov_model.save_pretrained(model_save_path)
+            model = OVSentenceTransformer.from_pretrained(model_save_path)
+            sentences = ["This is an example sentence", "Each sentence is converted"]
+            model.encode(sentences)
+        gc.collect()
diff --git a/tests/openvino/test_modeling_sentence_transformers.py b/tests/openvino/test_modeling_sentence_transformers.py
deleted file mode 100644
index acda045123..0000000000
--- a/tests/openvino/test_modeling_sentence_transformers.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#  Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import gc
-import os
-import tempfile
-import unittest
-
-import numpy as np
-from parameterized import parameterized
-from sentence_transformers import SentenceTransformer
-from transformers import (
-    PretrainedConfig,
-    set_seed,
-)
-
-from optimum.intel import OVSentenceTransformer
-
-
-SEED = 42
-
-F32_CONFIG = {"INFERENCE_PRECISION_HINT": "f32"}
-
-MODEL_NAMES = {
-    "bert": "sentence-transformers/all-MiniLM-L6-v2",
-    "mpnet": "sentence-transformers/all-mpnet-base-v2",
-}
-
-
-class OVModelForSTFeatureExtractionIntegrationTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = (
-        "bert",
-        "mpnet",
-    )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_compare_to_transformers(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        set_seed(SEED)
-        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        self.assertIsInstance(ov_model.config, PretrainedConfig)
-        self.assertTrue(hasattr(ov_model, "encode"))
-        st_model = SentenceTransformer(model_id)
-        sentences = ["This is an example sentence", "Each sentence is converted"]
-        st_embeddings = st_model.encode(sentences)
-        ov_embeddings = ov_model.encode(sentences)
-        # Compare tensor outputs
-        self.assertTrue(np.allclose(ov_embeddings, st_embeddings, atol=1e-4))
-        del st_embeddings
-        del ov_model
-        gc.collect()
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_sentence_transformers_save_and_infer(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        ov_model = OVSentenceTransformer.from_pretrained(model_id, export=True, ov_config=F32_CONFIG)
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model_save_path = os.path.join(tmpdirname, "sentence_transformers_ov_model")
-            ov_model.save_pretrained(model_save_path)
-            model = OVSentenceTransformer.from_pretrained(model_save_path)
-            sentences = ["This is an example sentence", "Each sentence is converted"]
-            model.encode(sentences)
-        gc.collect()
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 719509738f..2869acf834 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -17,7 +17,6 @@
 
 import itertools
 import logging
-import tempfile
 import unittest
 from collections import defaultdict
 from enum import Enum
@@ -57,12 +56,14 @@
     OVModelForSpeechSeq2Seq,
     OVStableDiffusionPipeline,
     OVStableDiffusionXLPipeline,
+    OVStableDiffusion3Pipeline,
     OVQuantizer,
     OVTrainer,
     OVQuantizationConfig,
     OVWeightQuantizationConfig,
     OVDynamicQuantizationConfig,
     OVModelOpenCLIPForZeroShotImageClassification,
+    OVModelForVisualCausalLM,
 )
 from optimum.intel.openvino.configuration import (
     OVQuantizationMethod,
@@ -70,6 +71,7 @@
     _DEFAULT_4BIT_CONFIGS,
     _DEFAULT_4BIT_CONFIG,
 )
+from optimum.intel.openvino.utils import TemporaryDirectory
 from copy import deepcopy
 
 from optimum.intel.openvino.quantization import InferRequestWrapper
@@ -102,7 +104,7 @@ def test_automodel_static_quantization(self, model_cls, model_name, expected_fak
         def preprocess_function(examples, tokenizer):
             return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = model_cls.auto_model_class.from_pretrained(model_id)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
@@ -146,7 +148,7 @@ def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_
         def preprocess_function(examples, tokenizer):
             return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             ov_model = model_cls.from_pretrained(model_id, export=True)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
@@ -189,17 +191,32 @@ class OVWeightCompressionTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 0, 74),)
     SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),)
 
-    LOAD_IN_4_BITS_SCOPE = (
-        (OVModelForCausalLM, "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8), {"int4": 30, "int8": 14}),
+    LOAD_IN_4_BITS_SCOPE = [
+        (
+            OVModelForCausalLM,  # model cls
+            "gpt2",  # model name
+            False,  # trust remote code
+            dict(bits=4, sym=False, group_size=-1, ratio=0.8),  # quantization config
+            {"int4": 30, "int8": 14},  # reference number of low-precision nodes
+        ),
         (
             OVModelForCausalLM,
             "gpt2",
+            False,
             dict(bits=4, weight_format="mxfp4", group_size=32),
             {"f4e2m1": 20, "f8e8m0": 20, "int8": 4},
         ),
         (
             OVModelForCausalLM,
             "gpt2",
+            False,
+            dict(bits=4, weight_format="nf4", group_size=32),
+            {"nf4": 20, "int8": 4},
+        ),
+        (
+            OVModelForCausalLM,
+            "gpt2",
+            False,
             dict(
                 bits=4,
                 sym=False,
@@ -211,12 +228,14 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "gpt2",
+            False,
             dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True),
             {"int4": 26, "int8": 18},
         ),
         (
             OVModelForCausalLM,
             "opt",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -230,6 +249,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "opt",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -238,11 +258,12 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset=["one two, " * i for i in range(10)],
             ),
-            {"int4": 25, "int8": 14},
+            {"int4": 24, "int8": 16},
         ),
         (
             OVModelForCausalLM,
             "llama_awq",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -258,6 +279,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "llama_awq",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -272,6 +294,7 @@ class OVWeightCompressionTest(unittest.TestCase):
         (
             OVModelForCausalLM,
             "llama_awq",
+            False,
             dict(
                 bits=4,
                 sym=True,
@@ -283,28 +306,132 @@ class OVWeightCompressionTest(unittest.TestCase):
             ),
             {"int4": 12, "int8": 8},
         ),
-    )
+    ]
+
+    if is_transformers_version(">=", "4.40.0"):
+        LOAD_IN_4_BITS_SCOPE.extend(
+            [
+                (
+                    OVModelForVisualCausalLM,
+                    "llava_next",
+                    False,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="hessian_input_activation",
+                        num_samples=1,
+                        processor=MODEL_NAMES["llava_next"],
+                    ),
+                    {"int4": 24, "int8": 6},
+                ),
+                (
+                    OVModelForVisualCausalLM,
+                    "nanollava",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=8,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        processor=MODEL_NAMES["nanollava_vision_tower"],
+                        tokenizer=MODEL_NAMES["nanollava"],
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 16, "int8": 14},
+                ),
+            ]
+        )
 
-    SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
-        (OVModelForCausalLM, "gpt2"),
-        (OVModelForMaskedLM, "bert"),
-        (OVModelForTokenClassification, "roberta"),
-        (OVModelForImageClassification, "vit"),
-        (OVModelForSeq2SeqLM, "t5"),
-        (OVModelForSequenceClassification, "albert"),
-        (OVModelForQuestionAnswering, "distilbert"),
-        (OVModelForAudioClassification, "wav2vec2"),
-        (OVModelForFeatureExtraction, "blenderbot"),
-        (OVStableDiffusionPipeline, "stable-diffusion"),
-        (OVStableDiffusionXLPipeline, "stable-diffusion-xl"),
-        (OVModelOpenCLIPForZeroShotImageClassification, "open-clip"),
-    )
+    if is_transformers_version(">=", "4.45.0"):
+        LOAD_IN_4_BITS_SCOPE.extend(
+            [
+                (
+                    OVModelForVisualCausalLM,
+                    "minicpmv",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        processor=MODEL_NAMES["minicpmv"],
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 22, "int8": 8},
+                ),
+                (
+                    OVModelForVisualCausalLM,
+                    "internvl2",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=4,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 22, "int8": 8},
+                ),
+                (
+                    OVModelForVisualCausalLM,
+                    "phi3_v",
+                    True,
+                    dict(
+                        bits=4,
+                        group_size=16,
+                        dataset="contextual",
+                        ratio=0.8,
+                        sensitivity_metric="mean_activation_magnitude",
+                        num_samples=1,
+                        trust_remote_code=True,
+                    ),
+                    {"int4": 14, "int8": 4},
+                ),
+            ]
+        )
 
-    SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = (
+    SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = [
+        (OVModelForCausalLM, "gpt2", False),
+        (OVModelForMaskedLM, "bert", False),
+        (OVModelForTokenClassification, "roberta", False),
+        (OVModelForImageClassification, "vit", False),
+        (OVModelForSeq2SeqLM, "t5", False),
+        (OVModelForSequenceClassification, "albert", False),
+        (OVModelForQuestionAnswering, "distilbert", False),
+        (OVModelForAudioClassification, "wav2vec2", False),
+        (OVModelForFeatureExtraction, "blenderbot", False),
+        (OVStableDiffusionPipeline, "stable-diffusion", False),
+        (OVStableDiffusionXLPipeline, "stable-diffusion-xl", False),
+        (OVModelOpenCLIPForZeroShotImageClassification, "open-clip", False),
+        (OVModelForVisualCausalLM, "llava", False),
+    ]
+
+    if is_transformers_version(">=", "4.40.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "nanollava", True))
+
+    if is_transformers_version(">=", "4.45.0"):
+        SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmv", True))
+
+    SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [
         (OVStableDiffusionPipeline, "stable-diffusion", 72, 195),
         (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331),
         (OVLatentConsistencyModelPipeline, "latent-consistency", 50, 135),
-    )
+    ]
+
+    if is_transformers_version(">=", "4.45.0"):
+        SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION.extend(
+            [
+                (OVStableDiffusion3Pipeline, "stable-diffusion-3", 9, 65),
+            ]
+        )
 
     IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
 
@@ -315,7 +442,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i
         task = model_cls.export_feature
         model_id = MODEL_NAMES[model_name]
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = model_cls.auto_model_class.from_pretrained(model_id)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
@@ -346,7 +473,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p
         task = model_cls.export_feature
         model_id = MODEL_NAMES[model_name]
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = model_cls.from_pretrained(model_id, export=True)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
@@ -371,7 +498,7 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p
     def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4):
         task = model_cls.export_feature
         model_id = MODEL_NAMES[model_name]
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=False)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
@@ -398,7 +525,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
     def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
         task = model_cls.export_feature
         model_id = MODEL_NAMES[model_name]
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=True)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
@@ -420,8 +547,14 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e
             self.assertEqual(OVWeightQuantizationConfig().to_dict(), loaded_config.quantization_config.to_dict())
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
-    def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
-        model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True, stateful=False)
+    def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust_remote_code):
+        model = model_cls.from_pretrained(
+            MODEL_NAMES[model_type],
+            export=True,
+            load_in_8bit=True,
+            stateful=False,
+            trust_remote_code=trust_remote_code,
+        )
 
         if model_type == "open-clip":
             self.assertEqual(model.text_model._openvino_config.quantization_config.bits, 8)
@@ -439,6 +572,9 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
             models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
         elif model_type == "open-clip":
             models = [model.text_model, model.visual_model]
+        elif model.export_feature == "image-text-to-text":
+            models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+            models += [getattr(model, part) for part in model.additional_parts]
         else:
             models = [model]
 
@@ -451,10 +587,12 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
     def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
         model_id = MODEL_NAMES[model_type]
         quantization_config = OVWeightQuantizationConfig(bits=8, dataset="conceptual_captions", num_samples=2)
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
 
-            num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet)
+            num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(
+                model.unet if model.unet is not None else model.transformer
+            )
             self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
             self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
             self.assertEqual(0, num_weight_nodes["int4"])
@@ -468,7 +606,9 @@ def test_stable_diffusion_with_weight_compression(self):
 
         quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
 
-        num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(int8_pipe.unet)
+        num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(
+            int8_pipe.unet if int8_pipe.unet is not None else int8_pipe.transformer
+        )
         self.assertEqual(0, num_fake_quantize)
         self.assertEqual(242, num_weight_nodes["int8"])
         self.assertEqual(0, num_weight_nodes["int4"])
@@ -487,7 +627,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
         self.assertEqual(quantization_config.quant_method, OVQuantizationMethod.HYBRID)
 
         quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset)
-        num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet)
+        num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(
+            model.unet if model.unet is not None else model.transformer
+        )
         self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
         self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
         self.assertEqual(0, num_weight_nodes["int4"])
@@ -497,7 +639,7 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
         "optimum.intel.openvino.configuration._DEFAULT_4BIT_CONFIGS", {"facebook/opt-125m": DEFAULT_INT4_CONFIG}
     )
     def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_int8, expected_ov_int4):
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             model_id = MODEL_NAMES[model_type]
             model = model_cls.from_pretrained(model_id, export=True, quantization_config={"bits": 4})
             tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -518,26 +660,26 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_auto_compression_with_config(
-        self, model_cls, model_name, quantization_config, expected_num_weight_nodes
+        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes
     ):
         model_id = MODEL_NAMES[model_name]
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             quantization_config = OVWeightQuantizationConfig.from_dict(quantization_config)
-            model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
+            model = model_cls.from_pretrained(
+                model_id, export=True, quantization_config=quantization_config, trust_remote_code=trust_remote_code
+            )
             if quantization_config.quant_method.lower() == "awq":
                 # TODO: Check that AWQ was actually applied
                 pass
 
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
+            ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model
 
-            _, num_weight_nodes = get_num_quantized_nodes(model)
+            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
             expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
             self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
 
-            wc_rt_info = model.model.get_rt_info()["nncf"]["weight_compression"]
+            wc_rt_info = ov_model.get_rt_info()["nncf"]["weight_compression"]
             self.assertEqual(quantization_config.quant_method.lower() == "awq", wc_rt_info["awq"].value == "True")
             self.assertEqual(
                 quantization_config.scale_estimation or False, wc_rt_info["scale_estimation"].value == "True"
@@ -559,8 +701,10 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
         self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
-    def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
-        model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=False)
+    def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type, trust_remote_code):
+        model = model_cls.from_pretrained(
+            MODEL_NAMES[model_type], export=True, load_in_8bit=False, trust_remote_code=trust_remote_code
+        )
         if model.export_feature.startswith("text2text-generation"):
             models = [model.encoder, model.decoder, model.decoder_with_past]
         elif model.export_feature == "text-to-image":
@@ -568,6 +712,9 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
             models.append(model.text_encoder if model_type == "stable-diffusion" else model.text_encoder_2)
         elif model_type == "open-clip":
             models = [model.text_model, model.visual_model]
+        elif model.export_feature == "image-text-to-text":
+            models = [model.lm_model, model.vision_embeddings_model, model.text_embeddings_model]
+            models += [getattr(model, part) for part in model.additional_parts]
         else:
             models = [model]
 
@@ -655,23 +802,23 @@ def main_export_not_in_stacktrace(*args, **kwargs):
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_dynamic_with_config(
-        self, model_cls, model_name, quantization_config, expected_num_weight_nodes
+        self, model_cls, model_name, trust_remote_code, quantization_config, expected_num_weight_nodes
     ):
         model_id = MODEL_NAMES[model_name]
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             group_size = quantization_config.pop("group_size", 32)
             quantization_config = OVDynamicQuantizationConfig(
                 weights_group_size=group_size, activations_group_size=group_size, **quantization_config
             )
-            model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
+            model = model_cls.from_pretrained(
+                model_id, export=True, quantization_config=quantization_config, trust_remote_code=trust_remote_code
+            )
             self.assertEqual(model.ov_config["DYNAMIC_QUANTIZATION_GROUP_SIZE"], str(group_size))
             self.assertEqual(model.ov_config["KV_CACHE_PRECISION"], "u8")
 
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
-            if tokenizer.pad_token is None:
-                tokenizer.pad_token = tokenizer.eos_token
+            ov_model = model.lm_model if model_cls == OVModelForVisualCausalLM else model.model
 
-            _, num_weight_nodes = get_num_quantized_nodes(model)
+            _, num_weight_nodes = get_num_quantized_nodes(ov_model)
             expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
             self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
@@ -693,7 +840,7 @@ def preprocess_function(examples, tokenizer):
                 examples["question"], examples["context"], padding="max_length", max_length=64, truncation=True
             )
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             quantizer = OVQuantizer.from_pretrained(transformers_model)
@@ -734,7 +881,7 @@ def preprocess_function(examples, tokenizer):
                 examples["question"], examples["context"], padding="max_length", max_length=64, truncation=True
             )
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             transformers_model = OVModelForQuestionAnswering.from_pretrained(model_name, export=True)
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             quantizer = OVQuantizer.from_pretrained(transformers_model)
@@ -768,9 +915,12 @@ def preprocess_function(examples, tokenizer):
 
 
 class OVTrainerTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("albert", 64, 39),)
+    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("albert", 63, 39),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
+    @unittest.skipIf(
+        is_transformers_version(">=", "4.46"), reason="OVTrainer is not compatible with transformers>=v4.46"
+    )
     def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8):
         model_id = MODEL_NAMES[model_name]
         model = AutoModelForSequenceClassification.from_pretrained(model_id, attn_implementation="eager")
@@ -787,7 +937,7 @@ def test_aware_training_quantization(self, model_name, expected_fake_quantize, e
         def compute_metrics(p):
             return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             trainer = OVTrainer(
                 model=model,
                 ov_config=ov_config,
@@ -916,7 +1066,7 @@ def get_default_configurations() -> dict:
     @parameterized.expand(QUANTIZATION_CONFIGS)
     def test_config_serialization(self, quantization_config: OVQuantizationConfigBase):
         ov_config = OVConfig(quantization_config=quantization_config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        with TemporaryDirectory() as tmp_dir:
             ov_config.save_pretrained(tmp_dir)
             loaded_ov_config = OVConfig.from_pretrained(tmp_dir)
 
diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index 014b802688..76f7ec3197 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import os
 import random
 import re
 import shutil
@@ -89,6 +90,10 @@ def initialize_movement_sparsifier_parameters_by_sparsity(
                 operand.bias_importance.copy_(bias_init_tensor)
 
 
+def is_windows():
+    return os.name == "nt"
+
+
 def is_avx_vnni_supported() -> bool:
     return any(re.search("avx.*vnni", flag.lower()) is not None for flag in cpuinfo.get_cpu_info()["flags"])
 
@@ -470,7 +475,10 @@ class OVTrainerTextClassificationTrainingTest(OVTrainerBaseTrainingTest):
     task = "sequence-classification"
 
     @parameterized.expand(OVTRAINER_TEXT_CLASSIFICATION_TEST_DESCRIPTORS.items())
-    @unittest.skipIf(is_transformers_version("<", "4.41.0"), reason="Mismatch in expected fake quantized op")
+    @unittest.skipIf(
+        is_transformers_version("<", "4.41") or is_transformers_version(">=", "4.46"),
+        reason="Mismatch in expected fake quantized op and incompatible with transformers v4.46",
+    )
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
@@ -614,6 +622,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
 # TODO : can be moved to MODEL_NAMES["swin-window"] after transformers v4.42.3
 
 
+@unittest.skipIf(is_windows(), reason="Fails on windows")
 class OVTrainerImageClassificationTrainingTest(OVTrainerBaseTrainingTest):
     ovmodel_cls = OVModelForImageClassification
     task = "image-classification"
@@ -621,7 +630,10 @@ class OVTrainerImageClassificationTrainingTest(OVTrainerBaseTrainingTest):
     @parameterized.expand(OVTRAINER_IMAGE_CLASSIFICATION_TEST_DESCRIPTORS.items())
     @pytest.mark.run_slow
     @slow
-    @unittest.skipIf(is_transformers_version("<", "4.41.0"), reason="Mismatch in expected fake quantized op")
+    @unittest.skipIf(
+        is_transformers_version("<", "4.41") or is_transformers_version(">=", "4.46"),
+        reason="Mismatch in expected fake quantized op and incompatible with transformers v4.46",
+    )
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
@@ -794,6 +806,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
 }
 
 
+@unittest.skipIf(is_windows(), reason="Fails on windows")
 class OVTrainerAudioClassificationTrainingTest(OVTrainerBaseTrainingTest):
     ovmodel_cls = OVModelForAudioClassification
     task = "audio-classification"
@@ -801,6 +814,9 @@ class OVTrainerAudioClassificationTrainingTest(OVTrainerBaseTrainingTest):
     @parameterized.expand(OVTRAINER_AUDIO_CLASSIFICATION_TEST_DESCRIPTORS.items())
     @pytest.mark.run_slow
     @slow
+    @unittest.skipIf(
+        is_transformers_version(">=", "4.46"), reason="OVTrainer is not compatible with transformers>=v4.46"
+    )
     def test_training(self, _, desc: OVTrainerTestDescriptor):
         self.run_ovtrainer_training_checks(desc)
 
diff --git a/tests/openvino/test_training_examples.py b/tests/openvino/test_training_examples.py
index 8a33ba42e0..023f9df7b8 100644
--- a/tests/openvino/test_training_examples.py
+++ b/tests/openvino/test_training_examples.py
@@ -15,7 +15,6 @@
 import os
 import subprocess
 import sys
-import tempfile
 import unittest
 from dataclasses import dataclass
 from pathlib import Path
@@ -25,7 +24,7 @@
 import torch.cuda
 from parameterized import parameterized
 
-from optimum.intel.openvino.utils import OV_XML_FILE_NAME
+from optimum.intel.openvino.utils import OV_XML_FILE_NAME, TemporaryDirectory
 
 
 PROJECT_ROOT = Path(__file__).parents[2]
@@ -148,7 +147,7 @@ def test_single_card_training(self, _, desc: TrainingExampleDescriptor):
             self.skipTest("No enough cuda devices.")
 
         self.env[CUDA_VISIBLE_DEVICES] = str(self.available_cuda_device_ids[0])
-        with tempfile.TemporaryDirectory() as output_dir:
+        with TemporaryDirectory() as output_dir:
             args = ["torchrun", "--nproc_per_node=1", desc.filename, *desc.get_args_with_output_dir(output_dir)]
             proc = subprocess.Popen(
                 args=args,
@@ -165,7 +164,7 @@ def test_data_parallel_training(self, _, desc: TrainingExampleDescriptor):
             self.skipTest("No enough cuda devices.")
 
         self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2]))
-        with tempfile.TemporaryDirectory() as output_dir:
+        with TemporaryDirectory() as output_dir:
             args = [sys.executable, desc.filename, *desc.get_args_with_output_dir(output_dir)]
             proc = subprocess.Popen(
                 args=args,
@@ -182,7 +181,7 @@ def test_distributed_data_parallel_training(self, _, desc: TrainingExampleDescri
             self.skipTest("No enough cuda devices.")
 
         self.env[CUDA_VISIBLE_DEVICES] = ",".join(map(str, self.available_cuda_device_ids[:2]))
-        with tempfile.TemporaryDirectory() as output_dir:
+        with TemporaryDirectory() as output_dir:
             args = [
                 "torchrun",
                 "--rdzv_backend=c10d",
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index d7eea01dba..b646b5b52a 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import numpy as np
+import openvino as ov
 import torch
 
 
@@ -59,6 +60,7 @@
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "flux": "katuni4ka/tiny-random-flux",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
@@ -69,13 +71,14 @@
     "ibert": "hf-internal-testing/tiny-random-ibert",
     "internlm": "katuni4ka/tiny-random-internlm",
     "internlm2": "katuni4ka/tiny-random-internlm2",
+    "internvl2": "katuni4ka/tiny-random-internvl2",
     "jais": "katuni4ka/tiny-random-jais",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-longt5",
     "llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
     "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
     "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-    "llava": "trl-internal-testing/tiny-random-LlavaForConditionalGeneration",
+    "llava": "katuni4ka/tiny-random-llava",
     "llava_next": "katuni4ka/tiny-random-llava-next",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
@@ -83,6 +86,7 @@
     "marian": "sshleifer/tiny-marian-en-de",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "minicpm": "katuni4ka/tiny-random-minicpm",
+    "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6",
     "mistral": "echarlaix/tiny-random-mistral",
     "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
     "mixtral": "TitanML/tiny-mixtral",
@@ -93,6 +97,8 @@
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mpnet": "hf-internal-testing/tiny-random-MPNetModel",
     "mt5": "stas/mt5-tiny-random",
+    "nanollava": "katuni4ka/tiny-random-nanollava",
+    "nanollava_vision_tower": "katuni4ka/tiny-random-siglip",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "olmo": "katuni4ka/tiny-random-olmo-hf",
     "orion": "katuni4ka/tiny-random-orion",
@@ -103,6 +109,7 @@
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "phi": "echarlaix/tiny-random-PhiForCausalLM",
     "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
+    "phi3_v": "katuni4ka/tiny-random-phi3-vision",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen": "katuni4ka/tiny-random-qwen",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
@@ -118,6 +125,7 @@
     "stable-diffusion-openvino": "hf-internal-testing/tiny-stable-diffusion-openvino",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner",
+    "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random",
     "stablelm": "hf-internal-testing/tiny-random-StableLmForCausalLM",
     "starcoder2": "hf-internal-testing/tiny-random-Starcoder2ForCausalLM",
     "latent-consistency": "echarlaix/tiny-random-latent-consistency",
@@ -146,6 +154,8 @@
     "glm4": "katuni4ka/tiny-random-glm4",
     "open-clip": "hf-internal-testing/tiny-open-clip-model",
     "open-clip-ov": "zofinka/tiny-open-clip-model",
+    "st-bert": "sentence-transformers/all-MiniLM-L6-v2",
+    "st-mpnet": "sentence-transformers/all-mpnet-base-v2",
 }
 
 
@@ -170,18 +180,28 @@
     "stable-diffusion-xl": (366, 34, 42, 66),
     "stable-diffusion-xl-refiner": (366, 34, 42, 66),
     "open-clip": (20, 28),
+    "stable-diffusion-3": (66, 42, 58, 30),
+    "flux": (56, 24, 28, 64),
+    "llava": (30, 9, 1),
+    "llava_next": (30, 9, 1),
+    "minicpmv": (30, 26, 1, 6),
+    "nanollava": (30, 15, 1),
 }
 
+TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"
 
-def get_num_quantized_nodes(ov_model):
+
+def get_num_quantized_nodes(model):
     num_fake_quantize = 0
     num_weight_nodes = {
         "int8": 0,
         "int4": 0,
         "f4e2m1": 0,
         "f8e8m0": 0,
+        "nf4": 0,
     }
-    for elem in ov_model.model.get_ops():
+    ov_model = model if isinstance(model, ov.Model) else model.model
+    for elem in ov_model.get_ops():
         if "FakeQuantize" in elem.name:
             num_fake_quantize += 1
         for i in range(elem.get_output_size()):
@@ -194,4 +214,6 @@ def get_num_quantized_nodes(ov_model):
                 num_weight_nodes["f4e2m1"] += 1
             if type_name == "f8e8m0":
                 num_weight_nodes["f8e8m0"] += 1
+            if type_name == "nf4":
+                num_weight_nodes["nf4"] += 1
     return num_fake_quantize, num_weight_nodes