openvinotoolkit · ilya-lavrenov · Nov 21, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -259,19 +259,19 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
       - name: Test bindings (wheel)
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
       - run: >
             source ${OV_INSTALL_DIR}/setupvars.sh
-            && python -m pytest ./tests/python_tests/test_vlm_api.py
+            && python -m pytest -v ./tests/python_tests/test_vlm_api.py
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -350,15 +350,15 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
       - name: Test bindings (wheel)
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
@@ -82,6 +82,12 @@ jobs:
       run: |
         wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
         python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai  --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
+    - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
+      run: |
+        optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
+        optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai  --assistant_confidence_threshold 0.4
+        python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai  --num_assistant_tokens 5
     - name: Test whisper-tiny on Linux
       run: |
         GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
@@ -99,7 +105,7 @@ jobs:
         pip install git+https://github.com/huggingface/optimum.git
         GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }}
         python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
-        python -m pytest tools/who_what_benchmark/tests
+        python -m pytest -v tools/who_what_benchmark/tests
   stateful:
     runs-on: ubuntu-20.04
     steps:
@@ -121,4 +127,4 @@ jobs:
           GIT_CLONE_PROTECTION_ACTIVE=false pip install tools/who_what_benchmark/
           pip install pytest
           python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
-          python -m pytest tools/who_what_benchmark/tests
+          python -m pytest -v tools/who_what_benchmark/tests
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -225,7 +225,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -234,7 +234,7 @@ jobs:
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
-          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -289,7 +289,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -298,7 +298,7 @@ jobs:
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -236,15 +236,15 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose
-          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -300,15 +300,15 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_python_lib_vlm:
     name: OpenVINO genai VLM tests (cmake + wheel)
@@ -364,7 +364,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_vlm_api.py
+          python -m pytest -v ./tests/python_tests/test_vlm_api.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 

diff --git a/.gitignore b/.gitignore
@@ -24,6 +24,7 @@ temp/
 .repo/
 CMakeLists.txt.user
 CMakeUserPresets.json
+.env
 
 *.project
 *.cproject

diff --git a/README.md b/README.md
@@ -108,7 +108,11 @@ For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2
 ### Converting and compressing the model from Hugging Face library
 
 ```sh
-optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6
+#(Basic) download and convert to OpenVINO MiniCPM-V-2_6 model
+optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format fp16 MiniCPM-V-2_6
+
+#(Recommended) Same as above but with compression: language model is compressed to int4, other model components are compressed to int8
+optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format int4 MiniCPM-V-2_6
 ```
 
 ### Run generation using VLMPipeline API in Python
@@ -159,6 +163,9 @@ For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2
 ```sh
 #Download and convert to OpenVINO dreamlike-anime-1.0 model
 optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16
+
+#You can also use INT8 hybrid quantization to further optimize the model and reduce inference latency
+optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format int8 --dataset conceptual_captions dreamlike_anime_1_0_ov/INT8
 ```
 
 ### Run generation using Text2Image API in Python

diff --git a/samples/python/text2image/README.md b/samples/python/text2image/README.md
@@ -63,6 +63,6 @@ With adapter | Without adapter
 ![](./lora.bmp) | ![](./baseline.bmp)
 
 
-# Fuse LoRA adapters into model weights
+## Fuse LoRA adapters into model weights
 
-To maximize inference performance using a LoRA adapter, refer to `lora_fuse.py`, which demonstrates fusing the adapter into the model weights. This approach achieves the same performance as the base model without a LoRA adapter but removes the flexibility to switch adapters between generate calls. This mode is ideal when performing multiple generations with the same LoRA adapters and blending alpha parameters, and when model recompilation on adapter changes is feasible. The example outputs the resulting image as `lora.bmp`.
+To maximize inference performance using a LoRA adapter, refer to `lora_fuse.py`, which demonstrates fusing the adapter into the model weights. This approach achieves the same performance as the base model without a LoRA adapter but removes the flexibility to switch adapters between generate calls. This mode is ideal when performing multiple generations with the same LoRA adapters and blending alpha parameters, and when model recompilation on adapter changes is feasible. The example outputs the resulting image as `lora.bmp`.
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -17,10 +17,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     const std::string& device,
     const ov::AnyMap& properties) {
     m_tokenizer = tokenizer;
+    m_generation_config = utils::from_config_json_if_exists(models_path);
 
     ov::Core core;
 
-    auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties);
+    auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
     core.set_property(core_properties);
 
     // The model can be compiled for GPU as well
@@ -74,15 +75,22 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
     m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
     m_sampler = std::make_shared<Sampler>(m_tokenizer);
     m_sampler->set_seed(m_generation_config.rng_seed);
+
+    // If eos_token_id was not provided, take value
+    if (m_generation_config.eos_token_id == -1)
+        m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
 };
 
 
 GenerationHandle
 ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request_id,
                                                                const ov::Tensor& input_ids,
                                                                ov::genai::GenerationConfig sampling_params) {
-    sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
+    // If eos_token_id was not provided, take value from default m_generation_config
+    if (sampling_params.eos_token_id == -1)
+        sampling_params.set_eos_token_id(m_generation_config.eos_token_id);
     sampling_params.validate();
+
     SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
                                                                         sampling_params,
                                                                         m_scheduler->get_block_size(),
@@ -262,6 +270,9 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         }
     }, streamer);
 
+    OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
+        "Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");
+
     std::vector<GenerationHandle> generations;
     for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
         OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
@@ -283,7 +294,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         m_requests.clear();
     };
 
-    bool continue_generation = true, step_throws_exception = false;
+    bool continue_generation = true;
     while (has_non_finished_requests() && continue_generation) {
         try {
             step();

diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/continuous_batching_impl_interface.hpp
@@ -17,7 +17,7 @@ class ContinuousBatchingPipeline::ImplInterface {
     Tokenizer m_tokenizer;
 
     // TODO (mzegla): GenerationConfig is request specific object
-    // and pipeline only uses default rng_seed. 
+    // and pipeline only uses default rng_seed and some special tokens.
     ov::genai::GenerationConfig m_generation_config;
 
     PipelineMetrics m_pipeline_metrics;

diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
@@ -171,9 +171,9 @@ void GenerationConfig::validate() const {
     }
     if (is_speculative_decoding()) {
         if (assistant_confidence_threshold != 0.f) {
-            OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually excluded in `GenerationConfig`");
+            OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
         } else {
-            OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually excluded in `GenerationConfig`");
+            OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
         };
     }
 }
@@ -208,6 +208,5 @@ GenerationConfig multinomial() {
     return multinomial_config;
 }
 
-
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/image_generation/diffusion_pipeline.hpp b/src/cpp/src/image_generation/diffusion_pipeline.hpp
@@ -36,6 +36,27 @@ const std::string get_class_name(const std::filesystem::path& root_dir) {
     return data["_class_name"].get<std::string>();
 }
 
+ov::Tensor get_guidance_scale_embedding(float guidance_scale, uint32_t embedding_dim) {
+    float w = guidance_scale * 1000;
+    uint32_t half_dim = embedding_dim / 2;
+    float emb = std::log(10000) / (half_dim - 1);
+
+    ov::Shape embedding_shape = {1, embedding_dim};
+    ov::Tensor w_embedding(ov::element::f32, embedding_shape);
+    float* w_embedding_data = w_embedding.data<float>();
+
+    for (size_t i = 0; i < half_dim; ++i) {
+        float temp = std::exp((i * (-emb))) * w;
+        w_embedding_data[i] = std::sin(temp);
+        w_embedding_data[i + half_dim] = std::cos(temp);
+    }
+
+    if (embedding_dim % 2 == 1)
+        w_embedding_data[embedding_dim - 1] = 0;
+
+    return w_embedding;
+}
+
 } // namespace
 
 

diff --git a/src/cpp/src/image_generation/schedulers/lcm.cpp b/src/cpp/src/image_generation/schedulers/lcm.cpp
@@ -99,7 +99,7 @@ void LCMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
     assert(skipping_step >= 1 && "The combination of `original_steps x strength` is smaller than `num_inference_steps`");
 
     // LCM Inference Steps Schedule
-    std::reverse(lcm_origin_timesteps.begin(),lcm_origin_timesteps.end());
+    std::reverse(lcm_origin_timesteps.begin(), lcm_origin_timesteps.end());
 
     using numpy_utils::linspace;
     // v1. based on https://github.com/huggingface/diffusers/blame/2a7f43a73bda387385a47a15d7b6fe9be9c65eb2/src/diffusers/schedulers/scheduling_lcm.py#L387