Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port fixes from master to 2024.5.1 / 2024.6.0 #1239

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -259,19 +259,19 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
env:
PYTHONPATH: "./build/:$PYTHONPATH"

- name: Test bindings (wheel)
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
python -m pytest -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"

- run: >
source ${OV_INSTALL_DIR}/setupvars.sh
&& python -m pytest ./tests/python_tests/test_vlm_api.py
&& python -m pytest -v ./tests/python_tests/test_vlm_api.py

genai_python_lib_whisper:
name: OpenVINO genai extension whisper tests (cmake + wheel)
Expand Down Expand Up @@ -350,15 +350,15 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
env:
PYTHONPATH: "./build/:$PYTHONPATH"

- name: Test bindings (wheel)
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"

genai_package:
name: OpenVINO genai extension (install to OpenVINO package)
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/llm_bench-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ jobs:
run: |
wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
- name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux
run: |
optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --assistant_confidence_threshold 0.4
python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --num_assistant_tokens 5
- name: Test whisper-tiny on Linux
run: |
GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
Expand All @@ -99,7 +105,7 @@ jobs:
pip install git+https://github.com/huggingface/optimum.git
GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }}
python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
python -m pytest tools/who_what_benchmark/tests
python -m pytest -v tools/who_what_benchmark/tests
stateful:
runs-on: ubuntu-20.04
steps:
Expand All @@ -121,4 +127,4 @@ jobs:
GIT_CLONE_PROTECTION_ACTIVE=false pip install tools/who_what_benchmark/
pip install pytest
python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall
python -m pytest tools/who_what_benchmark/tests
python -m pytest -v tools/who_what_benchmark/tests
8 changes: 4 additions & 4 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
env:
PYTHONPATH: "./build/:$PYTHONPATH"

Expand All @@ -234,7 +234,7 @@ jobs:
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
python -c "from openvino_genai import LLMPipeline"
python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"

genai_python_lib_whisper:
name: OpenVINO genai extension whisper tests (cmake + wheel)
Expand Down Expand Up @@ -289,7 +289,7 @@ jobs:
run: |
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
env:
PYTHONPATH: "./build/:$PYTHONPATH"

Expand All @@ -298,7 +298,7 @@ jobs:
source ${OV_INSTALL_DIR}/setupvars.sh
python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
python -c "from openvino_genai import LLMPipeline"
python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"

genai_package:
name: OpenVINO genai extension (install to OpenVINO package)
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -236,15 +236,15 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
env:
PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.

- name: Test bindings (wheel)
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install . --verbose
python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
python -m pytest -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"

genai_python_lib_whisper:
name: OpenVINO genai extension whisper tests (cmake + wheel)
Expand Down Expand Up @@ -300,15 +300,15 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
env:
PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.

- name: Test bindings (wheel)
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install . --verbose
python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"

genai_python_lib_vlm:
name: OpenVINO genai VLM tests (cmake + wheel)
Expand Down Expand Up @@ -364,7 +364,7 @@ jobs:
run: |
. "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
python -m pytest ./tests/python_tests/test_vlm_api.py
python -m pytest -v ./tests/python_tests/test_vlm_api.py
env:
PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ temp/
.repo/
CMakeLists.txt.user
CMakeUserPresets.json
.env

*.project
*.cproject
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,11 @@ For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2
### Converting and compressing the model from Hugging Face library

```sh
optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6
#(Basic) download and convert to OpenVINO MiniCPM-V-2_6 model
optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format fp16 MiniCPM-V-2_6

#(Recommended) Same as above but with compression: language model is compressed to int4, other model components are compressed to int8
optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code --weight-format int4 MiniCPM-V-2_6
```

### Run generation using VLMPipeline API in Python
Expand Down Expand Up @@ -159,6 +163,9 @@ For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2
```sh
#Download and convert to OpenVINO dreamlike-anime-1.0 model
optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16

#You can also use INT8 hybrid quantization to further optimize the model and reduce inference latency
optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format int8 --dataset conceptual_captions dreamlike_anime_1_0_ov/INT8
```

### Run generation using Text2Image API in Python
Expand Down
4 changes: 2 additions & 2 deletions samples/python/text2image/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,6 @@ With adapter | Without adapter
![](./lora.bmp) | ![](./baseline.bmp)


# Fuse LoRA adapters into model weights
## Fuse LoRA adapters into model weights

To maximize inference performance using a LoRA adapter, refer to `lora_fuse.py`, which demonstrates fusing the adapter into the model weights. This approach achieves the same performance as the base model without a LoRA adapter but removes the flexibility to switch adapters between generate calls. This mode is ideal when performing multiple generations with the same LoRA adapters and blending alpha parameters, and when model recompilation on adapter changes is feasible. The example outputs the resulting image as `lora.bmp`.
To maximize inference performance using a LoRA adapter, refer to `lora_fuse.py`, which demonstrates fusing the adapter into the model weights. This approach achieves the same performance as the base model without a LoRA adapter but removes the flexibility to switch adapters between generate calls. This mode is ideal when performing multiple generations with the same LoRA adapters and blending alpha parameters, and when model recompilation on adapter changes is feasible. The example outputs the resulting image as `lora.bmp`.
17 changes: 14 additions & 3 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
const std::string& device,
const ov::AnyMap& properties) {
m_tokenizer = tokenizer;
m_generation_config = utils::from_config_json_if_exists(models_path);

ov::Core core;

auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties);
auto [core_properties, compile_properties] = utils::split_core_complile_config(properties);
core.set_property(core_properties);

// The model can be compiled for GPU as well
Expand Down Expand Up @@ -74,15 +75,22 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
m_sampler = std::make_shared<Sampler>(m_tokenizer);
m_sampler->set_seed(m_generation_config.rng_seed);

// If eos_token_id was not provided, take value
if (m_generation_config.eos_token_id == -1)
m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
};


GenerationHandle
ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) {
sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
// If eos_token_id was not provided, take value from default m_generation_config
if (sampling_params.eos_token_id == -1)
sampling_params.set_eos_token_id(m_generation_config.eos_token_id);
sampling_params.validate();

SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids,
sampling_params,
m_scheduler->get_block_size(),
Expand Down Expand Up @@ -262,6 +270,9 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
}
}, streamer);

OPENVINO_ASSERT(streamer_ptr == nullptr || input_ids.size() == 1 && (sampling_params[0].is_greedy_decoding() || sampling_params[0].is_multinomial()),
"Currently streaming is possible only with batch size=1 and only for greedy or multinomial decoding");

std::vector<GenerationHandle> generations;
for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
Expand All @@ -283,7 +294,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
m_requests.clear();
};

bool continue_generation = true, step_throws_exception = false;
bool continue_generation = true;
while (has_non_finished_requests() && continue_generation) {
try {
step();
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/continuous_batching_impl_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class ContinuousBatchingPipeline::ImplInterface {
Tokenizer m_tokenizer;

// TODO (mzegla): GenerationConfig is request specific object
// and pipeline only uses default rng_seed.
// and pipeline only uses default rng_seed and some special tokens.
ov::genai::GenerationConfig m_generation_config;

PipelineMetrics m_pipeline_metrics;
Expand Down
5 changes: 2 additions & 3 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,9 @@ void GenerationConfig::validate() const {
}
if (is_speculative_decoding()) {
if (assistant_confidence_threshold != 0.f) {
OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually excluded in `GenerationConfig`");
OPENVINO_ASSERT(num_assistant_tokens == 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
} else {
OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually excluded in `GenerationConfig`");
OPENVINO_ASSERT(num_assistant_tokens > 0, "Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive in `GenerationConfig`");
};
}
}
Expand Down Expand Up @@ -208,6 +208,5 @@ GenerationConfig multinomial() {
return multinomial_config;
}


} // namespace genai
} // namespace ov
21 changes: 21 additions & 0 deletions src/cpp/src/image_generation/diffusion_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,27 @@ const std::string get_class_name(const std::filesystem::path& root_dir) {
return data["_class_name"].get<std::string>();
}

ov::Tensor get_guidance_scale_embedding(float guidance_scale, uint32_t embedding_dim) {
float w = guidance_scale * 1000;
uint32_t half_dim = embedding_dim / 2;
float emb = std::log(10000) / (half_dim - 1);

ov::Shape embedding_shape = {1, embedding_dim};
ov::Tensor w_embedding(ov::element::f32, embedding_shape);
float* w_embedding_data = w_embedding.data<float>();

for (size_t i = 0; i < half_dim; ++i) {
float temp = std::exp((i * (-emb))) * w;
w_embedding_data[i] = std::sin(temp);
w_embedding_data[i + half_dim] = std::cos(temp);
}

if (embedding_dim % 2 == 1)
w_embedding_data[embedding_dim - 1] = 0;

return w_embedding;
}

} // namespace


Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/image_generation/schedulers/lcm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void LCMScheduler::set_timesteps(size_t num_inference_steps, float strength) {
assert(skipping_step >= 1 && "The combination of `original_steps x strength` is smaller than `num_inference_steps`");

// LCM Inference Steps Schedule
std::reverse(lcm_origin_timesteps.begin(),lcm_origin_timesteps.end());
std::reverse(lcm_origin_timesteps.begin(), lcm_origin_timesteps.end());

using numpy_utils::linspace;
// v1. based on https://github.com/huggingface/diffusers/blame/2a7f43a73bda387385a47a15d7b6fe9be9c65eb2/src/diffusers/schedulers/scheduling_lcm.py#L387
Expand Down
Loading
Loading