From cea5cdc71124a6ed1298789729d42179acee2bee Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 10 Mar 2026 11:55:59 +0100 Subject: [PATCH 1/3] save --- .../continuous_batching/llm_executor.hpp | 2 +- .../continuous_batching/servable.hpp | 2 ++ .../servable_initializer.cpp | 27 +++++++++++++++++++ src/llm/llm_calculator.proto | 7 +++++ 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/llm/language_model/continuous_batching/llm_executor.hpp b/src/llm/language_model/continuous_batching/llm_executor.hpp index ec7c7d517b..e4035da01f 100644 --- a/src/llm/language_model/continuous_batching/llm_executor.hpp +++ b/src/llm/language_model/continuous_batching/llm_executor.hpp @@ -102,7 +102,7 @@ struct LLMExecutor { void printMetrics() { ov::genai::PipelineMetrics metrics = pipe->get_metrics(); SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache {};", - metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache)); + metrics.requests, metrics.scheduled_requests, metrics.cache_usage); } }; #pragma GCC diagnostic pop diff --git a/src/llm/language_model/continuous_batching/servable.hpp b/src/llm/language_model/continuous_batching/servable.hpp index def574e009..9b1cc9daf2 100644 --- a/src/llm/language_model/continuous_batching/servable.hpp +++ b/src/llm/language_model/continuous_batching/servable.hpp @@ -18,6 +18,7 @@ #include #include +#include #include "../../servable.hpp" #include "src/llm/llm_calculator.pb.h" @@ -31,6 +32,7 @@ struct ContinuousBatchingServableExecutionContext : public GenAiServableExecutio }; struct ContinuousBatchingServableProperties : public GenAiServableProperties { + ov::genai::AdapterConfig adapterConfig; ov::genai::SchedulerConfig schedulerConfig; std::shared_ptr pipeline; std::shared_ptr llmExecutorWrapper; diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 27f4f51aee..73c8cfed83 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -198,6 +199,32 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr 0) { + SPDLOG_INFO("LoRA adapters will be applied to the model. Number of adapters: {}", nodeOptions.lora_adapter_size()); + for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) { + SPDLOG_INFO("Processing LoRA adapter number {} with model path: {} alpha: {}", i, nodeOptions.lora_adapter(i).model_path(), nodeOptions.lora_adapter(i).alpha()); + const auto& loraAdapterOption = nodeOptions.lora_adapter(i); + auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path()); + std::string loraPath; + if (fsLoraPath.is_relative()) { + loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + loraPath = fsLoraPath.string(); + } + try { + ov::genai::Adapter adapter(loraPath); + properties->adapterConfig.add(adapter, loraAdapterOption.alpha()); + } catch (const std::exception& e) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig)); + } + status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config()); diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index c8edacf88e..d347a35264 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -26,6 +26,11 @@ message LLMCalculatorOptions { optional LLMCalculatorOptions ext = 113473750; } + message LoraAdapter { + required string model_path = 1; + optional float alpha = 2 [default = 1]; + } + message KVCrushConfig { enum AnchorPointMode { RANDOM = 0; @@ -135,4 +140,6 @@ message LLMCalculatorOptions { optional bool enable_tool_guided_generation = 23 [default = false]; optional SparseAttentionConfig sparse_attention_config = 24; + + repeated LoraAdapter lora_adapter = 25; } From 1f05705cf52e74fdacbfae0b91f943f2229a8b00 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Tue, 10 Mar 2026 14:18:24 +0100 Subject: [PATCH 2/3] save --- src/llm/apis/openai_completions.cpp | 10 +++++++ src/llm/apis/openai_request.hpp | 3 ++ .../continuous_batching/servable.hpp | 2 -- .../servable_initializer.cpp | 5 ++++ src/llm/language_model/legacy/servable.cpp | 6 ++++ .../legacy/servable_initializer.cpp | 30 +++++++++++++++++++ src/llm/llm_calculator.proto | 1 + src/llm/servable.cpp | 21 +++++++++++++ src/llm/servable.hpp | 11 +++++++ .../visual_language_model/legacy/servable.cpp | 6 ++++ .../legacy/servable_initializer.cpp | 30 +++++++++++++++++++ 11 files changed, 123 insertions(+), 2 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 6898b51604..b24b545b33 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -700,6 +700,16 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { + if (!it->value.IsString()) + return absl::InvalidArgumentError("lora_adapter is not a string"); + SPDLOG_ERROR("Found lora adapter in request => {}", it->value.GetString()); + request.loraAdapter = it->value.GetString(); + } + // ignore_eos: bool; optional - defaults to false // Extension, unsupported by OpenAI API, however supported by vLLM and CB lib it = doc.FindMember("ignore_eos"); diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index de355c12a1..d4add423a3 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -73,6 +73,9 @@ struct OpenAIChatCompletionsRequest { std::optional maxModelLength; + // LoRA adapter selection + std::optional loraAdapter{std::nullopt}; + // Guided generation specific // String representation of response format object std::optional responseFormat{std::nullopt}; diff --git a/src/llm/language_model/continuous_batching/servable.hpp b/src/llm/language_model/continuous_batching/servable.hpp index 9b1cc9daf2..def574e009 100644 --- a/src/llm/language_model/continuous_batching/servable.hpp +++ b/src/llm/language_model/continuous_batching/servable.hpp @@ -18,7 +18,6 @@ #include #include -#include #include "../../servable.hpp" #include "src/llm/llm_calculator.pb.h" @@ -32,7 +31,6 @@ struct ContinuousBatchingServableExecutionContext : public GenAiServableExecutio }; struct ContinuousBatchingServableProperties : public GenAiServableProperties { - ov::genai::AdapterConfig adapterConfig; ov::genai::SchedulerConfig schedulerConfig; std::shared_ptr pipeline; std::shared_ptr llmExecutorWrapper; diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 73c8cfed83..6857b1ac56 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -214,6 +214,11 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptradapterConfig.add(adapter, loraAdapterOption.alpha()); + std::string adapterName = loraAdapterOption.has_name() + ? loraAdapterOption.name() + : std::filesystem::path(loraPath).stem().string(); + properties->adaptersByName.emplace(adapterName, adapter); + SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath); } catch (const std::exception& e) { SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index b3420553a2..f6f02f95cc 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -120,6 +120,12 @@ absl::Status LegacyServable::parseRequest(std::shared_ptrgenerationConfigBuilder->unsetStructuredOutputConfig(); } + + auto adapterStatus = applyLoraAdapter(executionContext); + if (!adapterStatus.ok()) { + return adapterStatus; + } + return absl::OkStatus(); } diff --git a/src/llm/language_model/legacy/servable_initializer.cpp b/src/llm/language_model/legacy/servable_initializer.cpp index 4ee7d4820a..e2abc79186 100644 --- a/src/llm/language_model/legacy/servable_initializer.cpp +++ b/src/llm/language_model/legacy/servable_initializer.cpp @@ -19,6 +19,7 @@ #include #include "openvino/genai/llm_pipeline.hpp" +#include #include #include @@ -76,6 +77,35 @@ Status LegacyServableInitializer::initialize(std::shared_ptr& ser return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; } + if (nodeOptions.lora_adapter_size() > 0) { + for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) { + const auto& loraAdapterOption = nodeOptions.lora_adapter(i); + auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path()); + std::string loraPath; + if (fsLoraPath.is_relative()) { + loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + loraPath = fsLoraPath.string(); + } + try { + ov::genai::Adapter adapter(loraPath); + properties->adapterConfig.add(adapter, loraAdapterOption.alpha()); + std::string adapterName = loraAdapterOption.has_name() + ? loraAdapterOption.name() + : std::filesystem::path(loraPath).stem().string(); + properties->adaptersByName.emplace(adapterName, adapter); + SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath); + } catch (const std::exception& e) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig)); + } + status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config()); diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index d347a35264..1fc60b3825 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -29,6 +29,7 @@ message LLMCalculatorOptions { message LoraAdapter { required string model_path = 1; optional float alpha = 2 [default = 1]; + optional string name = 3; } message KVCrushConfig { diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 6d9810ae5f..fb18d79d63 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -153,6 +153,27 @@ absl::Status GenAiServable::parseRequest(std::shared_ptrgenerationConfigBuilder->unsetStructuredOutputConfig(); } + auto adapterStatus = applyLoraAdapter(executionContext); + if (!adapterStatus.ok()) { + return adapterStatus; + } + + return absl::OkStatus(); +} + +absl::Status GenAiServable::applyLoraAdapter(std::shared_ptr& executionContext) { + const auto& request = executionContext->apiHandler->getRequest(); + if (request.loraAdapter.has_value()) { + auto props = getProperties(); + auto it = props->adaptersByName.find(request.loraAdapter.value()); + if (it == props->adaptersByName.end()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown LoRA adapter requested: {}", request.loraAdapter.value()); + return absl::InvalidArgumentError("Unknown LoRA adapter: " + request.loraAdapter.value()); + } + float alpha = props->adapterConfig.get_alpha(it->second); + executionContext->generationConfigBuilder->getConfig().adapters = + ov::genai::AdapterConfig(it->second, alpha); + } return absl::OkStatus(); } diff --git a/src/llm/servable.hpp b/src/llm/servable.hpp index e4a5dd5ee2..314711eed2 100644 --- a/src/llm/servable.hpp +++ b/src/llm/servable.hpp @@ -20,6 +20,8 @@ #include #include +#include + #pragma warning(push) #pragma warning(disable : 4251 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246) #pragma GCC diagnostic push @@ -111,6 +113,9 @@ struct GenAiServableProperties { ov::genai::Tokenizer tokenizer; // Specific pipeline properties bool eagle3Mode = false; + // LoRA adapter support + ov::genai::AdapterConfig adapterConfig; + std::unordered_map adaptersByName; #if (PYTHON_DISABLE == 0) PyJinjaTemplateProcessor templateProcessor; @@ -157,6 +162,12 @@ class GenAiServable { */ virtual absl::Status parseRequest(std::shared_ptr& executionContext); +protected: + // Sets per-request LoRA adapter on generationConfigBuilder if lora_adapter is specified in the request + absl::Status applyLoraAdapter(std::shared_ptr& executionContext); + +public: + /* prepareInputs method implementation MUST fill executionContext inputIds field. Base implementation applies chat template to the payload body and encodes it with tokenizer. diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 2834072410..4c6a9c3c35 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -130,6 +130,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrgenerationConfigBuilder->unsetStructuredOutputConfig(); } + + auto adapterStatus = applyLoraAdapter(executionContext); + if (!adapterStatus.ok()) { + return adapterStatus; + } + return absl::OkStatus(); } diff --git a/src/llm/visual_language_model/legacy/servable_initializer.cpp b/src/llm/visual_language_model/legacy/servable_initializer.cpp index ec8bfd327a..b1849135fa 100644 --- a/src/llm/visual_language_model/legacy/servable_initializer.cpp +++ b/src/llm/visual_language_model/legacy/servable_initializer.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -75,6 +76,35 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr< return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; } + if (nodeOptions.lora_adapter_size() > 0) { + for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) { + const auto& loraAdapterOption = nodeOptions.lora_adapter(i); + auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path()); + std::string loraPath; + if (fsLoraPath.is_relative()) { + loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + loraPath = fsLoraPath.string(); + } + try { + ov::genai::Adapter adapter(loraPath); + properties->adapterConfig.add(adapter, loraAdapterOption.alpha()); + std::string adapterName = loraAdapterOption.has_name() + ? loraAdapterOption.name() + : std::filesystem::path(loraPath).stem().string(); + properties->adaptersByName.emplace(adapterName, adapter); + SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath); + } catch (const std::exception& e) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig)); + } + status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig); if (!status.ok()) { SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config()); From cca7428505673474e91217adcd2946ce1b515ef5 Mon Sep 17 00:00:00 2001 From: Damian Kalinowski Date: Wed, 11 Mar 2026 16:53:32 +0100 Subject: [PATCH 3/3] hardcoded, but changed at runtime --- src/llm/servable.cpp | 5 ++-- .../legacy/legacy_executor.cpp | 25 ++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index fb18d79d63..7d8fb0066f 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -170,9 +170,10 @@ absl::Status GenAiServable::applyLoraAdapter(std::shared_ptradapterConfig.get_alpha(it->second); + //float alpha = props->adapterConfig.get_alpha(it->second); executionContext->generationConfigBuilder->getConfig().adapters = - ov::genai::AdapterConfig(it->second, alpha); + ov::genai::AdapterConfig(it->second, 0.5);//alpha); + SPDLOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); } return absl::OkStatus(); } diff --git a/src/llm/visual_language_model/legacy/legacy_executor.cpp b/src/llm/visual_language_model/legacy/legacy_executor.cpp index a21c799cec..854bd925c8 100644 --- a/src/llm/visual_language_model/legacy/legacy_executor.cpp +++ b/src/llm/visual_language_model/legacy/legacy_executor.cpp @@ -40,7 +40,30 @@ void VisualLanguageModelLegacyExecutor::processRequest() { } else { SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started"); try { - requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); + /* + absl::Status GenAiServable::applyLoraAdapter(std::shared_ptr& executionContext) { + const auto& request = executionContext->apiHandler->getRequest(); + if (request.loraAdapter.has_value()) { + auto props = getProperties(); + auto it = props->adaptersByName.find(request.loraAdapter.value()); + if (it == props->adaptersByName.end()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown LoRA adapter requested: {}", request.loraAdapter.value()); + return absl::InvalidArgumentError("Unknown LoRA adapter: " + request.loraAdapter.value()); + } + float alpha = props->adapterConfig.get_alpha(it->second); + executionContext->generationConfigBuilder->getConfig().adapters = + ov::genai::AdapterConfig(it->second, alpha); + } + return absl::OkStatus(); +} + */ + + + requestExecutionContext->results = pipe->generate( + requestExecutionContext->inputText, + requestExecutionContext->inputImages, + requestExecutionContext->generationConfigBuilder->getConfig(), + requestExecutionContext->textStreamer); } catch (std::exception& e) { requestExecutionContext->success = false; SPDLOG_LOGGER_ERROR(llm_executor_logger, "VLM pipeline generation failed: {}.", e.what());