Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,16 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optional<uint32_
return absl::InvalidArgumentError("model missing in request");
}

// lora_adapter: string; optional
// Extension, select a LoRA adapter to use for this request by name
it = doc.FindMember("lora_adapter");
if (it != doc.MemberEnd() && !it->value.IsNull()) {
if (!it->value.IsString())
return absl::InvalidArgumentError("lora_adapter is not a string");
SPDLOG_ERROR("Found lora adapter in request => {}", it->value.GetString());
request.loraAdapter = it->value.GetString();
}

// ignore_eos: bool; optional - defaults to false
// Extension, unsupported by OpenAI API, however supported by vLLM and CB lib
it = doc.FindMember("ignore_eos");
Expand Down
3 changes: 3 additions & 0 deletions src/llm/apis/openai_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ struct OpenAIChatCompletionsRequest {

std::optional<uint32_t> maxModelLength;

// LoRA adapter selection
std::optional<std::string> loraAdapter{std::nullopt};

// Guided generation specific
// String representation of response format object
std::optional<std::string> responseFormat{std::nullopt};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ struct LLMExecutor {
void printMetrics() {
ov::genai::PipelineMetrics metrics = pipe->get_metrics();
SPDLOG_LOGGER_INFO(llm_executor_logger, "All requests: {}; Scheduled requests: {}; Cache {};",
metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache));
metrics.requests, metrics.scheduled_requests, metrics.cache_usage);
}
};
#pragma GCC diagnostic pop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <string>
#include <vector>
#include <openvino/genai/cache_eviction.hpp>
#include <openvino/genai/lora_adapter.hpp>
#include <openvino/genai/sparse_attention.hpp>
#include <openvino/genai/continuous_batching_pipeline.hpp>
#include <openvino/openvino.hpp>
Expand Down Expand Up @@ -198,6 +199,37 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}

if (nodeOptions.lora_adapter_size() > 0) {
SPDLOG_INFO("LoRA adapters will be applied to the model. Number of adapters: {}", nodeOptions.lora_adapter_size());
for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) {
SPDLOG_INFO("Processing LoRA adapter number {} with model path: {} alpha: {}", i, nodeOptions.lora_adapter(i).model_path(), nodeOptions.lora_adapter(i).alpha());
const auto& loraAdapterOption = nodeOptions.lora_adapter(i);
auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path());
std::string loraPath;
if (fsLoraPath.is_relative()) {
loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string();
} else {
loraPath = fsLoraPath.string();
}
try {
ov::genai::Adapter adapter(loraPath);
properties->adapterConfig.add(adapter, loraAdapterOption.alpha());
std::string adapterName = loraAdapterOption.has_name()
? loraAdapterOption.name()
: std::filesystem::path(loraPath).stem().string();
properties->adaptersByName.emplace(adapterName, adapter);
SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath);
} catch (const std::exception& e) {
SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what());
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
} catch (...) {
SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath);
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}
}
properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig));
}

status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig);
if (!status.ok()) {
SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config());
Expand Down
6 changes: 6 additions & 0 deletions src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ absl::Status LegacyServable::parseRequest(std::shared_ptr<GenAiServableExecution
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool guided generation will not be applied due to JSON schema validation failure: {}", e.what());
legacyExecutionContext->generationConfigBuilder->unsetStructuredOutputConfig();
}

auto adapterStatus = applyLoraAdapter(executionContext);
if (!adapterStatus.ok()) {
return adapterStatus;
}

return absl::OkStatus();
}

Expand Down
30 changes: 30 additions & 0 deletions src/llm/language_model/legacy/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <vector>

#include "openvino/genai/llm_pipeline.hpp"
#include <openvino/genai/lora_adapter.hpp>
#include <openvino/openvino.hpp>
#include <spdlog/spdlog.h>

Expand Down Expand Up @@ -76,6 +77,35 @@ Status LegacyServableInitializer::initialize(std::shared_ptr<GenAiServable>& ser
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}

if (nodeOptions.lora_adapter_size() > 0) {
for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) {
const auto& loraAdapterOption = nodeOptions.lora_adapter(i);
auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path());
std::string loraPath;
if (fsLoraPath.is_relative()) {
loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string();
} else {
loraPath = fsLoraPath.string();
}
try {
ov::genai::Adapter adapter(loraPath);
properties->adapterConfig.add(adapter, loraAdapterOption.alpha());
std::string adapterName = loraAdapterOption.has_name()
? loraAdapterOption.name()
: std::filesystem::path(loraPath).stem().string();
properties->adaptersByName.emplace(adapterName, adapter);
SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath);
} catch (const std::exception& e) {
SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what());
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
} catch (...) {
SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath);
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}
}
properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig));
}

status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig);
if (!status.ok()) {
SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config());
Expand Down
8 changes: 8 additions & 0 deletions src/llm/llm_calculator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ message LLMCalculatorOptions {
optional LLMCalculatorOptions ext = 113473750;
}

message LoraAdapter {
required string model_path = 1;
optional float alpha = 2 [default = 1];
optional string name = 3;
}

message KVCrushConfig {
enum AnchorPointMode {
RANDOM = 0;
Expand Down Expand Up @@ -135,4 +141,6 @@ message LLMCalculatorOptions {
optional bool enable_tool_guided_generation = 23 [default = false];

optional SparseAttentionConfig sparse_attention_config = 24;

repeated LoraAdapter lora_adapter = 25;
}
22 changes: 22 additions & 0 deletions src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,28 @@ absl::Status GenAiServable::parseRequest(std::shared_ptr<GenAiServableExecutionC
executionContext->generationConfigBuilder->unsetStructuredOutputConfig();
}

auto adapterStatus = applyLoraAdapter(executionContext);
if (!adapterStatus.ok()) {
return adapterStatus;
}

return absl::OkStatus();
}

absl::Status GenAiServable::applyLoraAdapter(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
const auto& request = executionContext->apiHandler->getRequest();
if (request.loraAdapter.has_value()) {
auto props = getProperties();
auto it = props->adaptersByName.find(request.loraAdapter.value());
if (it == props->adaptersByName.end()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown LoRA adapter requested: {}", request.loraAdapter.value());
return absl::InvalidArgumentError("Unknown LoRA adapter: " + request.loraAdapter.value());
}
//float alpha = props->adapterConfig.get_alpha(it->second);
executionContext->generationConfigBuilder->getConfig().adapters =
ov::genai::AdapterConfig(it->second, 0.5);//alpha);
SPDLOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
}
return absl::OkStatus();
}

Expand Down
11 changes: 11 additions & 0 deletions src/llm/servable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <unordered_map>
#include <vector>

#include <openvino/genai/lora_adapter.hpp>

#pragma warning(push)
#pragma warning(disable : 4251 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246)
#pragma GCC diagnostic push
Expand Down Expand Up @@ -111,6 +113,9 @@ struct GenAiServableProperties {
ov::genai::Tokenizer tokenizer;
// Specific pipeline properties
bool eagle3Mode = false;
// LoRA adapter support
ov::genai::AdapterConfig adapterConfig;
std::unordered_map<std::string, ov::genai::Adapter> adaptersByName;

#if (PYTHON_DISABLE == 0)
PyJinjaTemplateProcessor templateProcessor;
Expand Down Expand Up @@ -157,6 +162,12 @@ class GenAiServable {
*/
virtual absl::Status parseRequest(std::shared_ptr<GenAiServableExecutionContext>& executionContext);

protected:
// Sets per-request LoRA adapter on generationConfigBuilder if lora_adapter is specified in the request
absl::Status applyLoraAdapter(std::shared_ptr<GenAiServableExecutionContext>& executionContext);

public:

/*
prepareInputs method implementation MUST fill executionContext inputIds field.
Base implementation applies chat template to the payload body and encodes it with tokenizer.
Expand Down
25 changes: 24 additions & 1 deletion src/llm/visual_language_model/legacy/legacy_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,30 @@ void VisualLanguageModelLegacyExecutor::processRequest() {
} else {
SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started");
try {
requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer);
/*
absl::Status GenAiServable::applyLoraAdapter(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
const auto& request = executionContext->apiHandler->getRequest();
if (request.loraAdapter.has_value()) {
auto props = getProperties();
auto it = props->adaptersByName.find(request.loraAdapter.value());
if (it == props->adaptersByName.end()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Unknown LoRA adapter requested: {}", request.loraAdapter.value());
return absl::InvalidArgumentError("Unknown LoRA adapter: " + request.loraAdapter.value());
}
float alpha = props->adapterConfig.get_alpha(it->second);
executionContext->generationConfigBuilder->getConfig().adapters =
ov::genai::AdapterConfig(it->second, alpha);
}
return absl::OkStatus();
}
*/


requestExecutionContext->results = pipe->generate(
requestExecutionContext->inputText,
requestExecutionContext->inputImages,
requestExecutionContext->generationConfigBuilder->getConfig(),
requestExecutionContext->textStreamer);
} catch (std::exception& e) {
requestExecutionContext->success = false;
SPDLOG_LOGGER_ERROR(llm_executor_logger, "VLM pipeline generation failed: {}.", e.what());
Expand Down
6 changes: 6 additions & 0 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool guided generation will not be applied due to JSON schema validation failure: {}", e.what());
legacyExecutionContext->generationConfigBuilder->unsetStructuredOutputConfig();
}

auto adapterStatus = applyLoraAdapter(executionContext);
if (!adapterStatus.ok()) {
return adapterStatus;
}

return absl::OkStatus();
}

Expand Down
30 changes: 30 additions & 0 deletions src/llm/visual_language_model/legacy/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <string>
#include <vector>

#include <openvino/genai/lora_adapter.hpp>
#include <openvino/genai/visual_language/pipeline.hpp>
#include <openvino/openvino.hpp>
#include <spdlog/spdlog.h>
Expand Down Expand Up @@ -75,6 +76,35 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}

if (nodeOptions.lora_adapter_size() > 0) {
for (int i = 0; i < nodeOptions.lora_adapter_size(); ++i) {
const auto& loraAdapterOption = nodeOptions.lora_adapter(i);
auto fsLoraPath = std::filesystem::path(loraAdapterOption.model_path());
std::string loraPath;
if (fsLoraPath.is_relative()) {
loraPath = (std::filesystem::path(graphPath) / fsLoraPath).string();
} else {
loraPath = fsLoraPath.string();
}
try {
ov::genai::Adapter adapter(loraPath);
properties->adapterConfig.add(adapter, loraAdapterOption.alpha());
std::string adapterName = loraAdapterOption.has_name()
? loraAdapterOption.name()
: std::filesystem::path(loraPath).stem().string();
properties->adaptersByName.emplace(adapterName, adapter);
SPDLOG_INFO("Registered LoRA adapter '{}' from path: {}", adapterName, loraPath);
} catch (const std::exception& e) {
SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {} exception: {}", loraPath, e.what());
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
} catch (...) {
SPDLOG_ERROR("Error during LoRA adapter initialization for model_path: {}", loraPath);
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
}
}
properties->pluginConfig.insert(ov::genai::adapters(properties->adapterConfig));
}

status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), properties->pluginConfig);
if (!status.ok()) {
SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config());
Expand Down