From e0c377fb6807ea0bbc7aecae23a0544ac064ff42 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 Jan 2026 12:20:50 +0800 Subject: [PATCH 01/10] Fix llama-bench -p -n where p<=256 --- ggml/src/ggml-openvino/utils.cpp | 12 +++++------- ggml/src/ggml-openvino/utils.h | 9 +++------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index f7d62588c87..2d30eef941f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -768,14 +768,12 @@ graph_key compute_graph_key(ggml_cgraph * cgraph) { graph_key key; key.n_nodes = cgraph->n_nodes; - if (cgraph->n_nodes > 0) { - key.first_node_name = std::string(cgraph->nodes[0]->name); - key.last_node_name = std::string(cgraph->nodes[cgraph->n_nodes - 1]->name); - } else { - key.first_node_name = ""; - key.last_node_name = ""; + for (int i = 0; i < cgraph->n_nodes; ++i) { + const auto * node = cgraph->nodes[i]; + if (node->op == GGML_OP_SET_ROWS && strncmp(node->src[2]->name, "cache_k_l0", 10) == 0) { + key.cache_k_l0 = node->src[2]; + } } - return key; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 47bf2d4ff17..72ef904f741 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -8,20 +8,17 @@ struct graph_key { size_t n_nodes; - std::string first_node_name; - std::string last_node_name; + void * cache_k_l0; bool operator==(const graph_key & other) const { - return n_nodes == other.n_nodes && first_node_name == other.first_node_name && - last_node_name == other.last_node_name; + return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0; } }; struct graph_key_hash { size_t operator()(const graph_key & key) const { size_t h = std::hash{}(key.n_nodes); - h ^= std::hash{}(key.first_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); - h ^= std::hash{}(key.last_node_name) + 0x9e3779b9 + (h << 6) + (h >> 2); + h ^= std::hash{}(key.cache_k_l0) + 0x9e3779b9 + (h << 6) + (h >> 2); return h; } }; From ff9bb1ab144343972e22e48d3d070857e9c50713 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 22 Jan 2026 15:52:10 +0800 Subject: [PATCH 02/10] Fix --direct-io 0 --- ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index de986ea42d6..06bff5a2b77 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -943,7 +943,7 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } From cd067dcbfedbbcdcd0493a4eebc739a6570cc24f Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Sat, 24 Jan 2026 17:16:06 +0800 Subject: [PATCH 03/10] Don't put kvcache on GPU in stateful mode --- ggml/src/ggml-openvino/ggml-openvino.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 06bff5a2b77..8d6a0dbf335 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -140,7 +140,7 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu // Put kvcache on device memory for GPU (NPU memory is too small even for kvcache) if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY && strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && - ggml_openvino_get_device_name() == "GPU") { + ggml_openvino_get_device_name() == "GPU" && !getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) { GGML_ASSERT(ctx->tensor_extras.empty()); auto device = ctx->device; auto size = ctx->size; From e480d5bf00ece985bb8e3bc6bdb7bdb32d14f481 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 23 Jan 2026 15:49:01 +0800 Subject: [PATCH 04/10] Remove hardcode names --- ggml/src/ggml-openvino/ggml-decoder.cpp | 63 +++++++++++++------------ ggml/src/ggml-openvino/ggml-decoder.h | 8 ++-- ggml/src/ggml-openvino/utils.cpp | 4 +- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b8fe6358c8d..01e2c2ff193 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -169,9 +169,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // TODO: The shape modification for stateful model below is not validated for all supported models yet. More generic solution might be needed // to enable additional cases. Ideally, this could be removed from decoder and done as part of a transformation later. auto stateless_kv_shape = get_graph_input_shape(node, src); - assert(stateless_kv_shape.size() == 4 && stateless_kv_shape[0] == 1 && stateless_kv_shape[1] == 1 - && stateless_kv_shape[2].is_dynamic() && stateless_kv_shape[3] == (m_model_params.n_heads_kv*m_model_params.head_size)); - stateful_kv_shape = {stateless_kv_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv, m_model_params.head_size}; + assert(stateless_kv_shape.size() == 4 && stateless_kv_shape[0] == 1 && + stateless_kv_shape[1] == 1 && stateless_kv_shape[2].is_dynamic() && + stateless_kv_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size)); + stateful_kv_shape = {stateless_kv_shape[0], ov::Dimension::dynamic(), + m_model_params.n_heads_kv, m_model_params.head_size}; } } } @@ -180,9 +182,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } m_inputs[src_name] = src; assert(stateful_kv_shape.rank().is_static()); - ov::PartialShape param_shape = (stateful_kv_shape.rank().get_length() != 0) - ? stateful_kv_shape - : get_graph_input_shape(node, src); + ov::PartialShape param_shape = + (stateful_kv_shape.rank().get_length() != 0) ? stateful_kv_shape : get_graph_input_shape(node, src); auto param_node = std::make_shared(get_ov_type(src), param_shape); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); @@ -197,7 +198,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT || - node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) { + debug_output_names.count(node_output_name)) { if (m_model_outputs.find(node_output_name) == m_model_outputs.end()) { m_model_outputs[node_output_name] = node_output; } @@ -312,6 +313,11 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); if (node->op == GGML_OP_FLASH_ATTN_EXT) { + model_params.n_heads = node->src[0]->ne[2]; + model_params.n_heads_kv = node->src[1]->ne[2]; + model_params.head_size = node->src[0]->ne[0]; + compute_params.input_len = node->src[0]->ne[1]; + auto * cache_k_perm = node->src[1]; if (cache_k_perm->op == GGML_OP_CPY) { cache_k_perm = cache_k_perm->src[0]; @@ -324,9 +330,8 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr int layer = extract_layer_from_name(cache_k->name); auto * mask = node->src[3]; std::string mask_name(mask->name); - assert(mask_name.find("self_kq_mask") == 0); - if (std::string(node->src[3]->name).find("swa") != std::string::npos) { + if (mask_name.find("swa") != std::string::npos) { model_params.swa_layers.push_back(layer); model_params.ctx_per_seq_swa = cache_k->ne[1]; } else { @@ -351,25 +356,18 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr compute_params.attention_size_swa = model_params.ctx_per_seq_swa; compute_params.token_len_per_seq = 1; } - - } else if (node->op == GGML_OP_ROPE) { - if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { - model_params.head_size = node->ne[0]; - model_params.n_heads = node->ne[1]; - model_params.rope_params = node->op_params; - auto * inp_pos = node->src[1]; - compute_params.input_len = inp_pos->ne[0]; - } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { - model_params.n_heads_kv = node->ne[1]; - } - } else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") { - // for static case, output_len is always 1 except for llama-perplexity - compute_params.output_len = node->src[1]->ne[0]; - if (is_static && compute_params.output_len == 0) { - compute_params.output_len = 1; - } + break; + } + if (node->op == GGML_OP_ROPE) { + model_params.rope_params = node->op_params; } } + auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; + compute_params.output_len = output_tensor->ne[1]; + // for NPU, output_len is always 1 except for llama-perplexity + if (is_static && compute_params.output_len == 0) { + compute_params.output_len = 1; + } model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq; return {model_params, compute_params}; @@ -385,14 +383,17 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co auto name = std::string(input->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos") { + if ((op->op == GGML_OP_GET_ROWS && op->src[0]->op == GGML_OP_NONE) || op->op == GGML_OP_ROPE) { + // tokens or positions int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; - } else if (name == "inp_out_ids") { + } else if (op->op == GGML_OP_GET_ROWS) { + // output index input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; - } else if (name.find("self_kq_mask") == 0) { + } else if (op->op == GGML_OP_CPY || op->op == GGML_OP_FLASH_ATTN_EXT) { + // mask if (m_is_static) { input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; } else if (m_is_stateful) { @@ -401,7 +402,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{-1, 1, -1, -1}; } - } else if (name.find("cache_") == 0) { + } else if (op && op->op == GGML_OP_SET_ROWS && op->src[2] == input) { + // kvcache input_shape = ov::PartialShape{get_shape(input)}; if (!m_is_static) { // do not fix ctx size to make llama-bench work @@ -409,6 +411,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { + // kv update index int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; input_shape = ov::PartialShape{1, 1, 1, len}; diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4afec272e1a..c0d18b7512e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -16,7 +16,7 @@ struct ModelParams { int ctx_swa = -1; int ctx_per_seq = -1; int ctx_per_seq_swa = -1; - int n_seq = -1; + int n_seq = 1; int n_heads = -1; int n_heads_kv = -1; int head_size = -1; @@ -37,14 +37,14 @@ struct ModelParams { }; struct ComputeParams { - int n_seq_active = -1; - int seq_active_start = -1; + int n_seq_active = 1; + int seq_active_start = 0; int attention_size = -1; int attention_size_swa = -1; int input_len = -1; int token_len_per_seq = -1; int past_kv_len = -1; - int output_len = -1; + int output_len = 1; }; class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 2d30eef941f..8c3717472b4 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -614,10 +614,10 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && result_name == "result_output" && output_shape[2] == 0) { + if (ggml_decoder->is_static() && output_shape[2] == 0) { output_shape[2] = 1; } - if (ggml_decoder->is_stateful() && result_name == "result_output") { + if (ggml_decoder->is_stateful() && ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { std::vector output_shape_3d; for (size_t i=1; i Date: Fri, 23 Jan 2026 15:49:36 +0800 Subject: [PATCH 05/10] Fix stateful shapes --- .../ggml-openvino/openvino/op/glu_geglu.cpp | 2 +- .../ggml-openvino/openvino/op/glu_swiglu.cpp | 2 +- ggml/src/ggml-openvino/openvino/op/rope.cpp | 22 +++++-------------- ggml/src/ggml-openvino/openvino/utils.cpp | 2 +- ggml/src/ggml-openvino/utils.cpp | 2 ++ 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index ad5cd3f6ba5..8be9e8deb06 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -26,7 +26,7 @@ OutputVector translate_glu_geglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 2b7f13629f2..6e0b85517e6 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -26,7 +26,7 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { src1 = context.get_input(1); } else { auto combined = context.get_input(0); - auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {3}); + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, {}, {-1}); auto split = std::make_shared(combined, split_axis, 2); src0 = split->output(0); src1 = split->output(1); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 01bc46131e1..44e3368217e 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -70,22 +70,16 @@ OutputVector translate_rope(const NodeContext & context) { constexpr int ROPE_TYPE_NORM = 0; if (mode == ROPE_TYPE_NORM) { + auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); Output even_slice; Output odd_slice; - int32_t unsqueeze_dim = 4; - if (context.is_stateful()) { - unsqueeze_dim = 3; - even_slice = std::make_shared(data_node, zero, end, two, two); - odd_slice = std::make_shared(data_node, one, end, two, two); - } else { - auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); - even_slice = std::make_shared(data_node, zero, end, two, three); - odd_slice = std::make_shared(data_node, one, end, two, three); - } + int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4; + even_slice = std::make_shared(data_node, zero, end, two, neg_one); + odd_slice = std::make_shared(data_node, one, end, two, neg_one); Output first_half = std::make_shared(std::make_shared(even_slice, cos_theta_node), @@ -105,7 +99,7 @@ OutputVector translate_rope(const NodeContext & context) { res = std::make_shared(stack, data_shape, false); } else if (mode == ROPE_TYPE_NEOX) { auto data_split = std::make_shared( - data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {3}), 2); + data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); Output slice_data_node_0 = data_split->outputs()[0]; Output slice_data_node_1 = data_split->outputs()[1]; @@ -117,11 +111,7 @@ OutputVector translate_rope(const NodeContext & context) { std::make_shared(slice_data_node_0, sin_theta_node), std::make_shared(slice_data_node_1, cos_theta_node)); - int32_t concat_dim = 3; - if (context.is_stateful()) { - concat_dim = 2; - } - res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, concat_dim); + res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index b7553f99c86..a0215b97b11 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -216,7 +216,7 @@ ov::Output process_view_input(const NodeContext & context, int input_i auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {split_addr}); auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_end}); auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {context.is_stateful() ? 2 : 3}); auto sliced = std::make_shared(input, begin, end, stride, axes); return sliced; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 8c3717472b4..edf42cd9854 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -497,6 +497,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name) { + // NPU decoding stage const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); @@ -540,6 +541,7 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, const std::string & param_name, int chunk_index) { + // NPU prompt processing stage const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); From 4a8fd24e32089ba84e57bb92ff3a23ae1a067894 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 21 Jan 2026 15:17:11 -0800 Subject: [PATCH 06/10] Simplification for stateful and update output shape processing --- ggml/src/ggml-openvino/ggml-decoder.cpp | 18 ++++----- ggml/src/ggml-openvino/ggml-decoder.h | 2 +- .../openvino/translate_session.cpp | 25 ++++++++++++ ggml/src/ggml-openvino/utils.cpp | 39 ++++++++----------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 01e2c2ff193..2f97af0a3ed 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -56,11 +56,11 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_model_params(model_params), m_compute_params(compute_params) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { - #ifdef _WIN32 - _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); - #else - unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); - #endif +#ifdef _WIN32 + _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); +#else + unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); +#endif print_tensor_address_map(cgraph); } @@ -106,8 +106,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map(get_ov_type(src_node), ov::Shape(get_shape(src_node))); + auto param_node = std::make_shared(get_ov_type(src_node), get_shape(src_node)); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); m_model_inputs[src_name] = param_node; @@ -163,7 +162,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); - if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name); it == m_model_params.kv_names.end()) { + if (auto it = std::find(m_model_params.kv_names.begin(), m_model_params.kv_names.end(), src_name); + it == m_model_params.kv_names.end()) { m_model_params.kv_names.push_back(src_name); if (is_stateful()) { // TODO: The shape modification for stateful model below is not validated for all supported models yet. More generic solution might be needed @@ -719,7 +719,7 @@ void print_tensor_address_map(const ggml_cgraph * cgraph) { } } -std::vector GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { +ov::Shape GgmlOvDecoder::get_shape(const ggml_tensor * tensor) { std::vector shape; for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { shape.push_back(static_cast(tensor->ne[i])); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c0d18b7512e..f69d1878800 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -207,7 +207,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_is_prefill = false; int m_prefill_chunk_size = 0; - static std::vector get_shape(const ggml_tensor * tensor); + static ov::Shape get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); static ov::element::Type get_ov_type(const ggml_tensor * tensor); static std::string compute_op_type(const ggml_tensor * node); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index adb3025d175..b7e7b58531f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -29,8 +29,10 @@ #include #include #include +#include #include #include +#include namespace ov { namespace frontend { @@ -252,6 +254,29 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); } manager.run_passes(model); + if (ggml_model_decoder->is_stateful()) { + auto output_names = ggml_model_decoder->get_model_output_names(); + std::map model_output_indexes; + for (size_t i=0; iget_output_size(); i++) { + auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name(); + auto output_id = model_output_indexes[output_friendly_name]; + auto model_output_shape = model->output(i).get_partial_shape(); + auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id); + if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() + && model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() + && decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) { + ppp.output(i).postprocess().custom([](const ov::Output& node) { + auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0}); + return std::make_shared(node, axes); + }); + } + } + model = ppp.build(); + } } return model; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index edf42cd9854..0c5a520b251 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -103,10 +103,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ggml_decoder->add_extra_inputs(); infer_request = infer_request_cache[key]; - auto * inp_pos = get_inp_pos_tensor(cgraph); - int32_t * pos_data = (int32_t *) inp_pos->data; - if (pos_data[0] == 0) { - infer_request->reset_state(); + if (stateful) { + const auto * inp_pos = get_inp_pos_tensor(cgraph); + int32_t * pos_data = (int32_t *) inp_pos->data; + if (pos_data[0] == 0) { + infer_request->reset_state(); + } } decoder_end_time = ggml_time_us(); @@ -118,7 +120,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); + ggml_decoder = + std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -351,7 +354,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), + infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data); infer_request->set_output_tensor(i, output_tensor); } @@ -378,7 +383,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { } for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + ov::Tensor output_tensor(infer_request->get_output_tensor(i).get_element_type(), + infer_request->get_output_tensor(i).get_shape(), ggml_tensor->data); infer_request->set_output_tensor(i, output_tensor); } @@ -478,7 +485,7 @@ ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { - input_shape = ggml_decoder->get_shape(ggml_tensor); + input_shape = ggml_decoder->get_shape(ggml_tensor); } auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; @@ -616,20 +623,8 @@ ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, con auto output_type = ggml_decoder->get_ov_type(ggml_tensor); auto output_shape = ggml_decoder->get_shape(ggml_tensor); - if (ggml_decoder->is_static() && output_shape[2] == 0) { - output_shape[2] = 1; - } - if (ggml_decoder->is_stateful() && ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - std::vector output_shape_3d; - for (size_t i=1; idata); - return output_tensor; - } else { - ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); - return output_tensor; - } + ov::Tensor output_tensor(output_type, output_shape, ggml_tensor->data); + return output_tensor; } size_t checksum(const void * data, size_t size) { From cdf724a21a664b6725f8469ce14d0f1f958734d1 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Mon, 2 Feb 2026 14:27:35 -0800 Subject: [PATCH 07/10] stateful masking fix --- .../openvino/translate_session.cpp | 11 +++++++---- ggml/src/ggml-openvino/utils.cpp | 17 ++++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index b7e7b58531f..5057d1fa86e 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -89,12 +90,14 @@ void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto two_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1}); auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto shape_of_inp_pos = std::make_shared(inp_pos); - auto gather_inp_pos = std::make_shared(shape_of_inp_pos, two_1d, zero_1d); - auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, gather_inp_pos}, 0); + auto gather_inp_pos = std::make_shared(inp_pos, neg_one_1d, three_1d); + auto reshaped_inp_pos = std::make_shared(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false); + auto inp_pos_incremented = std::make_shared(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1})); + auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, std::make_shared(inp_pos_incremented, token_len_per_seq)}, 0); mask_sliced = std::make_shared(mask, zero_2d, stop, one_2d, axes); mask_sliced = std::make_shared(mask_sliced, ov::element::f16); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 0c5a520b251..a438d6b7e18 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -57,6 +57,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin auto & core = ov_singleton_core(); const auto & config = ggml_openvino_get_compile_config(); static auto is_static = false; + static size_t stateful_kv_size = 0; // if (is_naive(cgraph)) { // return naive_compute(cgraph, core, device, config); @@ -106,9 +107,23 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (stateful) { const auto * inp_pos = get_inp_pos_tensor(cgraph); int32_t * pos_data = (int32_t *) inp_pos->data; + auto pos_shape = ggml_decoder->get_shape(inp_pos); if (pos_data[0] == 0) { infer_request->reset_state(); - } + stateful_kv_size = pos_shape[3]; + } else if (stateful_kv_size == pos_data[0]) { + stateful_kv_size += pos_shape[3]; + } else { + auto states = infer_request->query_state(); + for (auto state : states) { + auto state_tensor = state.get_state(); + ov::Coordinate begin = {0, 0, 0, 0}; + ov::Coordinate end = {state_tensor.get_shape()[0], pos_data[0], state_tensor.get_shape()[2], state_tensor.get_shape()[3]}; + ov::Tensor new_state_tensor(state_tensor, begin, end); + state.set_state(new_state_tensor); + } + stateful_kv_size = pos_data[0] + 1; + } } decoder_end_time = ggml_time_us(); From 5e6142f0c877f704ae8bdbe7a1e9ea3f663afbde Mon Sep 17 00:00:00 2001 From: Cavus Date: Tue, 10 Feb 2026 16:01:21 -0800 Subject: [PATCH 08/10] type casting fixes --- ggml/src/ggml-openvino/ggml-quants.cpp | 4 ++-- ggml/src/ggml-openvino/utils.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 8946b73a561..762f38ddce7 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -658,7 +658,7 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons } else { // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / static_cast(layout.weights_per_block)}; // For symmetric quantization, biases are a single value instead of per-block ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; @@ -680,7 +680,7 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons } else { // Normal extraction path (no requant) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / static_cast(layout.weights_per_block)}; // For symmetric quantization, biases are a single value instead of per-block ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index a438d6b7e18..8173a25955f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -118,7 +118,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin for (auto state : states) { auto state_tensor = state.get_state(); ov::Coordinate begin = {0, 0, 0, 0}; - ov::Coordinate end = {state_tensor.get_shape()[0], pos_data[0], state_tensor.get_shape()[2], state_tensor.get_shape()[3]}; + ov::Coordinate end = {state_tensor.get_shape()[0], static_cast(pos_data[0]), state_tensor.get_shape()[2], state_tensor.get_shape()[3]}; ov::Tensor new_state_tensor(state_tensor, begin, end); state.set_state(new_state_tensor); } From 8f5ee213ae684584a0409591d8767a9e3797f32b Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 11 Feb 2026 15:32:12 -0800 Subject: [PATCH 09/10] Fix after rebase --- ggml/src/ggml-openvino/ggml-decoder.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 35963bc9f84..4b9429740c8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -382,12 +382,6 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr if (is_static && compute_params.output_len == 0) { compute_params.output_len = 1; } - auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; - compute_params.output_len = output_tensor->ne[1]; - // for NPU, output_len is always 1 except for llama-perplexity - if (is_static && compute_params.output_len == 0) { - compute_params.output_len = 1; - } model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq; return {model_params, compute_params}; From 9606b1f86b4e369665bafd42570cb0f9bd617017 Mon Sep 17 00:00:00 2001 From: Mustafa Cavus Date: Wed, 11 Feb 2026 15:34:40 -0800 Subject: [PATCH 10/10] Fix after rebase --- ggml/src/ggml-openvino/utils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index ff0baa3ea27..7c403b7d890 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -20,7 +20,8 @@ struct graph_key { } bool operator==(const graph_key & other) const { - return n_nodes == other.n_nodes && cache_k_l0 == other.cache_k_l0; + return n_nodes == other.n_nodes && first_node_name == other.first_node_name && + last_node_name == other.last_node_name; } };