Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2b6d2da
Add interface is_model_splitted() to check the c-graph is splited or not
zhaixuejun1993 Mar 6, 2026
813fe5f
Add member func named is_splited_model()
zhaixuejun1993 Mar 6, 2026
eb5dc53
Fix error in test ops
zhaixuejun1993 Mar 16, 2026
a528765
Add fun description
zhaixuejun1993 Mar 16, 2026
fbc3128
Infer and propagate dynamic-dimension indices for all tensors in the …
zhaixuejun1993 Mar 17, 2026
c397b1c
Thread safety per request only
cavusmustafa Mar 17, 2026
37f6bca
Merge branch 'dev_backend_openvino' into xuejun/ov-bk-add-func-is-spl…
zhaixuejun1993 Mar 18, 2026
be67f32
Merge pull request #71 from zhaixuejun1993/xuejun/ov-bk-add-func-is-s…
zhaixuejun1993 Mar 19, 2026
07029c1
Only do this for fallback sub graph
zhaixuejun1993 Mar 19, 2026
f4b663e
Merge pull request #76 from zhaixuejun1993/xuejun/fix_llama_cli-issue
zhaixuejun1993 Mar 19, 2026
b185b49
Use i4/i8 directly for symmetric quant
wine99 Mar 19, 2026
ee7c9f3
Use weightless caching
wine99 Mar 19, 2026
8930726
Add WeightlessCacheAttribute to reduce NPU memory usage
wine99 Mar 19, 2026
c13ca29
Move dynamic dims compute in graph missmatch
zhaixuejun1993 Mar 23, 2026
bb0028a
ggml-openvino: fix tensor data handling for PERMUTE/VIEW ops in split…
zhaixuejun1993 Mar 19, 2026
5c1ec64
ggml-openvino:add comments
zhaixuejun1993 Mar 19, 2026
ad8605e
ggml-openvino: override VIEW op_case to 0 for split model inputs
zhaixuejun1993 Mar 19, 2026
dc7ff7f
openvino backend: Handle unsupported VIEW shape-mismatch in OpenVINO …
zhaixuejun1993 Mar 19, 2026
b627d58
Fix sticky stateful config
wine99 Mar 19, 2026
2d032d8
Enable additional mul_mat tests and add tensor data saving function (…
zhaixuejun1993 Mar 23, 2026
6ce5e7a
Fix ROPE yarn case
wine99 Mar 24, 2026
ca1bd05
ggml-openvino: fix CONT/TRANSPOSE mapping and improve dynamic-dimensi…
zhaixuejun1993 Mar 26, 2026
6f0b803
OpenVINO: add NORM/TANH support and rework SOFT_MAX translation
zhaixuejun1993 Mar 28, 2026
0696172
ggml-openvino: extend VIEW handling
zhaixuejun1993 Mar 30, 2026
2ac36d6
Enable -fa off (#118)
wine99 Apr 2, 2026
f5a979e
added gelu support
cavusmustafa Apr 6, 2026
246edca
gelu support
cavusmustafa Apr 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
420 changes: 378 additions & 42 deletions ggml/src/ggml-openvino/ggml-decoder.cpp

Large diffs are not rendered by default.

31 changes: 23 additions & 8 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "ggml-quants.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "openvino/decoder.h"

Expand All @@ -9,7 +10,6 @@
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
#include <optional>
#include <vector>

struct ModelParams {
Expand Down Expand Up @@ -69,6 +69,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful = false,
bool model_is_splitted = false,
bool is_prefill = false,
int prefill_chunk_size = 256);

Expand Down Expand Up @@ -106,10 +107,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual ov::element::Type get_output_type(int node_idx) const override;

virtual std::vector<size_t> get_output_stride(int node_idx) const override;

virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;

virtual int32_t * get_output_op_params(int node_idx) const override;

virtual size_t get_output_op_offset(int node_idx) const override;

virtual std::vector<std::string> get_output_names(int node_idx) const override;

virtual const std::string & get_op_type() const override;
Expand All @@ -120,6 +125,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual const std::string & get_op_name(int node_idx) const override;

virtual int32_t get_op_dynamic_dim(int node_idx) const override;

virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;

ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
Expand Down Expand Up @@ -175,7 +182,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual bool is_stateful() const override { return m_is_stateful; }

ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
virtual bool is_splited_model() const override {
return m_model_is_splitted;
}

ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;

static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

Expand Down Expand Up @@ -205,6 +216,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
bool m_is_prefill = false;
bool m_naive = false;
int m_prefill_chunk_size = 0;
bool m_model_is_splitted = false; // label the cgraph is splited or not

static ov::Shape get_shape(const ggml_tensor * tensor);
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
Expand All @@ -227,15 +239,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
}

inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) ||
(op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]);
}

inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_ROPE && tensor == op->src[2];
}

inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
return (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor) ||
tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY;
}

inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
Expand All @@ -256,9 +270,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
if (is_inp_emb(tensor, op)) {
return "embd";
}
if (is_output_idx(tensor, op)) {
return "inp_out_ids";
}
if (is_inp_mask(tensor, op)) {
return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
}
Expand All @@ -272,6 +283,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
void compute_model_inputs();
void compute_model_outputs();

// Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
void compute_node_dynamic_dims();

void validate_cgraph() const;

ggml_cgraph * m_cgraph = nullptr;
Expand All @@ -284,6 +298,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, ggml_tensor *> m_model_outputs;
std::vector<std::string> m_model_output_names;
std::vector<NodeInfo> m_node_info_list;
std::map<ggml_tensor *, int> m_node_dynamic_dims;

ModelParams m_model_params;
ComputeParams m_compute_params;
Expand Down
29 changes: 18 additions & 11 deletions ggml/src/ggml-openvino/ggml-openvino-extra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
#include <openvino/runtime/properties.hpp>
#include <optional>

ov::Core & ov_singleton_core() {
Expand Down Expand Up @@ -42,11 +43,13 @@ void ggml_openvino_device_config::init() {
{"NPUW_DQ", "YES" },
{"NPUW_DQ_FULL", "NO" },
};
if (cache_dir) {
if (cache_dir && strlen(cache_dir) > 0) {
compile_config["NPUW_CACHE_DIR"] = cache_dir;
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
}
} else if (cache_dir) {
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
} else if (cache_dir && strlen(cache_dir) > 0) {
compile_config.insert(ov::cache_dir(cache_dir));
compile_config.insert(ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
}

// Initialize remote context with queue sharing for GPU
Expand Down Expand Up @@ -259,10 +262,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t);
// For symmetric quantization, we only need one zp value (not one per block)
// Zero points are stored in U4 or U8 format matching the weight type
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
// For symmetric quantization, no zp needed (weights stored as signed)
if (layout.is_symmetric) {
layout.zp_size = 0;
} else {
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
}

layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
Expand Down Expand Up @@ -313,10 +318,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
// Scales: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// Zero points: U4 or U8 matching weight type
// For symmetric quantization, we only need one zp value (not one per block)
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
// For symmetric quantization, no zp needed (weights stored as signed)
if (layout.is_symmetric) {
layout.zp_size = 0;
} else {
layout.zp_size = layout.is_u4 ? ((n_blocks + 1) / 2) : n_blocks;
}

// Layout in buffer: [weights | scales | zp] with alignment
layout.weights_offset = 0;
Expand Down
46 changes: 20 additions & 26 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,18 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
return ctx->data;
}

static bool is_stateful_enabled() {
static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
return stateful && strcmp(stateful, "1") == 0;
}

static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
// GGML_LOG_DEBUG("%s: buffer usage=%d, tensor name=%s\n", __func__, buffer->usage, tensor->name);
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;

// Put kvcache on device memory for GPU (NPU memory is too small even for kvcache)
if (strncmp(tensor->name, "cache_", 6) == 0 && !ctx->is_remote && ggml_openvino_get_device_name() == "GPU" &&
!getenv("GGML_OPENVINO_STATEFUL_EXECUTION")) {
!is_stateful_enabled()) {
GGML_ASSERT(ctx->tensor_extras.empty());
auto device = ctx->device;
auto size = ctx->size;
Expand Down Expand Up @@ -664,7 +669,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device) {

std::shared_ptr<ov_runtime_context> r_ctx = std::static_pointer_cast<ov_runtime_context>(ctx->runtime_context);
r_ctx->device = ggml_openvino_get_device_name();
r_ctx->stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION") && !ggml_openvino_is_npu();
r_ctx->stateful = is_stateful_enabled() && !ggml_openvino_is_npu();

ggml_backend_t openvino_backend = new ggml_backend{
/* .guid = */ ggml_backend_openvino_guid(),
Expand Down Expand Up @@ -800,15 +805,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
return true;
}
float scale = 1.0f;
float max_bias = 0.0f;
const auto * op_params = op->op_params;
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
if (max_bias > 0) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
return true;
}
break;
}
case GGML_OP_FLASH_ATTN_EXT: {
Expand Down Expand Up @@ -860,9 +856,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
return true;
}
if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
return true;
}
if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
// MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
// triggers a bug in ov matmul_shape_inference.hpp
Expand Down Expand Up @@ -890,14 +883,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
return true;
}
float freq_scale;
float ext_factor;
memcpy(&freq_scale, op_params + 6, sizeof(float));
memcpy(&ext_factor, op_params + 7, sizeof(float));
if (ext_factor != 0.0f) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with ext_factor %f != 0.0f\n", ext_factor);
return true;
}
if (op->src[0]->op == GGML_OP_VIEW) {
if (op->src[0]->view_src->ne[1] != op->src[0]->ne[2]) {
// GGML_LOG_WARN(
Expand All @@ -909,6 +894,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
}
break;
}
case GGML_OP_TRANSPOSE: {
// if the type is bf16, will return true
if (op->type == GGML_TYPE_BF16) {
// GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n");
return true;
}
break;
}
default:
break;
}
Expand All @@ -930,13 +923,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};

static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
/*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
// softmax is not updated due to replaced by flash_attn_ext
// GGML_OP_SOFT_MAX,
GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_NORM,
GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
static const std::set<ggml_unary_op> supported_unary_ops{
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_TANH,
};
static const std::set<ggml_glu_op> supported_glu_ops{
GGML_GLU_OP_SWIGLU,
Expand Down
Loading