diff --git a/Makefile b/Makefile index 9befa1bc8dc..24b43e945bf 100644 --- a/Makefile +++ b/Makefile @@ -686,10 +686,6 @@ SDCPP_FLAGS := -I./vendor/nlohmann # sd.cpp objects sdcpp_default.o: otherarch/sdcpp/sdtype_adapter.cpp $(SDCPP_COMMON_SOURCES) $(CXX) $(CXXFLAGS) $(SDCPP_FLAGS) -c $< -o $@ -sdcpp_cublas.o: otherarch/sdcpp/sdtype_adapter.cpp $(SDCPP_COMMON_SOURCES) - $(CXX) $(CXXFLAGS) $(SDCPP_FLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ -sdcpp_vulkan.o: otherarch/sdcpp/sdtype_adapter.cpp $(SDCPP_COMMON_SOURCES) - $(CXX) $(CXXFLAGS) $(SDCPP_FLAGS) $(VULKAN_FLAGS) -c $< -o $@ #whisper objects @@ -873,7 +869,7 @@ koboldcpp_noavx2: endif ifdef CUBLAS_BUILD -koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o music_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_cublas.o ggml-repack.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) +koboldcpp_cublas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_default.o whispercpp_cublas.o tts_default.o music_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_cublas.o ggml-repack.o $(CUBLAS_OBJS) $(OBJS_FULL) $(OBJS) $(CUBLAS_BUILD) else koboldcpp_cublas: @@ -881,7 +877,7 @@ koboldcpp_cublas: endif ifdef HIPBLAS_BUILD -koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_cublas.o whispercpp_cublas.o tts_default.o music_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_cublas.o ggml-repack.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) +koboldcpp_hipblas: ggml_v4_cublas.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o gpttype_adapter_cublas.o sdcpp_default.o whispercpp_cublas.o tts_default.o music_default.o embeddings_default.o llavaclip_cublas.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_cublas.o ggml-repack.o $(HIP_OBJS) $(OBJS_FULL) $(OBJS) $(HIPBLAS_BUILD) else koboldcpp_hipblas: @@ -889,12 +885,12 @@ koboldcpp_hipblas: endif ifdef VULKAN_BUILD -koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_FULL) $(OBJS) +koboldcpp_vulkan: ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o sdcpp_default.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(VULKAN_BUILD) ifdef NOAVX2_BUILD -koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml-ops-noavx2.o ggml-vec-noavx2.o ggml-binops.o ggml-unops.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLE) $(OBJS) +koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml-cpu_v4_noavx2.o ggml-ops-noavx2.o ggml-vec-noavx2.o ggml-binops.o ggml-unops.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_default.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLE) $(OBJS) $(VULKAN_BUILD) -koboldcpp_vulkan_failsafe: ggml_v4_vulkan_failsafe.o ggml-cpu_v4_failsafe.o ggml-ops-failsafe.o ggml-vec-failsafe.o ggml-binops.o ggml-unops.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_vulkan.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLER) $(OBJS) +koboldcpp_vulkan_failsafe: ggml_v4_vulkan_failsafe.o ggml-cpu_v4_failsafe.o ggml-ops-failsafe.o ggml-vec-failsafe.o ggml-binops.o ggml-unops.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan-noext.o ggml-vulkan-shaders-noext.o sdcpp_default.o whispercpp_vulkan.o tts_default.o music_default.o embeddings_default.o llavaclip_vulkan.o llava.o ggml-backend.o ggml-backend-meta.o ggml-backend-reg_vulkan.o ggml-repack.o $(OBJS_SIMPLER) $(OBJS) $(VULKAN_BUILD) else koboldcpp_vulkan_noavx2: diff --git a/otherarch/sdcpp/common_block.hpp b/otherarch/sdcpp/common_block.hpp index 112a4d7a1e4..e6c0b06bd0c 100644 --- a/otherarch/sdcpp/common_block.hpp +++ b/otherarch/sdcpp/common_block.hpp @@ -1,7 +1,9 @@ #ifndef __COMMON_BLOCK_HPP__ #define __COMMON_BLOCK_HPP__ +#include "ggml-backend.h" #include "ggml_extend.hpp" +#include "util.h" class DownSampleBlock : public GGMLBlock { protected: @@ -248,9 +250,6 @@ class FeedForward : public GGMLBlock { float scale = 1.f; if (precision_fix) { scale = 1.f / 128.f; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif } // The purpose of the scale here is to prevent NaN issues in certain situations. // For example, when using Vulkan without enabling force_prec_f32, @@ -264,6 +263,9 @@ class FeedForward : public GGMLBlock { auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]); auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); + if (sd_backend_is(ctx->backend, "Vulkan")) { + net_2->set_force_prec_f32(true); + } x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index b559f58bbf3..00e4e5292d3 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -24,32 +24,12 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml.h" +#include "ggml_extend_backend.hpp" #include "model.h" #include "tensor.hpp" -#ifdef SD_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef SD_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef SD_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef SD_USE_OPENCL -#include "ggml-opencl.h" -#endif - -#ifdef SD_USE_SYCL -#include "ggml-sycl.h" -#endif - #include "rng.hpp" #include "tensor_ggml.hpp" #include "util.h" @@ -91,6 +71,48 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha } } +__STATIC_INLINE__ bool backend_name_exists(std::string name) { + ggml_backend_load_all_once(); + const size_t device_count = ggml_backend_dev_count(); + for (size_t i = 0; i < device_count; ++i) { + if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) { + return true; + } + } + return false; +} + +__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) { + if (name == "" || backend_name_exists(name)) { + return name; + } else { + LOG_WARN("Backend %s not found, using default backend", name.c_str()); + return ""; + } +} + +__STATIC_INLINE__ std::string get_default_backend_name() { + ggml_backend_load_all_once(); + // should pick the same backend as ggml_backend_init_best + ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU); + dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (dev == nullptr) { + return ""; + } + return ggml_backend_dev_name(dev); +} + +__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") { + ggml_backend_load_all_once(); + LOG_DEBUG("Initializing backend: %s", name.c_str()); + if (name.empty()) { + return ggml_backend_init_best(); + } else { + return ggml_backend_init_by_name(name.c_str(), nullptr); + } +} + static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); // n-mode tensor-matrix product @@ -1286,25 +1308,25 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx, return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); } -__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) { -#ifdef SD_USE_VULKAN - auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int"); - auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); - out = ggml_get_rows(ctx, out, zero_index); - out = ggml_reshape(ctx, out, a); - // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); - return out; -#else - auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a)); - ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,] - if (ggml_is_transposed(out)) { - out = ggml_mul_mat(ctx, one, out); +__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* a) { + if (sd_backend_is(backend, "Vulkan")) { + auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int"); + auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); + out = ggml_get_rows(ctx, out, zero_index); + out = ggml_reshape(ctx, out, a); + // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); + return out; } else { - out = ggml_mul_mat(ctx, out, one); + auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a)); + ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,] + if (ggml_is_transposed(out)) { + out = ggml_mul_mat(ctx, one, out); + } else { + out = ggml_mul_mat(ctx, out, one); + } + out = ggml_reshape(ctx, out, a); + return out; } - out = ggml_reshape(ctx, out, a); -#endif - return out; } // q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head] @@ -1496,16 +1518,14 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx, } __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) { -#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) - if (!ggml_backend_is_cpu(backend)) { + if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) && + !ggml_backend_is_cpu(backend)) { ggml_backend_tensor_get_async(backend, tensor, data, offset, size); ggml_backend_synchronize(backend); - } else { - ggml_backend_tensor_get(tensor, data, offset, size); + return; } -#else + ggml_backend_tensor_get(tensor, data, offset, size); -#endif } __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) { @@ -1664,14 +1684,15 @@ struct WeightAdapter { float scale = 1.f; } conv2d; }; - virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0; + virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) = 0; virtual ggml_tensor* forward_with_lora(ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, const std::string& prefix, - ForwardParams forward_params) = 0; - virtual size_t get_extra_graph_size() = 0; + ForwardParams forward_params) = 0; + virtual size_t get_extra_graph_size() = 0; }; struct GGMLRunnerContext { @@ -2192,6 +2213,14 @@ struct GGMLRunner { void set_weight_adapter(const std::shared_ptr& adapter) { weight_adapter = adapter; } + + ggml_backend_t get_runtime_backend() { + return runtime_backend; + } + + ggml_backend_t get_params_backend() { + return params_backend; + } }; class GGMLBlock { @@ -2336,6 +2365,14 @@ class Linear : public UnaryBlock { force_prec_f32(force_prec_f32), scale(scale) {} + void set_scale(float scale_) { + scale = scale_; + } + + void set_force_prec_f32(bool force_prec_f32_) { + force_prec_f32 = force_prec_f32_; + } + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { ggml_tensor* w = params["weight"]; ggml_tensor* b = nullptr; @@ -2347,7 +2384,7 @@ class Linear : public UnaryBlock { forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR; forward_params.linear.force_prec_f32 = force_prec_f32; forward_params.linear.scale = scale; - return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params); } return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); } @@ -2463,7 +2500,7 @@ class Conv2d : public UnaryBlock { forward_params.conv2d.circular_x = ctx->circular_x_enabled; forward_params.conv2d.circular_y = ctx->circular_y_enabled; forward_params.conv2d.scale = scale; - return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params); } return ggml_ext_conv_2d(ctx->ggml_ctx, x, @@ -2527,7 +2564,7 @@ class Conv3d : public UnaryBlock { ggml_tensor* w = params["weight"]; ggml_tensor* b = nullptr; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); if (w->type != GGML_TYPE_F16) { w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16); } @@ -2535,7 +2572,7 @@ class Conv3d : public UnaryBlock { if (bias) { b = params["bias"]; if (ctx->weight_adapter) { - b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias"); } } return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, @@ -2582,12 +2619,12 @@ class LayerNorm : public UnaryBlock { if (elementwise_affine) { w = params["weight"]; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); } if (bias) { b = params["bias"]; if (ctx->weight_adapter) { - b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias"); } } } @@ -2630,8 +2667,8 @@ class GroupNorm : public GGMLBlock { w = params["weight"]; b = params["bias"]; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); - b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias"); } } return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups); @@ -2665,7 +2702,7 @@ class RMSNorm : public UnaryBlock { ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) { ggml_tensor* w = params["weight"]; if (ctx->weight_adapter) { - w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight"); } x = ggml_rms_norm(ctx->ggml_ctx, x, eps); x = ggml_mul_inplace(ctx->ggml_ctx, x, w); @@ -2748,6 +2785,7 @@ class MultiheadAttention : public GGMLBlock { __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward( ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch] ggml_tensor* w1, // Outer C (Full rank) ggml_tensor* w1a, // Outer A (Low rank part 1) @@ -2768,7 +2806,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward( int vq = q_actual / uq; int vp = (w2 != nullptr) ? (is_conv ? (int)w2->ne[3] : (int)w2->ne[1]) - : (int)w2a->ne[1]; + : (int)w2a->ne[1]; GGML_ASSERT(q_actual == (uq * vq) && "Input dimension mismatch for LoKR split"); ggml_tensor* hb; @@ -2778,29 +2816,29 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward( int merge_batch_uq = batch; int merge_batch_vp = batch; -#if SD_USE_VULKAN - if (batch > 1) { - // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend - int max_batch = 65535; - int max_batch_uq = max_batch / uq; - merge_batch_uq = 1; - for (int i = max_batch_uq; i > 0; i--) { - if (batch % i == 0) { - merge_batch_uq = i; - break; + if (sd_backend_is(backend, "Vulkan")) { + if (batch > 1) { + // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend + int max_batch = 65535; + int max_batch_uq = max_batch / uq; + merge_batch_uq = 1; + for (int i = max_batch_uq; i > 0; i--) { + if (batch % i == 0) { + merge_batch_uq = i; + break; + } } - } - int max_batch_vp = max_batch / vp; - merge_batch_vp = 1; - for (int i = max_batch_vp; i > 0; i--) { - if (batch % i == 0) { - merge_batch_vp = i; - break; + int max_batch_vp = max_batch / vp; + merge_batch_vp = 1; + for (int i = max_batch_vp; i > 0; i--) { + if (batch % i == 0) { + merge_batch_vp = i; + break; + } } } } -#endif ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq); if (w2 != nullptr) { diff --git a/otherarch/sdcpp/ggml_extend_backend.hpp b/otherarch/sdcpp/ggml_extend_backend.hpp new file mode 100644 index 00000000000..50158c883bf --- /dev/null +++ b/otherarch/sdcpp/ggml_extend_backend.hpp @@ -0,0 +1,298 @@ +#ifndef __GGML_EXTEND_BACKEND_HPP__ +#define __GGML_EXTEND_BACKEND_HPP__ + +#include +#include + +#include "ggml-backend.h" +#include "ggml.h" + +#ifndef __STATIC_INLINE__ +#define __STATIC_INLINE__ static inline +#endif + +inline void ggml_backend_load_all_once() { + // If the registry already has devices and the CPU backend is present, + // assume either static registration or explicit host-side preloading has + // completed and avoid rescanning the default paths. + if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { + return; + } + // In dynamic-backend mode the backend modules are discovered at runtime, + // so we must load them before asking for the CPU backend or its proc table. + // If the host preloaded only a subset of backends, allow one default-path + // scan so missing modules can still be discovered. + static std::once_flag once; + std::call_once(once, []() { + if (ggml_backend_dev_count() > 0 && ggml_backend_reg_by_name("CPU") != nullptr) { + return; + } + ggml_backend_load_all(); + }); +} + +// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS: +// those are CMake options used to configure ggml itself, but they are not +// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode. +// In practice, this target can reliably see GGML_BACKEND_DL, but not whether +// the CPU backend was compiled as a loadable module. We therefore use runtime +// backend discovery instead of compile-time assumptions. + +__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() { + ggml_backend_reg_t reg = ggml_backend_reg_by_name("CPU"); + if (reg != nullptr) { + return reg; + } + + ggml_backend_load_all_once(); + return ggml_backend_reg_by_name("CPU"); +} + +__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) { + if (backend != nullptr) { + ggml_backend_dev_t device = ggml_backend_get_device(backend); + if (device != nullptr) { + return ggml_backend_dev_backend_reg(device); + } + } + + return ggml_backend_cpu_reg(); +} + +__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() { + ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (backend != nullptr) { + return backend; + } + + ggml_backend_load_all_once(); + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); +} + +__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) { + if (backend == nullptr) { + return false; + } + + ggml_backend_dev_t device = ggml_backend_get_device(backend); + if (device != nullptr) { + return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; + } + + const char* backend_name = ggml_backend_name(backend); + return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0; +} + +__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); + if (reg == nullptr) { + return; + } + + auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads")); + if (fn != nullptr) { + fn(backend_cpu, n_threads); + } +} + +using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + +__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); + if (reg == nullptr) { + return; + } + + auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool")); + if (fn != nullptr) { + fn(backend_cpu, threadpool); + } +} + +__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) { + ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu); + if (reg == nullptr) { + return; + } + + auto fn = reinterpret_cast(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback")); + if (fn != nullptr) { + fn(backend_cpu, abort_callback, abort_callback_data); + } +} + +__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) { + if (tensor == nullptr) { + return nullptr; + } + + return tensor->view_src ? tensor->view_src->buffer : tensor->buffer; +} + +__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) { + if (tensor == nullptr || tensor->data == nullptr) { + return false; + } + + ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor); + return buffer == nullptr || ggml_backend_buffer_is_host(buffer); +} + +__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]); +} + +template +__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) { + const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3); + + if (ggml_backend_tensor_is_host_accessible(tensor)) { + auto* dst = reinterpret_cast(reinterpret_cast(tensor->data) + offset); + *dst = value; + return; + } + + ggml_backend_tensor_set(const_cast(tensor), &value, offset, sizeof(T)); +} + +__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) { + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) { + if (!ggml_is_contiguous(tensor)) { + int64_t id[4] = {0, 0, 0, 0}; + ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]); + ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value); + return; + } + + switch (tensor->type) { + case GGML_TYPE_I8: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_I32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast(value)); + break; + case GGML_TYPE_F16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value)); + break; + case GGML_TYPE_BF16: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value)); + break; + case GGML_TYPE_F32: + ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value); + break; + default: + GGML_ABORT("fatal error"); + } +} + +__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) { + (void)ctx; + + // The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but + // the backend proc table does not expose it in GGML_BACKEND_DL mode. + // Recreate the old behavior by initializing the CPU backend explicitly and + // executing the graph through the generic backend API. + ggml_backend_t backend = ggml_backend_cpu_init(); + if (backend == nullptr) { + return GGML_STATUS_ALLOC_FAILED; + } + + ggml_backend_cpu_set_n_threads(backend, n_threads); + + const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph); + ggml_backend_free(backend); + + return status; +} + +__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) { + GGML_ASSERT(tensor != nullptr); + + if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) { + const int64_t nelements = ggml_nelements(tensor); + + switch (tensor->type) { + case GGML_TYPE_I8: { + auto* data = reinterpret_cast(tensor->data); + const int8_t v = static_cast(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_I16: { + auto* data = reinterpret_cast(tensor->data); + const int16_t v = static_cast(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_I32: { + auto* data = reinterpret_cast(tensor->data); + const int32_t v = static_cast(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_F16: { + auto* data = reinterpret_cast(tensor->data); + const ggml_fp16_t v = ggml_fp32_to_fp16(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_BF16: { + auto* data = reinterpret_cast(tensor->data); + const ggml_bf16_t v = ggml_fp32_to_bf16(value); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = v; + } + } break; + case GGML_TYPE_F32: { + auto* data = reinterpret_cast(tensor->data); + for (int64_t i = 0; i < nelements; ++i) { + data[i] = value; + } + } break; + default: + GGML_ABORT("fatal error"); + } + + return tensor; + } + + const int64_t nelements = ggml_nelements(tensor); + for (int64_t i = 0; i < nelements; ++i) { + ggml_set_f32_1d(tensor, static_cast(i), value); + } + + return tensor; +} + +#endif diff --git a/otherarch/sdcpp/lora.hpp b/otherarch/sdcpp/lora.hpp index f8014809e1b..0cd627a3a1c 100644 --- a/otherarch/sdcpp/lora.hpp +++ b/otherarch/sdcpp/lora.hpp @@ -129,7 +129,7 @@ struct LoraModel : public GGMLRunner { } } - ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -152,17 +152,17 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(lora_up_name); if (iter != lora_tensors.end()) { - lora_up = ggml_ext_cast_f32(ctx, iter->second); + lora_up = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lora_mid_name); if (iter != lora_tensors.end()) { - lora_mid = ggml_ext_cast_f32(ctx, iter->second); + lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lora_down_name); if (iter != lora_tensors.end()) { - lora_down = ggml_ext_cast_f32(ctx, iter->second); + lora_down = ggml_ext_cast_f32(ctx, backend, iter->second); } if (lora_up == nullptr || lora_down == nullptr) { @@ -208,7 +208,7 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -225,7 +225,7 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(diff_name); if (iter != lora_tensors.end()) { - curr_updown = ggml_ext_cast_f32(ctx, iter->second); + curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second); } else { break; } @@ -248,7 +248,7 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -276,33 +276,33 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(hada_1_down_name); if (iter != lora_tensors.end()) { - hada_1_down = ggml_ext_cast_f32(ctx, iter->second); + hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_1_up_name); if (iter != lora_tensors.end()) { - hada_1_up = ggml_ext_cast_f32(ctx, iter->second); + hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_1_mid_name); if (iter != lora_tensors.end()) { - hada_1_mid = ggml_ext_cast_f32(ctx, iter->second); + hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second); hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up)); } iter = lora_tensors.find(hada_2_down_name); if (iter != lora_tensors.end()) { - hada_2_down = ggml_ext_cast_f32(ctx, iter->second); + hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_2_up_name); if (iter != lora_tensors.end()) { - hada_2_up = ggml_ext_cast_f32(ctx, iter->second); + hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(hada_2_mid_name); if (iter != lora_tensors.end()) { - hada_2_mid = ggml_ext_cast_f32(ctx, iter->second); + hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second); hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up)); } @@ -351,7 +351,7 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -378,24 +378,24 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(lokr_w1_name); if (iter != lora_tensors.end()) { - lokr_w1 = ggml_ext_cast_f32(ctx, iter->second); + lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lokr_w2_name); if (iter != lora_tensors.end()) { - lokr_w2 = ggml_ext_cast_f32(ctx, iter->second); + lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second); } int64_t rank = 1; if (lokr_w1 == nullptr) { iter = lora_tensors.find(lokr_w1_a_name); if (iter != lora_tensors.end()) { - lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second); + lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lokr_w1_b_name); if (iter != lora_tensors.end()) { - lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second); + lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second); } if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) { @@ -410,12 +410,12 @@ struct LoraModel : public GGMLRunner { if (lokr_w2 == nullptr) { iter = lora_tensors.find(lokr_w2_a_name); if (iter != lora_tensors.end()) { - lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second); + lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second); } iter = lora_tensors.find(lokr_w2_b_name); if (iter != lora_tensors.end()) { - lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second); + lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second); } if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) { @@ -468,23 +468,23 @@ struct LoraModel : public GGMLRunner { return updown; } - ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) { + ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) { // lora ggml_tensor* diff = nullptr; if (with_lora_and_lokr) { - diff = get_lora_weight_diff(model_tensor_name, ctx); + diff = get_lora_weight_diff(model_tensor_name, ctx, backend); } // diff if (diff == nullptr) { - diff = get_raw_weight_diff(model_tensor_name, ctx); + diff = get_raw_weight_diff(model_tensor_name, ctx, backend); } // loha if (diff == nullptr) { - diff = get_loha_weight_diff(model_tensor_name, ctx); + diff = get_loha_weight_diff(model_tensor_name, ctx, backend); } // lokr if (diff == nullptr && with_lora_and_lokr) { - diff = get_lokr_weight_diff(model_tensor_name, ctx); + diff = get_lokr_weight_diff(model_tensor_name, ctx, backend); } if (diff != nullptr) { if (ggml_nelements(diff) < ggml_nelements(model_tensor)) { @@ -502,6 +502,7 @@ struct LoraModel : public GGMLRunner { } ggml_tensor* get_out_diff(ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* x, WeightAdapter::ForwardParams forward_params, const std::string& model_tensor_name) { @@ -590,7 +591,7 @@ struct LoraModel : public GGMLRunner { } scale_value *= multiplier; - auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); + auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); if (out_diff == nullptr) { out_diff = curr_out_diff; } else { @@ -761,7 +762,7 @@ struct LoraModel : public GGMLRunner { ggml_tensor* model_tensor = it.second; // lora - ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor); + ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor); if (diff == nullptr) { continue; } @@ -774,7 +775,7 @@ struct LoraModel : public GGMLRunner { ggml_tensor* final_tensor; if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) { - final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor); + final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor); final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff); final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor); } else { @@ -841,34 +842,35 @@ struct MultiLoraAdapter : public WeightAdapter { : lora_models(lora_models) { } - ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) { + ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) { for (auto& lora_model : lora_models) { - ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr); + ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr); if (diff == nullptr) { continue; } if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { - weight = ggml_ext_cast_f32(ctx, weight); + weight = ggml_ext_cast_f32(ctx, backend, weight); } weight = ggml_add(ctx, weight, diff); } return weight; } - ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override { - return patch_weight(ctx, weight, weight_name, true); + ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override { + return patch_weight(ctx, backend, weight, weight_name, true); } ggml_tensor* forward_with_lora(ggml_context* ctx, + ggml_backend_t backend, ggml_tensor* x, ggml_tensor* w, ggml_tensor* b, const std::string& prefix, WeightAdapter::ForwardParams forward_params) override { - w = patch_weight(ctx, w, prefix + "weight", false); + w = patch_weight(ctx, backend, w, prefix + "weight", false); if (b) { - b = patch_weight(ctx, b, prefix + "bias", false); + b = patch_weight(ctx, backend, b, prefix + "bias", false); } ggml_tensor* out; if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) { @@ -890,7 +892,7 @@ struct MultiLoraAdapter : public WeightAdapter { forward_params.conv2d.scale); } for (auto& lora_model : lora_models) { - ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight"); + ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight"); if (out_diff == nullptr) { continue; } diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp index d1bb0c90d08..4c05f756cf3 100644 --- a/otherarch/sdcpp/model.cpp +++ b/otherarch/sdcpp/model.cpp @@ -23,24 +23,11 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml.h" +#include "ggml_extend_backend.hpp" #include "zip.h" #include "name_conversion.h" -#include "stable-diffusion.h" - -#ifdef SD_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef SD_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef SD_USE_OPENCL -#include "ggml-opencl.h" -#endif /*================================================= Preprocess ==================================================*/ diff --git a/otherarch/sdcpp/preprocessing.hpp b/otherarch/sdcpp/preprocessing.hpp index 7c83a289df3..57ab0cec7fd 100644 --- a/otherarch/sdcpp/preprocessing.hpp +++ b/otherarch/sdcpp/preprocessing.hpp @@ -24,6 +24,75 @@ static inline void preprocessing_set_4d(sd::Tensor& tensor, float value, tensor.values()[static_cast(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value; } +static inline uint8_t preprocessing_float_to_u8(float value) { + if (value <= 0.0f) { + return 0; + } + if (value >= 1.0f) { + return 255; + } + return static_cast(value * 255.0f + 0.5f); +} + +static inline void preprocessing_tensor_frame_to_sd_image(const sd::Tensor& tensor, int frame_index, uint8_t* image_data) { + const auto& shape = tensor.shape(); + GGML_ASSERT(shape.size() == 4 || shape.size() == 5); + GGML_ASSERT(image_data != nullptr); + + const int width = static_cast(shape[0]); + const int height = static_cast(shape[1]); + const int channel = static_cast(shape[shape.size() == 5 ? 3 : 2]); + const size_t pixels = static_cast(width) * static_cast(height); + const float* src = tensor.data(); + + if (shape.size() == 4) { + GGML_ASSERT(frame_index >= 0 && frame_index < shape[3]); + const size_t frame_stride = pixels * static_cast(channel); + const float* frame_ptr = src + static_cast(frame_index) * frame_stride; + if (channel == 3) { + const float* c0 = frame_ptr; + const float* c1 = frame_ptr + pixels; + const float* c2 = frame_ptr + pixels * 2; + for (size_t i = 0; i < pixels; ++i) { + image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]); + image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]); + image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]); + } + return; + } + + for (size_t i = 0; i < pixels; ++i) { + for (int c = 0; c < channel; ++c) { + image_data[i * static_cast(channel) + static_cast(c)] = + preprocessing_float_to_u8(frame_ptr[i + pixels * static_cast(c)]); + } + } + return; + } + + GGML_ASSERT(frame_index >= 0 && frame_index < shape[2]); + const size_t channel_stride = pixels * static_cast(shape[2]); + const float* frame_ptr = src + static_cast(frame_index) * pixels; + if (channel == 3) { + const float* c0 = frame_ptr; + const float* c1 = frame_ptr + channel_stride; + const float* c2 = frame_ptr + channel_stride * 2; + for (size_t i = 0; i < pixels; ++i) { + image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]); + image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]); + image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]); + } + return; + } + + for (size_t i = 0; i < pixels; ++i) { + for (int c = 0; c < channel; ++c) { + image_data[i * static_cast(channel) + static_cast(c)] = + preprocessing_float_to_u8(frame_ptr[i + channel_stride * static_cast(c)]); + } + } +} + static inline sd::Tensor sd_image_to_preprocessing_tensor(sd_image_t image) { sd::Tensor tensor({static_cast(image.width), static_cast(image.height), static_cast(image.channel), 1}); for (uint32_t y = 0; y < image.height; ++y) { @@ -39,20 +108,7 @@ static inline sd::Tensor sd_image_to_preprocessing_tensor(sd_image_t imag static inline void preprocessing_tensor_to_sd_image(const sd::Tensor& tensor, uint8_t* image_data) { GGML_ASSERT(tensor.dim() == 4); GGML_ASSERT(tensor.shape()[3] == 1); - GGML_ASSERT(image_data != nullptr); - - int width = static_cast(tensor.shape()[0]); - int height = static_cast(tensor.shape()[1]); - int channel = static_cast(tensor.shape()[2]); - for (int y = 0; y < height; ++y) { - for (int x = 0; x < width; ++x) { - for (int c = 0; c < channel; ++c) { - float value = preprocessing_get_4d(tensor, x, y, c, 0); - value = std::min(1.0f, std::max(0.0f, value)); - image_data[(y * width + x) * channel + c] = static_cast(std::round(value * 255.0f)); - } - } - } + preprocessing_tensor_frame_to_sd_image(tensor, 0, image_data); } static inline sd::Tensor gaussian_kernel_tensor(int kernel_size) { diff --git a/otherarch/sdcpp/qwen_image.hpp b/otherarch/sdcpp/qwen_image.hpp index 83c8cec666c..1cbeb71d538 100644 --- a/otherarch/sdcpp/qwen_image.hpp +++ b/otherarch/sdcpp/qwen_image.hpp @@ -95,9 +95,7 @@ namespace Qwen { float scale = 1.f / 32.f; bool force_prec_f32 = false; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif + // The purpose of the scale here is to prevent NaN issues in certain situations. // For example when using CUDA but the weights are k-quants (not all prompts). blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale)); @@ -124,6 +122,10 @@ namespace Qwen { auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + if (sd_backend_is(ctx->backend, "Vulkan")) { + to_out_0->set_force_prec_f32(true); + } + auto norm_added_q = std::dynamic_pointer_cast(blocks["norm_added_q"]); auto norm_added_k = std::dynamic_pointer_cast(blocks["norm_added_k"]); diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 41aaa82f29a..d7c66ab7dcb 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -54,7 +54,10 @@ using namespace torch_zip; #include "tokenizers/tokenizer.cpp" #include "tokenizers/tokenize_util.cpp" -#include "otherarch/utils.h" +// FIXME: llama.h errors out if included (through utils.h) +std::vector kcpp_base64_decode(const std::string & encoded_string); +std::string kcpp_base64_encode(const unsigned char* data, unsigned int data_length); +std::string get_timestamp_str(); // #include "preprocessing.hpp" #include "stable-diffusion.h" @@ -316,6 +319,8 @@ std::string load_umt5_tokenizer_json() return umt5str; } +void kcpp_sd_set_main_gpu(int value); + bool sdtype_load_model(const sd_load_model_inputs inputs) { sd_is_quiet = inputs.quiet; set_sd_quiet(sd_is_quiet); @@ -339,17 +344,8 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { cfg_square_limit = inputs.img_soft_limit; printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename); - { - //kcpp allow gpu id override - std::string sdmaingpu = std::to_string(inputs.kcpp_main_gpu); - const char* existingenv = getenv("SD_VK_DEVICE"); - int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; - if(kcpp_parseinfo_maindevice>0 && !existingenv && sdmaingpu!="") - { - sdmaingpuenv = "SD_VK_DEVICE="+sdmaingpu; - putenv((char*)sdmaingpuenv.c_str()); - } - } + //kcpp allow gpu id override + kcpp_sd_set_main_gpu(inputs.kcpp_main_gpu); int lora_apply_mode = LORA_APPLY_AT_RUNTIME; bool lora_dynamic = false; @@ -1631,6 +1627,28 @@ sd_info_outputs sdtype_get_info() } j["available_samplers"] = available_samplers; + auto get_dev_type_name = [](auto dev_type) -> std::string { + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) + return "CPU"; + else if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) + return "GPU"; + else if (dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) + return "IGPU"; + return "TYPE_" + std::to_string(dev_type); + }; + + auto devices = json::array(); + size_t dev_count = ggml_backend_dev_count(); + for (size_t i = 0; i < dev_count; ++i) { + auto dev = ggml_backend_dev_get(i); + json jdev; + jdev["name"] = ggml_backend_dev_name(dev); + jdev["description"] = ggml_backend_dev_description(dev); + jdev["type"] = get_dev_type_name(ggml_backend_dev_type(dev)); + devices.push_back(jdev); + } + j["devices"] = devices; + static std::string recent_info = j.dump(); sd_info_outputs output; output.status = 0; diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 4b619bc5d86..32a63971fbc 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -187,82 +187,7 @@ class StableDiffusionGGML { } void init_backend() { -#ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - size_t device = 0; //kcpp: ported device selection from vulkan - const int device_count = ggml_backend_cuda_get_device_count(); - if (device_count) { - const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); - if (SD_VK_DEVICE != nullptr) { - std::string sd_vk_device_str = SD_VK_DEVICE; - try { - device = std::stoull(sd_vk_device_str); - } catch (const std::invalid_argument&) { - LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } catch (const std::out_of_range&) { - LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } - if (device >= device_count) { - LOG_WARN("Cannot find targeted cuda device (%llu). Falling back to device 0.", device); - device = 0; - } - } - LOG_INFO("CUDA: Using device %llu", device); - } - backend = ggml_backend_cuda_init(device); -#endif -#ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); -#endif -#ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - size_t device = 0; - const int device_count = ggml_backend_vk_get_device_count(); - if (device_count) { - const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); - if (SD_VK_DEVICE != nullptr) { - std::string sd_vk_device_str = SD_VK_DEVICE; - try { - device = std::stoull(sd_vk_device_str); - } catch (const std::invalid_argument&) { - LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } catch (const std::out_of_range&) { - LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE); - device = 0; - } - if (device >= device_count) { - LOG_WARN("Cannot find targeted vulkan device (%zu). Falling back to device 0.", device); - device = 0; - } - } - LOG_INFO("Vulkan: Using device %zu", device); - backend = ggml_backend_vk_init(device); - } - if (!backend) { - LOG_WARN("Failed to initialize Vulkan backend"); - } -#endif -#ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs - backend = ggml_backend_opencl_init(); - if (!backend) { - LOG_WARN("Failed to initialize OpenCL backend"); - } -#endif -#ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); -#endif - - if (!backend) { - LOG_DEBUG("Using CPU backend"); - backend = ggml_backend_cpu_init(); - } + backend = sd_get_default_backend(); } std::shared_ptr get_rng(rng_type_t rng_type) { diff --git a/otherarch/sdcpp/upscaler.cpp b/otherarch/sdcpp/upscaler.cpp index ed7bb89a0cf..80e68c947b4 100644 --- a/otherarch/sdcpp/upscaler.cpp +++ b/otherarch/sdcpp/upscaler.cpp @@ -16,26 +16,9 @@ bool UpscalerGGML::load_from_file(const std::string& esrgan_path, bool offload_params_to_cpu, int n_threads) { ggml_log_set(ggml_log_callback_default, nullptr); -#ifdef SD_USE_CUDA - LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); -#endif -#ifdef SD_USE_METAL - LOG_DEBUG("Using Metal backend"); - backend = ggml_backend_metal_init(); -#endif -#ifdef SD_USE_VULKAN - LOG_DEBUG("Using Vulkan backend"); - backend = ggml_backend_vk_init(0); -#endif -#ifdef SD_USE_OPENCL - LOG_DEBUG("Using OpenCL backend"); - backend = ggml_backend_opencl_init(); -#endif -#ifdef SD_USE_SYCL - LOG_DEBUG("Using SYCL backend"); - backend = ggml_backend_sycl_init(0); -#endif + + backend = sd_get_default_backend(); + ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(esrgan_path)) { LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str()); diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp index 2daa01c093b..cb9931b74d3 100644 --- a/otherarch/sdcpp/util.cpp +++ b/otherarch/sdcpp/util.cpp @@ -27,8 +27,9 @@ #include #endif -#include "ggml-cpu.h" +#include "ggml-backend.h" #include "ggml.h" +#include "ggml_extend_backend.hpp" #include "stable-diffusion.h" bool ends_with(const std::string& str, const std::string& ending) { @@ -540,26 +541,6 @@ sd_progress_cb_t sd_get_progress_callback() { void* sd_get_progress_callback_data() { return sd_progress_cb_data; } -const char* sd_get_system_info() { - static char buffer[1024]; - std::stringstream ss; - ss << "System Info: \n"; - ss << " SSE3 = " << ggml_cpu_has_sse3() << " | "; - ss << " AVX = " << ggml_cpu_has_avx() << " | "; - ss << " AVX2 = " << ggml_cpu_has_avx2() << " | "; - ss << " AVX512 = " << ggml_cpu_has_avx512() << " | "; - ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | "; - ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | "; - ss << " FMA = " << ggml_cpu_has_fma() << " | "; - ss << " NEON = " << ggml_cpu_has_neon() << " | "; - ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | "; - ss << " F16C = " << ggml_cpu_has_f16c() << " | "; - ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | "; - ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | "; - ss << " VSX = " << ggml_cpu_has_vsx() << " | "; - snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); - return buffer; -} sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index) { const auto& shape = tensor.shape(); @@ -569,17 +550,7 @@ sd_image_t tensor_to_sd_image(const sd::Tensor& tensor, int frame_index) int channel = static_cast(shape[shape.size() == 5 ? 3 : 2]); uint8_t* data = (uint8_t*)malloc(static_cast(width * height * channel)); GGML_ASSERT(data != nullptr); - - for (int iw = 0; iw < width; ++iw) { - for (int ih = 0; ih < height; ++ih) { - for (int ic = 0; ic < channel; ++ic) { - float value = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0) - : tensor.index(iw, ih, ic, frame_index); - value = std::clamp(value, 0.0f, 1.0f); - data[(ih * width + iw) * channel + ic] = static_cast(std::round(value * 255.0f)); - } - } - } + preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data); return { static_cast(width), static_cast(height), @@ -763,3 +734,139 @@ std::vector> parse_prompt_attention(const std::str return res; } + +// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. +bool sd_backend_is(ggml_backend_t backend, const std::string& name) { + if (!backend) { + return false; + } + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + if (!dev) + return false; + std::string dev_name = ggml_backend_dev_name(dev); + return dev_name.find(name) != std::string::npos; +} + +static int kcpp_main_gpu = -1; +void kcpp_sd_set_main_gpu(int value) { + ggml_backend_load_all_once(); + if (value >= 0) { + size_t dev_count = ggml_backend_dev_count(); + size_t dev_index = static_cast(value); + if (dev_index >= dev_count) { + LOG_WARN("device %d not found, falling back to default", value); + value = -1; + } + } else if (value <= -2) { + value = -2; + } + kcpp_main_gpu = value; +} +static ggml_backend_t kcpp_get_main_gpu() { + ggml_backend_t backend = nullptr; + if (kcpp_main_gpu != -1) { + std::string dev_name; + if (kcpp_main_gpu <= -2) { + dev_name = "CPU"; + } else { + auto dev = ggml_backend_dev_get(static_cast(kcpp_main_gpu)); + dev_name = ggml_backend_dev_name(dev); + } + backend = init_named_backend(dev_name); + if (backend) { + LOG_INFO("Setting %s as main device (#%d)", dev_name.c_str(), kcpp_main_gpu); + } else { + LOG_WARN("Couldn't initialize device #%d; falling back to the default device", kcpp_main_gpu); + } + } + return backend; +} + +ggml_backend_t sd_get_default_backend() { + ggml_backend_load_all_once(); + static std::once_flag once; + std::call_once(once, []() { + size_t dev_count = ggml_backend_dev_count(); + if (dev_count == 0) { + LOG_ERROR("No devices found!"); + } else { + LOG_DEBUG("Found %zu backend devices:", dev_count); + for (size_t i = 0; i < dev_count; ++i) { + auto dev = ggml_backend_dev_get(i); + LOG_DEBUG("#%zu: %s", i, ggml_backend_dev_name(dev)); + } + } + }); + ggml_backend_t backend = nullptr; + const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); + if (SD_VK_DEVICE != nullptr) { + std::string sd_vk_device_str = SD_VK_DEVICE; + try { + unsigned long long device = std::stoull(sd_vk_device_str); + std::string vk_device_name = "Vulkan" + std::to_string(device); + if (backend_name_exists(vk_device_name)) { + LOG_INFO("Selecting %s as main device by env var SD_VK_DEVICE", vk_device_name.c_str()); + backend = init_named_backend(vk_device_name); + if (!backend) { + LOG_WARN("Device %s requested by SD_VK_DEVICE failed to init. Falling back to the default device.", vk_device_name.c_str()); + } + } else { + LOG_WARN("Device %s requested by SD_VK_DEVICE was not found. Falling back to the default device.", vk_device_name.c_str()); + } + } catch (const std::invalid_argument&) { + LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to the default device.", SD_VK_DEVICE); + } catch (const std::out_of_range&) { + LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to the default device.", SD_VK_DEVICE); + } + } + + if (backend == nullptr) { // kcpp + backend = kcpp_get_main_gpu(); + } // kcpp + + if (!backend) { + std::string dev_name = get_default_backend_name(); + backend = init_named_backend(dev_name); + if (!backend && !dev_name.empty()) { + LOG_WARN("device %s failed to init", dev_name.c_str()); + } + } + + if (!backend) { + LOG_WARN("loading CPU backend"); + backend = ggml_backend_cpu_init(); + } + + if (ggml_backend_is_cpu(backend)) { + LOG_DEBUG("Using CPU backend"); + } + + return backend; +} + +// namespace is needed to avoid conflicts with ggml_backend_extend.hpp +namespace ggml_cpu { +#include "ggml-cpu.h" +} + +const char* sd_get_system_info() { + using namespace ggml_cpu; + static char buffer[1024]; + std::stringstream ss; + ss << "System Info: \n"; + ss << " SSE3 = " << ggml_cpu_has_sse3() << " | "; + ss << " AVX = " << ggml_cpu_has_avx() << " | "; + ss << " AVX2 = " << ggml_cpu_has_avx2() << " | "; + ss << " AVX512 = " << ggml_cpu_has_avx512() << " | "; + ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | "; + ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | "; + ss << " FMA = " << ggml_cpu_has_fma() << " | "; + ss << " NEON = " << ggml_cpu_has_neon() << " | "; + ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << " | "; + ss << " F16C = " << ggml_cpu_has_f16c() << " | "; + ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << " | "; + ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | "; + ss << " VSX = " << ggml_cpu_has_vsx() << " | "; + snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str()); + return buffer; +} diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h index 112f70a3fb7..6f17b0719da 100644 --- a/otherarch/sdcpp/util.h +++ b/otherarch/sdcpp/util.h @@ -6,6 +6,7 @@ #include #include +#include "ggml-backend.h" #include "stable-diffusion.h" #include "tensor.hpp" @@ -85,6 +86,10 @@ int sd_get_preview_interval(); bool sd_should_preview_denoised(); bool sd_should_preview_noisy(); +// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc. +bool sd_backend_is(ggml_backend_t backend, const std::string& name); +ggml_backend_t sd_get_default_backend(); + void log_message(const char* format, ...); void set_sd_log_level(int log); bool get_sd_log_level(); diff --git a/otherarch/sdcpp/z_image.hpp b/otherarch/sdcpp/z_image.hpp index 363ce5f4f0d..6bb44b791d2 100644 --- a/otherarch/sdcpp/z_image.hpp +++ b/otherarch/sdcpp/z_image.hpp @@ -31,10 +31,6 @@ namespace ZImage { : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) { blocks["qkv"] = std::make_shared(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false); float scale = 1.f; -#if GGML_USE_HIP - // Prevent NaN issues with certain ROCm setups - scale = 1.f / 16.f; -#endif blocks["out"] = std::make_shared(num_heads * head_dim, hidden_size, false, false, false, scale); if (qk_norm) { blocks["q_norm"] = std::make_shared(head_dim); @@ -52,6 +48,10 @@ namespace ZImage { auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); auto out_proj = std::dynamic_pointer_cast(blocks["out"]); + if (sd_backend_is(ctx->backend, "ROCm")) { + out_proj->set_scale(1.f / 16.f); + } + auto qkv = qkv_proj->forward(ctx, x); // [N, n_token, (num_heads + num_kv_heads*2)*head_dim] qkv = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]); // [N, n_token, num_heads + num_kv_heads*2, head_dim] @@ -115,9 +115,7 @@ namespace ZImage { bool force_prec_f32 = false; float scale = 1.f / 128.f; -#ifdef SD_USE_VULKAN - force_prec_f32 = true; -#endif + // The purpose of the scale here is to prevent NaN issues in certain situations. // For example, when using CUDA but the weights are k-quants. blocks["w2"] = std::make_shared(hidden_dim, dim, false, false, force_prec_f32, scale); @@ -129,6 +127,10 @@ namespace ZImage { auto w2 = std::dynamic_pointer_cast(blocks["w2"]); auto w3 = std::dynamic_pointer_cast(blocks["w3"]); + if (sd_backend_is(ctx->backend, "Vulkan")) { + w2->set_force_prec_f32(true); + } + auto x1 = w1->forward(ctx, x); auto x3 = w3->forward(ctx, x); x = ggml_swiglu_split(ctx->ggml_ctx, x1, x3);