diff --git a/.gitignore b/.gitignore
index 8ef96d1e..56a8913e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,19 +1,5 @@
-*.jl.*.cov
-*.jl.cov
-*.jl.mem
-*.msh
-*.pos
-/Manifest.toml
+Manifest.toml
 /docs/Manifest.toml
-/docs/build/
 /test/Manifest.toml
-.vscode
-/docs/src/pluto-examples/*.md
-/sandbox/*
-/talks/*
+/docs/build/
 /test/vtk_output/*
-
-.markdownlint.json
-Inti.code-workspace
-docs/src/examples/heat.gif
-docs/src/.DS_Store
diff --git a/Project.toml b/Project.toml
index 576bd7a4..6daf9c3d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 HAdaptiveIntegration = "eaa5ad34-b243-48e9-b09c-54bc0655cecf"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
@@ -46,7 +47,9 @@ Bessels = "0.2"
 DataStructures = "0.18 - 0.19"
 ElementaryPDESolutions = "0.2 - 0.3"
 FMM2D = "0.2"
-FMM3D = "1"
+# FMM3D <2.1.0 has a different scaling convention for the kernels, not
+# including the 1/(4pi) prefactor
+FMM3D = "2.1.0"
 FMMLIB2D = "0.3"
 ForwardDiff = "0.10, 1"
 Gmsh = "0.3"
@@ -54,8 +57,9 @@ HAdaptiveIntegration = "0.2"
 HMatrices = "0.2"
 LinearAlgebra = "1"
 LinearMaps = "3"
+Logging = "1"
 Makie = "0.21 - 0.24"
-Meshes = "0.42 - 0.55"
+Meshes = "0.42 - 0.56"
 NearestNeighbors = "0.4"
 OrderedCollections = "1"
 Pkg = "1"
diff --git a/ext/IntiFMM2DExt.jl b/ext/IntiFMM2DExt.jl
index 64055645..4b60358b 100644
--- a/ext/IntiFMM2DExt.jl
+++ b/ext/IntiFMM2DExt.jl
@@ -3,6 +3,7 @@ module IntiFMM2DExt
 import Inti
 import FMM2D
 import LinearMaps
+using StaticArrays
 
 function __init__()
     return @info "Loading Inti.jl FMM2D extension"
@@ -146,6 +147,148 @@ function Inti._assemble_fmm2d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
                 return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
             end
         end
+    elseif K isa Inti.GradientSingleLayerKernel{<:SVector{2}, <:Inti.Laplace{2}}
+        charges = Vector{Float64}(undef, n)
+        return LinearMaps.LinearMap{SVector{2, Float64}}(m, n) do y, x
+            # multiply by weights and constant
+            @. charges = -1 / (2 * π) * weights * x
+            if same_surface
+                out =
+                    FMM2D.rfmm2d(; sources = sources, charges = charges, eps = rtol, pg = 2)
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.grad)))
+            else
+                out = FMM2D.rfmm2d(;
+                    sources = sources,
+                    charges = charges,
+                    targets = targets,
+                    eps = rtol,
+                    pgt = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.gradtarg)))
+            end
+        end
+    elseif K isa Inti.GradientDoubleLayerKernel{<:SVector{2}, <:Inti.Laplace{2}}
+        normals = Matrix{Float64}(undef, 2, n)
+        for j in 1:n
+            normals[:, j] = Inti.normal(iop.source[j])
+        end
+        dipvecs = similar(normals)
+        return LinearMaps.LinearMap{SVector{2, Float64}}(m, n) do y, x
+            # multiply by weights and constant
+            for j in 1:n
+                dipvecs[:, j] = -1 / (2 * π) * view(normals, :, j) * weights[j]
+            end
+            dipstr = Vector{Float64}(undef, n)
+            for j in 1:n
+                dipstr[j] = x[j]
+            end
+            if same_surface
+                out = FMM2D.rfmm2d(;
+                    sources = sources,
+                    dipstr = dipstr,
+                    dipvecs = dipvecs,
+                    eps = rtol,
+                    pg = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.grad)))
+            else
+                out = FMM2D.rfmm2d(;
+                    sources = sources,
+                    targets = targets,
+                    dipstr = dipstr,
+                    dipvecs = dipvecs,
+                    eps = rtol,
+                    pgt = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.gradtarg)))
+            end
+        end
+    elseif K isa Inti.SourceGradientSingleLayerKernel{<:Any, <:Inti.Laplace{2}}
+        # ∇yG(x,y)⋅g : dipoles with vector strengths g (scalar output). The W operator
+        # W[g] = -∫∇yG⋅g applies the leading minus when this map is assembled.
+        dipvecs = Matrix{Float64}(undef, 2, n)
+        dipstr = ones(Float64, n)
+        return LinearMaps.LinearMap{Float64}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = -(1 / (2 * π)) * x[j] * weights[j]
+            end
+            if same_surface
+                out = FMM2D.rfmm2d(;
+                    sources = sources,
+                    dipstr = dipstr,
+                    dipvecs = dipvecs,
+                    eps = rtol,
+                    pg = 1,
+                )
+                return copyto!(y, out.pot)
+            else
+                out = FMM2D.rfmm2d(;
+                    sources = sources,
+                    targets = targets,
+                    dipstr = dipstr,
+                    dipvecs = dipvecs,
+                    eps = rtol,
+                    pgt = 1,
+                )
+                return copyto!(y, out.pottarg)
+            end
+        end
+    elseif K isa Inti.HessianKernel{<:Any, <:Inti.Laplace{2}}
+        # X forward (PV part) = +∫∇ₓ∇ₓG⋅g = -∇ₓ(∫∇yG⋅g). The bracket is the
+        # `SourceGradientSingleLayer` dipole field above; its target-gradient (`gradtarg`,
+        # pgt = 2) is ∫∇ₓ∇yG⋅g = -X_forward. Negating the dipole strengths relative to the
+        # W branch folds in the leading minus, so `gradtarg` is the `SVector` output.
+        if K.charge_dipole == :charge
+            charges = Vector{Float64}(undef, n)
+            return LinearMaps.LinearMap{SMatrix{2, 2, Float64, 4}}(m, n) do y, x
+                @. charges = -1 / (2 * π) * weights * x
+                if same_surface
+                    out = FMM2D.rfmm2d(; sources = sources, charges = charges, eps = rtol, pg = 3)
+                    H = out.hess        # (3, n): ∂xx, ∂xy, ∂yy
+                else
+                    out = FMM2D.rfmm2d(;
+                        sources = sources,
+                        charges = charges,
+                        targets = targets,
+                        eps = rtol,
+                        pgt = 3,
+                    )
+                    H = out.hesstarg
+                end
+                @inbounds for i in 1:m
+                    y[i] = SMatrix{2, 2, Float64, 4}(H[1, i], H[2, i], H[2, i], H[3, i])
+                end
+                return y
+            end
+        else
+            dipvecs = Matrix{Float64}(undef, 2, n)
+            dipstr = ones(Float64, n)
+            return LinearMaps.LinearMap{SVector{2, Float64}}(m, n) do y, x
+                for j in 1:n
+                    dipvecs[:, j] = (1 / (2 * π)) * x[j] * weights[j]
+                end
+                if same_surface
+                    out = FMM2D.rfmm2d(;
+                        sources = sources,
+                        dipstr = dipstr,
+                        dipvecs = dipvecs,
+                        eps = rtol,
+                        pg = 2,
+                    )
+                    return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.grad)))
+                else
+                    out = FMM2D.rfmm2d(;
+                        sources = sources,
+                        targets = targets,
+                        dipstr = dipstr,
+                        dipvecs = dipvecs,
+                        eps = rtol,
+                        pgt = 2,
+                    )
+                    return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.gradtarg)))
+                end
+            end
+        end
         # Helmholtz
     elseif K isa Inti.SingleLayerKernel{ComplexF64, <:Inti.Helmholtz{2}}
         charges = Vector{ComplexF64}(undef, n)
@@ -287,6 +430,178 @@ function Inti._assemble_fmm2d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
                 return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
             end
         end
+    elseif K isa Inti.GradientSingleLayerKernel{<:SVector{2}, <:Inti.Helmholtz{2}}
+        charges = Vector{ComplexF64}(undef, n)
+        zk = ComplexF64(K.op.k)
+        return LinearMaps.LinearMap{SVector{2, ComplexF64}}(m, n) do y, x
+            # multiply by weights and constant
+            @. charges = weights * x
+            if same_surface
+                out = FMM2D.hfmm2d(;
+                    zk = zk,
+                    sources = sources,
+                    charges = charges,
+                    eps = rtol,
+                    pg = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, ComplexF64}, vec(out.grad)))
+            else
+                out = FMM2D.hfmm2d(;
+                    zk = zk,
+                    sources = sources,
+                    charges = charges,
+                    targets = targets,
+                    eps = rtol,
+                    pgt = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, ComplexF64}, vec(out.gradtarg)))
+            end
+        end
+    elseif K isa Inti.GradientDoubleLayerKernel{<:SVector{2}, <:Inti.Helmholtz{2}}
+        normals = Matrix{Float64}(undef, 2, n)
+        for j in 1:n
+            normals[:, j] = Inti.normal(iop.source[j])
+        end
+        dipvecs = similar(normals, Float64)
+        dipstrs = Vector{ComplexF64}(undef, n)
+        zk = ComplexF64(K.op.k)
+        return LinearMaps.LinearMap{SVector{2, ComplexF64}}(m, n) do y, x
+            # multiply by weights and constant
+            for j in 1:n
+                dipvecs[:, j] = view(normals, :, j) * weights[j]
+            end
+            for j in 1:n
+                dipstrs[j] = x[j]
+            end
+            if same_surface
+                out = FMM2D.hfmm2d(;
+                    zk = zk,
+                    sources = sources,
+                    dipstr = dipstrs,
+                    dipvecs = dipvecs,
+                    eps = rtol,
+                    pg = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, ComplexF64}, vec(out.grad)))
+            else
+                out = FMM2D.hfmm2d(;
+                    zk = zk,
+                    sources = sources,
+                    targets = targets,
+                    dipstr = dipstrs,
+                    dipvecs = dipvecs,
+                    eps = rtol,
+                    pgt = 2,
+                )
+                return copyto!(y, reinterpret(SVector{2, ComplexF64}, vec(out.gradtarg)))
+            end
+        end
+
+    elseif K isa Inti.SourceGradientSingleLayerKernel{<:Any, <:Inti.Helmholtz{2}}
+        # ∫∇yG⋅g = Σ wⱼ gⱼ⋅∇yG : dipoles with vector strengths g. The W operator
+        # W[g] = -∫∇yG⋅g applies the leading minus when this map is assembled.
+        # FMM2D's `hfmm2d` requires real dipole directions (with a complex
+        # strength), so the complex density is split into real/imaginary parts:
+        #   Re(g): dipvec=Re(gⱼ), dipstr=wⱼ ;  Im(g): dipvec=Im(gⱼ), dipstr=i wⱼ.
+        dipvecs = Matrix{Float64}(undef, 2, n)
+        dipstr = Vector{ComplexF64}(undef, n)
+        zk = ComplexF64(K.op.k)
+        return LinearMaps.LinearMap{ComplexF64}(m, n) do y, x
+            fill!(y, 0)
+            for (getpart, strconst) in ((real, one(ComplexF64)), (imag, 1im))
+                for j in 1:n
+                    dipvecs[:, j] = getpart.(x[j])
+                    dipstr[j] = strconst * weights[j]
+                end
+                if same_surface
+                    out = FMM2D.hfmm2d(;
+                        zk = zk,
+                        sources = sources,
+                        dipstr = dipstr,
+                        dipvecs = dipvecs,
+                        eps = rtol,
+                        pg = 1,
+                    )
+                    y .+= out.pot
+                else
+                    out = FMM2D.hfmm2d(;
+                        zk = zk,
+                        sources = sources,
+                        targets = targets,
+                        dipstr = dipstr,
+                        dipvecs = dipvecs,
+                        eps = rtol,
+                        pgt = 1,
+                    )
+                    y .+= out.pottarg
+                end
+            end
+            return y
+        end
+    elseif K isa Inti.HessianKernel{<:Any, <:Inti.Helmholtz{2}}
+        # Charge→Hessian realization of the Hessian single-layer *volume* operator used by the
+        # `X = ∇W` VDIM correction in 2D: a scalar density `ρ` maps to `∫∇ₓ∇ₓG(x,y)ρ(y)dy`
+        # (a 2×2 `SMatrix` per target), i.e. the Hessian of the single-layer potential of
+        # charges `ρ`. `rfmm2d` returns the 3 unique second derivatives per point as `(3,·)` in
+        # the order ∂xx, ∂xy, ∂yy;
+        if K.charge_dipole == :charge
+            #`hess`/`hesstarg` is (3,·) = ∂xx, ∂xy, ∂yy.
+            charges = Vector{ComplexF64}(undef, n)
+            zk = ComplexF64(K.op.k)
+            return LinearMaps.LinearMap{SMatrix{2, 2, ComplexF64, 4}}(m, n) do y, x
+                @. charges = weights * x
+                if same_surface
+                    out = FMM2D.hfmm2d(; zk = zk, sources = sources, charges = charges, eps = rtol, pg = 3)
+                    H = out.hess
+                else
+                    out = FMM2D.hfmm2d(;
+                        zk = zk,
+                        sources = sources,
+                        charges = charges,
+                        targets = targets,
+                        eps = rtol,
+                        pgt = 3,
+                    )
+                    H = out.hesstarg
+                end
+                @inbounds for i in 1:m
+                    y[i] = SMatrix{2, 2, ComplexF64, 4}(H[1, i], H[2, i], H[2, i], H[3, i])
+                end
+                return y
+            end
+        # X forward = +∫∇ₓ∇ₓG⋅g = -∇ₓ(∫∇yG⋅g). The bracket is the ∇yG-dipole field (W
+        # forward); its target-gradient is ∫∇ₓ∇yG⋅g = -X_forward, so the dipole strengths
+        # are negated and `grad`/`gradtarg` is the output. As for the W forward, hfmm2d
+        # needs real dipvecs with complex strengths, so the complex density is split into
+        # real/imag parts (dipstr = wⱼ resp. i·wⱼ).
+        else
+            dipvecs = Matrix{Float64}(undef, 2, n)
+            dipstr = Vector{ComplexF64}(undef, n)
+            zk = ComplexF64(K.op.k)
+            return LinearMaps.LinearMap{SVector{2, ComplexF64}}(m, n) do y, x
+                fill!(y, zero(SVector{2, ComplexF64}))
+                for (getpart, strconst) in ((real, one(ComplexF64)), (imag, 1im))
+                    for j in 1:n
+                        dipvecs[:, j] = -getpart.(x[j])
+                        dipstr[j] = strconst * weights[j]
+                    end
+                    if same_surface
+                        out = FMM2D.hfmm2d(;
+                            zk = zk, sources = sources, dipstr = dipstr, dipvecs = dipvecs,
+                            eps = rtol, pg = 2,
+                        )
+                        y .+= reinterpret(SVector{2, ComplexF64}, vec(out.grad))
+                    else
+                        out = FMM2D.hfmm2d(;
+                            zk = zk, sources = sources, targets = targets, dipstr = dipstr,
+                            dipvecs = dipvecs, eps = rtol, pgt = 2,
+                        )
+                        y .+= reinterpret(SVector{2, ComplexF64}, vec(out.gradtarg))
+                    end
+                end
+                return y
+            end
+        end
     else
         error("integral operator not supported by Inti's FMM2D wrapper")
     end
diff --git a/ext/IntiFMM3DExt.jl b/ext/IntiFMM3DExt.jl
index 1e92caea..2a0ac4da 100644
--- a/ext/IntiFMM3DExt.jl
+++ b/ext/IntiFMM3DExt.jl
@@ -9,7 +9,7 @@ function __init__()
     return @info "Loading Inti.jl FMM3D extension"
 end
 
-function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
+function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()), ndiv = nothing)
     # unpack the necessary fields in the appropriate format
     m, n = size(iop)
     targets = Matrix{Float64}(undef, 3, m)
@@ -27,8 +27,12 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         charges = Vector{Float64}(undef, n)
         return LinearMaps.LinearMap{Float64}(m, n) do y, x
             # multiply by weights and constant
-            @. charges = 1 / (4 * π) * weights * x
-            out = FMM3D.lfmm3d(rtol, sources; charges, targets, pgt = 1)
+            @. charges = weights * x
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; charges, targets, pgt = 1)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; charges, targets, pgt = 1)
+            end
             return copyto!(y, out.pottarg)
         end
     elseif K isa Inti.DoubleLayerKernel{Float64, <:Inti.Laplace{3}}
@@ -40,9 +44,13 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         return LinearMaps.LinearMap{Float64}(m, n) do y, x
             # multiply by weights and constant
             for j in 1:n
-                dipvecs[:, j] = 1 / (4 * π) * view(normals, :, j) * x[j] * weights[j]
+                dipvecs[:, j] = view(normals, :, j) * x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; dipvecs, targets, pgt = 1)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 1)
             end
-            out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 1)
             return copyto!(y, out.pottarg)
         end
     elseif K isa Inti.AdjointDoubleLayerKernel{Float64, <:Inti.Laplace{3}}
@@ -53,8 +61,12 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         charges = Vector{Float64}(undef, n)
         return LinearMaps.LinearMap{Float64}(m, n) do y, x
             # multiply by weights and constant
-            @. charges = 1 / (4 * π) * weights * x
-            out = FMM3D.lfmm3d(rtol, sources; charges, targets, pgt = 2)
+            @. charges = weights * x
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; charges, targets, pgt = 2)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; charges, targets, pgt = 2)
+            end
             return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
         end
     elseif K isa Inti.HyperSingularKernel{Float64, <:Inti.Laplace{3}}
@@ -70,19 +82,109 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         return LinearMaps.LinearMap{Float64}(m, n) do y, x
             # multiply by weights and constant
             for j in 1:n
-                dipvecs[:, j] = 1 / (4 * π) * view(ynormals, :, j) * x[j] * weights[j]
+                dipvecs[:, j] = view(ynormals, :, j) * x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; dipvecs, targets, pgt = 2)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 2)
             end
-            out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 2)
             return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
         end
+    elseif K isa Inti.GradientSingleLayerKernel{<:SVector{3}, <:Inti.Laplace{3}}
+        charges = Vector{Float64}(undef, n)
+        return LinearMaps.LinearMap{SVector{3, Float64}}(m, n) do y, x
+            # multiply by weights and constant
+            @. charges = weights * x
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; charges, targets, pgt = 2)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; charges, targets, pgt = 2)
+            end
+            return copyto!(y, reinterpret(SVector{3, Float64}, vec(out.gradtarg)))
+        end
+    elseif K isa Inti.GradientDoubleLayerKernel{<:SVector{3}, <:Inti.Laplace{3}}
+        normals = Matrix{Float64}(undef, 3, n)
+        for j in 1:n
+            normals[:, j] = Inti.normal(iop.source[j])
+        end
+        dipvecs = similar(normals)
+        return LinearMaps.LinearMap{SVector{3, Float64}}(m, n) do y, x
+            dipvecs .= normals .* transpose(x .* weights)
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; dipvecs, targets, pgt = 2)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 2)
+            end
+            return copyto!(y, reinterpret(SVector{3, Float64}, vec(out.gradtarg)))
+        end
+    elseif K isa Inti.SourceGradientSingleLayerKernel{<:Any, <:Inti.Laplace{3}}
+        # ∇yG(x,y)⋅g : dipoles with vector strengths g (scalar output). The W operator
+        # W[g] = -∫∇yG⋅g applies the leading minus when this map is assembled.
+        dipvecs = Matrix{Float64}(undef, 3, n)
+        return LinearMaps.LinearMap{Float64}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; dipvecs, targets, pgt = 1)
+            else
+                out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 1)
+            end
+            return copyto!(y, out.pottarg)
+        end
+    elseif K isa Inti.HessianKernel{<:Any, <:Inti.Laplace{3}}
+        # Charge→Hessian realization of the Hessian *volume* operator used by
+        # the `X = ∇W` VDIM correction: a scalar density `ρ` maps to
+        # `∫∇ₓ∇ₓG(x,y)ρ(y)dy` (an `SMatrix` per target), i.e. the Hessian of the
+        # volume potential of charges `ρ`.
+        if K.charge_dipole == :charges
+            charges = Vector{Float64}(undef, n)
+            return LinearMaps.LinearMap{SMatrix{3, 3, Float64, 9}}(m, n) do y, x
+                @. charges = weights * x
+                out = FMM3D.lfmm3d(rtol, sources; charges, targets, pgt = 3)
+                # `hesstarg` is (6, m): the unique second derivatives in the order
+                # ∂xx, ∂yy, ∂zz, ∂xy, ∂xz, ∂yz. Assemble the symmetric `SMatrix`.
+                H = out.hesstarg
+                @inbounds for i in 1:m
+                    y[i] = SMatrix{3, 3, Float64, 9}(
+                        H[1, i], H[4, i], H[5, i],
+                        H[4, i], H[2, i], H[6, i],
+                        H[5, i], H[6, i], H[3, i],
+                    )
+                end
+                return y
+            end
+        # X forward = +∫∇ₓ∇ₓG⋅g, realized as the target-gradient of the
+        # ∇yG-dipole field with strengths -g (gradtarg = ∫∇ₓ∇yG⋅g = -X_forward,
+        # so the strengths are negated). It performs a dipole→gradient
+        # contraction of the Hessian: `SVector→SVector`.
+        else
+            dipvecs = Matrix{Float64}(undef, 3, n)
+            return LinearMaps.LinearMap{SVector{3, Float64}}(m, n) do y, x
+                for j in 1:n
+                    dipvecs[:, j] = -1.0 * x[j] * weights[j]
+                end
+                if !isnothing(ndiv)
+                    out = FMM3D.lfmm3d_ndiv(rtol, sources, ndiv; dipvecs, targets, pgt = 2)
+                else
+                    out = FMM3D.lfmm3d(rtol, sources; dipvecs, targets, pgt = 2)
+                end
+                return copyto!(y, reinterpret(SVector{3, Float64}, vec(out.gradtarg)))
+            end
+        end
         # Helmholtz
     elseif K isa Inti.SingleLayerKernel{ComplexF64, <:Inti.Helmholtz{3}}
         charges = Vector{ComplexF64}(undef, n)
         zk = ComplexF64(K.op.k)
         return LinearMaps.LinearMap{ComplexF64}(m, n) do y, x
             # multiply by weights and constant
-            @. charges = 1 / (4 * π) * weights * x
-            out = FMM3D.hfmm3d(rtol, zk, sources; charges, targets, pgt = 1)
+            @. charges = weights * x
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources, ndiv; charges, targets, pgt = 1)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; charges, targets, pgt = 1)
+            end
             return copyto!(y, out.pottarg)
         end
     elseif K isa Inti.DoubleLayerKernel{ComplexF64, <:Inti.Helmholtz{3}}
@@ -95,9 +197,13 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         return LinearMaps.LinearMap{ComplexF64}(m, n) do y, x
             # multiply by weights and constant
             for j in 1:n
-                dipvecs[:, j] = 1 / (4 * π) * view(normals, :, j) * x[j] * weights[j]
+                dipvecs[:, j] = view(normals, :, j) * x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources, ndiv; dipvecs, targets, pgt = 1)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 1)
             end
-            out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 1)
             return copyto!(y, out.pottarg)
         end
     elseif K isa Inti.AdjointDoubleLayerKernel{ComplexF64, <:Inti.Helmholtz{3}}
@@ -109,8 +215,12 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         zk = ComplexF64(K.op.k)
         return LinearMaps.LinearMap{ComplexF64}(m, n) do y, x
             # multiply by weights and constant
-            @. charges = 1 / (4 * π) * weights * x
-            out = FMM3D.hfmm3d(rtol, zk, sources; charges, targets, pgt = 2)
+            @. charges = weights * x
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources; charges, targets, pgt = 2)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; charges, targets, pgt = 2)
+            end
             return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
         end
     elseif K isa Inti.HyperSingularKernel{ComplexF64, <:Inti.Helmholtz{3}}
@@ -127,18 +237,89 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         return LinearMaps.LinearMap{ComplexF64}(m, n) do y, x
             # multiply by weights and constant
             for j in 1:n
-                dipvecs[:, j] = 1 / (4 * π) * view(ynormals, :, j) * x[j] * weights[j]
+                dipvecs[:, j] = view(ynormals, :, j) * x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources, ndiv; dipvecs, targets, pgt = 2)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 2)
             end
-            out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 2)
             return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
         end
+    elseif K isa Inti.GradientSingleLayerKernel{<:SVector{3}, <:Inti.Helmholtz{3}}
+        charges = Vector{ComplexF64}(undef, n)
+        zk = ComplexF64(K.op.k)
+        return LinearMaps.LinearMap{SVector{3, ComplexF64}}(m, n) do y, x
+            # multiply by weights and constant
+            @. charges = weights * x
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources, ndiv; charges, targets, pgt = 2)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; charges, targets, pgt = 2)
+            end
+            return copyto!(y, reinterpret(SVector{3, ComplexF64}, vec(out.gradtarg)))
+        end
+    elseif K isa Inti.GradientDoubleLayerKernel{<:SVector{3}, <:Inti.Helmholtz{3}}
+        normals = Matrix{Float64}(undef, 3, n)
+        for j in 1:n
+            normals[:, j] = Inti.normal(iop.source[j])
+        end
+        dipvecs = similar(normals, ComplexF64)
+        zk = ComplexF64(K.op.k)
+        return LinearMaps.LinearMap{SVector{3, ComplexF64}}(m, n) do y, x
+            dipvecs .= normals .* transpose(x .* weights)
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources, ndiv; dipvecs, targets, pgt = 2)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 2)
+            end
+            return copyto!(y, reinterpret(SVector{3, ComplexF64}, vec(out.gradtarg)))
+        end
+    elseif K isa Inti.SourceGradientSingleLayerKernel{<:Any, <:Inti.Helmholtz{3}}
+        # ∇yG(x,y)⋅g : dipoles with vector strengths g (scalar output). The W operator
+        # W[g] = -∫∇yG⋅g applies the leading minus when this map is assembled.
+        dipvecs = Matrix{ComplexF64}(undef, 3, n)
+        zk = ComplexF64(K.op.k)
+        return LinearMaps.LinearMap{ComplexF64}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d(rtol, zk, sources, ndiv; dipvecs, targets, pgt = 1)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 1)
+            end
+            return copyto!(y, out.pottarg)
+        end
+    elseif K isa Inti.HessianKernel{<:Any, <:Inti.Helmholtz{3}}
+        # X forward = +∫∇ₓ∇ₓG⋅g, realized as the target-gradient of the
+        # ∇yG-dipole field with strengths -g (gradtarg = ∫∇ₓ∇yG⋅g = -X_forward,
+        # so the strengths are negated). It performs a dipole→gradient
+        # contraction of the Hessian: `SVector→SVector`.
+        # hfmm3d has no Hessian for an optimized volume routine (see 'opt_vol'
+        # in vdim.jl), but the forward map only needs the gradient (pgt = 2).
+        msg = "FMM3D does not support charge Hessian computations for Helmholtz"
+        K.charge_dipole == :dipole || error(msg)
+        zk = ComplexF64(K.op.k)
+        dipvecs = Matrix{ComplexF64}(undef, 3, n)
+        return LinearMaps.LinearMap{SVector{3, ComplexF64}}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = -1.0 * x[j] * weights[j]
+            end
+            if !isnothing(ndiv)
+                out = FMM3D.hfmm3d_ndiv(rtol, zk, sources, ndiv; dipvecs, targets, pgt = 2)
+            else
+                out = FMM3D.hfmm3d(rtol, zk, sources; dipvecs, targets, pgt = 2)
+            end
+            return copyto!(y, reinterpret(SVector{3, ComplexF64}, vec(out.gradtarg)))
+        end
         # Stokes
     elseif K isa Inti.SingleLayerKernel{SMatrix{3, 3, Float64, 9}, <:Inti.Stokes{3, Float64}}
         T = SVector{3, Float64}
         stoklet = Matrix{Float64}(undef, 3, n)
         return LinearMaps.LinearMap{SMatrix{3, 3, Float64, 9}}(m, n) do y, x
             # multiply by weights and constant
-            stoklet[:] = 1 / (4 * π * K.op.μ) .* reinterpret(Float64, weights .* x)
+            stoklet[:] = 1 / K.op.μ .* reinterpret(Float64, weights .* x)
             out = FMM3D.stfmm3d(rtol, sources; stoklet, targets, ppregt = 1)
             return copyto!(y, reinterpret(T, out.pottarg))
         end
@@ -152,7 +333,7 @@ function Inti._assemble_fmm3d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
         strslet = similar(normals, Float64)
         # multiply by weights and constant
         for j in 1:n
-            strsvec[:, j] = -1 / (4 * π) * view(normals, :, j) .* weights[j]
+            strsvec[:, j] = view(normals, :, j) .* weights[j]
         end
         return LinearMaps.LinearMap{SMatrix{3, 3, Float64, 9}}(m, n) do y, x
             strslet[:] = reinterpret(Float64, x)
diff --git a/ext/IntiFMMLIB2DExt.jl b/ext/IntiFMMLIB2DExt.jl
index 5dd95012..1ec613f0 100644
--- a/ext/IntiFMMLIB2DExt.jl
+++ b/ext/IntiFMMLIB2DExt.jl
@@ -3,6 +3,7 @@ module IntiFMMLIB2DExt
 import Inti
 import FMMLIB2D
 import LinearMaps
+using StaticArrays
 
 function __init__()
     return @info "Loading Inti.jl FMMLIB2D extension"
@@ -150,6 +151,130 @@ function Inti._assemble_fmm2d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
                 return copyto!(y, sum(xnormals .* out.gradtarg; dims = 1) |> vec)
             end
         end
+    elseif K isa Inti.GradientSingleLayerKernel{<:SVector{2}, <:Inti.Laplace{2}}
+        # ∇ₓG : charges with scalar strengths (SVector output = ∇ₓ of the single layer).
+        charges = Vector{Float64}(undef, n)
+        return LinearMaps.LinearMap{SVector{2, Float64}}(m, n) do y, x
+            @. charges = -1 / (2 * π) * weights * x
+            # FMMLIB2D does no checking for if targets are also sources
+            if same_surface
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    charge = charges,
+                    ifgrad = true,
+                    tol = rtol,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.grad)))
+            else
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    charge = charges,
+                    target = targets,
+                    ifgradtarg = true,
+                    tol = rtol,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.gradtarg)))
+            end
+        end
+    elseif K isa Inti.GradientDoubleLayerKernel{<:SVector{2}, <:Inti.Laplace{2}}
+        # ∇ₓ∂_{n_y}G : dipoles (vec = source normal, str = density), SVector output.
+        normals = Matrix{Float64}(undef, 2, n)
+        for j in 1:n
+            normals[:, j] = Inti.normal(iop.source[j])
+        end
+        dipvecs = similar(normals)
+        dipstr = Vector{Float64}(undef, n)
+        return LinearMaps.LinearMap{SVector{2, Float64}}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = -1 / (2 * π) * view(normals, :, j) * weights[j]
+            end
+            for j in 1:n
+                dipstr[j] = x[j]
+            end
+            # FMMLIB2D does no checking for if targets are also sources
+            if same_surface
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    dipstr = dipstr,
+                    dipvec = dipvecs,
+                    ifgrad = true,
+                    tol = rtol,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.grad)))
+            else
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    target = targets,
+                    dipstr = dipstr,
+                    dipvec = dipvecs,
+                    ifgradtarg = true,
+                    tol = rtol,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.gradtarg)))
+            end
+        end
+    elseif K isa Inti.SourceGradientSingleLayerKernel{<:Any, <:Inti.Laplace{2}}
+        # ∇yG(x,y)⋅g : dipoles with vector strengths g (scalar output). The W operator
+        # W[g] = -∫∇yG⋅g applies the leading minus when this map is assembled.
+        dipvecs = Matrix{Float64}(undef, 2, n)
+        dipstr = ones(Float64, n)
+        return LinearMaps.LinearMap{Float64}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = -(1 / (2 * π)) * x[j] * weights[j]
+            end
+            # FMMLIB2D does no checking for if targets are also sources
+            if same_surface
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    dipstr = dipstr,
+                    dipvec = dipvecs,
+                    tol = rtol,
+                )
+                return copyto!(y, out.pot)
+            else
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    target = targets,
+                    dipstr = dipstr,
+                    dipvec = dipvecs,
+                    tol = rtol,
+                )
+                return copyto!(y, out.pottarg)
+            end
+        end
+    elseif K isa Inti.HessianSingleLayerKernel{<:Any, <:Inti.Laplace{2}}
+        # X forward (PV part) = +∫∇ₓ∇ₓG⋅g = -∇ₓ(∫∇yG⋅g): dipoles with vector strengths g,
+        # whose target-gradient is ∫∇ₓ∇yG⋅g = -X_forward, so the strengths are negated
+        # (the +1/(2π) Laplace 2D prefactor folds in) and `grad`/`gradtarg` is the
+        # `SVector` output directly.
+        dipvecs = Matrix{Float64}(undef, 2, n)
+        dipstrs = ones(Float64, n)
+        return LinearMaps.LinearMap{SVector{2, Float64}}(m, n) do y, x
+            for j in 1:n
+                dipvecs[:, j] = (1 / (2 * π)) * x[j] * weights[j]
+            end
+            # FMMLIB2D does no checking for if targets are also sources
+            if same_surface
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    dipstr = dipstrs,
+                    dipvec = dipvecs,
+                    ifgrad = true,
+                    tol = rtol,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.grad)))
+            else
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    target = targets,
+                    dipstr = dipstrs,
+                    dipvec = dipvecs,
+                    ifgradtarg = true,
+                    tol = rtol,
+                )
+                return copyto!(y, reinterpret(SVector{2, Float64}, vec(out.gradtarg)))
+            end
+        end
         # Helmholtz
     elseif K isa Inti.SingleLayerKernel{ComplexF64, <:Inti.Helmholtz{2}}
         charges = Vector{ComplexF64}(undef, n)
@@ -296,4 +421,52 @@ function Inti._assemble_fmm2d(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
     end
 end
 
+# Charge→Hessian realization of the Hessian single-layer *volume* operator used by the
+# `X = ∇W` VDIM correction in 2D: a scalar density `ρ` maps to `∫∇ₓ∇ₓG(x,y)ρ(y)dy`
+# (a 2×2 `SMatrix` per target), i.e. the Hessian of the single-layer potential of
+# charges `ρ`. `rfmm2d` returns the 3 unique second derivatives per point as `(3,·)` in
+# the order ∂xx, ∂xy, ∂yy; Inti's Laplace 2D kernel carries the −1/(2π) prefactor, folded
+# into the charges exactly as for the single layer.
+function Inti._assemble_fmm2d_chargehessian(iop::Inti.IntegralOperator; rtol = sqrt(eps()))
+    m, n = size(iop)
+    sources = Matrix{Float64}(undef, 2, n)
+    for j in 1:n
+        sources[:, j] = Inti.coords(iop.source[j])
+    end
+    targets = Matrix{Float64}(undef, 2, m)
+    for i in 1:m
+        targets[:, i] = Inti.coords(iop.target[i])
+    end
+    weights = [q.weight for q in iop.source]
+    same_surface =
+        m == n ? isapprox(targets, sources; atol = Inti.SAME_POINT_TOLERANCE) : false
+    K = iop.kernel
+    if K isa Inti.HessianSingleLayerKernel{<:Any, <:Inti.Laplace{2}}
+        charges = Vector{Float64}(undef, n)
+        return LinearMaps.LinearMap{SMatrix{2, 2, Float64, 4}}(m, n) do y, x
+            @. charges = -1 / (2 * π) * weights * x
+            # FMMLIB2D does no checking for if targets are also sources
+            if same_surface
+                out = FMMLIB2D.rfmm2d(; source = sources, charge = charges, ifhess = true, tol = rtol)
+                H = out.hess        # (3, n): ∂xx, ∂xy, ∂yy
+            else
+                out = FMMLIB2D.rfmm2d(;
+                    source = sources,
+                    charge = charges,
+                    target = targets,
+                    ifhesstarg = true,
+                    tol = rtol,
+                )
+                H = out.hesstarg
+            end
+            @inbounds for i in 1:m
+                y[i] = SMatrix{2, 2, Float64, 4}(H[1, i], H[2, i], H[2, i], H[3, i])
+            end
+            return y
+        end
+    else
+        error("Inti's FMMLIB2D charge→Hessian wrapper only supports Laplace 2D")
+    end
+end
+
 end # module
diff --git a/src/Inti.jl b/src/Inti.jl
index 8179e911..e6f99312 100644
--- a/src/Inti.jl
+++ b/src/Inti.jl
@@ -7,19 +7,27 @@ module Inti
 
 const PROJECT_ROOT = pkgdir(Inti)
 
+import Bessels
+import ElementaryPDESolutions
+import HAdaptiveIntegration
+import SpecialFunctions
+
+import ElementaryPDESolutions: Polynomial
+
 using DataStructures
 using ForwardDiff
 using LinearAlgebra
 using LinearMaps
+using Logging
 using NearestNeighbors
 using Pkg
+using Printf
 using QuadGK
+using Richardson
 using Scratch
 using SparseArrays
 using StaticArrays
-using Printf
 using TOML
-using Richardson
 using OrderedCollections
 
 import ElementaryPDESolutions
@@ -27,12 +35,14 @@ import SpecialFunctions
 import Bessels # faster than SpecialFunctions for Bessel functions with real args
 import HAdaptiveIntegration
 
+
 # helper functions
 include("utils.jl")
 include("blockarray.jl")
 
 # basic interpolation and integration
 include("reference_shapes.jl")
+
 include("polynomials.jl")
 include("reference_interpolation.jl")
 include("quad_rules_tables.jl")
diff --git a/src/api.jl b/src/api.jl
index 26cbccf1..9eaa0dec 100644
--- a/src/api.jl
+++ b/src/api.jl
@@ -15,13 +15,14 @@ const CORRECTION_METHODS = [:none, :dim, :adaptive]
 
 """
     single_double_layer(; op, target, source::Quadrature, compression,
-    correction, derivative = false)
+    correction, kernel_variant = :default)
 
 Construct a discrete approximation to the single- and double-layer integral operators for
 `op`, mapping values defined on the quadrature nodes of `source` to values defined on the
-nodes of `target`. If `derivative = true`, return instead the adjoint double-layer and
-hypersingular operators (which are the generalized Neumann trace of the single- and
-double-layer, respectively).
+nodes of `target`. The `kernel_variant` keyword controls which pair of kernels is used:
+`:default` gives the standard single- and double-layer kernels, `:neumann` gives the
+adjoint double-layer and hypersingular kernels (i.e. the generalized Neumann trace of the
+single- and double-layer), and `:gradient` gives the gradient kernels.
 
 For finer control, you must choose a `compression` method and a `correction` method, as
 described below.
@@ -68,12 +69,28 @@ function single_double_layer(;
         source,
         compression = (method = :none,),
         correction = (method = :adaptive,),
-        derivative = false,
+        kernel_variant::Symbol = :default,
+        derivative = nothing,
     )
+    if !isnothing(derivative)
+        Base.depwarn(
+            "The `derivative` keyword is deprecated; use `kernel_variant = :neumann` instead.",
+            :single_double_layer,
+        )
+        kernel_variant = derivative ? :neumann : :default
+    end
     compression = _normalize_compression(compression, target, source)
     correction = _normalize_correction(correction, target, source)
-    G = derivative ? AdjointDoubleLayerKernel(op) : SingleLayerKernel(op)
-    dG = derivative ? HyperSingularKernel(op) : DoubleLayerKernel(op)
+    if kernel_variant === :gradient
+        G = GradientSingleLayerKernel(op)
+        dG = GradientDoubleLayerKernel(op)
+    elseif kernel_variant === :neumann
+        G = AdjointDoubleLayerKernel(op)
+        dG = HyperSingularKernel(op)
+    else
+        G = SingleLayerKernel(op)
+        dG = DoubleLayerKernel(op)
+    end
     Sop = IntegralOperator(G, target, source)
     Dop = IntegralOperator(dG, target, source)
     # handle compression
@@ -143,7 +160,7 @@ function single_double_layer(;
                 Dop_dim_mat;
                 green_multiplier = green_multiplier[glob_near_trgs],
                 correction.maxdist,
-                derivative,
+                kernel_variant,
                 filter_target_params,
             )
         else
@@ -155,7 +172,7 @@ function single_double_layer(;
                 Dmat;
                 green_multiplier,
                 correction.maxdist,
-                derivative,
+                kernel_variant,
             )
         end
     elseif correction.method == :adaptive
@@ -207,7 +224,7 @@ function adj_double_layer_hypersingular(;
         source,
         compression,
         correction,
-        derivative = true,
+        kernel_variant = :neumann,
     )
 end
 
@@ -279,10 +296,22 @@ the specified compression method. If no compression is specified, the operator
 is returned as is. If a correction method is specified, the correction is
 computed and added to the compressed operator.
 """
-function volume_potential(; op, target, source::Quadrature, compression, correction)
+function volume_potential(; op, target, source::Quadrature, compression, correction, kernel_variant::Symbol = :default)
     correction = _normalize_correction(correction, target, source)
     compression = _normalize_compression(compression, target, source)
-    G = SingleLayerKernel(op)
+    if kernel_variant === :gradient
+        G = GradientSingleLayerKernel(op)
+    elseif kernel_variant === :gradient_source
+        # naive forward map of W[g] = -∫∇yG⋅g (vector density -> scalar). The kernel is
+        # ∇yG; the leading minus of W is applied when the operator is assembled below.
+        G = SourceGradientSingleLayerKernel(op)
+    elseif kernel_variant === :hessian_source
+        # forward map of X[g] = ∇W[g] (vector density -> vector). The PV part is
+        # +∫∇ₓ∇ₓG⋅g, so the (target) Hessian single-layer kernel is assembled as-is.
+        G = HessianKernel(op, :dipole)
+    else
+        G = SingleLayerKernel(op)
+    end
     V = IntegralOperator(G, target, source)
     # compress V
     if compression.method == :none
@@ -290,7 +319,7 @@ function volume_potential(; op, target, source::Quadrature, compression, correct
     elseif compression.method == :hmatrix
         Vmat = assemble_hmatrix(V; rtol = compression.tol)
     elseif compression.method == :fmm
-        Vmat = assemble_fmm(V; rtol = compression.tol)
+        Vmat = assemble_fmm(V; rtol = compression.tol, ndiv = compression.ndiv)
     else
         error("Unknown compression method. Available options: $COMPRESSION_METHODS")
     end
@@ -322,6 +351,10 @@ function volume_potential(; op, target, source::Quadrature, compression, correct
         else
             error("Missing correction.boundary field for :dim method on a volume potential")
         end
+        # The W (3.25) and X (3.28) regularizations both use the standard scalar
+        # single-/double-layer potentials, so for those variants the boundary
+        # operators are built with the `:default` kernel.
+        boundary_variant = kernel_variant in (:gradient_source, :hessian_source) ? :default : kernel_variant
         # Advanced usage: Use previously constructed layer operators for VDIM
         if !haskey(correction, :S_b2d) || !haskey(correction, :D_b2d)
             if haskey(correction, :green_multiplier)
@@ -331,6 +364,7 @@ function volume_potential(; op, target, source::Quadrature, compression, correct
                     source = boundary,
                     compression,
                     correction,
+                    kernel_variant = boundary_variant,
                 )
             else
                 S, D = single_double_layer(;
@@ -339,12 +373,55 @@ function volume_potential(; op, target, source::Quadrature, compression, correct
                     source = boundary,
                     compression,
                     correction = (correction..., target_location = loc),
+                    kernel_variant = boundary_variant,
                 )
             end
         else
             S = correction.S_b2d
             D = correction.D_b2d
         end
+        # The X regularization (3.28) additionally needs the gradient single-layer
+        # ∇ₓS for the `-∇ₓS[gⱼν]` term; build it from the `:gradient` variant (its
+        # single-layer return, `GS`, is the gradient single-layer).
+        grad_single_layer = nothing
+        if kernel_variant === :hessian_source
+            if haskey(correction, :green_multiplier)
+                grad_single_layer, _ = single_double_layer(;
+                    op, target, source = boundary, compression, correction,
+                    kernel_variant = :gradient,
+                )
+            else
+                grad_single_layer, _ = single_double_layer(;
+                    op, target, source = boundary, compression,
+                    correction = (correction..., target_location = loc),
+                    kernel_variant = :gradient,
+                )
+            end
+        end
+        # Volume operator used to build the correction.
+        if kernel_variant === :gradient_source
+            Vg = IntegralOperator(GradientSingleLayerKernel(op), target, source)
+            if compression.method == :none
+                Vcorr = assemble_matrix(Vg)
+            elseif compression.method == :hmatrix
+                Vcorr = assemble_hmatrix(Vg; rtol = compression.tol)
+            else
+                Vcorr = assemble_fmm(Vg; rtol = compression.tol)
+            end
+        elseif kernel_variant === :hessian_source && compression.method == :fmm
+            # X under FMM: the forward `Vmat` is a dipole→gradient map (`SVector` output)
+            # and cannot produce the Hessian `SMatrix` from a scalar monomial density. Build
+            # a dedicated charge→Hessian volume operator (scalar→`SMatrix`) so the correction
+            # applies it once per monomial (see `_vdim_correction_X`).
+            if (ambient_dimension(op) == 3 && op isa Laplace) || (ambient_dimension(op) == 2 && (op isa Laplace || op isa Helmholtz))
+                Vh = IntegralOperator(HessianKernel(op, :charge), target, source)
+            else
+                Vh = IntegralOperator(HessianKernel(op, :dipole), target, source)
+            end
+            Vcorr = assemble_fmm(Vh; rtol = compression.tol)
+        else
+            Vcorr = Vmat
+        end
         interpolation_order = correction.interpolation_order
         δV = vdim_correction(
             op,
@@ -353,16 +430,33 @@ function volume_potential(; op, target, source::Quadrature, compression, correct
             boundary,
             S,
             D,
-            Vmat;
+            Vcorr;
             green_multiplier,
             correction.maxdist,
             interpolation_order,
+            kernel_variant,
+            grad_single_layer,
         )
     else
         error("Unknown correction method. Available options: $CORRECTION_METHODS")
     end
     # add correction
-    if compression.method ∈ (:hmatrix, :none)
+    if kernel_variant === :gradient_source
+        # W maps a vector (`SVector`) density to a scalar. `Vmat` assembles
+        # `∫∇yG⋅g`; the operator W[g] = -∫∇yG⋅g carries a leading minus, hence the
+        # `-Vmat` forward map. Wrap it together with the sparse correction in a
+        # `VectorDensityOperator` so that `W * g` allocates a clean scalar output
+        # (see the type's docstring).
+        V = VectorDensityOperator{default_kernel_eltype(op)}(-Vmat, δV, size(δV))
+    elseif kernel_variant === :hessian_source
+        # X maps a vector (`SVector`) density to a vector (`SVector`) output. The
+        # PV part of `X[g] = ∇W[g]` equals `+∫∇ₓ∇ₓG⋅g`, so the forward map is `Vmat`
+        # (no leading minus). Wrap it with the sparse correction in a
+        # `VectorDensityOperator` so that `X * g` allocates a clean `Vector{SVector}`
+        # output (a plain `LinearMap` would infer an abstract `SArray` eltype).
+        N = ambient_dimension(op)
+        V = VectorDensityOperator{SVector{N, default_kernel_eltype(op)}}(Vmat, δV, size(δV))
+    elseif compression.method ∈ (:hmatrix, :none)
         # TODO: in the hmatrix case, we may want to add the correction directly
         # to the HMatrix so that a direct solver can be later used
         V = LinearMap(Vmat) + LinearMap(δV)
diff --git a/src/bdim.jl b/src/bdim.jl
index 213315ba..551bfaaf 100644
--- a/src/bdim.jl
+++ b/src/bdim.jl
@@ -9,6 +9,78 @@ Parameters associated with the density interpolation method used in
     sources_radius_multiplier::Float64 = 1.5
 end
 
+# ---- Internal helpers for bdim_correction ----
+#
+# The per-target linear solve is Mdata' * Wdata ≈ Θi_flat, where Mdata is a
+# plain float matrix whose shape is determined by `Tbase` (the base kernel
+# eltype). The RHS column count and pack/unpack logic depend on `Tout`:
+#   Tout <: Number                     → 1 column
+#   Tout <: SVector{P,<:Number}        → P columns
+#   Tout <: SMatrix{N,N}               → N columns       (one block solve)
+#   Tout <: SVector{K,<:SMatrix{N,N}}  → K*N columns     (K block solves batched)
+#
+# The `transpose` calls in the SMatrix variants reflect BlockArray's column-major
+# block layout: writing an SMatrix through parent() exposes its transpose.
+
+_bdim_num_rhs(::Type{T}) where {T <: Number} = 1
+_bdim_num_rhs(::Type{SV}) where {P, SV <: SVector{P, <:Number}} = P
+_bdim_num_rhs(::Type{SM}) where {N, SM <: SMatrix{N, N}} = N
+_bdim_num_rhs(::Type{SV}) where {K, SM <: SMatrix, SV <: SVector{K, SM}} = K * size(SM, 1)
+
+function _bdim_fill_Θi!(Θi_flat, Θ, j, ::Type{T}) where {T <: Number}
+    @inbounds for m in axes(Θi_flat, 1)
+        Θi_flat[m, 1] = Θ[j, m]
+    end
+end
+function _bdim_fill_Θi!(Θi_flat, Θ, j, ::Type{SV}) where {P, SV <: SVector{P, <:Number}}
+    @inbounds for m in axes(Θi_flat, 1)
+        Θi_flat[m, :] .= Θ[j, m]
+    end
+end
+function _bdim_fill_Θi!(Θi_flat, Θ, j, ::Type{SM}) where {N, SM <: SMatrix{N, N}}
+    ns = size(Θi_flat, 1) ÷ N
+    @inbounds for m in 1:ns
+        Θi_flat[(m - 1) * N + 1:m * N, :] .= transpose(Θ[j, m])
+    end
+end
+function _bdim_fill_Θi!(Θi_flat, Θ, j, ::Type{SV}) where {K, SM <: SMatrix, SV <: SVector{K, SM}}
+    N = size(SM, 1)
+    ns = size(Θi_flat, 1) ÷ N
+    @inbounds for m in 1:ns, kk in 1:K
+        Θi_flat[(m - 1) * N + 1:m * N, (kk - 1) * N + 1:kk * N] .= transpose(Θ[j, m][kk])
+    end
+end
+
+function _bdim_push_weights!(Is, Js, Ss, Ds, Wdata, i, jglob, nq, ::Type{T}) where {T <: Number}
+    @inbounds for k in 1:nq
+        push!(Is, i); push!(Js, jglob[k])
+        push!(Ss, -Wdata[nq + k, 1])
+        push!(Ds,  Wdata[k, 1])
+    end
+end
+function _bdim_push_weights!(Is, Js, Ss, Ds, Wdata, i, jglob, nq, ::Type{SV}) where {P, SV <: SVector{P, <:Number}}
+    @inbounds for k in 1:nq
+        push!(Is, i); push!(Js, jglob[k])
+        push!(Ss, -SV(Wdata[nq + k, :]))
+        push!(Ds,  SV(Wdata[k, :]))
+    end
+end
+function _bdim_push_weights!(Is, Js, Ss, Ds, Wdata, i, jglob, nq, ::Type{SM}) where {N, SM <: SMatrix{N, N}}
+    @inbounds for k in 1:nq
+        push!(Is, i); push!(Js, jglob[k])
+        push!(Ss, -transpose(SM(view(Wdata, N * (nq + k - 1) + 1:N * (nq + k), :))))
+        push!(Ds,  transpose(SM(view(Wdata, N * (k - 1) + 1:k * N, :))))
+    end
+end
+function _bdim_push_weights!(Is, Js, Ss, Ds, Wdata, i, jglob, nq, ::Type{SV}) where {K, SM <: SMatrix, SV <: SVector{K, SM}}
+    N = size(SM, 1)
+    @inbounds for k in 1:nq
+        push!(Is, i); push!(Js, jglob[k])
+        push!(Ss, SV(ntuple(kk -> -transpose(SM(view(Wdata, N * (nq + k - 1) + 1:N * (nq + k), (kk - 1) * N + 1:kk * N))), Val(K))))
+        push!(Ds, SV(ntuple(kk ->  transpose(SM(view(Wdata, N * (k - 1) + 1:k * N,             (kk - 1) * N + 1:kk * N))), Val(K))))
+    end
+end
+
 """
     bdim_correction(op,X,Y,S,D; green_multiplier, kwargs...)
 
@@ -37,10 +109,9 @@ See [faria2021general](@cite) for more details on the method.
 
 - `parameters::DimParameters`: parameters associated with the density
   interpolation method
-- `derivative`: if true, compute the correction to the adjoint double-layer and
-  hypersingular operators instead. In this case, `S` and `D` should be replaced
-  by a (possibly innacurate) discretization of adjoint double-layer and
-  hypersingular operators, respectively.
+- `kernel_variant`: `:default` for the standard single/double-layer, `:neumann` for
+  the adjoint double-layer and hypersingular operators (Neumann trace), or `:gradient`
+  for the gradient kernels. `S` and `D` must be consistent with the chosen variant.
 - `maxdist`: distance beyond which interactions are considered sufficiently far
   so that no correction is needed. This is used to determine a threshold for
   nearly-singular corrections when `X` and `Y` are different surfaces. When `X
@@ -55,17 +126,20 @@ function bdim_correction(
         Dop;
         green_multiplier::Vector{<:Real},
         parameters = DimParameters(),
-        derivative::Bool = false,
+        kernel_variant::Symbol = :default,
         maxdist = Inf,
         filter_target_params = nothing,
     )
     imat_cond = imat_norm = res_norm = rhs_norm = theta_norm = -Inf
-    T = eltype(Sop)
+    Tout = eltype(Sop)
+    Tbase = default_kernel_eltype(op)
     # determine type for dense matrices
-    Dense = T <: SMatrix ? BlockArray : Array
+    DenseBase = Tbase <: SMatrix ? BlockArray : Array
     N = ambient_dimension(source)
-    @assert eltype(Dop) == T "eltype of S and D must match"
+    @assert eltype(Dop) == Tout "eltype of S and D must match"
     m, n = length(target), length(source)
+    # check if we are in debug mode to avoid expensive computations
+    do_debug = debug_mode()
     if isnothing(filter_target_params)
         dict_near = etype_to_nearest_points(target, source; maxdist)
         num_trgs = m
@@ -91,10 +165,22 @@ function bdim_correction(
         error("only 2D and 3D supported")
     end
     # compute traces of monopoles on the source mesh
-    G = SingleLayerKernel(op, T)
-    γ₁G = AdjointDoubleLayerKernel(op, T)
-    γ₀B = Dense{T}(undef, length(source), ns)
-    γ₁B = Dense{T}(undef, length(source), ns)
+    G = SingleLayerKernel(op, Tbase)
+    γ₁G = AdjointDoubleLayerKernel(op, Tbase)
+
+    if kernel_variant === :gradient
+        G_target = GradientSingleLayerKernel(op, Tout)
+        γ₁G_target = GradientDoubleLayerKernel(op, Tout)
+    elseif kernel_variant === :neumann
+        G_target = AdjointDoubleLayerKernel(op, Tout)
+        γ₁G_target = HyperSingularKernel(op, Tout)
+    else
+        G_target = SingleLayerKernel(op, Tout)
+        γ₁G_target = DoubleLayerKernel(op, Tout)
+    end
+
+    γ₀B = DenseBase{Tbase}(undef, length(source), ns)
+    γ₁B = DenseBase{Tbase}(undef, length(source), ns)
     for k in 1:ns
         for j in 1:length(source)
             γ₀B[j, k] = G(source[j], xs[k])
@@ -103,24 +189,25 @@ function bdim_correction(
     end
     # integrate the monopoles/dipoles over Y with target on X. This is the
     # slowest step, and passing a custom S,D can accelerate this computation.
-    Θ = Dense{T}(undef, m, ns)
-    fill!(Θ, zero(T))
-    # Compute Θ <-- S * γ₁B - D * γ₀B + μ * B(x) usig in-place matvec
+    DenseOut = Tout <: SMatrix ? BlockArray : Array
+    Θ = DenseOut{Tout}(undef, m, ns)
+    fill!(Θ, zero(Tout))
+    # Compute Θ <-- S * γ₁B - D * γ₀B + μ * B(x) using in-place matvec
     for k in 1:ns
         for i in 1:length(target)
             μ = green_multiplier[i]
-            v = derivative ? γ₁G(target[i], xs[k]) : G(target[i], xs[k])
+            v = G_target(target[i], xs[k])
             Θ[i, k] = μ * v
         end
     end
-    if Dense <: Array || (Sop isa BlockArray && Dop isa BlockArray)
+    if DenseOut <: Array || (Sop isa BlockArray && Dop isa BlockArray)
         mul!(Θ, Sop, γ₁B, 1, 1)
         mul!(Θ, Dop, γ₀B, -1, 1)
     else
         # for vector value problems, we only assume that Sop and Dop can be multiplied by
         # Vectors of SVectors, and so we need to perform multiplication column by column
-        P, Q = size(T)
-        S = eltype(T)
+        P, Q = size(Tbase)
+        S = eltype(Tbase)
         Θ_data = parent(Θ)
         γ₀B_data = parent(γ₀B)
         γ₁B_data = parent(γ₁B)
@@ -134,23 +221,22 @@ function bdim_correction(
     end
 
     # finally compute the corrected weights as sparse matrices
-    Is, Js, Ss, Ds = Int[], Int[], T[], T[]
+    Is, Js, Ss, Ds = Int[], Int[], Tout[], Tout[]
     for (E, qtags) in source.etype2qtags
         near_list = dict_near[E]
         nq, ne = size(qtags)
         @assert length(near_list) == ne
-        # preallocate a local matrix to store interpolant values resulting
-        # weights. To benefit from Lapack, we must convert everything to
-        # matrices of scalars, so when `T` is an `SMatrix` we are careful to
-        # convert between the `Matrix{<:SMatrix}` and `Matrix{<:Number}` formats
-        # by viewing the elements of type `T` as `σ × σ` matrices of
-        # `eltype(T)`.
-        M = Dense{T}(undef, 2 * nq, ns)
-        W = Dense{T}(undef, 2 * nq, 1)
-        Θi = Dense{T}(undef, 1, ns)
-        Mdata, Wdata, Θidata = parent(M)::Matrix, parent(W)::Matrix, parent(Θi)::Matrix
-        # for each element, we will solve Mᵀ W = Θiᵀ, where W is a vector of
-        # size 2nq, and Θi is a row vector of length(ns)
+        # M's block structure is fully determined by Tbase; its plain float
+        # parent Mdata is what LAPACK actually factors. The RHS column count
+        # `nc` is the only thing Tout contributes to the solve dimensions.
+        M = DenseBase{Tbase}(undef, 2 * nq, ns)
+        Mdata = parent(M)::Matrix
+        S_scalar = eltype(Mdata)
+        nc = _bdim_num_rhs(Tout)
+        Θi_flat = Matrix{S_scalar}(undef, size(Mdata, 2), nc)
+        Wdata   = Matrix{S_scalar}(undef, size(Mdata, 1), nc)
+        # for each element, we will solve Mᵀ W = Θiᵀ, where W is a matrix of
+        # size (flat_m × nc), and Θiᵀ has size (flat_ns × nc)
         for n in 1:ne
             # if there is nothing near, skip immediately to next element
             isempty(near_list[n]) && continue
@@ -158,28 +244,18 @@ function bdim_correction(
             jglob = @view qtags[:, n]
             M[1:nq, :] .= γ₀B[jglob, :]
             M[(nq + 1):2nq, :] .= γ₁B[jglob, :]
-            # TODO: get ride of all this transposing mumble jumble by assembling
+            # TODO: get rid of all this transposing mumble jumble by assembling
             # the matrix in the correct orientation in the first place
             F = qr!(transpose(Mdata))
-            @debug (imat_cond = max(cond(Mdata), imat_cond)) maxlog = 0
-            @debug (imat_norm = max(norm(Mdata), imat_norm)) maxlog = 0
+            if do_debug
+                imat_cond = max(cond(Mdata), imat_cond)
+                imat_norm = max(norm(Mdata), imat_norm)
+            end
             for i in near_list[n]
                 j = glob_loc_near_trgs[i]
-                Θi .= Θ[j:j, :]
-                @debug (rhs_norm = max(rhs_norm, norm(Θidata))) maxlog = 0
-                ldiv!(Wdata, F, transpose(Θidata))
-                @debug (
-                    res_norm = max(norm(Matrix(F) * Wdata - transpose(Θidata)), res_norm)
-                ) maxlog = 0
-                @debug (theta_norm = max(theta_norm, norm(Wdata))) maxlog = 0
-                for k in 1:nq
-                    push!(Is, i)
-                    push!(Js, jglob[k])
-                    # Since we actually computed the tranpose of the weights, we
-                    # need to transpose it again. This matters for e.g. elasticity
-                    push!(Ss, -transpose(W[nq + k])) # single layer corresponds to α=0,β=-1
-                    push!(Ds, transpose(W[k]))     # double layer corresponds to α=1,β=0
-                end
+                _bdim_fill_Θi!(Θi_flat, Θ, j, Tout)
+                ldiv!(Wdata, F, Θi_flat)
+                _bdim_push_weights!(Is, Js, Ss, Ds, Wdata, i, jglob, nq, Tout)
             end
         end
     end
diff --git a/src/entities.jl b/src/entities.jl
index d46ba7bc..a4001fca 100644
--- a/src/entities.jl
+++ b/src/entities.jl
@@ -20,16 +20,12 @@ Base.hash(ent::EntityKey, h::UInt) = hash((ent.dim, abs(ent.tag)), h)
 Base.:(==)(e1::EntityKey, e2::EntityKey) = e1.dim == e2.dim && abs(e1.tag) == abs(e2.tag)
 
 # defer some functions on EntityKey to the corresponding GeometricEntity
-for f in (
-        :labels,
-        :boundary,
-        :pushforward,
-        :ambient_dimension,
-        :hasparametrization,
-        :parametrization,
-    )
-    @eval $f(k::EntityKey) = $f(global_get_entity(k))
-end
+labels(k::EntityKey) = labels(global_get_entity(k))
+boundary(k::EntityKey) = boundary(global_get_entity(k))
+pushforward(k::EntityKey) = pushforward(global_get_entity(k))
+ambient_dimension(k::EntityKey) = ambient_dimension(global_get_entity(k))
+hasparametrization(k::EntityKey) = hasparametrization(global_get_entity(k))
+parametrization(k::EntityKey) = parametrization(global_get_entity(k))
 
 function (k::EntityKey)(x)
     hasparametrization(k) || error("$k has no parametrization")
diff --git a/src/kernels.jl b/src/kernels.jl
index c0796ac4..1e6f088a 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -33,6 +33,17 @@ abstract type AbstractDifferentialOperator{N} end
 
 ambient_dimension(::AbstractDifferentialOperator{N}) where {N} = N
 
+function range_dimension(op::AbstractDifferentialOperator)
+    T = default_density_eltype(op)
+    if T <: Number
+        return 1
+    elseif hasmethod(length, Tuple{Type{T}})
+        return length(T)
+    else
+        error("default_density_eltype($(typeof(op))) = $T does not define length(::Type{$T}); cannot determine range dimension")
+    end
+end
+
 # convenient constructor for e.g. SingleLayerKernel(op,Float64) or DoubleLayerKernel(op,ComplexF64)
 function (::Type{K})(
         op::Op,
@@ -115,6 +126,127 @@ end
 ################################# LAPLACE ######################################
 ################################################################################
 
+"""
+    struct GradientSingleLayerKernel{T,Op} <: AbstractKernel{T}
+
+Given an operator `Op`, construct its free-space gradient single-layer kernel.
+This evaluates the gradient of the fundamental solution with respect to the target variable.
+"""
+struct GradientSingleLayerKernel{T, Op} <: AbstractKernel{T}
+    op::Op
+end
+
+function GradientSingleLayerKernel(op::AbstractDifferentialOperator{N}, ::Type{T} = SVector{N, default_kernel_eltype(op)}) where {N, T}
+    return GradientSingleLayerKernel{T, typeof(op)}(op)
+end
+
+function singularity_order(K::GradientSingleLayerKernel)
+    N = ambient_dimension(K.op)
+    return 1 - N
+end
+
+"""
+    struct GradientDoubleLayerKernel{T,Op} <: AbstractKernel{T}
+
+Given an operator `Op`, construct its free-space gradient double-layer kernel.
+This evaluates the gradient of the double-layer kernel with respect to the target variable.
+"""
+struct GradientDoubleLayerKernel{T, Op} <: AbstractKernel{T}
+    op::Op
+end
+
+function GradientDoubleLayerKernel(op::AbstractDifferentialOperator{N}, ::Type{T} = SVector{N, default_kernel_eltype(op)}) where {N, T}
+    return GradientDoubleLayerKernel{T, typeof(op)}(op)
+end
+
+function singularity_order(K::GradientDoubleLayerKernel)
+    N = ambient_dimension(K.op)
+    return -N
+end
+
+"""
+    struct SourceGradientSingleLayerKernel{T,Op} <: AbstractKernel{T}
+
+Gradient of the single-layer kernel `G(x,y)` with respect to the source variable `y`,
+i.e. ``\\nabla_y G(x,y)``. The value is returned as the (row) covector
+`transpose(∇_yG)`, so that `K(x,y) * g(y)` contracts an `SVector` density to a
+scalar. It is the source-variable counterpart of [`GradientSingleLayerKernel`](@ref)
+(which is the target gradient ``\\nabla_x G``), and since ``\\nabla_y G = -\\nabla_x
+G`` it is built by negating the latter's evaluation.
+
+The associated volume integral operator is ``\\mathcal{W}[g](x) = -\\int_\\Omega
+\\nabla_y G(x,y) \\cdot g(y)\\,dy``; note the leading minus sign, so the
+discrete operator that evaluates ``\\mathcal{W}`` is the negative of
+`IntegralOperator(SourceGradientSingleLayerKernel(op), ...)`.
+"""
+struct SourceGradientSingleLayerKernel{T, Op} <: AbstractKernel{T}
+    op::Op
+end
+
+function SourceGradientSingleLayerKernel(
+        op::AbstractDifferentialOperator{N},
+        ::Type{S} = default_kernel_eltype(op),
+    ) where {N, S}
+    T = Transpose{S, SVector{N, S}}
+    return SourceGradientSingleLayerKernel{T, typeof(op)}(op)
+end
+
+function singularity_order(K::SourceGradientSingleLayerKernel)
+    N = ambient_dimension(K.op)
+    return 1 - N
+end
+
+function (K::SourceGradientSingleLayerKernel)(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    )
+    # ∇_y G(x,y) = -∇_x G(x,y); returned as a row covector to contract a vector density
+    gx = GradientSingleLayerKernel(K.op)(target, source, r)
+    return transpose(-gx)
+end
+
+# `zero` for the covector element type, needed when assembling the sparse VDIM
+# correction whose entries map an `SVector` density to a scalar.
+Base.zero(::Type{Transpose{T, SVector{N, T}}}) where {N, T} = transpose(zero(SVector{N, T}))
+
+"""
+    struct HessianKernel{T,Op} <: AbstractKernel{T}
+
+The Hessian of the single-layer kernel `G(x,y)` with respect to the *target*
+variable `x`, i.e. ``\\nabla_x\\nabla_x G(x,y)``, returned as an `N×N`
+`SMatrix`.
+
+This is the kernel of the strongly-singular volume integral operator
+``\\mathcal{X}[g](x) = \\nabla\\mathcal{W}[g](x) = \\mathsf{S}\\,g(x) -
+\\mathrm{p.v.}\\!\\int_\\Omega \\nabla_x\\nabla_y G(x,y)\\cdot g(y)\\,dy`` (eq.
+(2.24) of [anderson2026global](@cite)) acting on a vector density `g`. Since
+``\\nabla_x\\nabla_y G = -\\nabla_x\\nabla_x G``, the principal-value integral
+equals ``+\\int_\\Omega \\nabla_x\\nabla_x G \\cdot g``, so this (target) Hessian
+is the kernel assembled for the forward map of ``\\mathcal{X}``; the free-term
+tensor ``\\mathsf{S}`` is handled by the density-interpolation regularization.
+"""
+struct HessianKernel{T, Op} <: AbstractKernel{T}
+    op::Op
+    charge_dipole::Symbol
+end
+
+function HessianKernel(
+        op::AbstractDifferentialOperator{N},
+        charge_dipole::Symbol = :charge,
+        ::Type{T} = SMatrix{N, N, default_kernel_eltype(op), N * N},
+    ) where {N, T}
+    if !(charge_dipole == :charge || charge_dipole == :dipole)
+        error("Invalid charge/dipole selection")
+    end
+    return HessianKernel{T, typeof(op)}(op, charge_dipole)
+end
+
+function singularity_order(K::HessianKernel)
+    N = ambient_dimension(K.op)
+    return -N
+end
+
 struct Laplace{N} <: AbstractDifferentialOperator{N} end
 
 """
@@ -138,7 +270,7 @@ function (SL::SingleLayerKernel{T, Laplace{N}})(
         target,
         source,
         r = coords(target) - coords(source),
-    )::T where {N, T}
+    ) where {N, T}
     d = norm(r)
     (d ≤ SAME_POINT_TOLERANCE) && return zero(T)
     if N == 2
@@ -154,7 +286,7 @@ function (DL::DoubleLayerKernel{T, Laplace{N}})(
         target,
         source,
         r = coords(target) - coords(source),
-    )::T where {N, T}
+    ) where {N, T}
     ny = normal(source)
     d = norm(r)
     d ≤ SAME_POINT_TOLERANCE && return zero(T)
@@ -171,7 +303,7 @@ function (ADL::AdjointDoubleLayerKernel{T, Laplace{N}})(
         target,
         source,
         r = coords(target) - coords(source),
-    )::T where {N, T}
+    ) where {N, T}
     nx = normal(target)
     d = norm(r)
     d ≤ SAME_POINT_TOLERANCE && return zero(T)
@@ -198,6 +330,53 @@ function (HS::HyperSingularKernel{T, Laplace{N}})(
     end
 end
 
+function (GSL::GradientSingleLayerKernel{T, Laplace{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    d = norm(r)
+    d ≤ SAME_POINT_TOLERANCE && return zero(T)
+    if N == 2
+        return -1 / (2π) / (d^2) * r
+    elseif N == 3
+        return -1 / (4π) / (d^3) * r
+    end
+end
+
+function (GDL::GradientDoubleLayerKernel{T, Laplace{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    ny = normal(source)
+    d = norm(r)
+    d ≤ SAME_POINT_TOLERANCE && return zero(T)
+    if N == 2
+        return 1 / (2π) / (d^2) * (ny - 2 * dot(r, ny) / d^2 * r)
+    elseif N == 3
+        return 1 / (4π) / (d^3) * (ny - 3 * dot(r, ny) / d^2 * r)
+    end
+end
+
+function (HSL::HessianKernel{T, Laplace{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    d = norm(r)
+    d ≤ SAME_POINT_TOLERANCE && return zero(T)
+    # ∇ₓ∇ₓG: for Laplace, ∂ᵢ∂ⱼG = c/dᴺ (k·r̂ᵢr̂ⱼ - δᵢⱼ) with (c,k) = (1/2π,2) in 2D
+    # and (1/4π,3) in 3D.
+    if N == 2
+        return 1 / (2π) / d^2 * (2 * r * transpose(r) / d^2 - I)
+    elseif N == 3
+        return 1 / (4π) / d^3 * (3 * r * transpose(r) / d^2 - I)
+    else
+        notimplemented()
+    end
+end
+
 ################################################################################
 ################################# Yukawa #######################################
 ################################################################################
@@ -415,6 +594,64 @@ function (HS::HyperSingularKernel{T, <:Helmholtz{N}})(target, source)::T where {
     end
 end
 
+function (GSL::GradientSingleLayerKernel{T, <:Helmholtz{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    k = GSL.op.k
+    d = norm(r)
+    d ≤ SAME_POINT_TOLERANCE && return zero(T)
+    if N == 2
+        return -im * k / 4 / d * hankelh1(1, k * d) * r
+    elseif N == 3
+        return 1 / (4π) / d^2 * exp(im * k * d) * (im * k - 1 / d) * r
+    end
+end
+
+function (GDL::GradientDoubleLayerKernel{T, <:Helmholtz{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    ny = normal(source)
+    k = GDL.op.k
+    d = norm(r)
+    d ≤ SAME_POINT_TOLERANCE && return zero(T)
+    rdotny = dot(r, ny)
+    if N == 2
+        return im * k / (4 * d) * hankelh1(1, k * d) * ny -
+            im * k^2 / (4 * d^2) * hankelh1(2, k * d) * r * rdotny
+    elseif N == 3
+        pref = 1 / (4π) / d^3 * exp(im * k * d)
+        return pref * ((1 - im * k * d) * ny + (k^2 * d^2 + 3 * im * k * d - 3) / d^2 * r * rdotny)
+    end
+end
+
+function (HSL::HessianKernel{T, <:Helmholtz{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    k = HSL.op.k
+    d = norm(r)
+    d ≤ SAME_POINT_TOLERANCE && return zero(T)
+    # ∇ₓ∇ₓG = (g″−g′/d) r̂r̂ᵀ + (g′/d) I for the isotropic G = g(d).
+    if N == 2
+        # 2D: G = (i/4)H₀⁽¹⁾(kd);  uses H₂ via the recurrence (matches HyperSingularKernel).
+        return im * k^2 / 4 / d^2 * hankelh1(2, k * d) * (r * transpose(r)) -
+            im * k / 4 / d * hankelh1(1, k * d) * I
+    elseif N == 3
+        # 3D: G = eⁱᵏᵈ/(4πd).
+        pref = exp(im * k * d) / (4π)
+        cI = pref * (im * k / d^2 - 1 / d^3)
+        cR = pref * (3 / d^5 - 3 * im * k / d^4 - k^2 / d^3)
+        return cR * (r * transpose(r)) + cI * I
+    else
+        notimplemented()
+    end
+end
+
 ############################ STOKES ############################3
 struct Stokes{N, T} <: AbstractDifferentialOperator{N}
     μ::T
@@ -620,6 +857,184 @@ function (HS::HyperSingularKernel{T, <:Elastostatic{N}})(target, source) where {
     end
 end
 
+################################################################################
+############################ ELASTOSTATIC GRADIENT ############################
+################################################################################
+
+# Algebra needed for SVector{N, <:SMatrix} output type.
+# These products arise when gradient kernels for vector-valued operators (Elastostatic, Stokes)
+# act on matrix-valued densities (DIM method) or vector-valued densities (final application).
+function Base.:*(t::SVector{N, M}, m::SMatrix{P, Q}) where {N, P, Q, M <: SMatrix{P, Q}}
+    return typeof(t)(ntuple(k -> t[k] * m, N))
+end
+function Base.:*(t::SVector{N, M}, v::SVector{P}) where {N, P, M <: SMatrix{P, P}}
+    return SMatrix{P, N}(hcat(ntuple(k -> t[k] * v, N)...))
+end
+
+function GradientSingleLayerKernel(op::Elastostatic{N}) where {N}
+    T = SVector{N, SMatrix{N, N, Float64, N * N}}
+    return GradientSingleLayerKernel{T, typeof(op)}(op)
+end
+
+function GradientDoubleLayerKernel(op::Elastostatic{N}) where {N}
+    T = SVector{N, SMatrix{N, N, Float64, N * N}}
+    return GradientDoubleLayerKernel{T, typeof(op)}(op)
+end
+
+function (K::GradientSingleLayerKernel{T, <:Elastostatic{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    μ, λ = K.op.μ, K.op.λ
+    ν = λ / (2 * (μ + λ))
+    d = norm(r)
+    d == 0 && return zero(T)
+    RRT = r * r'
+    SM = SMatrix{N, N, Float64, N * N}
+    if N == 2
+        C = 1 / (8π * μ * (1 - ν))
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(C * (-(3 - 4ν) * r[k] / d^2 * I + (ek * r' + r * ek') / d^2 - 2 * r[k] * RRT / d^4))
+            end
+        )
+    elseif N == 3
+        C = 1 / (16π * μ * (1 - ν))
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(C * (-(3 - 4ν) * r[k] / d^3 * I + (ek * r' + r * ek') / d^3 - 3 * r[k] * RRT / d^5))
+            end
+        )
+    end
+end
+
+function (K::GradientDoubleLayerKernel{T, <:Elastostatic{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    μ, λ = K.op.μ, K.op.λ
+    ν = λ / (2 * (μ + λ))
+    ny = normal(source)
+    d = norm(r)
+    d == 0 && return zero(T)
+    RRT = r * r'
+    qr = dot(ny, r)
+    B = r * ny' - ny * r'
+    SM = SMatrix{N, N, Float64, N * N}
+    if N == 2
+        C = 1 / (4π * (1 - ν))
+        A = (1 - 2ν) * I + 2 * RRT / d^2
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(
+                    C * (
+                        (ny[k] / d^2 - 2 * qr * r[k] / d^4) * A
+                            + 2 * qr / d^4 * (ek * r' + r * ek')
+                            - 4 * qr * r[k] * RRT / d^6
+                            - (1 - 2ν) / d^2 * (ek * ny' - ny * ek')
+                            + 2 * r[k] * (1 - 2ν) / d^4 * B
+                    )
+                )
+            end
+        )
+    elseif N == 3
+        C = 1 / (8π * (1 - ν))
+        A = (1 - 2ν) * I + 3 * RRT / d^2
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(
+                    C * (
+                        (ny[k] / d^3 - 3 * qr * r[k] / d^5) * A
+                            + 3 * qr / d^5 * (ek * r' + r * ek')
+                            - 6 * qr * r[k] * RRT / d^7
+                            - (1 - 2ν) / d^3 * (ek * ny' - ny * ek')
+                            + 3 * r[k] * (1 - 2ν) / d^5 * B
+                    )
+                )
+            end
+        )
+    end
+end
+
+################################################################################
+################################### STOKES GRADIENT ############################
+################################################################################
+
+function GradientSingleLayerKernel(op::Stokes{N}) where {N}
+    T = SVector{N, SMatrix{N, N, Float64, N * N}}
+    return GradientSingleLayerKernel{T, typeof(op)}(op)
+end
+
+function GradientDoubleLayerKernel(op::Stokes{N}) where {N}
+    T = SVector{N, SMatrix{N, N, Float64, N * N}}
+    return GradientDoubleLayerKernel{T, typeof(op)}(op)
+end
+
+function (K::GradientSingleLayerKernel{T, <:Stokes{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    μ = K.op.μ
+    d = norm(r)
+    d == 0 && return zero(T)
+    RRT = r * r'
+    SM = SMatrix{N, N, Float64, N * N}
+    if N == 2
+        C = 1 / (4π * μ)
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(C * (-r[k] / d^2 * I + (ek * r' + r * ek') / d^2 - 2 * r[k] * RRT / d^4))
+            end
+        )
+    elseif N == 3
+        C = 1 / (8π * μ)
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(C * (-r[k] / d^3 * I + (ek * r' + r * ek') / d^3 - 3 * r[k] * RRT / d^5))
+            end
+        )
+    end
+end
+
+function (K::GradientDoubleLayerKernel{T, <:Stokes{N}})(
+        target,
+        source,
+        r = coords(target) - coords(source),
+    ) where {N, T}
+    ny = normal(source)
+    d = norm(r)
+    d == 0 && return zero(T)
+    RRT = r * r'
+    qr = dot(ny, r)
+    SM = SMatrix{N, N, Float64, N * N}
+    if N == 2
+        C = 1 / π
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(C * (ny[k] * RRT / d^4 + qr * (ek * r' + r * ek') / d^4 - 4 * qr * r[k] * RRT / d^6))
+            end
+        )
+    elseif N == 3
+        C = 3 / (4π)
+        return SVector{N}(
+            ntuple(N) do k
+                ek = SVector{N}(ntuple(i -> i == k ? 1.0 : 0.0, N))
+                SM(C * (ny[k] * RRT / d^5 + qr * (ek * r' + r * ek') / d^5 - 5 * qr * r[k] * RRT / d^7))
+            end
+        )
+    end
+end
+
 ################################################################################
 ################################# LAPLACE PERIODIC #############################
 ################################################################################
diff --git a/src/nystrom.jl b/src/nystrom.jl
index 86af35fc..4a5cd304 100644
--- a/src/nystrom.jl
+++ b/src/nystrom.jl
@@ -95,6 +95,53 @@ function Base.getindex(iop::IntegralOperator, i::Integer, j::Integer)
     return k(iop.target[i], iop.source[j]) * weight(iop.source[j])
 end
 
+"""
+    struct VectorDensityOperator{T,F,C}
+
+Operator returned by [`volume_potential`](@ref) for the vector-density variants
+`kernel_variant = :gradient_source` (`W`) and `:hessian_source` (`X`). It maps an
+`SVector` density `g` to a `Vector{T}` output through `forward * g + correction *
+g`, where `forward` is the (dense or FMM-accel) naive map and `correction` is the
+sparse VDIM correction.
+
+A bespoke wrapper is used instead of a `LinearMap` so that `W * g` / `X * g`
+allocates a clean output `Vector{T}` (`T = Float64`/`ComplexF64` for the
+scalar-output `W`, `T = SVector{N,…}` for the vector-output `X`). `LinearMap`'s
+`*` infers the output element type by promoting the operand eltypes, which for a
+vector-density→scalar contraction collapses to `Vector{Any}`, and for the
+`SMatrix`-entry→`SVector` contraction of `X` to an abstract `SArray` eltype.
+"""
+struct VectorDensityOperator{T, F, C}
+    forward::F
+    correction::C
+    sz::Tuple{Int, Int}
+end
+
+VectorDensityOperator{T}(forward, correction, sz) where {T} =
+    VectorDensityOperator{T, typeof(forward), typeof(correction)}(forward, correction, sz)
+
+Base.size(W::VectorDensityOperator) = W.sz
+Base.size(W::VectorDensityOperator, i::Integer) = W.sz[i]
+Base.eltype(::VectorDensityOperator{T}) where {T} = T
+
+function LinearAlgebra.mul!(y::AbstractVector, W::VectorDensityOperator, g::AbstractVector)
+    mul!(y, W.forward, g)
+    mul!(y, W.correction, g, true, true)
+    return y
+end
+
+function LinearAlgebra.mul!(
+        y::AbstractVector, W::VectorDensityOperator, g::AbstractVector, α::Number, β::Number,
+    )
+    mul!(y, W.forward, g, α, β)
+    mul!(y, W.correction, g, α, true)
+    return y
+end
+
+function Base.:*(W::VectorDensityOperator{T}, g::AbstractVector) where {T}
+    return mul!(Vector{T}(undef, size(W, 1)), W, g)
+end
+
 """
     assemble_matrix(iop::IntegralOperator; threads = true)
 
@@ -135,12 +182,12 @@ loaded) while in 3D `FMM3D` is used.
     will return `Inf` values if `iop.target !== iop.source`, but there is a
     point `x ∈ iop.target` such that `x ∈ iop.source`.
 """
-function assemble_fmm(iop::IntegralOperator; rtol)
+function assemble_fmm(iop::IntegralOperator; rtol, ndiv = nothing)
     N = ambient_dimension(iop.source)
     if N == 2
         return _assemble_fmm2d(iop; rtol)
     elseif N == 3
-        return _assemble_fmm3d(iop; rtol)
+        return _assemble_fmm3d(iop; rtol, ndiv)
     else
         return error("Only 2D and 3D FMMs are supported")
     end
@@ -154,6 +201,35 @@ function _assemble_fmm3d(args...; kwargs...)
     return error("_assemble_fmm3d not found. Did you forget to import FMM3D ?")
 end
 
+"""
+    assemble_fmm_chargehessian(iop; rtol)
+
+Charge→Hessian FMM realization of the Hessian single-layer *volume* operator: maps a
+*scalar* density `ρ` to the `SMatrix` output `∫∇ₓ∇ₓG(x,y)ρ(y)dy`. Used to build the
+volume operator for the `X = ∇W` VDIM correction under FMM compression, where the
+forward (dipole→gradient, `SVector` output) map cannot produce the Hessian `SMatrix`
+from a scalar monomial density. Supported for the Laplace kernel in 2D (`FMM2D` or
+`FMMLIB2D`, whichever was most recently loaded) and 3D (`FMM3D`).
+"""
+function assemble_fmm_chargehessian(iop::IntegralOperator; rtol)
+    N = ambient_dimension(iop.source)
+    if N == 2
+        return _assemble_fmm2d_chargehessian(iop; rtol)
+    elseif N == 3
+        return _assemble_fmm3d_chargehessian(iop; rtol)
+    else
+        return error("assemble_fmm_chargehessian is only supported in 2D and 3D")
+    end
+end
+
+function _assemble_fmm2d_chargehessian(args...; kwargs...)
+    return error("_assemble_fmm2d_chargehessian not found. Did you forget to import FMM2D or FMMLIB2D ?")
+end
+
+function _assemble_fmm3d_chargehessian(args...; kwargs...)
+    return error("_assemble_fmm3d_chargehessian not found. Did you forget to import FMM3D ?")
+end
+
 """
     assemble_hmatrix(iop[; atol, rank, rtol, eta])
 
diff --git a/src/utils.jl b/src/utils.jl
index 355f8d22..72cb35ca 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -193,7 +193,11 @@ function _normalize_compression(compression, target, source)
         "Unknown compression.method $(compression.method). Available options: $methods",
     )
     # set default tolerance if not provided
+    if haskey(compression, :fmmndiv)
+        compression.method == :fmm || error("FMM ndiv passed but compression method is not FMM!")
+    end
     compression = merge((tol = 1.0e-8,), compression)
+    compression = merge((ndiv = nothing,), compression)
     return compression
 end
 
@@ -243,6 +247,15 @@ function notimplemented()
     return error("not (yet) implemented")
 end
 
+"""
+    debug_mode()
+
+Return `true` if the current logger's minimum enabled level is `Debug` or lower.
+"""
+function debug_mode()
+    return Logging.min_enabled_level(Logging.current_logger()) <= Logging.Debug
+end
+
 """
     MultiIndex{N}
 
diff --git a/src/vdim.jl b/src/vdim.jl
index 88243f30..77f3ff86 100644
--- a/src/vdim.jl
+++ b/src/vdim.jl
@@ -22,97 +22,216 @@ See [anderson2024fast](@cite) for more details on the method.
 - `maxdist`: distance beyond which interactions are considered sufficiently far
   so that no correction is needed. This is used to determine a threshold for
   nearly-singular corrections.
-- `center`: the center of the basis functions. By default, the basis functions
-  are centered at the origin.
-- `shift`: a boolean indicating whether the basis functions should be shifted
-  and rescaled to each element.
+- `kernel_variant`: `:default` for the standard volume potential, `:gradient` for
+  the gradient of the volume potential, or `:gradient_source` for the operator
+  `W[g] = -∫∇yG⋅g` acting on a vector density `g` (regularized via eq. (3.25) of
+  [anderson2026global](@cite)). For `:default`/`:gradient`, `S`, `D`, and `V` must be
+  built with the same `kernel_variant`; for `:gradient_source`, `S`/`D` are the standard
+  single-/double-layer operators and `V` is the gradient single-layer volume operator.
 """
+# Helper: fill one row of bdata from Θ[i,m] — dispatches on element type
+_vdim_fill_bdata!(bdata, val, m, ::Type{<:Number}) = (bdata[m, 1] = val)
+_vdim_fill_bdata!(bdata, val, m, ::Type{<:SVector}) = (bdata[m, :] .= val)
+
+# Helper: push the weight at quadrature node k into Vs — dispatches on Tout
+_vdim_push_weight!(Vs, wdata, k, ::Type{T}) where {T<:Number} = push!(Vs, -wdata[k, 1])
+_vdim_push_weight!(Vs, wdata, k, ::Type{SV}) where {SV<:SVector} = push!(Vs, -SV(wdata[k, :]))
+
 function vdim_correction(
-        op,
+        op::AbstractDifferentialOperator{N},
         target,
-        source::Quadrature,
+        source::Quadrature{N},
         boundary::Quadrature,
         Sop,
         Dop,
         Vop;
         green_multiplier::Vector{<:Real},
         interpolation_order = nothing,
+        kernel_variant::Symbol = :default,
         maxdist = Inf,
-        center = nothing,
-        shift::Val{SHIFT} = Val(false),
-    ) where {SHIFT}
+        grad_single_layer = nothing,
+    ) where {N}
+    # The W operator (vector density -> scalar) is handled by a dedicated
+    # routine: its boundary potentials are single-/double-layers (so `Sop`/`Dop`
+    # here are scalar-valued), `Vop` is the gradient single-layer volume
+    # operator, and the resulting correction maps an `SVector` density to a
+    # scalar.
+    if kernel_variant === :gradient_source
+        return _vdim_correction_W(
+            op, target, source, boundary, Sop, Dop, Vop;
+            green_multiplier, interpolation_order, maxdist,
+        )
+    end
+    # The X = ∇W operator (vector density -> vector output) is handled by another
+    # dedicated routine: `Sop`/`Dop` are the scalar single-/double-layer,
+    # `grad_single_layer` is the gradient single-layer (∇ₓS), and `Vop` is the
+    # Hessian single-layer volume operator. The resulting correction has
+    # `SMatrix` entries mapping an `SVector` density to an `SVector` output.
+    if kernel_variant === :hessian_source
+        isnothing(grad_single_layer) &&
+            error("kernel_variant = :hessian_source requires the `grad_single_layer` operator")
+        return _vdim_correction_X(
+            op, target, source, boundary, Sop, Dop, grad_single_layer, Vop;
+            green_multiplier, interpolation_order, maxdist,
+        )
+    end
     # variables for debugging the condition properties of the method
     vander_cond = vander_norm = rhs_norm = res_norm = shift_norm = -Inf
-    T = eltype(Vop)
-    @assert eltype(Dop) == eltype(Sop) == T "eltype of Sop, Dop, and Vop must match"
+    Tout = eltype(Vop)
+    Tbase = default_kernel_eltype(op)
+    # determine type for dense matrices
+    DenseOut = Tout <: SMatrix ? BlockArray : Array
+    DenseBase = Tbase <: SMatrix ? BlockArray : Array
+    @assert eltype(Dop) == eltype(Sop) == Tout "eltype of Sop, Dop, and Vop must match"
     # figure out if we are dealing with a scalar or vector PDE
-    m, n = length(target), length(source)
-    N = ambient_dimension(op)
-    @assert ambient_dimension(source) == N "vdim only works for volume potentials"
-    m, n = length(target), length(source)
+    num_target, num_source = length(target), length(source)
     # a reasonable interpolation_order if not provided
     isnothing(interpolation_order) &&
         (interpolation_order = maximum(order, values(source.etype2qrule)))
+    # check if we are in debug mode to avoid expensive computations
+    do_debug = debug_mode()
     # by default basis centered at origin
-    center = isnothing(center) ? zero(SVector{N, Float64}) : center
-    p, P, γ₁P, multiindices = polynomial_solutions_vdim(op, interpolation_order, center)
+    basis = polynomial_solutions_vdim(op, interpolation_order, Tbase)
     dict_near = etype_to_nearest_points(target, source; maxdist)
-    R = _vdim_auxiliary_quantities(
-        p,
-        P,
-        γ₁P,
-        target,
-        source,
-        boundary,
-        green_multiplier,
-        Sop,
-        Dop,
-        Vop,
-    )
+    num_basis = length(basis)
+    b = DenseBase{Tbase}(undef, length(source), num_basis)
+    γ₀B = DenseBase{Tbase}(undef, length(boundary), num_basis)
+    γ₁B = DenseBase{Tbase}(undef, length(boundary), num_basis)
+    for k in 1:num_basis, j in 1:length(source)
+        b[j, k] = basis[k].source(source[j])
+    end
+    for k in 1:num_basis, j in 1:length(boundary)
+        γ₀B[j, k] = basis[k].solution(boundary[j])
+        γ₁B[j, k] = basis[k].neumann_trace(boundary[j])
+    end
+    Θ = DenseOut{Tout}(undef, num_target, num_basis)
+    fill!(Θ, zero(Tout))
+    # Compute Θ <-- S * γ₁B - D * γ₀B + V * b + σ * B(x) using in-place matvec
+    if DenseOut <: Array || (Sop isa BlockArray && Dop isa BlockArray && Vop isa BlockArray)
+        for n in 1:num_basis
+            @views mul!(Θ[:, n], Sop, γ₁B[:, n])
+            @views mul!(Θ[:, n], Dop, γ₀B[:, n], -1, 1)
+            @views mul!(Θ[:, n], Vop, b[:, n], 1, 1)
+        end
+    else
+        # For vector-valued problems with FMM (LinearMap operators), we need to
+        # perform multiplication column-by-column since FMM expects vector densities
+        # (SVector) not matrix densities (SMatrix). See bdim.jl for similar handling.
+        P, Q = size(Tout)
+        S = eltype(Tout)
+        Θ_data = parent(Θ)::Matrix
+        γ₀B_data = parent(γ₀B)::Matrix
+        γ₁B_data = parent(γ₁B)::Matrix
+        b_data = parent(b)::Matrix
+        for k in 1:size(Θ_data, 2)
+            y = reinterpret(SVector{P, S}, @view Θ_data[:, k])
+            x = reinterpret(SVector{Q, S}, @view γ₁B_data[:, k])
+            mul!(y, Sop, x)
+            x = reinterpret(SVector{Q, S}, @view γ₀B_data[:, k])
+            mul!(y, Dop, x, -1, 1)
+            x = reinterpret(SVector{Q, S}, @view b_data[:, k])
+            mul!(y, Vop, x, 1, 1)
+        end
+    end
+    # Add σ * B(x) term
+    for n in 1:num_basis
+        for i in 1:num_target
+            if kernel_variant === :gradient
+                Θ[i, n] += green_multiplier[i] * basis[n].gradient_solution(target[i])
+            else
+                Θ[i, n] += green_multiplier[i] * basis[n].solution(target[i])
+            end
+        end
+    end
     # compute sparse correction
     Is = Int[]
     Js = Int[]
     Vs = eltype(Vop)[]
     for (E, qtags) in source.etype2qtags
-        els = elements(source.mesh, E)
         near_list = dict_near[E]
         nq, ne = size(qtags)
         @assert length(near_list) == ne
+        L_arr = DenseBase{Tbase}(undef, num_basis, nq)
+        Ldata = parent(L_arr)::Matrix
+        # Preallocate solve buffers. Vector-valued PDEs (Tout <: SMatrix or
+        # Tout <: SVector{K,<:SMatrix}) need BlockArray so that LAPACK sees a
+        # plain float matrix while we can still index with SMatrix semantics.
+        # Scalar/SVector{P,<:Number} PDEs use a plain float matrix directly.
+        if Tout <: SMatrix
+            b_arr = BlockArray{Tout}(undef, num_basis, 1)
+            wei_arr = BlockArray{Tout}(undef, nq, 1)
+            bdata = parent(b_arr)::Matrix
+            weidata = parent(wei_arr)::Matrix
+        elseif Tout <: SVector && eltype(Tout) <: SMatrix
+            K = length(Tout)
+            SM = eltype(Tout)
+            b_kk = BlockArray{SM}(undef, num_basis, 1)
+            wei_kk = BlockArray{SM}(undef, nq, 1)
+            bdata_kk = parent(b_kk)::Matrix
+            weidata_kk = parent(wei_kk)::Matrix
+            Vs_mat = Matrix{SM}(undef, nq, K)
+        else
+            # Scalar (Tout <: Number) and gradient-scalar (Tout <: SVector{P,<:Number})
+            # cases share the same structure: P columns in the solve, where P=1 for scalar.
+            S = eltype(Tout)
+            P = Tout <: Number ? 1 : length(Tout)
+            bdata = Matrix{S}(undef, num_basis, P)
+            weidata = Matrix{S}(undef, nq, P)
+        end
         for n in 1:ne
-            # indices of nodes in element `n`
             isempty(near_list[n]) && continue
             jglob = @view qtags[:, n]
-            # compute translation and scaling
-            c, r = translation_and_scaling(els[n])
-            if SHIFT
-                iszero(center) || error("SHIFT is not implemented for non-zero center")
-                L̃ = [f((q.coords - c) / r) for f in p, q in view(source, jglob)]
-                S = change_of_basis(multiindices, p, c, r)
-                F = svd(L̃)
-                @debug (vander_cond = max(vander_cond, cond(L̃))) maxlog = 0
-                @debug (shift_norm = max(shift_norm, norm(S))) maxlog = 0
-                @debug (vander_norm = max(vander_norm, norm(L̃))) maxlog = 0
-            else
-                L = [f(q.coords) for f in p, q in view(source, jglob)]
-                F = svd(L)
-                @debug (vander_cond = max(vander_cond, cond(L))) maxlog = 0
-                @debug (shift_norm = max(shift_norm, 1)) maxlog = 0
-                @debug (vander_norm = max(vander_norm, norm(L))) maxlog = 0
+            for k in 1:nq, m in 1:num_basis
+                L_arr[m, k] = basis[m].source(view(source, jglob)[k])
+            end
+            F = svd(Ldata)
+            if do_debug
+                vander_cond = max(vander_cond, cond(Ldata))
+                shift_norm = max(shift_norm, 1)
+                vander_norm = max(vander_norm, norm(Ldata))
             end
-            # correct each target near the current element
             for i in near_list[n]
-                b = @views R[i, :]
-                wei = SHIFT ? F \ (S * b) : F \ b # weights for the current element and target i
-                rhs_norm = max(rhs_norm, norm(b))
-                res_norm = if SHIFT
-                    max(res_norm, norm(L̃ * wei - S * b))
+                if Tout <: SMatrix
+                    b_arr .= @views transpose(Θ[i:i, :])
+                    ldiv!(weidata, F, bdata)
+                    if do_debug
+                        rhs_norm = max(rhs_norm, norm(bdata))
+                        res_norm = max(res_norm, norm(Ldata * weidata - bdata))
+                    end
+                    for k in 1:nq
+                        push!(Is, i)
+                        push!(Js, jglob[k])
+                        push!(Vs, -transpose(wei_arr[k]))
+                    end
+                elseif Tout <: SVector && eltype(Tout) <: SMatrix
+                    for kk in 1:K
+                        for m in 1:num_basis
+                            b_kk[m, 1] = transpose(Θ[i, m][kk])
+                        end
+                        ldiv!(weidata_kk, F, bdata_kk)
+                        for k in 1:nq
+                            Vs_mat[k, kk] = -transpose(wei_kk[k])
+                        end
+                    end
+                    for k in 1:nq
+                        push!(Is, i)
+                        push!(Js, jglob[k])
+                        push!(Vs, Tout(ntuple(kk -> Vs_mat[k, kk], K)))
+                    end
                 else
-                    max(res_norm, norm(L * wei - b))
-                end
-                for k in 1:nq
-                    push!(Is, i)
-                    push!(Js, jglob[k])
-                    push!(Vs, wei[k])
+                    for m in 1:num_basis
+                        _vdim_fill_bdata!(bdata, Θ[i, m], m, Tout)
+                    end
+                    ldiv!(weidata, F, bdata)
+                    if do_debug
+                        rhs_norm = max(rhs_norm, norm(bdata))
+                        res_norm = max(res_norm, norm(Ldata * weidata - bdata))
+                    end
+                    for k in 1:nq
+                        push!(Is, i)
+                        push!(Js, jglob[k])
+                        _vdim_push_weight!(Vs, weidata, k, Tout)
+                    end
                 end
             end
         end
@@ -124,277 +243,611 @@ function vdim_correction(
     |-- max interp. matrix norm :     $vander_norm
     |-- max shift norm :              $shift_norm
     """
-    δV = sparse(Is, Js, Vs, m, n)
+    δV = sparse(Is, Js, Vs, num_target, num_source)
     return δV
 end
 
-function change_of_basis(multiindices, p, c, r)
-    nbasis = length(multiindices)
-    P = zeros(nbasis, nbasis)
-    for i in 1:nbasis
-        α = multiindices[i]
-        for j in 1:nbasis
-            β = multiindices[j]
-            β ≤ α || continue
-            # P[i, j] = prod((-c) .^ ((α - β).indices)) / r^abs(α) / factorial(α
-            # - β)
-            γ = α - β
-            p_γ = p[findfirst(x -> x == γ, multiindices)] # p_{\alpha - \beta}
-            P[i, j] = p_γ(-c) / r^abs(α)
-        end
+"""
+    _vdim_correction_W(op, target, source, boundary, Sop, Dop, Vop; kwargs...)
+
+VDIM correction for the operator `W[g] = -∫∇yG(x,y)⋅g(y)dy` (vector density `g`,
+scalar output), using the regularization (3.25) of [anderson2026global](@cite).
+
+Here `Sop`, `Dop` are the (scalar) single-/double-layer, and `Vop` is the
+*gradient* single-layer volume operator (scalar density -> `SVector` output);
+the per-monomial volume contribution `W[pₐeⱼ] = [∇ₓV[pₐ]]ⱼ` is exactly `Vop`
+applied to the scalar monomial.
+
+The assembly mirrors the scalar/gradient case: per basis monomial `α` and target `i`,
+`Θ[i,α] = Sop·γ₁B - Dop·γ₀B + Vop·b + σ·Ψ(x)` is an `SVector{N}` over the density
+directions; the local interpolation system is solved against the scalar Vandermonde,
+and the weights are stored as `transpose`d `SVector`s so the resulting sparse matrix
+maps an `SVector{N}` density to a scalar.
+"""
+function _vdim_correction_W(
+        op::AbstractDifferentialOperator{N},
+        target,
+        source::Quadrature{N},
+        boundary::Quadrature,
+        Sop,
+        Dop,
+        Vop;
+        green_multiplier::Vector{<:Real},
+        interpolation_order = nothing,
+        maxdist = Inf,
+    ) where {N}
+    T = default_kernel_eltype(op)          # scalar element type (Float64 / ComplexF64)
+    SV = SVector{N, T}                      # density / Θ element type
+    @assert eltype(Vop) == SV "Vop must be the gradient single-layer volume operator (SVector output)"
+    num_target, num_source = length(target), length(source)
+    isnothing(interpolation_order) &&
+        (interpolation_order = maximum(order, values(source.etype2qrule)))
+    basis = polynomial_solutions_vdim_W(op, interpolation_order, T)
+    num_basis = length(basis)
+    dict_near = etype_to_nearest_points(target, source; maxdist)
+
+    # source-node monomial values (scalar) for the volume term and the Vandermonde
+    b = Matrix{T}(undef, num_source, num_basis)
+    for k in 1:num_basis, j in 1:num_source
+        b[j, k] = basis[k].source(source[j])
+    end
+    # boundary traces: SVector{N} over the density directions j
+    nbnd = length(boundary)
+    γ₀B = Matrix{SV}(undef, nbnd, num_basis)
+    γ₁B = Matrix{SV}(undef, nbnd, num_basis)
+    for k in 1:num_basis, j in 1:nbnd
+        γ₀B[j, k] = basis[k].solution(boundary[j])
+        # actually `q -> SVector(∂νΨₐⱼ(q) + pₐ(q) νⱼ(q))ⱼ`; see polynomial_solutions_vdim_W
+        γ₁B[j, k] = basis[k].neumann_trace(boundary[j])
     end
-    return P
-end
 
-function translation_and_scaling(el::LagrangeTriangle)
-    vertices = el.vals[1:3]
-    l1 = norm(vertices[1] - vertices[2])
-    l2 = norm(vertices[2] - vertices[3])
-    l3 = norm(vertices[3] - vertices[1])
-    if ((l1^2 + l2^2 >= l3^2) && (l2^2 + l3^2 >= l1^2) && (l3^2 + l1^2 > l2^2))
-        acuteright = true
-    else
-        acuteright = false
+    # Assemble Θ[i,n] ∈ SVector{N} (column by column, FMM-friendly).
+    Θ = Matrix{SV}(undef, num_target, num_basis)
+    g0 = Vector{T}(undef, nbnd)
+    g1 = Vector{T}(undef, nbnd)
+    for n in 1:num_basis
+        vol = Vop * view(b, :, n)          # Vector{SVector{N}} of length num_target
+        # boundary contribution, one density-direction component at a time
+        bnd = [zero(SV) for _ in 1:num_target]
+        for c in 1:N
+            for j in 1:nbnd
+                g0[j] = γ₀B[j, n][c]
+                g1[j] = γ₁B[j, n][c]
+            end
+            sc = Sop * g1
+            dc = Dop * g0
+            for i in 1:num_target
+                bnd[i] += SV(ntuple(d -> d == c ? sc[i] - dc[i] : zero(T), N))
+            end
+        end
+        for i in 1:num_target
+            Θ[i, n] = vol[i] + bnd[i] + green_multiplier[i] * basis[n].solution(target[i])
+        end
     end
 
-    if acuteright
-        # Compute the circumcenter and circumradius
-        Bp = vertices[2] - vertices[1]
-        Cp = vertices[3] - vertices[1]
-        Dp = 2 * (Bp[1] * Cp[2] - Bp[2] * Cp[1])
-        Upx = 1 / Dp * (Cp[2] * (Bp[1]^2 + Bp[2]^2) - Bp[2] * (Cp[1]^2 + Cp[2]^2))
-        Upy = 1 / Dp * (Bp[1] * (Cp[1]^2 + Cp[2]^2) - Cp[2] * (Bp[1]^2 + Bp[2]^2))
-        Up = SVector{2}(Upx, Upy)
-        r = norm(Up)
-        c = Up + vertices[1]
-    else
-        if (l1 >= l2) && (l1 >= l3)
-            c = (vertices[1] + vertices[2]) / 2
-            r = l1 / 2
-        elseif (l2 >= l1) && (l2 >= l3)
-            c = (vertices[2] + vertices[3]) / 2
-            r = l2 / 2
-        else
-            c = (vertices[1] + vertices[3]) / 2
-            r = l3 / 2
+    # Local interpolation solve + adjoint-SVector storage.
+    Is = Int[]
+    Js = Int[]
+    Vs = Transpose{T, SV}[]
+    for (E, qtags) in source.etype2qtags
+        near_list = dict_near[E]
+        nq, ne = size(qtags)
+        @assert length(near_list) == ne
+        L = Matrix{T}(undef, num_basis, nq)
+        bdata = Matrix{T}(undef, num_basis, N)
+        weidata = Matrix{T}(undef, nq, N)
+        for n in 1:ne
+            isempty(near_list[n]) && continue
+            jglob = @view qtags[:, n]
+            for k in 1:nq, m in 1:num_basis
+                L[m, k] = basis[m].source(view(source, jglob)[k])
+            end
+            F = svd(L)
+            for i in near_list[n]
+                for m in 1:num_basis
+                    bdata[m, :] .= Θ[i, m]
+                end
+                ldiv!(weidata, F, bdata)
+                for k in 1:nq
+                    push!(Is, i)
+                    push!(Js, jglob[k])
+                    push!(Vs, -transpose(SV(ntuple(c -> weidata[k, c], N))))
+                end
+            end
         end
     end
-    return c, r
+    δV = sparse(Is, Js, Vs, num_target, num_source)
+    return δV
 end
 
-function translation_and_scaling(el::ParametricElement{ReferenceSimplex{3}})
-    straight_nodes =
-        [el([0.0, 0.0, 0.0]), el([1.0, 0.0, 0.0]), el([0.0, 1.0, 0.0]), el([0.0, 0.0, 1.0])]
-    return translation_and_scaling(
-        LagrangeElement{ReferenceSimplex{3}, 4, SVector{3, Float64}}(straight_nodes),
-    )
-end
+"""
+    _vdim_correction_X(op, target, source, boundary, Sop, Dop, GSop, Vop; kwargs...)
 
-function translation_and_scaling(el::ParametricElement{ReferenceSimplex{2}})
-    straight_nodes = [el([1.0e-18, 1.0e-18]), el([1.0, 0.0]), el([0.0, 1.0])]
-    return translation_and_scaling(
-        LagrangeElement{ReferenceSimplex{2}, 3, SVector{2, Float64}}(straight_nodes),
-    )
-end
+VDIM correction for the operator `X[g] = ∇W[g] = S·g(x) - PV∫∇ₓ∇_yG(x,y)·g(y)dy`
+(vector density `g`, vector output), using the regularization (3.28) of
+[anderson2026global](@cite).
 
-function translation_and_scaling(el::LagrangeTetrahedron)
-    vertices = el.vals[1:4]
-    # Compute the circumcenter in barycentric coordinates
-    # formulas here are due to: https://math.stackexchange.com/questions/2863613/tetrahedron-centers
-    a = norm(vertices[4] - vertices[1])
-    b = norm(vertices[2] - vertices[4])
-    c = norm(vertices[3] - vertices[4])
-    d = norm(vertices[3] - vertices[2])
-    e = norm(vertices[3] - vertices[1])
-    f = norm(vertices[2] - vertices[1])
-    f² = f^2
-    a² = a^2
-    b² = b^2
-    c² = c^2
-    d² = d^2
-    e² = e^2
-
-    ρ =
-        a² * d² * (-d² + e² + f²) + b² * e² * (d² - e² + f²) + c² * f² * (d² + e² - f²) -
-        2 * d² * e² * f²
-    α =
-        a² * d² * (b² + c² - d²) + e² * b² * (-b² + c² + d²) + f² * c² * (b² - c² + d²) -
-        2 * b² * c² * d²
-    β =
-        b² * e² * (a² + c² - e²) + d² * a² * (-a² + c² + e²) + f² * c² * (a² - c² + e²) -
-        2 * a² * c² * e²
-    γ =
-        c² * f² * (a² + b² - f²) + d² * a² * (-a² + b² + f²) + e² * b² * (a² - b² + f²) -
-        2 * a² * b² * f²
-    if (ρ >= 0 && α >= 0 && β >= 0 + γ >= 0)
-        # circumcenter lays inside `el`
-        center =
-            (α * vertices[1] + β * vertices[2] + γ * vertices[3] + ρ * vertices[4]) /
-            (ρ + α + β + γ)
-        # ref: https://math.stackexchange.com/questions/1087011/calculating-the-radius-of-the-circumscribed-sphere-of-an-arbitrary-tetrahedron
-        R = sqrt(1 / 2 * (β * f² + γ * e² + ρ * a²) / (ρ + α + β + γ))
-    else
-        if (a >= b && a >= c && a >= d && a >= e && a >= f)
-            center = (vertices[1] + vertices[4]) / 2
-            R = a / 2
-        elseif (b >= a && b >= c && b >= d && b >= e && b >= f)
-            center = (vertices[2] + vertices[4]) / 2
-            R = b / 2
-        elseif (c >= a && c >= b && c >= d && c >= e && c >= f)
-            center = (vertices[3] + vertices[4]) / 2
-            R = c / 2
-        elseif (d >= a && d >= b && d >= c && d >= e && d >= f)
-            center = (vertices[3] + vertices[2]) / 2
-            R = d / 2
-        elseif (e >= a && e >= b && e >= c && e >= d && e >= f)
-            center = (vertices[3] + vertices[1]) / 2
-            R = e / 2
-        else
-            center = (vertices[2] + vertices[1]) / 2
-            R = f / 2
-        end
-    end
-    return center, R
-end
+Here `Sop`, `Dop` are the (scalar) single-/double-layer, `GSop` is the gradient
+single-layer (`∇ₓS`, scalar density -> `SVector` output), and `Vop` is the
+*Hessian* single-layer volume operator (`SVector` density -> `SVector` output,
+`SMatrix` entries) whose action equals the principal-value part `+∫∇ₓ∇ₓG·g`.
+
+Per basis monomial `pₐ` and target `i`, `Θ[i,α] ∈ SMatrix{N,N}` collects, in its
+`j`-th column, the (3.28) representation of `X[pₐeⱼ](xᵢ)` together with the naive
+volume term; the local interpolation system is solved component-wise against the
+scalar Vandermonde and the weights are stored as `SMatrix`es so the resulting
+sparse correction maps an `SVector{N}` density to an `SVector{N}` output.
 
-function _vdim_auxiliary_quantities(
-        p,
-        P,
-        γ₁P,
-        X,
-        Y::Quadrature,
-        Γ::Quadrature,
-        μ,
+(Implementation notes: This routine makes no use of Green's function isotropy
+which would allow to exploit Φ'_{σ(β)} = σ(Φ'_β) for a coordinate permutation σ
+and polynomial solution Φ. The volume term is batched across the coordinate
+directions only for the FMM charge→Hessian operator — a single scalar→`SMatrix`
+pass per monomial yields the full Mₐ = ∫∇ₓ∇ₓG pₐ, avoiding N dipole passes; this
+only works if the fast algorithm for `Vop` supports a charge→Hessian operation.
+The dense (`BlockArray`) operator and the boundary `S`/`D`/`∇ₓS` terms are
+applied per coordinate direction.)
+"""
+function _vdim_correction_X(
+        op::AbstractDifferentialOperator{N},
+        target,
+        source::Quadrature{N},
+        boundary::Quadrature,
         Sop,
         Dop,
-        Vop,
-    )
-    num_basis = length(p)
-    num_targets = length(X)
-    b = [f(q) for q in Y, f in p]
-    γ₀B = [f(q) for q in Γ, f in P]
-    γ₁B = [f(q) for q in Γ, f in γ₁P]
-    Θ = zeros(eltype(Vop), num_targets, num_basis)
-    # Compute Θ <-- S * γ₁B - D * γ₀B - V * b + σ * B(x) using in-place matvec
+        GSop,
+        Vop;
+        green_multiplier::Vector{<:Real},
+        interpolation_order = nothing,
+        maxdist = Inf,
+    ) where {N}
+    T = default_kernel_eltype(op)          # scalar element type (Float64 / ComplexF64)
+    SV = SVector{N, T}                      # density / output element type
+    SM = SMatrix{N, N, T, N * N}            # Θ / correction entry type
+    # `Vop` is the Hessian single-layer volume operator; its dense form has
+    # `SMatrix` entries while the FMM form is a `LinearMap` with `SVector` output
+    # (both map an `SVector` density to an `SVector` output).
+    @assert eltype(Vop) in (SM, SV) "Vop must be the Hessian single-layer volume operator"
+    @assert eltype(GSop) == SV "GSop must be the gradient single-layer operator (SVector output)"
+    num_target, num_source = length(target), length(source)
+    isnothing(interpolation_order) &&
+        (interpolation_order = maximum(order, values(source.etype2qrule)))
+    basis = polynomial_solutions_vdim_X(op, interpolation_order, T)
+    num_basis = length(basis)
+    dict_near = etype_to_nearest_points(target, source; maxdist)
+
+    # source-node monomial values (scalar) for the volume term and the Vandermonde
+    b = Matrix{T}(undef, num_source, num_basis)
+    for k in 1:num_basis, j in 1:num_source
+        b[j, k] = basis[k].source(source[j])
+    end
+    # multi-index `I` of each basis monomial (same ordering as polynomial_solutions_vdim_X)
+    indices = [I for I in Iterators.product(ntuple(i -> 0:interpolation_order, N)...) if sum(I) <= interpolation_order]
+    @assert length(indices) == num_basis
+    # grad_single_trace (pₐν) on the boundary, for the ∇ₓS term
+    nbnd = length(boundary)
+    γs = Matrix{SV}(undef, nbnd, num_basis)
+    for k in 1:num_basis, q in 1:nbnd
+        γs[q, k] = basis[k].grad_single_trace(boundary[q])
+    end
+
+    # --- Optimization: dedup the S/D boundary applications over β = I − eⱼ ---
+    # `basis_from_monomial` is linear, so Ψₙⱼ = Iⱼ·Φ′_β with β = I − eⱼ and ℒΦ′_β = yᵝ;
+    # hence Υₙⱼ = Iⱼ·∇Φ′_β and the two (3.28) layer terms factor through β alone:
+    #   D[Υₙⱼ]_c             = Iⱼ · D[∂_cΦ′_β]                (γ₀ trace ∂_cΦ′_β)
+    #   S[(∂ⱼpₐ)ν + BνΥₙⱼ]_c = Iⱼ · S[τ_{β,c}],  τ_{β,c} = p_β ν_c + ∂ν(∂_cΦ′_β)
+    # Thus, the nominal per-(n,j,c) applications in the above description can be
+    # reduced to per-(β,c), β over |β| ≤ order−1, each scaled by Iⱼ in the
+    # assembly below.
+    βindices = [B for B in Iterators.product(ntuple(i -> 0:(interpolation_order - 1), N)...) if sum(B) <= interpolation_order - 1]
+    β2k = Dict(B => k for (k, B) in enumerate(βindices))
+    nβ = length(βindices)
+    g₀ = Matrix{T}(undef, nbnd, nβ * N)    # γ₀ traces ∂_cΦ′_β, column index (β,c) = (k-1)N+c
+    g₁ = Matrix{T}(undef, nbnd, nβ * N)    # γ₁ traces τ_{β,c}
+    for k in 1:nβ
+        pβ = Polynomial(βindices[k] => one(T))
+        Φβ, _ = basis_from_monomial(op, pβ)
+        ∇Φβ = ElementaryPDESolutions.gradient(Φβ)                       # ∂_cΦ′_β
+        HessΦβ = ntuple(c -> ElementaryPDESolutions.gradient(∇Φβ[c]), N) # ∂_d∂_cΦ′_β
+        for q in 1:nbnd
+            xq, nu = coords(boundary[q]), normal(boundary[q])
+            pv = pβ(xq)
+            for c in 1:N
+                g₀[q, (k - 1) * N + c] = ∇Φβ[c](xq)
+                g₁[q, (k - 1) * N + c] = pv * nu[c] + dot(nu, HessΦβ[c](xq))
+            end
+        end
+    end
+    # Apply D (resp. S) once over all (β,c) columns; reused (scaled by Iⱼ) for every n.
+    # `mul!` into a dense matrix forces eager evaluation: BLAS `gemm` when `Dop` is a
+    # dense matrix, LinearMaps' column-wise apply when it is an (FMM) `LinearMap`.
+    DG = Matrix{T}(undef, num_target, nβ * N)    # num_target × (nβ·N)
+    SG = Matrix{T}(undef, num_target, nβ * N)
+    nβ * N > 0 && (mul!(DG, Dop, g₀); mul!(SG, Sop, g₁))
+
+    # Assemble Θ[i,n] ∈ SMatrix{N,N}. Following the internal sign convention used for
+    # the scalar/W cases (Θ = naive_volume − analytic_representation, σ-term carried by
+    # `green_multiplier`), the (3.28) boundary signs are flipped: +∇ₓS, +S, −D.
+    Θ = Matrix{SM}(undef, num_target, num_basis)
+    volbuf = Vector{SV}(undef, num_target)
+    # Volume term: the j-th column of Mₐ(xᵢ) := ∫∇ₓ∇ₓG(xᵢ,y) pₐ(y) dy (an `SMatrix`)
+    # is exactly `Vop·(pₐeⱼ)`, so all N directions share the single `SMatrix` Mₐ. The FMM
+    # charge→Hessian operator (a `LinearMap{SMatrix}`, scalar density → `SMatrix`) forms
+    # every Mₐ with a single FMM pass per monomial, avoiding the N per-direction dipole
+    # passes (which re-traverse `Vop` N·num_basis times). All other operators — the dense
+    # `BlockArray{SMatrix}` and the FMM dipole→gradient map (`SVector` output) — use the
+    # per-direction application below.
+    opt_vol = eltype(Vop) == SM && !(Vop isa BlockArray)
+    MVol = Matrix{SM}(undef, opt_vol ? num_target : 0, opt_vol ? num_basis : 0)
+    if opt_vol
+        mbuf = Vector{SM}(undef, num_target)
+        for n in 1:num_basis
+            mul!(mbuf, Vop, view(b, :, n))
+            for i in 1:num_target
+                MVol[i, n] = mbuf[i]
+            end
+        end
+    end
     for n in 1:num_basis
-        @views mul!(Θ[:, n], Sop, γ₁B[:, n])
-        @views mul!(Θ[:, n], Dop, γ₀B[:, n], -1, 1)
-        @views mul!(Θ[:, n], Vop, b[:, n], -1, 1)
-        for i in 1:num_targets
-            Θ[i, n] += μ[i] * P[n](X[i])
+        I = indices[n]
+        Υtarg = [basis[n].solution(target[i]) for i in 1:num_target]   # SMatrix Υₙ(xᵢ)
+        colvecs = ntuple(N) do j
+            # σ-term: green_multiplier · Υₙⱼ(x)
+            col = [green_multiplier[i] * Υtarg[i][:, j] for i in 1:num_target]
+            # naive volume: +∫∇ₓ∇ₓG·(pₐeⱼ) = X PV part for direction j (column j of Mₐ)
+            if opt_vol
+                for i in 1:num_target
+                    col[i] += MVol[i, n][:, j]
+                end
+            else
+                ej = svector(d -> d == j ? one(T) : zero(T), N)
+                dj = [b[k, n] * ej for k in 1:num_source]
+                mul!(volbuf, Vop, dj)
+                col .+= volbuf
+            end
+            # +∇ₓS[pₐνⱼ]
+            φj = [γs[q, n][j] for q in 1:nbnd]
+            col .+= GSop * φj
+            # +S[(∂ⱼpₐ)ν + BνΥⱼ] − D[Υⱼ], via the β = I − eⱼ dedup (scaled by Iⱼ)
+            if I[j] ≥ 1
+                k = β2k[ntuple(d -> d == j ? I[d] - 1 : I[d], N)]
+                Iⱼ = T(I[j])
+                for c in 1:N
+                    kc = (k - 1) * N + c
+                    for i in 1:num_target
+                        col[i] += SV(ntuple(d -> d == c ? Iⱼ * (SG[i, kc] - DG[i, kc]) : zero(T), N))
+                    end
+                end
+            end
+            col
+        end
+        for i in 1:num_target
+            Θ[i, n] = reduce(hcat, ntuple(j -> colvecs[j][i], N))
         end
     end
-    return Θ
+
+    # Local interpolation solve + SMatrix storage (flatten column-major over the
+    # N×N entries, solve against the scalar Vandermonde, reconstruct the SMatrix).
+    Is = Int[]
+    Js = Int[]
+    Vs = SM[]
+    for (E, qtags) in source.etype2qtags
+        near_list = dict_near[E]
+        nq, ne = size(qtags)
+        @assert length(near_list) == ne
+        L = Matrix{T}(undef, num_basis, nq)
+        bdata = Matrix{T}(undef, num_basis, N * N)
+        weidata = Matrix{T}(undef, nq, N * N)
+        for n in 1:ne
+            isempty(near_list[n]) && continue
+            jglob = @view qtags[:, n]
+            for k in 1:nq, m in 1:num_basis
+                L[m, k] = basis[m].source(view(source, jglob)[k])
+            end
+            F = svd(L)
+            for i in near_list[n]
+                for m in 1:num_basis
+                    bdata[m, :] .= vec(Θ[i, m])
+                end
+                ldiv!(weidata, F, bdata)
+                for k in 1:nq
+                    push!(Is, i)
+                    push!(Js, jglob[k])
+                    push!(Vs, -SM(ntuple(idx -> weidata[k, idx], N * N)))
+                end
+            end
+        end
+    end
+    δV = sparse(Is, Js, Vs, num_target, num_source)
+    return δV
 end
 
 """
-    vdim_mesh_center(msh)
+    polynomial_solutions_vdim(op, order, [T])
 
-Point `x` which minimizes ∑ (x-xⱼ)²/r²ⱼ, where xⱼ and rⱼ are the circumcenter
-and circumradius of the elements of `msh`, respectively.
+Build a basis of polynomial solutions for the VDIM method.
+
+For every monomial `pₙ` of degree at most `order`, computes a polynomial solution `Pₙ`
+satisfying `ℒ[Pₙ] = pₙ`, where `ℒ` is the differential operator of `op`.
+
+Returns a vector of named tuples with fields `source = pₙ`, `solution = Pₙ`, `neumann_trace = γ₁Pₙ`, and `gradient_solution = ∇Pₙ`.
 """
-function vdim_mesh_center(msh::AbstractMesh)
-    N = ambient_dimension(msh)
-    M = 0.0
-    xc = zero(SVector{N, Float64})
-    for E in element_types(msh)
-        for el in elements(msh, E)
-            c, r = translation_and_scaling(el)
-            # w = 1/r^2
-            w = 1
-            M += w
-            xc += c * w
-        end
+function polynomial_solutions_vdim(
+        op::AbstractDifferentialOperator{N},
+        order::Integer,
+        ::Type{T} = default_kernel_eltype(op),
+    ) where {N, T}
+    indices = [I for I in Iterators.product(ntuple(i -> 0:order, N)...) if sum(I) <= order]
+    return map(indices) do I
+        monomial = Polynomial(I => one(T))
+        P, γ₁P = basis_from_monomial(op, monomial)
+        ∇P = ElementaryPDESolutions.gradient(P)
+        (source = monomial, solution = P, neumann_trace = γ₁P, gradient_solution = ∇P)
     end
-    return xc / M
 end
 
 """
-    polynomial_solutions_vdim(op, order[, center])
+    polynomial_solutions_vdim_W(op, order, [T])
 
-For every monomial term `pₙ` of degree `order`, compute a polynomial `Pₙ` such
-that `ℒ[Pₙ] = pₙ`, where `ℒ` is the differential operator associated with `op`.
-This function returns `{pₙ,Pₙ,γ₁Pₙ}`, where `γ₁Pₙ` is the generalized Neumann
-trace of `Pₙ`.
+Build a basis for the VDIM evaluation of the operator `W[g] = -∫∇yG(x,y)⋅g(y)dy`
+acting on a vector density `g`, using the regularization of eq. (3.25) in
+[anderson2026global](@cite).
 
-Passing a point `center` will shift the monomials and solutions accordingly.
+The density is interpolated component-wise, so the basis is indexed by the scalar
+monomials `pₐ = yᴵ` (`|I| ≤ order`). For each monomial, and each coordinate direction
+`j`, the associated vector monomial is `gₐⱼ = pₐ eⱼ`, whose divergence is `∂ⱼpₐ`; the
+polynomial PDE solution `Ψₐⱼ` then satisfies `ℒΨₐⱼ = ∂ⱼpₐ`. Per (3.25),
+
+    W[gₐⱼ] = μ(x)Ψₐⱼ(x) + D[Ψₐⱼ](x) - S[∂νΨₐⱼ + (gₐⱼ⋅ν)](x),
+
+Each returned named tuple bundles the `N` directions into `SVector`-valued
+traces:
+
+- `source`        : the scalar monomial `pₐ` (used for the volume term and the Vandermonde)
+- `solution`      : `q -> SVector(Ψₐⱼ(q))ⱼ`  (γ₀ trace for `D`, and the σ-term)
+- `neumann_trace` : `q -> SVector(∂νΨₐⱼ(q) + pₐ(q) νⱼ(q))ⱼ`  (γ₁ trace for `S`)
+                    (an abuse of notation for consistency with other vdim)
 """
-function polynomial_solutions_vdim(
-        op::AbstractDifferentialOperator,
+function polynomial_solutions_vdim_W(
+        op::AbstractDifferentialOperator{N},
         order::Integer,
-        center = nothing,
-    )
-    N = ambient_dimension(op)
-    center = isnothing(center) ? zero(SVector{N, Float64}) : center
-    # create empty arrays to store the monomials, solutions, and traces. For the
-    # neumann trace, we try to infer the concrete return type instead of simply
-    # having a vector of `Function`.
-    monomials = Vector{ElementaryPDESolutions.Polynomial{N, Float64}}()
-    dirchlet_traces = Vector{ElementaryPDESolutions.Polynomial{N, Float64}}()
-    T = return_type(neumann_trace, typeof(op), eltype(dirchlet_traces))
-    neumann_traces = Vector{T}()
-    multiindices = Vector{MultiIndex{N}}()
-    # iterate over N-tuples going from 0 to order
-    for I in Iterators.product(ntuple(i -> 0:order, N)...)
-        sum(I) > order && continue
-        # define the monomial basis functions, and the corresponding solutions.
-        # TODO: adapt this to vectorial case
-        p = ElementaryPDESolutions.Polynomial(I => 1 / factorial(MultiIndex(I)))
-        P = polynomial_solution(op, p)
-        γ₁P = neumann_trace(op, P)
-        push!(multiindices, MultiIndex(I))
-        push!(monomials, p)
-        push!(dirchlet_traces, P)
-        push!(neumann_traces, γ₁P)
-    end
-    monomial_shift = map(monomials) do f
-        return (q) -> f(coords(q) - center)
-    end
-    dirchlet_shift = map(dirchlet_traces) do f
-        return (q) -> f(coords(q) - center)
+        ::Type{T} = default_kernel_eltype(op),
+    ) where {N, T}
+    indices = [I for I in Iterators.product(ntuple(i -> 0:order, N)...) if sum(I) <= order]
+    return map(indices) do I
+        monomial = Polynomial(I => one(T))      # raw monomial yᴵ
+        ∂p = ElementaryPDESolutions.gradient(monomial)   # (∂₁,…,∂_N) yᴵ
+        sols = ntuple(N) do j
+            dj = ∂p[j]
+            if isempty(dj.order2coeff)
+                # ∂ⱼyᴵ = 0  ⇒  Ψⱼ = 0
+                (_ -> zero(T), _ -> zero(T))
+            else
+                Ψj, γ₁Ψj = basis_from_monomial(op, dj)   # ℒΨⱼ = ∂ⱼyᴵ
+                (x -> Ψj(x), γ₁Ψj)
+            end
+        end
+        Ψ = ntuple(j -> sols[j][1], N)
+        γ₁Ψ = ntuple(j -> sols[j][2], N)
+        solution = q -> svector(j -> Ψ[j](coords(q)), N)
+        neumann_trace = q -> begin
+            pv = monomial(coords(q))
+            nu = normal(q)
+            svector(j -> γ₁Ψ[j](q) + pv * nu[j], N)
+        end
+        (source = monomial, solution = solution, neumann_trace = neumann_trace)
     end
-    neumann_shift = map(neumann_traces) do f
-        return (q) -> f((coords = q.coords - center, normal = q.normal))
+end
+
+"""
+    polynomial_solutions_vdim_X(op, order, [T])
+
+Build a basis for the VDIM evaluation of the singular operator `X[g] = ∇W[g] =
+S·g(x) - PV∫∇ₓ∇_yG(x,y)·g(y)dy` acting on a vector density `g`, using the
+regularization of eq. (3.28) in [anderson2026global](@cite).
+
+As for `W` ([`polynomial_solutions_vdim_W`](@ref)), the density is interpolated
+component-wise, so the basis is indexed by the scalar monomials `pₐ = yᴵ`
+(`|I| ≤ order`). For each monomial, and each coordinate direction `j`, the vector
+monomial is `gₐⱼ = pₐeⱼ`, whose divergence is `∂ⱼpₐ`; the polynomial PDE solution
+`Ψₐⱼ` satisfies `ℒΨₐⱼ = ∂ⱼpₐ`, and the relevant solution for `X` is `Υₐⱼ = ∇Ψₐⱼ`
+(so that `ℒΥₐⱼ = ∇∂ⱼpₐ`). Per (3.28),
+
+    X[gₐⱼ] = μ(x)Υₐⱼ(x) - ∇ₓS[pₐνⱼ](x) - S[(∂ⱼpₐ)ν + BνΥₐⱼ](x) + D[Υₐⱼ](x),
+
+with `BνΥₐⱼ` the (Laplace) conormal derivative `∂ν∇Ψₐⱼ = (HessΨₐⱼ)·ν`. Since `X`
+maps a vector density to a vector output, the `N` directions are bundled into
+`SMatrix{N,N}`-valued traces (column `j` ↔ density direction `j`, row ↔ output
+component):
+
+- `source`           : the scalar monomial `pₐ` (volume term and Vandermonde)
+- `solution`         : `q -> SMatrix` with column `j` equal to `Υₐⱼ(q) = ∇Ψₐⱼ(q)`
+                       (γ₀ trace for `D`, and the σ-term `μΥ`)
+- `single_trace`     : `q -> SMatrix` with column `j` equal to
+                       `(∂ⱼpₐ)(q)ν(q) + BνΥₐⱼ(q)`  (γ₁-type trace for `S`)
+                       NOTE: This is not actually used in the associated
+                       correction routine, because there is de-duping performed
+                       there that is more efficient than use of `single-trace`
+                       would be. However this is useful in the testsuite
+                       (`test_X_volume_potential`) so is kept.
+- `grad_single_trace`: `q -> SVector` equal to `pₐ(q)ν(q)`, whose component `j`
+                       is the scalar trace `pₐνⱼ` for the `∇ₓS` term
+"""
+function polynomial_solutions_vdim_X(
+        op::AbstractDifferentialOperator{N},
+        order::Integer,
+        ::Type{T} = default_kernel_eltype(op),
+    ) where {N, T}
+    indices = [I for I in Iterators.product(ntuple(i -> 0:order, N)...) if sum(I) <= order]
+    return map(indices) do I
+        monomial = Polynomial(I => one(T))       # raw monomial yᴵ
+        ∂p = ElementaryPDESolutions.gradient(monomial)   # (∂₁,…,∂_N) yᴵ
+        # per direction j: Υⱼ = ∇Ψⱼ (with ℒΨⱼ = ∂ⱼpₐ) and the conormal BνΥⱼ = HessΨⱼ·ν.
+        dirs = ntuple(N) do j
+            dj = ∂p[j]
+            if isempty(dj.order2coeff)
+                # ∂ⱼyᴵ = 0  ⇒  Ψⱼ = 0, Υⱼ = 0, BνΥⱼ = 0
+                (
+                    Υ = (_ -> zero(SVector{N, T})),
+                    BνΥ = (_ -> zero(SVector{N, T})),
+                    ∂jp = (_ -> zero(T)),
+                )
+            else
+                Ψj, _ = basis_from_monomial(op, dj)        # ℒΨⱼ = ∂ⱼpₐ
+                Υj = ElementaryPDESolutions.gradient(Ψj)    # ∇Ψⱼ (NTuple{N,Polynomial})
+                Hessj = ntuple(c -> ElementaryPDESolutions.gradient(Υj[c]), N) # ∇(∂_cΨⱼ)
+                (
+                    Υ = (x -> Υj(x)),
+                    BνΥ = (q -> svector(c -> dot(normal(q), Hessj[c](coords(q))), N)),
+                    ∂jp = (x -> dj(x)),
+                )
+            end
+        end
+        solution = q -> reduce(hcat, ntuple(j -> dirs[j].Υ(coords(q)), N))
+        single_trace = q -> begin
+            nu = normal(q)
+            x = coords(q)
+            reduce(hcat, ntuple(j -> dirs[j].∂jp(x) * nu + dirs[j].BνΥ(q), N))
+        end
+        grad_single_trace = q -> monomial(coords(q)) * normal(q)
+        (
+            source = monomial,
+            solution = solution,
+            single_trace = single_trace,
+            grad_single_trace = grad_single_trace,
+        )
     end
-    return monomial_shift, dirchlet_shift, neumann_shift, multiindices
-    # return monomials, dirchlet_traces, neumann_traces, multiindices
 end
 
-# dispatch to the correct solver in ElementaryPDESolutions
-function polynomial_solution(::Laplace, p::ElementaryPDESolutions.Polynomial)
-    P = ElementaryPDESolutions.solve_laplace(p)
-    return ElementaryPDESolutions.convert_coefs(P, Float64)
+"""
+    basis_from_monomial(op, monomial) -> (solution, neumann_trace)
+
+Compute a polynomial solution `P` to `ℒ[P] = monomial` and its Neumann trace `γ₁P`.
+
+Each operator implements this to handle its specific PDE structure, including any
+auxiliary fields (e.g., pressure for Stokes) needed to compute the Neumann trace.
+"""
+function basis_from_monomial end
+
+
+# Laplace
+function basis_from_monomial(::Laplace{N}, monomial::Polynomial{N, T}) where {N, T}
+    P = ElementaryPDESolutions.solve_laplace(-monomial)
+    ∇P = ElementaryPDESolutions.gradient(P)
+    γ₁P = q -> dot(normal(q), ∇P(coords(q)))
+    return P, γ₁P
+end
+
+# Helmholtz
+
+function basis_from_monomial(op::Helmholtz{N}, monomial::Polynomial{N, T}) where {N, T}
+    P = ElementaryPDESolutions.solve_helmholtz(-monomial, op.k^2)
+    ∇P = ElementaryPDESolutions.gradient(P)
+    γ₁P = q -> dot(normal(q), ∇P(coords(q)))
+    return P, γ₁P
 end
 
-function polynomial_solution(op::Helmholtz, p::ElementaryPDESolutions.Polynomial)
-    k = op.k
-    P = ElementaryPDESolutions.solve_helmholtz(p; k)
-    return ElementaryPDESolutions.convert_coefs(P, Float64)
+# Elastostatic
+
+function basis_from_monomial(op::Elastostatic{N}, monomial::Polynomial{N, T}) where {N, T}
+    monomial = -monomial
+    @assert T <: StaticMatrix && size(T) == (N, N)
+    S = eltype(T)
+    ord2coef = monomial.order2coeff
+    @assert length(ord2coef) == 1 "Input must be a monomial"
+    coef = first(values(ord2coef))
+    idx = first(keys(ord2coef))
+    μ, λ = op.μ, op.λ
+    ν = λ / (2 * (λ + μ))
+    # Solve for each column of the tensor
+    sol_tuple = ntuple(N) do n
+        p = ntuple(d -> Polynomial(idx => coef[d, n]), N)
+        u = ElementaryPDESolutions.solve_elastostatic(p; μ, ν)
+        ntuple(d -> convert(Polynomial{N, S}, u[d]), N)
+    end
+    P = flatten_polynomial_ntuple(sol_tuple)
+    ∇P = ElementaryPDESolutions.gradient(P)
+    # Neumann trace: traction vector
+    γ₁P = q -> begin
+        n = normal(q)
+        x = coords(q)
+        M = ∇P(x)  # M[j] = ∂P/∂xⱼ
+        cols = svector(N) do m
+            gradu = hcat(ntuple(j -> M[j][:, m], N)...)
+            divu = tr(gradu)
+            λ * divu * n + μ * (gradu + gradu') * n
+        end
+        reduce(hcat, cols)
+    end
+    return P, γ₁P
 end
 
-function polynomial_solution(op::Yukawa, p::ElementaryPDESolutions.Polynomial)
-    k = im * op.λ
-    P = ElementaryPDESolutions.solve_helmholtz(p; k)
-    return ElementaryPDESolutions.convert_coefs(P, Float64)
+# Stokes
+
+function basis_from_monomial(op::Stokes{N}, monomial::Polynomial{N, T}) where {N, T}
+    monomial = -monomial
+    @assert T <: StaticMatrix && size(T) == (N, N)
+    S = eltype(T)
+    ord2coef = monomial.order2coeff
+    @assert length(ord2coef) == 1 "Input must be a monomial"
+    coef = first(values(ord2coef))
+    idx = first(keys(ord2coef))
+    μ = op.μ
+    # Solve for each column: velocity and pressure
+    solutions = ntuple(N) do n
+        f = ntuple(d -> Polynomial(idx => coef[d, n]), N)
+        u, p = ElementaryPDESolutions.solve_stokes(f; μ)
+        vel = ntuple(d -> convert(Polynomial{N, S}, u[d]), N)
+        pres = convert(Polynomial{N, S}, p)
+        (velocity = vel, pressure = pres)
+    end
+    velocities = ntuple(n -> solutions[n].velocity, N)
+    pressures = ntuple(n -> solutions[n].pressure, N)
+    U = flatten_polynomial_ntuple(velocities)
+    ∇U = ElementaryPDESolutions.gradient(U)
+    # Neumann trace: traction using velocity gradient and pressure
+    γ₁U = q -> begin
+        n = normal(q)
+        x = coords(q)
+        M = ∇U(x)  # M[j] = ∂U/∂xⱼ
+        cols = svector(N) do m
+            gradu = hcat(ntuple(j -> M[j][:, m], N)...)
+            p_val = pressures[m](x)
+            -p_val * n + μ * (gradu + gradu') * n
+        end
+        reduce(hcat, cols)
+    end
+    return U, γ₁U
 end
 
-function neumann_trace(
-        ::Union{Laplace, Helmholtz, Yukawa},
-        P::ElementaryPDESolutions.Polynomial{N, T},
-    ) where {N, T}
-    return _normal_derivative(P)
+function flatten_polynomial_ntuple(P::NTuple{N, NTuple{N, Polynomial{DIM, T}}}) where {N, DIM, T <: Number}
+    V = SMatrix{N, N, T, N * N}
+    # collect all multi-indices
+    idxs = Set{NTuple{DIM, Int}}()
+    foreach(p -> union!(idxs, keys(p.order2coeff)), Iterators.flatten(P))
+    # now loop over keys and build flattened coefficients
+    idx2coef = Dict{NTuple{DIM, Int}, V}()
+    for idx in idxs
+        coef_tuple = ntuple(N^2) do n
+            m = div(n - 1, N) + 1
+            l = mod(n - 1, N) + 1
+            p = P[m][l]
+            get(p.order2coeff, idx, zero(T))
+        end
+        idx2coef[idx] = V(coef_tuple)
+    end
+    return Polynomial{DIM, V}(idx2coef)
 end
 
-function _normal_derivative(P::ElementaryPDESolutions.Polynomial{N, T}) where {N, T}
-    ∇P = ElementaryPDESolutions.gradient(P)
-    return (q) -> dot(normal(q), ∇P(coords(q)))
+function (P::NTuple{N, <:Polynomial})(x) where {N}
+    return svector(n -> P[n](x), N)
 end
 
-function (∇P::NTuple{N, <:ElementaryPDESolutions.Polynomial})(x) where {N}
-    return ntuple(n -> ∇P[n](x), N)
+function (P::Polynomial)(q::QuadratureNode)
+    x = coords(q)
+    return P(x)
 end
 
-function (P::ElementaryPDESolutions.Polynomial)(q::QuadratureNode)
-    x = q.coords.data
+function (P::NTuple{N, <:Polynomial})(q::QuadratureNode) where {N}
+    x = coords(q)
     return P(x)
 end
diff --git a/test/Project.toml b/test/Project.toml
index 7559c3b7..bb364ce8 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,7 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+ElementaryPDESolutions = "88a69b33-da0f-4502-8c8c-d680cf4d883b"
 FMM2D = "2d63477d-9690-4b75-bcc1-c3461d43fecc"
 FMM3D = "1e13804c-f9b7-11ea-0ef0-29f3b1745df8"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
@@ -7,8 +9,10 @@ GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 Gmsh = "705231aa-382f-11e9-3f0c-b7cb4346fdeb"
 HMatrices = "8646bddf-ab1c-4fa7-9c51-ba187d647618"
 Inti = "fb74042b-437e-4c5b-88cf-d4e2beb394d5"
+IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Meshes = "eacbb407-ea5a-433e-ab97-5258b1ca43fa"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 QPGreen = "8ff76263-3d2e-4b9c-88ff-2ca8b003e2a7"
 QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -16,3 +20,6 @@ SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
+
+[sources]
+Inti = {path = ".."}
diff --git a/test/aqua_test.jl b/test/aqua_test.jl
index 650e6f22..fad7a67c 100644
--- a/test/aqua_test.jl
+++ b/test/aqua_test.jl
@@ -6,7 +6,7 @@ using Aqua
     Aqua.test_all(
         Inti;
         ambiguities = false, # test only `Inti` for ambiguities later
-        unbound_args = true,
+        unbound_args = (; broken = true), # broken due to use of NTuple in some signatures
         undefined_exports = true,
         project_extras = true,
         stale_deps = true,
diff --git a/test/convergence/poisson.jl b/test/convergence/poisson.jl
new file mode 100644
index 00000000..c5e7102b
--- /dev/null
+++ b/test/convergence/poisson.jl
@@ -0,0 +1,226 @@
+using Inti
+using StaticArrays
+using Gmsh
+using LinearAlgebra
+using HMatrices
+using FMM3D
+using CairoMakie
+using DataStructures
+
+tinit = time() # hide
+
+# ## Problem definition
+
+# In this example we will solve the Poisson equation in a domain $\Omega$ with
+# Dirichlet boundary conditions on $\Gamma := \partial \Omega$:
+# ```math
+#   \begin{align}
+#       \Delta u &= f  \quad \text{in } \Omega \\
+#       u &= g  \quad \text{on } \partial \Gamma
+#   \end{align}
+# ```
+# where $f : \Omega \to \mathbb{R}$ and $g : \Gamma \to \mathbb{R}$ are given
+# functions.
+#
+# Seeking for a solution $u$ of the form ...
+
+include("../test_utils.jl")
+compression = (method = :fmm, tol = 1.0e-13)
+
+meshsize = 0.24
+# `n` in the VDIM paper
+interpolation_order = 4
+qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order)
+nothing #hide
+
+r1 = 1.0
+r2 = 0.5
+# first build boundary meshes for use with BDIM - at higher orders we want to saturate the accuracy in VDIM, so we overresolve the boundaries. Unfortunately BDIM is not stable for large bdry_qorder
+bdry_qorder = min(2 * qorder, 7)
+meshsize_bdry = meshsize / 4
+tmsh = @elapsed begin
+    Ω, msh =
+        gmsh_torus(; center = [0.0, 0.0, 0.0], r1 = r1, r2 = r2, meshsize = meshsize_bdry)
+    #Ω, msh =
+    #    gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = meshsize)
+    Γ = Inti.external_boundary(Ω)
+
+    function face_element_on_torus(nodelist, R, r)
+        return all(
+            [
+                (sqrt(node[1]^2 + node[2]^2) - R^2)^2 + node[3]^2 ≈ r^2 for node in nodelist
+            ]
+        )
+    end
+    face_element_on_curved_surface =
+        (nodelist) -> face_element_on_torus(nodelist, r1, r2)
+
+    ψ =
+        (v) ->
+    [(r1 + r2 * sin(v[1])) * cos(v[2]), (r1 + r2 * sin(v[1])) * sin(v[2]), r2 * cos(v[1])]
+    θ = min(qorder+1,interpolation_order+3) - 1 # smoothness order of curved elements
+    crvmsh = Inti.curve_mesh(
+        msh,
+        ψ,
+        θ;
+        face_element_on_curved_surface = face_element_on_curved_surface,
+    )
+
+    Ωₕ = view(crvmsh, Ω)
+    Γₕ = view(crvmsh, Γ)
+end
+Qgauss = Inti.Gauss(; domain = :triangle, order = bdry_qorder);
+Γdict = OrderedDict(E => Qgauss for E in Inti.element_types(Γₕ))
+Γₕ_quad = Inti.Quadrature(Γₕ, Γdict)
+@info "Mesh generation time: $tmsh"
+
+#cleanup
+Inti.clear_entities!()
+
+# now build the volume mesh
+tmsh = @elapsed begin
+    Ω_coarse, msh_coarse =
+        gmsh_torus(; center = [0.0, 0.0, 0.0], r1 = r1, r2 = r2, meshsize = meshsize)
+    #Ω, msh =
+    #    gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = meshsize)
+    Γ_coarse = Inti.external_boundary(Ω_coarse)
+
+    function face_element_on_torus(nodelist, R, r)
+        return all(
+            [
+                (sqrt(node[1]^2 + node[2]^2) - R^2)^2 + node[3]^2 ≈ r^2 for node in nodelist
+            ]
+        )
+    end
+    face_element_on_curved_surface =
+        (nodelist) -> face_element_on_torus(nodelist, r1, r2)
+
+    ψ =
+        (v) ->
+    [(r1 + r2 * sin(v[1])) * cos(v[2]), (r1 + r2 * sin(v[1])) * sin(v[2]), r2 * cos(v[1])]
+    θ = min(qorder+1,interpolation_order+3) - 1 # smoothness order of curved elements
+    crvmsh_coarse = Inti.curve_mesh(
+        msh_coarse,
+        ψ,
+        θ;
+        face_element_on_curved_surface = face_element_on_curved_surface,
+    )
+
+    Ωₕ_coarse = view(crvmsh_coarse, Ω_coarse)
+    Γₕ_coarse = view(crvmsh_coarse, Γ_coarse)
+end
+@info "Mesh generation time: $tmsh"
+
+Ω = Inti.Domain(e -> Inti.geometric_dimension(e) == 3, Inti.entities(msh))
+Γ = Inti.boundary(Ω)
+
+# Use VDIM with the Vioreanu-Rokhlin quadrature rule
+Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = qorder);
+dict = OrderedDict(E => Q for E in Inti.element_types(Ωₕ_coarse))
+Ωₕ_quad = Inti.Quadrature(Ωₕ_coarse, dict)
+#Qgauss = Inti.Gauss(; domain = :triangle, order = bdry_qorder);
+#Γdict = OrderedDict(E => Qgauss for E in Inti.element_types(Γₕ))
+#Γₕ_quad = Inti.Quadrature(Γₕ, Γdict)
+vol_err = Inti.integrate(x -> 1, Ωₕ_quad) - 2 * π^2 * r2^2 * r1
+@info "Volume error: $vol_err"
+
+
+# ## Manufactured solution
+
+# For the purpose of comparing our numerical results to an exact solution, we
+# will use the method of manufactured solutions. For simplicity, we will take as
+# an exact solution
+
+uₑ = (x) -> cos(10 * x[1]) * sin(10 * x[2])
+
+# which yields
+
+fₑ = (x) -> -200 * uₑ(x)
+
+# Here is what the solution looks like:
+# qvals = map(Ωₕ_quad) do q
+#     return uₑ(q.coords)
+# end
+
+# ivals = Inti.quadrature_to_node_vals(Ωₕ_quad, qvals)
+
+# er = ivals - uₑ.(Ωₕ_quad.mesh.nodes)
+# norm(er,Inf)
+# Inti.write_gmsh_view(Ωₕ, uₑ.(Ωₕ.nodes))
+
+# ## Boundary and integral operators
+op = Inti.Laplace(; dim = 3)
+
+## Boundary operators
+S_b2b, D_b2b = Inti.single_double_layer(;
+    op,
+    target = Γₕ_quad,
+    source = Γₕ_quad,
+    compression,
+    correction = (method = :dim,),
+)
+S_b2d, D_b2d = Inti.single_double_layer(;
+    op,
+    target = Ωₕ_quad,
+    source = Γₕ_quad,
+    compression,
+    correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+)
+
+## Volume potentials
+V_d2d = Inti.volume_potential(;
+    op,
+    target = Ωₕ_quad,
+    source = Ωₕ_quad,
+    compression,
+    correction = (method = :dim, interpolation_order, S_b2d = S_b2d, D_b2d = D_b2d, boundary = Γₕ_quad),
+)
+V_d2b = Inti.volume_potential(;
+    op,
+    target = Γₕ_quad,
+    source = Ωₕ_quad,
+    compression,
+    correction = (
+        method = :dim,
+        maxdist = 5 * meshsize,
+        interpolation_order,
+        target_location = :on,
+        S_b2d = S_b2b,
+        D_b2d = D_b2b,
+        boundary = Γₕ_quad,
+    ),
+)
+
+# We can now solve a BIE for the unknown density $\sigma$:
+f = map(Ωₕ_quad) do q
+    return fₑ(q.coords)
+end
+g = map(Γₕ_quad) do q
+    return uₑ(q.coords)
+end
+rhs = V_d2b * f + g
+
+using LinearAlgebra
+L = -I / 2 + D_b2b
+
+# If `compression=none` or `compresion=hmatrix` is used above for constructing `D_b2b`, we could alternately use dense linear algebra:
+#F = lu(L)
+#σ = F \ rhs
+
+using IterativeSolvers
+σ, hist =
+    gmres(L, rhs; log = true, abstol = 1.0e-14, verbose = false, restart = 100, maxiter = 100)
+@show hist
+
+# To check the solution, lets evaluate it at the nodes $\Omega$
+uₕ_quad = -(V_d2d * f) + D_b2d * σ
+uₑ_quad = map(q -> uₑ(q.coords), Ωₕ_quad)
+er = abs.(uₕ_quad - uₑ_quad)
+@show norm(er, Inf)
+
+# ## Visualize the solution error using Gmsh
+
+tend = time() # hide
+@info "Example completed in $(tend - tinit) seconds" # hide
+ndofs = length(Ωₕ_quad)
+@info "ndofs: $ndofs"
diff --git a/test/convergence/vdim_potential_script_3d.jl b/test/convergence/vdim_potential_script_3d.jl
index c964793a..ac25bb99 100644
--- a/test/convergence/vdim_potential_script_3d.jl
+++ b/test/convergence/vdim_potential_script_3d.jl
@@ -7,110 +7,203 @@ using LinearAlgebra
 using HMatrices
 using FMM3D
 using CairoMakie
+using DataStructures
 
 include("../test_utils.jl")
+compression = (method = :fmm, tol = 1.0e-11)
 
-meshsize = 0.1
-r1 = 1.0
+meshsize = 0.05
+meshsize_bdry = meshsize
+Inti.clear_entities!()
 tmsh = @elapsed begin
-    r2 = 0.5
     Ω, msh =
-        gmsh_torus(; center = [0.0, 0.0, 0.0], r1 = r1, r2 = r2, meshsize = meshsize)
+        gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = meshsize_bdry)
     Γ = Inti.external_boundary(Ω)
 
-    function face_element_on_torus(nodelist, R, r)
-        return all(
-            [
-                (sqrt(node[1]^2 + node[2]^2) - R^2)^2 + node[3]^2 ≈ r^2 for node in nodelist
-            ]
-        )
-    end
-    face_element_on_curved_surface =
-        (nodelist) -> face_element_on_torus(nodelist, r1, r2)
-
-    ψ =
-        (v) ->
-    [(r1 + r2 * sin(v[1])) * cos(v[2]), (r1 + r2 * sin(v[1])) * sin(v[2]), r2 * cos(v[1])]
-    θ = 5 # smoothness order of curved elements
-    crvmsh = Inti.curve_mesh(
-        msh,
-        ψ,
-        θ;
-        face_element_on_curved_surface = face_element_on_curved_surface,
-    )
-
-    Ωₕ = view(crvmsh, Ω)
-    Γₕ = view(crvmsh, Γ)
+    Ωₕ = view(msh, Ω)
+    Γₕ = view(msh, Γ)
 end
-@info "Mesh generation time: $tmsh"
+meshsize_bdry = meshsize
+Ωₕ_coarse = Ωₕ
+
+#Inti.clear_entities!()
+#tmsh = @elapsed begin
+#    Ω_coarse, msh_coarse =
+#        gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = meshsize)
+#    Γ_coarse = Inti.external_boundary(Ω_coarse)
+#
+#    Ωₕ_coarse = view(msh_coarse, Ω_coarse)
+#    Γₕ_coarse = view(msh_coarse, Γ_coarse)
+#end
+#@info "Mesh generation time: $tmsh"
 
 interpolation_order = 2
 VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order)
-bdry_qorder = 2 * VR_qorder
+bdry_qorder = 7 #min(2 * VR_qorder, 7)
 
 tquad = @elapsed begin
     # Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
     Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
-    dict = Dict(E => Q for E in Inti.element_types(Ωₕ))
-    Ωₕ_quad = Inti.Quadrature(Ωₕ, dict)
+    dict = OrderedDict(E => Q for E in Inti.element_types(Ωₕ_coarse))
+    Ωₕ_quad = Inti.Quadrature(Ωₕ_coarse, dict)
     # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
-    Qbdry = Inti.Gauss(; domain = :triangle, order = bdry_qorder)
-    dictbdry = Dict(E => Qbdry for E in Inti.element_types(Γₕ))
+    #Qbdry = Inti.Gauss(; domain = :triangle, order = bdry_qorder)
+    Qbdry = Inti.VioreanuRokhlin(; domain = :triangle, order = bdry_qorder)
+    dictbdry = OrderedDict(E => Qbdry for E in Inti.element_types(Γₕ))
     Γₕ_quad = Inti.Quadrature(Γₕ, dictbdry)
 end
 @info "Quadrature generation time: $tquad"
 
 k0 = π
-k = 0
-θ = (sin(π / 3) * cos(π / 3), sin(π / 3) * sin(π / 3), cos(π / 3))
+#k0 = 3π
+#k0 = 1
+#k = 2π # helmholtz
+k = 0 # laplace
+θ = SVector(sin(π / 3) * cos(π / 3), sin(π / 3) * sin(π / 3), cos(π / 3))
 #u  = (x) -> exp(im * k0 * dot(x, θ))
 #du = (x,n) -> im * k0 * dot(θ, n) * exp(im * k0 * dot(x, θ))
-u = (x) -> cos(k0 * dot(x, θ))
-du = (x, n) -> -k0 * dot(θ, n) * sin(k0 * dot(x, θ))
-f = (x) -> (k^2 - k0^2) * u(x)
+u_exact = x -> cos(k0 * dot(x, θ))
+grad_u_exact = x -> -k0 * θ * sin(k0 * dot(x, θ))
+normal_deriv = (x, n) -> dot(grad_u_exact(x), n)
+#du = (x, n) -> -k0 * dot(θ, n) * sin(k0 * dot(x, θ))
+f_val = (x) -> (k0^2 - k^2) * u_exact(x)
 
-u_d = map(q -> u(q.coords), Ωₕ_quad)
-u_b = map(q -> u(q.coords), Γₕ_quad)
-du_b = map(q -> du(q.coords, q.normal), Γₕ_quad)
-f_d = map(q -> f(q.coords), Ωₕ_quad)
+f_d_nonpoly = [f_val(q.coords) for q in Ωₕ_quad]
+u_b_nonpoly = [u_exact(q.coords) for q in Γₕ_quad]
+du_b_nonpoly = [normal_deriv(q.coords, q.normal) for q in Γₕ_quad]
 
 op = k == 0 ? Inti.Laplace(; dim = 3) : Inti.Helmholtz(; dim = 3, k)
 
 ## Boundary operators
 tbnd = @elapsed begin
-    S_b2d, D_b2d = Inti.single_double_layer(;
+    S_b2d_std, D_b2d_std = Inti.single_double_layer(;
         op,
         target = Ωₕ_quad,
         source = Γₕ_quad,
-        compression = (method = :fmm, tol = 1.0e-8),
-        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+        compression,
+        correction = (method = :dim, maxdist = 5 * meshsize_bdry, target_location = :inside),
+        kernel_variant = :default,
     )
 end
-@info "Boundary operators time: $tbnd"
+@info "(Standard) Boundary operators time: $tbnd"
+#tbnd = @elapsed begin
+#    S_b2d_grad, D_b2d_grad = Inti.single_double_layer(;
+#        op,
+#        target = Ωₕ_quad,
+#        source = Γₕ_quad,
+#        compression,
+#        correction = (method = :dim, maxdist = 5 * meshsize_bdry, target_location = :inside),
+#        kernel_variant = :gradient,
+#    )
+#end
+#@info "(Gradient) Boundary operators time: $tbnd"
 
 ## Volume potentials
+#tvol = @elapsed begin
+#    V_d2d_std = Inti.volume_potential(;
+#        op,
+#        target = Ωₕ_quad,
+#        source = Ωₕ_quad,
+#        compression,
+#        correction = (
+#            method = :dim,
+#            interpolation_order,
+#            maxdist = 5 * meshsize_bdry,
+#            boundary = Γₕ_quad,
+#            S_b2d = S_b2d_std,
+#            D_b2d = D_b2d_std,
+#        ),
+#        kernel_variant = :default,
+#    )
+#end
+#@info "(Standard) Volume potential time: $tvol"
+#tvol = @elapsed begin
+#    V_d2d_grad = Inti.volume_potential(;
+#        op,
+#        target = Ωₕ_quad,
+#        source = Ωₕ_quad,
+#        compression,
+#        correction = (
+#            method = :dim,
+#            interpolation_order,
+#            maxdist = 5 * meshsize_bdry,
+#            boundary = Γₕ_quad,
+#            S_b2d = S_b2d_grad,
+#            D_b2d = D_b2d_grad,
+#        ),
+#        kernel_variant = :gradient,
+#    )
+#end
+#@info "(Gradient) Volume potential time: $tvol"
 tvol = @elapsed begin
-    V_d2d = Inti.volume_potential(;
+    W_d2d = Inti.volume_potential(;
         op,
         target = Ωₕ_quad,
         source = Ωₕ_quad,
-        compression = (method = :fmm, tol = 1.0e-8),
+        compression,
         correction = (
             method = :dim,
             interpolation_order,
-            maxdist = 5 * meshsize,
+            maxdist = 5 * meshsize_bdry,
             boundary = Γₕ_quad,
-            S_b2d = S_b2d,
-            D_b2d = D_b2d,
+            S_b2d = S_b2d_std,
+            D_b2d = D_b2d_std,
         ),
+        kernel_variant = :gradient_source,
     )
 end
-@info "Volume potential time: $tvol"
-
-vref = -u_d - D_b2d * u_b + S_b2d * du_b
-vapprox = V_d2d * f_d
-er = vref - vapprox
-
-ndofs = length(er)
-
-@show ndofs, meshsize, norm(er, Inf)
+@info "(W) Volume potential time: $tvol"
+
+# Standard plane wave test
+#u_d_nonpoly_std = [u_exact(q.coords) for q in Ωₕ_quad]
+#vref_nonpoly_std = u_d_nonpoly_std + D_b2d_std * u_b_nonpoly - S_b2d_std * du_b_nonpoly
+#vapprox_nonpoly_std = V_d2d_std * f_d_nonpoly
+#err_nonpoly_std = norm(vref_nonpoly_std - vapprox_nonpoly_std, Inf)
+#println("  Standard plane wave test: Max Error = ", err_nonpoly_std)
+#
+## Gradient plane wave test
+#u_d_nonpoly_grad = [grad_u_exact(q.coords) for q in Ωₕ_quad]
+#vref_nonpoly_grad = u_d_nonpoly_grad + D_b2d_grad * u_b_nonpoly - S_b2d_grad * du_b_nonpoly
+#vapprox_nonpoly_grad = V_d2d_grad * f_d_nonpoly
+#err_nonpoly_grad = norm(vref_nonpoly_grad - vapprox_nonpoly_grad, Inf)
+#println("  Gradient plane wave test: Max Error = ", err_nonpoly_grad)
+#println("  NDOFS: ", length(Ωₕ_quad))
+
+# ------------------------------------------------------------------------------
+# Non-polynomial convergence test:  g = ∇u with u harmonic  ⇒  div g = 0,
+# so  W[g] = -S[∂_ν u]  (closed form, exercising the volume residual).
+# ------------------------------------------------------------------------------
+#uₑ(x)  = exp(sqrt(2) * x[1]) * cos(x[2]) * cos(x[3])                 # harmonic: Δu = 0
+#gradu(x)    = SVector(sqrt(2) * exp(sqrt(2) * x[1]) * cos(x[2]) * cos(x[3]),
+#                    -exp(sqrt(2) * x[1]) * sin(x[2]) * cos(x[3]),
+#                    -exp(sqrt(2) * x[1]) * cos(x[2]) * sin(x[3]))
+#g_d   = [gradu(q.coords) for q in Ωₕ_quad]
+#du_b  = [dot(gradu(q.coords), q.normal) for q in Γₕ_quad]   # ∂_ν u = g⋅ν
+#w_ref = -(S_b2d_std * du_b)                                     # W[∇u] = -S[∂_ν u]
+#w_app = W_d2d * g_d
+
+#Ψ(x) = exp(x[1] + x[2]) * cos(x[3])
+#gradΨ(x) = SVector(exp(x[1] + x[2]) * cos(x[3]),
+#                   exp(x[1] + x[2]) * cos(x[3]),
+#                   -exp(x[1] + x[2]) * sin(x[3]))
+#g(x) = -1*SVector(1/3 * exp(x[1] + x[2]) * cos(x[3]),
+#               1/3 * exp(x[1] + x[2]) * cos(x[3]),
+#               1/3 * exp(x[1] + x[2]) * sin(x[3]) )
+
+α = 2π; β = α; γ = α;
+Ψ(x) = cos(α * x[1]) * sin(β * x[2]) * cos(γ * x[3])
+gradΨ(x) = SVector(-α*sin(α * x[1]) * sin(β * x[2]) * cos(γ * x[3]),
+                   β*cos(α * x[1]) * cos(β * x[2]) * cos(γ * x[3]),
+                   -γ*cos(α * x[1]) * sin(β * x[2]) * sin(γ * x[3]))
+g(x) = SVector(α * sin(α * x[1]) * sin(β * x[2]) * cos(γ * x[3]), 
+               -β * cos(α * x[1]) * cos(β * x[2]) * cos(γ * x[3]),
+               γ * cos(α * x[1]) * sin(β * x[2]) * sin(γ * x[3]))
+g_d = [g(q.coords) for q in Ωₕ_quad]
+Ψ_b = [Ψ(q.coords) for q in Γₕ_quad]
+Ψ_d = [Ψ(q.coords) for q in Ωₕ_quad]
+BvΨ_plus_gnu = [dot(gradΨ(q.coords), q.normal) for q in Γₕ_quad] + [dot(g(q.coords), q.normal) for q in Γₕ_quad]
+w_ref = Ψ_d + D_b2d_std * Ψ_b - S_b2d_std * BvΨ_plus_gnu
+w_app = W_d2d * g_d
+err = norm(w_app - w_ref, Inf)
+ndofs = length(w_app)
+@show ndofs, meshsize, err
diff --git a/test/convergence/vdim_potential_script_3d_X.jl b/test/convergence/vdim_potential_script_3d_X.jl
new file mode 100644
index 00000000..49127425
--- /dev/null
+++ b/test/convergence/vdim_potential_script_3d_X.jl
@@ -0,0 +1,273 @@
+# # High-order convergence of vdim
+
+using Inti
+using StaticArrays
+using Gmsh
+using LinearAlgebra
+using HMatrices
+using FMM3D
+using DataStructures
+using Dates
+
+tinit = time() # hide
+
+include(joinpath(@__DIR__, "../test_utils.jl"))
+outfile = joinpath(@__DIR__, "Xop_iunius5_results.tsv")
+logfile = joinpath(@__DIR__, "Xop_iunius5_run.log")
+function logmsg(msg)
+    line = "[$(Dates.now())] $msg"
+    println(line)
+    open(logfile, "a") do io
+        println(io, line)
+        flush(io)
+    end
+end
+
+open(outfile, "w") do io
+    println(
+        io,
+        "timestamp\tmeshsize\tinterpolation_order\tqorder\tbdry_qorder\tndofs\tvol_err\terr_inf\ttmsh\tttotal",
+    )
+end
+open(logfile, "w") do io end
+
+compression = (method = :fmm, tol = 1.0e-13, ndiv = 700)
+#compression = (method = :none,)
+
+meshsize = 0.4
+meshsize_bdry = meshsize
+Inti.clear_entities!()
+tmsh = @elapsed begin
+    Ω, msh =
+        gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = meshsize_bdry)
+    Γ = Inti.external_boundary(Ω)
+
+    Ωₕ = view(msh, Ω)
+    Γₕ = view(msh, Γ)
+end
+meshsize_bdry = meshsize
+Ωₕ_coarse = Ωₕ
+
+#Inti.clear_entities!()
+#tmsh = @elapsed begin
+#    Ω_coarse, msh_coarse =
+#        gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = meshsize)
+#    Γ_coarse = Inti.external_boundary(Ω_coarse)
+#
+#    Ωₕ_coarse = view(msh_coarse, Ω_coarse)
+#    Γₕ_coarse = view(msh_coarse, Γ_coarse)
+#end
+#@info "Mesh generation time: $tmsh"
+logmsg("Mesh generation time: $tmsh")
+
+interpolation_order = 1
+VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order)
+bdry_qorder = 10 #min(2 * VR_qorder, 7)
+
+tquad = @elapsed begin
+    # Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    dict = OrderedDict(E => Q for E in Inti.element_types(Ωₕ_coarse))
+    Ωₕ_quad = Inti.Quadrature(Ωₕ_coarse, dict)
+    # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
+    #Qbdry = Inti.Gauss(; domain = :triangle, order = bdry_qorder)
+    Qbdry = Inti.VioreanuRokhlin(; domain = :triangle, order = bdry_qorder)
+    dictbdry = OrderedDict(E => Qbdry for E in Inti.element_types(Γₕ))
+    Γₕ_quad = Inti.Quadrature(Γₕ, dictbdry)
+end
+@info "Quadrature generation time: $tquad"
+
+k0 = π
+#k0 = 3π
+#k0 = 1
+#k = 2π # helmholtz
+k = 0 # laplace
+θ = SVector(sin(π / 3) * cos(π / 3), sin(π / 3) * sin(π / 3), cos(π / 3))
+#u  = (x) -> exp(im * k0 * dot(x, θ))
+#du = (x,n) -> im * k0 * dot(θ, n) * exp(im * k0 * dot(x, θ))
+u_exact = x -> cos(k0 * dot(x, θ))
+grad_u_exact = x -> -k0 * θ * sin(k0 * dot(x, θ))
+normal_deriv = (x, n) -> dot(grad_u_exact(x), n)
+#du = (x, n) -> -k0 * dot(θ, n) * sin(k0 * dot(x, θ))
+f_val = (x) -> (k0^2 - k^2) * u_exact(x)
+
+f_d_nonpoly = [f_val(q.coords) for q in Ωₕ_quad]
+u_b_nonpoly = [u_exact(q.coords) for q in Γₕ_quad]
+du_b_nonpoly = [normal_deriv(q.coords, q.normal) for q in Γₕ_quad]
+
+op = k == 0 ? Inti.Laplace(; dim = 3) : Inti.Helmholtz(; dim = 3, k)
+
+## Boundary operators
+tbnd = @elapsed begin
+    S_b2d_std, D_b2d_std = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression,
+        correction = (method = :dim, maxdist = 5 * meshsize_bdry, target_location = :inside),
+        kernel_variant = :default,
+    )
+end
+@info "(Standard) Boundary operators time: $tbnd"
+tbnd = @elapsed begin
+    S_b2d_grad, D_b2d_grad = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression,
+        correction = (method = :dim, maxdist = 5 * meshsize_bdry, target_location = :inside),
+        kernel_variant = :gradient,
+    )
+end
+@info "(Gradient) Boundary operators time: $tbnd"
+
+## Volume potentials
+#tvol = @elapsed begin
+#    V_d2d_std = Inti.volume_potential(;
+#        op,
+#        target = Ωₕ_quad,
+#        source = Ωₕ_quad,
+#        compression,
+#        correction = (
+#            method = :dim,
+#            interpolation_order,
+#            maxdist = 5 * meshsize_bdry,
+#            boundary = Γₕ_quad,
+#            S_b2d = S_b2d_std,
+#            D_b2d = D_b2d_std,
+#        ),
+#        kernel_variant = :default,
+#    )
+#end
+#@info "(Standard) Volume potential time: $tvol"
+#tvol = @elapsed begin
+#    V_d2d_grad = Inti.volume_potential(;
+#        op,
+#        target = Ωₕ_quad,
+#        source = Ωₕ_quad,
+#        compression,
+#        correction = (
+#            method = :dim,
+#            interpolation_order,
+#            maxdist = 5 * meshsize_bdry,
+#            boundary = Γₕ_quad,
+#            S_b2d = S_b2d_grad,
+#            D_b2d = D_b2d_grad,
+#        ),
+#        kernel_variant = :gradient,
+#    )
+#end
+#@info "(Gradient) Volume potential time: $tvol"
+tvol = @elapsed begin
+    X_d2d = Inti.volume_potential(;
+        op,
+        target = Ωₕ_quad,
+        source = Ωₕ_quad,
+        compression,
+        correction = (
+            method = :dim,
+            interpolation_order,
+            maxdist = 5 * meshsize_bdry,
+            boundary = Γₕ_quad,
+            S_b2d = S_b2d_std,
+            D_b2d = D_b2d_std,
+        ),
+        kernel_variant = :hessian_source,
+        #kernel_variant = :gradient_source,
+    )
+end
+@info "(X) Volume potential time: $tvol"
+
+# Standard plane wave test
+#u_d_nonpoly_std = [u_exact(q.coords) for q in Ωₕ_quad]
+#vref_nonpoly_std = u_d_nonpoly_std + D_b2d_std * u_b_nonpoly - S_b2d_std * du_b_nonpoly
+#vapprox_nonpoly_std = V_d2d_std * f_d_nonpoly
+#err_nonpoly_std = norm(vref_nonpoly_std - vapprox_nonpoly_std, Inf)
+#println("  Standard plane wave test: Max Error = ", err_nonpoly_std)
+#
+## Gradient plane wave test
+#u_d_nonpoly_grad = [grad_u_exact(q.coords) for q in Ωₕ_quad]
+#vref_nonpoly_grad = u_d_nonpoly_grad + D_b2d_grad * u_b_nonpoly - S_b2d_grad * du_b_nonpoly
+#vapprox_nonpoly_grad = V_d2d_grad * f_d_nonpoly
+#err_nonpoly_grad = norm(vref_nonpoly_grad - vapprox_nonpoly_grad, Inf)
+#println("  Gradient plane wave test: Max Error = ", err_nonpoly_grad)
+#println("  NDOFS: ", length(Ωₕ_quad))
+
+# ------------------------------------------------------------------------------
+# Non-polynomial convergence test:  g = ∇u with u harmonic  ⇒  div g = 0,
+# so  W[g] = -S[∂_ν u]  (closed form, exercising the volume residual).
+# ------------------------------------------------------------------------------
+#uₑ(x)  = exp(sqrt(2) * x[1]) * cos(x[2]) * cos(x[3])                 # harmonic: Δu = 0
+#gradu(x)    = SVector(sqrt(2) * exp(sqrt(2) * x[1]) * cos(x[2]) * cos(x[3]),
+#                    -exp(sqrt(2) * x[1]) * sin(x[2]) * cos(x[3]),
+#                    -exp(sqrt(2) * x[1]) * cos(x[2]) * sin(x[3]))
+#g_d   = [gradu(q.coords) for q in Ωₕ_quad]
+#du_b  = [dot(gradu(q.coords), q.normal) for q in Γₕ_quad]   # ∂_ν u = g⋅ν
+#w_ref = -(S_b2d_std * du_b)                                     # W[∇u] = -S[∂_ν u]
+#w_app = W_d2d * g_d
+
+#Ψ(x) = exp(x[1] + x[2]) * cos(x[3])
+#gradΨ(x) = SVector(exp(x[1] + x[2]) * cos(x[3]),
+#                   exp(x[1] + x[2]) * cos(x[3]),
+#                   -exp(x[1] + x[2]) * sin(x[3]))
+#g(x) = -1*SVector(1/3 * exp(x[1] + x[2]) * cos(x[3]),
+#               1/3 * exp(x[1] + x[2]) * cos(x[3]),
+#               1/3 * exp(x[1] + x[2]) * sin(x[3]) )
+
+α = π/4; β = α; γ = α;
+Ψ(x) = cos(α * x[1]) * sin(β * x[2]) * cos(γ * x[3])
+gradΨ(x) = SVector(-α*sin(α * x[1]) * sin(β * x[2]) * cos(γ * x[3]),
+                   β*cos(α * x[1]) * cos(β * x[2]) * cos(γ * x[3]),
+                   -γ*cos(α * x[1]) * sin(β * x[2]) * sin(γ * x[3]))
+g(x) = SVector(α * sin(α * x[1]) * sin(β * x[2]) * cos(γ * x[3]), 
+               -β * cos(α * x[1]) * cos(β * x[2]) * cos(γ * x[3]),
+               γ * cos(α * x[1]) * sin(β * x[2]) * sin(γ * x[3]))
+g_d = [g(q.coords) for q in Ωₕ_quad]
+Ψ_b = [Ψ(q.coords) for q in Γₕ_quad]
+Ψ_d = [Ψ(q.coords) for q in Ωₕ_quad]
+gradΨ_d = [gradΨ(q.coords) for q in Ωₕ_quad]
+BvΨ_plus_gnu = [dot(gradΨ(q.coords), q.normal) for q in Γₕ_quad] + [dot(g(q.coords), q.normal) for q in Γₕ_quad]
+w_ref = gradΨ_d + D_b2d_grad * Ψ_b - S_b2d_grad * BvΨ_plus_gnu
+#Id = [1 0 0; 0 1 0; 0 0 1]
+#Sdotg = similar(w_ref)
+#for i in 1:length(w_ref)
+#    Sdotg[i] = -1/3 * Id * g_d[i]
+#end
+#w_ref = Ψ_d + D_b2d_std * Ψ_b - S_b2d_std * BvΨ_plus_gnu
+w_app = X_d2d * g_d
+err = norm(w_app - w_ref, Inf)
+ndofs = length(w_app)
+@show ndofs, meshsize, err
+
+tend = time() # hide
+ttotal = tend - tinit
+logmsg("Example completed in $ttotal seconds") # hide
+ndofs = length(Ωₕ_quad)
+logmsg("ndofs: $ndofs")
+logmsg("Result meshsize=$(meshsize) ndofs=$(ndofs) err_inf=$(err)")
+
+open(outfile, "a") do io
+    println(
+        io,
+        string(
+            Dates.now(),
+            "\t",
+            meshsize,
+            "\t",
+            interpolation_order,
+            "\t",
+            VR_qorder,
+            "\t",
+            bdry_qorder,
+            "\t",
+            ndofs,
+            "\t",
+            err,
+            "\t",
+            tmsh,
+            "\t",
+            ttotal,
+        ),
+    )
+    flush(io)
+end
diff --git a/test/convergence/vdim_potential_script_3dvec.jl b/test/convergence/vdim_potential_script_3dvec.jl
new file mode 100644
index 00000000..7e805fce
--- /dev/null
+++ b/test/convergence/vdim_potential_script_3dvec.jl
@@ -0,0 +1,116 @@
+# # High-order convergence of vdim
+
+using Inti
+using StaticArrays
+using Gmsh
+using LinearAlgebra
+using HMatrices
+using FMM3D
+using CairoMakie
+
+include(joinpath(@__DIR__, "../test_utils.jl"))
+#compression = (method = :hmatrix, tol = 1.0e-8)
+compression = (method = :fmm, tol = 1.0e-13)
+
+meshsize = 0.05
+r1 = 1.0
+tmsh = @elapsed begin
+    r2 = 0.5
+    Ω, msh =
+        gmsh_ball(; center = [0.0, 0.0, 0.0], radius = r1, meshsize = meshsize)
+    Γ = Inti.external_boundary(Ω)
+    Ωₕ = view(msh, Ω)
+    Γₕ = view(msh, Γ)
+end
+@info "Mesh generation time: $tmsh"
+
+interpolation_order = 3
+VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order)
+bdry_qorder = 2 * VR_qorder
+
+tquad = @elapsed begin
+    # Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, Q)
+    # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
+    Qbdry = Inti.Gauss(; domain = :triangle, order = bdry_qorder)
+    Γₕ_quad = Inti.Quadrature(Γₕ, Qbdry)
+end
+@info "Quadrature generation time: $tquad"
+
+μ = 1.0
+op = Inti.Stokes(; dim = 3, μ)
+
+## Boundary operators
+tbnd = @elapsed begin
+    S_b2d, D_b2d = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression,
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+    )
+end
+@info "Boundary operators time: $tbnd"
+
+## Volume potentials
+tvol = @elapsed begin
+    V_d2d = Inti.volume_potential(;
+        op,
+        target = Ωₕ_quad,
+        source = Ωₕ_quad,
+        compression,
+        correction = (
+            method = :dim,
+            interpolation_order,
+            maxdist = 5 * meshsize,
+            boundary = Γₕ_quad,
+            S_b2d = S_b2d,
+            D_b2d = D_b2d,
+        ),
+    )
+end
+@info "Volume potential time: $tvol"
+
+freq = 5.0
+u = (x) -> SVector{3, Float64}(cos(freq * x[2]), sin(freq * x[3]), cos(freq * x[1]))
+p = (x) -> sin(x[3]) * cos(x[2]) * cos(x[1])
+gradu = (x) -> SMatrix{3, 3, Float64}([0 0 -freq * sin(freq * x[1]); -freq * sin(freq * x[2]) 0 0; 0 freq * cos(freq * x[3]) 0])
+gradu_plustranspose = (x) -> gradu(x) + gradu(x)'
+du = (x, n) -> -p(x) * n + gradu_plustranspose(x) * n
+f = (x) -> μ * freq^2 * u(x) + SVector{3, Float64}(-sin(x[3]) * cos(x[2]) * sin(x[1]), -sin(x[3]) * sin(x[2]) * cos(x[1]), cos(x[3]) * cos(x[2]) * cos(x[1]))
+
+u_d = map(q -> u(q.coords), Ωₕ_quad)
+u_b = map(q -> u(q.coords), Γₕ_quad)
+du_b = map(q -> du(q.coords, q.normal), Γₕ_quad)
+f_d = map(q -> f(q.coords), Ωₕ_quad)
+
+vref = u_d + D_b2d * u_b - S_b2d * du_b
+vapprox = V_d2d * f_d
+er = vref - vapprox
+
+ndofs = length(er)
+
+@show ndofs, meshsize, norm(er, Inf)
+
+# verifies VDIM on polynomials -- Working
+#basis = Inti.polynomial_solutions_vdim(op, interpolation_order)
+#idx = 3
+#N = Inti.ambient_dimension(Ωₕ)
+#c = SVector(ntuple(i -> rand(), N)...)
+#f = (q) -> basis[idx].source(q) * c
+#u = (q) -> basis[idx].solution(q) * c
+#t = (q) -> basis[idx].neumann_trace(q) * c
+#u_d = map(q -> u(q), Ωₕ_quad)
+#u_b = map(q -> u(q), Γₕ_quad)
+#du_b = map(q -> t(q), Γₕ_quad)
+#f_d = map(q -> f(q), Ωₕ_quad)
+#
+## Compute reference solution: -u - D*u_b + S*du_b
+## This comes from Green's representation: u = S*t - D*u + V*f
+## So V*f = u - S*t + D*u, and we test -V*f = S*t - D*u - u
+#vref = -u_d - D_b2d * u_b + S_b2d * du_b
+#
+## Compute uncorrected approximation
+#vapprox = V_d2d * f_d
+#er = vref - vapprox
diff --git a/test/convergence/vdim_potential_script_3dvec_elasto.jl b/test/convergence/vdim_potential_script_3dvec_elasto.jl
new file mode 100644
index 00000000..90b14bd8
--- /dev/null
+++ b/test/convergence/vdim_potential_script_3dvec_elasto.jl
@@ -0,0 +1,106 @@
+# # High-order convergence of vdim
+
+using Inti
+using StaticArrays
+using Gmsh
+using LinearAlgebra
+using HMatrices
+using FMM3D
+using CairoMakie
+
+include(joinpath(@__DIR__, "../test_utils.jl"))
+#compression = (method = :fmm, tol = 1.0e-12)
+compression = (method = :hmatrix, tol = 1.0e-8)
+#compression = (method = :none,)
+
+meshsize = 0.2
+r1 = 1.0
+tmsh = @elapsed begin
+    r2 = 0.5
+    Ω, msh =
+        gmsh_ball(; center = [0.0, 0.0, 0.0], radius = r1, meshsize = meshsize)
+    Γ = Inti.external_boundary(Ω)
+    Ωₕ = view(msh, Ω)
+    Γₕ = view(msh, Γ)
+end
+@info "Mesh generation time: $tmsh"
+
+interpolation_order = 2
+VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order)
+bdry_qorder = 2 * VR_qorder
+
+tquad = @elapsed begin
+    # Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, Q)
+    # Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
+    Qbdry = Inti.Gauss(; domain = :triangle, order = bdry_qorder)
+    Γₕ_quad = Inti.Quadrature(Γₕ, Qbdry)
+end
+@info "Quadrature generation time: $tquad"
+
+μ = 1.0
+λ = 2.0
+#op = Inti.Stokes(; dim = 3, μ)
+op = Inti.Elastostatic(; dim = 3, μ, λ)
+
+## Boundary operators
+tbnd = @elapsed begin
+    S_b2d, D_b2d = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression,
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+    )
+end
+@info "Boundary operators time: $tbnd"
+
+## Volume potentials
+tvol = @elapsed begin
+    V_d2d = Inti.volume_potential(;
+        op,
+        target = Ωₕ_quad,
+        source = Ωₕ_quad,
+        compression,
+        correction = (
+            method = :dim,
+            interpolation_order,
+            maxdist = 5 * meshsize,
+            boundary = Γₕ_quad,
+            S_b2d = S_b2d,
+            D_b2d = D_b2d,
+        ),
+    )
+end
+@info "Volume potential time: $tvol"
+
+# Stokes
+#u = (x) -> SVector{3, Float64}(cos(x[2]), sin(x[3]), cos(x[1]))
+#p = (x) -> sin(x[3]) * cos(x[2]) * cos(x[1])
+#gradu = (x) -> SMatrix{3, 3, Float64}([0 0 -sin(x[1]); -sin(x[2]) 0 0; 0 cos(x[3]) 0])
+#gradu_plustranspose = (x) -> gradu(x) + gradu(x)'
+#du = (x, n) -> -p(x) * n + gradu_plustranspose(x) * n
+#f = (x) -> μ * u(x) + SVector{3, Float64}(-sin(x[3]) * cos(x[2]) * sin(x[1]), -sin(x[3]) * sin(x[2]) * cos(x[1]), cos(x[3]) * cos(x[2]) * cos(x[1]))
+
+# Elastostatics
+u = (x) -> SVector{3, Float64}(cos(x[1]) * sin(x[2]), sin(x[3]) * cos(x[2]), cos(x[3]) * sin(x[1]))
+graddivu = (x) -> -SVector{3, Float64}(cos(x[1]) * sin(x[2]) + sin(x[3]) * cos(x[1]), sin(x[1]) * cos(x[2]) + sin(x[3]) * cos(x[2]), cos(x[3]) * sin(x[2]) + cos(x[3]) * sin(x[1]))
+gradu = (x) -> SMatrix{3, 3, Float64}(-sin(x[1]) * sin(x[2]), 0, cos(x[3]) * cos(x[1]), cos(x[1]) * cos(x[2]), -sin(x[3]) * sin(x[2]), 0, 0, cos(x[3]) * cos(x[2]), -sin(x[3]) * sin(x[1]))
+f = (x) -> 2 * μ * u(x) - (μ + λ) * graddivu(x)
+divu = (x) -> -sin(x[1]) * sin(x[2]) - sin(x[3]) * sin(x[2]) - sin(x[3]) * sin(x[1])
+curlv = (x) -> SVector{3, Float64}(-cos(x[3]) * cos(x[2]), -cos(x[3]) * cos(x[1]), -cos(x[1]) * cos(x[2]))
+du = (x, n) -> λ * divu(x) * n + 2 * μ * gradu(x) * n + μ * cross(n, curlv(x))
+
+u_d = map(q -> u(q.coords), Ωₕ_quad)
+u_b = map(q -> u(q.coords), Γₕ_quad)
+du_b = map(q -> du(q.coords, q.normal), Γₕ_quad)
+f_d = map(q -> f(q.coords), Ωₕ_quad)
+
+vref = u_d + D_b2d * u_b - S_b2d * du_b
+vapprox = V_d2d * f_d
+er = vref - vapprox
+
+ndofs = length(er)
+
+@show ndofs, meshsize, norm(er, Inf)
diff --git a/test/fmm2d_test.jl b/test/fmm2d_test.jl
index bbc4fdd3..54e91c07 100644
--- a/test/fmm2d_test.jl
+++ b/test/fmm2d_test.jl
@@ -3,7 +3,9 @@ using Inti
 using FMM2D
 using Gmsh
 using LinearAlgebra
+using OrderedCollections
 using Random
+using StaticArrays
 
 include("test_utils.jl")
 
@@ -19,16 +21,20 @@ include("test_utils.jl")
 
 for op in (Inti.Laplace(; dim = 2), Inti.Helmholtz(; dim = 2, k = 1.2))
     @testset "PDE: $op" begin
+    op = Inti.Laplace(; dim = 2)
         for K in (
                 Inti.DoubleLayerKernel(op),
                 Inti.SingleLayerKernel(op),
                 Inti.AdjointDoubleLayerKernel(op),
                 Inti.HyperSingularKernel(op),
+                Inti.GradientSingleLayerKernel(op),
+                Inti.GradientDoubleLayerKernel(op),
             )
+            K = Inti.GradientDoubleLayerKernel(op)
             for Γ_quad in (Γ₁_quad, Γ₂_quad)
                 iop = Inti.IntegralOperator(K, Γ₁_quad, Γ_quad)
                 iop_fmm = Inti.assemble_fmm(iop; rtol = 1.0e-8)
-                x = rand(eltype(iop), size(iop, 2))
+                x = rand(Inti.default_density_eltype(op), size(iop, 2))
                 yapprox = iop_fmm * x
                 # test on a given index set
                 idx_test = rand(1:size(iop, 1), 10)
@@ -40,3 +46,125 @@ for op in (Inti.Laplace(; dim = 2), Inti.Helmholtz(; dim = 2, k = 1.2))
         end
     end
 end
+
+@testset "VDIM grad V Operator for Laplace 2D" begin
+    op = Inti.Laplace(; dim = 2)
+
+    # Create a coarse mesh for this test
+    Ω_coarse, msh_coarse = gmsh_disk(; center = [0.0, 0.0], rx = 1.0, ry = 1.0, meshsize = 0.4)
+    Γ_coarse = Inti.external_boundary(Ω_coarse)
+    Γₕ_quad = Inti.Quadrature(view(msh_coarse, Γ_coarse); qorder = 4)
+
+    # Create volume quadrature
+    Ωₕ = view(msh_coarse, Ω_coarse)
+    VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(2)
+    Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+
+    # Build gradient operators with FMM
+    S, D = Inti.single_double_layer(;
+        op, target = Ωₕ_quad, source = Γₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-10),
+        correction = (method = :dim, maxdist = 0.5, target_location = :inside),
+        kernel_variant = :gradient
+    )
+    V = Inti.volume_potential(;
+        op, target = Ωₕ_quad, source = Ωₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-10), correction = (method = :none,),
+        kernel_variant = :gradient
+    )
+    δV = Inti.vdim_correction(
+        op, Ωₕ_quad, Ωₕ_quad, Γₕ_quad, S, D, V;
+        green_multiplier = -ones(length(Ωₕ_quad)), interpolation_order = 2,
+        kernel_variant = :gradient
+    )
+
+    # Test Green's identity with polynomial solution
+    basis = Inti.polynomial_solutions_vdim(op, 2)
+
+    # Use polynomial 2: x
+    u_d = [basis[2].gradient_solution(q) for q in Ωₕ_quad]
+    u_b = [basis[2].solution(q) for q in Γₕ_quad]
+    du_b = [basis[2].neumann_trace(q) for q in Γₕ_quad]
+    f_d = [basis[2].source(q) for q in Ωₕ_quad]
+
+    vref = u_d + D * u_b - S * du_b
+    vapprox = V * f_d + δV * f_d
+    @test norm(vref - vapprox, Inf) < 1.0e-10
+end
+
+@testset "W operator (FMM vs dense) 2D" begin
+    Ω_c, msh_c = gmsh_disk(; center = [0.0, 0.0], rx = 1.0, ry = 1.0, meshsize = 0.2)
+    Γₕ_quad = Inti.Quadrature(view(msh_c, Inti.external_boundary(Ω_c)); qorder = 4)
+    Ωₕ = view(msh_c, Ω_c)
+    VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(2)
+    Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+    for (op, Tout) in (
+            (Inti.Laplace(; dim = 2), Float64),
+            (Inti.Helmholtz(; dim = 2, k = 1.2), ComplexF64),
+        )
+        @testset "PDE: $op" begin
+            cor = (method = :dim, maxdist = 0.5, boundary = Γₕ_quad, interpolation_order = 2)
+            Wd = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :none,), correction = cor, kernel_variant = :gradient_source)
+            Wf = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :fmm, tol = 1.0e-12), correction = cor, kernel_variant = :gradient_source)
+            g = [rand(SVector{2, Tout}) for _ in 1:length(Ωₕ_quad)]
+            yd = Wd * g
+            yf = Wf * g
+            @test eltype(yd) == Tout
+            @test eltype(yf) == Tout
+            @test norm(yf - yd, Inf) / norm(yd, Inf) < 1.0e-6
+        end
+    end
+end
+
+@testset "X charge→Hessian volume op (FMM vs dense) Laplace 2D" begin
+    # Validates `assemble_fmm_chargehessian` (the scalar-density → `SMatrix` Hessian
+    # single-layer volume operator used by the X = ∇W VDIM correction) against the dense
+    # Hessian operator.
+    op = Inti.Laplace(; dim = 2)
+    Ω_c, msh_c = gmsh_disk(; center = [0.0, 0.0], rx = 1.0, ry = 1.0, meshsize = 0.2)
+    Ωₕ = view(msh_c, Ω_c)
+    VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(2)
+    Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+    Vh = Inti.IntegralOperator(Inti.HessianSingleLayerKernel(op), Ωₕ_quad, Ωₕ_quad)
+    m, n = size(Vh)
+    ρ = rand(n)
+    # dense reference: Mᵢ = Σₖ Vh[i,k] ρₖ  (an SMatrix per target)
+    Mref = [sum(Vh[i, k] * ρ[k] for k in 1:n) for i in 1:m]
+    Vfmm = Inti.assemble_fmm_chargehessian(Vh; rtol = 1.0e-12)
+    @test eltype(Vfmm) == SMatrix{2, 2, Float64, 4}
+    Mfmm = Vector{SMatrix{2, 2, Float64, 4}}(undef, m)
+    mul!(Mfmm, Vfmm, ρ)
+    @test norm(Mref - Mfmm, Inf) / norm(Mref, Inf) < 1.0e-8
+end
+
+@testset "X (∇W) operator (FMM vs dense) 2D" begin
+    Ω_c, msh_c = gmsh_disk(; center = [0.0, 0.0], rx = 1.0, ry = 1.0, meshsize = 0.2)
+    Γₕ_quad = Inti.Quadrature(view(msh_c, Inti.external_boundary(Ω_c)); qorder = 4)
+    Ωₕ = view(msh_c, Ω_c)
+    VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(2)
+    Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+    cor = (method = :dim, maxdist = 0.5, boundary = Γₕ_quad, interpolation_order = 2)
+    for (op, Tout) in (
+            (Inti.Laplace(; dim = 2), Float64),
+            (Inti.Helmholtz(; dim = 2, k = 1.2), ComplexF64),
+        )
+        @testset "PDE: $op" begin
+            Xd = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :none,), correction = cor, kernel_variant = :hessian_source)
+            Xf = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :fmm, tol = 1.0e-12), correction = cor, kernel_variant = :hessian_source)
+            g = [rand(SVector{2, Tout}) for _ in 1:length(Ωₕ_quad)]
+            yd = Xd * g
+            yf = Xf * g
+            @test eltype(yd) == SVector{2, Tout}
+            @test eltype(yf) == SVector{2, Tout}
+            @test norm(yf - yd, Inf) / norm(yd, Inf) < 1.0e-6
+        end
+    end
+end
\ No newline at end of file
diff --git a/test/fmm3d_test.jl b/test/fmm3d_test.jl
index fc805d87..9d4b18cd 100644
--- a/test/fmm3d_test.jl
+++ b/test/fmm3d_test.jl
@@ -4,6 +4,8 @@ using FMM3D
 using Gmsh
 using LinearAlgebra
 using Random
+using OrderedCollections
+using StaticArrays
 
 include("test_utils.jl")
 
@@ -27,10 +29,14 @@ for op in (
                 Inti.SingleLayerKernel(op),
                 Inti.AdjointDoubleLayerKernel(op),
                 Inti.HyperSingularKernel(op),
+                Inti.GradientSingleLayerKernel(op),
+                Inti.GradientDoubleLayerKernel(op),
             )
             # TODO Stokes has only single and double layer implemented for now
             (K isa Inti.AdjointDoubleLayerKernel && op isa Inti.Stokes) && continue
             (K isa Inti.HyperSingularKernel && op isa Inti.Stokes) && continue
+            (K isa Inti.GradientSingleLayerKernel && op isa Inti.Stokes) && continue
+            (K isa Inti.GradientDoubleLayerKernel && op isa Inti.Stokes) && continue
             for Γ_quad in (Γ₁_quad, Γ₂_quad)
                 iop = Inti.IntegralOperator(K, Γ₁_quad, Γ_quad)
                 iop_fmm = Inti.assemble_fmm(iop; rtol = 1.0e-8)
@@ -44,3 +50,147 @@ for op in (
         end
     end
 end
+
+# Test VDIM correction with FMM for Stokes (vector-valued PDE)
+@testset "VDIM + FMM for Stokes 3D" begin
+    op = Inti.Stokes(; μ = 1.0, dim = 3)
+
+    # Create a coarse mesh for this test
+    Ω_coarse, msh_coarse = gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = 0.4)
+    Γ_coarse = Inti.external_boundary(Ω_coarse)
+    Γₕ_quad = Inti.Quadrature(view(msh_coarse, Γ_coarse); qorder = 4)
+
+    # Create volume quadrature
+    Ωₕ = view(msh_coarse, Ω_coarse)
+    VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(2)
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+
+    # Build operators with FMM
+    S, D = Inti.single_double_layer(;
+        op, target = Ωₕ_quad, source = Γₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-10),
+        correction = (method = :dim, maxdist = 0.5, target_location = :inside)
+    )
+    V = Inti.volume_potential(;
+        op, target = Ωₕ_quad, source = Ωₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-10), correction = (method = :none,)
+    )
+    δV = Inti.vdim_correction(
+        op, Ωₕ_quad, Ωₕ_quad, Γₕ_quad, S, D, V;
+        green_multiplier = -ones(length(Ωₕ_quad)), interpolation_order = 2
+    )
+
+    # Test Green's identity with polynomial solution
+    basis = Inti.polynomial_solutions_vdim(op, 2)
+    c = SVector(1.0, 2.0, 3.0)
+    u_d = [basis[1].solution(q) * c for q in Ωₕ_quad]
+    u_b = [basis[1].solution(q) * c for q in Γₕ_quad]
+    du_b = [basis[1].neumann_trace(q) * c for q in Γₕ_quad]
+    f_d = [basis[1].source(q) * c for q in Ωₕ_quad]
+
+    vref = u_d + D * u_b - S * du_b
+    vapprox = V * f_d + δV * f_d
+    @test norm(vref - vapprox, Inf) < 1.0e-10
+end
+
+
+@testset "VDIM grad V Operator for Laplace 3D" begin
+    op = Inti.Laplace(; dim = 3)
+
+    # Create a coarse mesh for this test
+    Ω_coarse, msh_coarse = gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = 0.4)
+    Γ_coarse = Inti.external_boundary(Ω_coarse)
+    Γₕ_quad = Inti.Quadrature(view(msh_coarse, Γ_coarse); qorder = 4)
+
+    # Create volume quadrature
+    Ωₕ = view(msh_coarse, Ω_coarse)
+    VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(2)
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+
+    # Build gradient operators with FMM
+    S, D = Inti.single_double_layer(;
+        op, target = Ωₕ_quad, source = Γₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-10),
+        correction = (method = :dim, maxdist = 0.5, target_location = :inside),
+        kernel_variant = :gradient
+    )
+    V = Inti.volume_potential(;
+        op, target = Ωₕ_quad, source = Ωₕ_quad,
+        compression = (method = :fmm, tol = 1.0e-10), correction = (method = :none,),
+        kernel_variant = :gradient
+    )
+    δV = Inti.vdim_correction(
+        op, Ωₕ_quad, Ωₕ_quad, Γₕ_quad, S, D, V;
+        green_multiplier = -ones(length(Ωₕ_quad)), interpolation_order = 2,
+        kernel_variant = :gradient
+    )
+
+    # Test Green's identity with polynomial solution
+    basis = Inti.polynomial_solutions_vdim(op, 2)
+    
+    # Use polynomial 2: x
+    u_d = [basis[2].gradient_solution(q) for q in Ωₕ_quad]
+    u_b = [basis[2].solution(q) for q in Γₕ_quad]
+    du_b = [basis[2].neumann_trace(q) for q in Γₕ_quad]
+    f_d = [basis[2].source(q) for q in Ωₕ_quad]
+
+    vref = u_d + D * u_b - S * du_b
+    vapprox = V * f_d + δV * f_d
+    @test norm(vref - vapprox, Inf) < 1.0e-10
+end
+
+@testset "W operator (FMM vs dense) 3D" begin
+    Ω_c, msh_c = gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = 0.4)
+    Γₕ_quad = Inti.Quadrature(view(msh_c, Inti.external_boundary(Ω_c)); qorder = 4)
+    Ωₕ = view(msh_c, Ω_c)
+    VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(1)
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+    for (op, Tout) in (
+            (Inti.Laplace(; dim = 3), Float64),
+            (Inti.Helmholtz(; dim = 3, k = 1.2), ComplexF64),
+        )
+        @testset "PDE: $op" begin
+            cor = (method = :dim, maxdist = 0.5, boundary = Γₕ_quad, interpolation_order = 1)
+            Wd = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :none,), correction = cor, kernel_variant = :gradient_source)
+            Wf = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :fmm, tol = 1.0e-12), correction = cor, kernel_variant = :gradient_source)
+            g = [rand(SVector{3, Tout}) for _ in 1:length(Ωₕ_quad)]
+            yd = Wd * g
+            yf = Wf * g
+            @test eltype(yd) == Tout
+            @test eltype(yf) == Tout
+            @test norm(yf - yd, Inf) / norm(yd, Inf) < 1.0e-6
+        end
+    end
+end
+
+@testset "X (∇W) operator (FMM vs dense) 3D" begin
+    Ω_c, msh_c = gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = 0.4)
+    Γₕ_quad = Inti.Quadrature(view(msh_c, Inti.external_boundary(Ω_c)); qorder = 4)
+    Ωₕ = view(msh_c, Ω_c)
+    VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(1)
+    Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, OrderedDict(E => Q for E in Inti.element_types(Ωₕ)))
+    cor = (method = :dim, maxdist = 0.5, boundary = Γₕ_quad, interpolation_order = 1)
+    for (op, Tout) in (
+            (Inti.Laplace(; dim = 3), Float64),
+            (Inti.Helmholtz(; dim = 3, k = 1.2), ComplexF64),
+        )
+        @testset "PDE: $op" begin
+            Xd = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :none,), correction = cor, kernel_variant = :hessian_source)
+            Xf = Inti.volume_potential(; op, target = Ωₕ_quad, source = Ωₕ_quad,
+                compression = (method = :fmm, tol = 1.0e-12), correction = cor, kernel_variant = :hessian_source)
+            g = [rand(SVector{3, Tout}) for _ in 1:length(Ωₕ_quad)]
+            yd = Xd * g
+            yf = Xf * g
+            @test eltype(yd) == SVector{3, Tout}
+            @test eltype(yf) == SVector{3, Tout}
+            @test norm(yf - yd, Inf) / norm(yd, Inf) < 1.0e-6
+        end
+    end
+end
diff --git a/test/green_identities_test.jl b/test/green_identities_test.jl
index b72611e0..341499b1 100644
--- a/test/green_identities_test.jl
+++ b/test/green_identities_test.jl
@@ -11,16 +11,6 @@ using StaticArrays
 using QPGreen
 using ForwardDiff
 
-## Extend QuadGK to support ForwardDiff.Dual types (see https://github.com/JuliaMath/QuadGK.jl/issues/122)
-## (only needed if computing derivatives of QPGreen using ForwardDiff)
-# using QuadGK
-# function QuadGK.cachedrule(
-#     ::Type{<:ForwardDiff.Dual{<:Any,T}},
-#     n::Integer,
-# ) where {T<:Number}
-#     return QuadGK._cachedrule(typeof(float(real(one(T)))), Int(n))
-# end
-
 include("test_utils.jl")
 
 Random.seed!(1)
@@ -29,99 +19,120 @@ Random.seed!(1)
 rtol1 = 1.0e-2 # single and double layer
 rtol2 = 5.0e-2 # hypersingular (higher tolerance to avoid use of fine mesh + long unit tests)
 dims = (2, 3)
-meshsize = 0.5
+meshsize    = 0.2  # 2D mesh
+meshsize_3d = 0.2  # same as 2D; kept separate so it can be tuned independently
 types = (:interior, :exterior)
 
-corrections = [(method = :dim,), (method = :adaptive, maxdist = 2 * meshsize, rtol = 1.0e-2)]
+# corrections defined inside the loop to support dimension-dependent tolerances
 
-for correction in corrections
-    @testset "Method = $(correction.method)" begin
-        for N in dims
-            # create geometry
-            Inti.clear_entities!()
+# Build mesh once per dimension and loop corrections inside, so the geometry is not
+# recreated for every correction method.
+for N in dims
+    Inti.clear_entities!()
+    tol_adaptive = N == 2 ? 1.0e-2 : 1.0e-3
+    corrections = [
+        (method = :dim,),
+        (method = :adaptive, maxdist = 2 * (N == 2 ? meshsize : meshsize_3d), rtol = tol_adaptive, atol = tol_adaptive)
+    ]
+    msize = N == 2 ? meshsize : meshsize_3d
+    local Γ
+    if N == 2
+        Γ = Inti.parametric_curve(x -> SVector(cos(x), sin(x)), 0.0, 2π) |> Inti.Domain
+        quad = Inti.Quadrature(Γ; meshsize = msize, qorder = 3)
+    else
+        Ω = Inti.GeometricEntity("ellipsoid") |> Inti.Domain
+        Γ = Inti.external_boundary(Ω)
+        quad = Inti.Quadrature(Γ; meshsize = msize, qorder = 3)
+    end
+
+    for correction in corrections
+        @testset "Method = $(correction.method), $(N)d" begin
+            ops = (
+                Inti.Laplace(; dim = N),
+                Inti.Helmholtz(; k = 1.2, dim = N),
+                Inti.Stokes(; μ = 1.2, dim = N),
+                Inti.Elastostatic(; λ = 1, μ = 1, dim = N),
+            )
+            # periodic operators only defined in 2D
             if N == 2
-                Γ =
-                    Inti.parametric_curve(x -> SVector(cos(x), sin(x)), 0.0, 2π) |>
-                    Inti.Domain
-                quad = Inti.Quadrature(Γ; meshsize, qorder = 5)
-            else
-                # Ω, msh = gmsh_ball(; center = [0.0, 0.0, 0.0], radius = 1.0, meshsize = 0.2)
-                Ω = Inti.GeometricEntity("ellipsoid") |> Inti.Domain
-                Γ = Inti.external_boundary(Ω)
-                quad = Inti.Quadrature(Γ; meshsize, qorder = 5)
-            end
-            for t in types
-                σ = t == :interior ? 1 / 2 : -1 / 2
                 ops = (
-                    Inti.Laplace(; dim = N),
-                    Inti.Helmholtz(; k = 1.2, dim = N),
-                    Inti.Stokes(; μ = 1.2, dim = N),
-                    Inti.Elastostatic(; λ = 1, μ = 1, dim = N),
+                    Inti.LaplacePeriodic1D(; dim = N, period = 2π),
+                    Inti.HelmholtzPeriodic1D(; alpha = 0.3, k = 1.2, dim = N),
+                    ops...,
                 )
-                # periodic Laplace only defined for 2d, so we add it conditionally
-                if N == 2
-                    ops = (
-                        Inti.LaplacePeriodic1D(; dim = N, period = 2π),
-                        Inti.HelmholtzPeriodic1D(; alpha = 0.3, k = 1.2, dim = N),
-                        ops...,
+            end
+
+            for op in ops
+                @testset "Greens identity $(N)d $op" begin
+                    # HelmholtzPeriodic1D with adaptive correction needs higher quadrature order.
+                    quad_op = if op isa
+                                 Base.get_extension(Inti, :IntiQPGreenExt).HelmholtzPeriodic1D &&
+                                 correction.method == :adaptive
+                        Inti.Quadrature(Γ; meshsize = msize, qorder = 5)
+                    else
+                        quad
+                    end
+
+                    # ── Single / double layer ─────────────────────────────────────────────
+                    # Uncorrected and corrected operators are independent of the test point
+                    # (interior vs exterior), so build them once and reuse across both types.
+                    Smat = Inti.assemble_matrix(Inti.IntegralOperator(Inti.SingleLayerKernel(op), quad_op))
+                    Dmat = Inti.assemble_matrix(Inti.IntegralOperator(Inti.DoubleLayerKernel(op), quad_op))
+                    S, D = Inti.single_double_layer(;
+                        op,
+                        target = quad_op,
+                        source = quad_op,
+                        compression = (method = :none,),
+                        correction,
                     )
-                end
-                for op in ops
-                    @testset "Greens identity ($t) $(N)d $op" begin
-                        if op isa
-                                Base.get_extension(Inti, :IntiQPGreenExt).HelmholtzPeriodic1D &&
-                                correction ==
-                                (method = :adaptive, maxdist = 2 * meshsize, rtol = 1.0e-2)
-                            quad = Inti.Quadrature(Γ; meshsize = 0.2, qorder = 5)
-                        end
-                        xs = t == :interior ? ntuple(i -> 3, N) : ntuple(i -> 0.1, N)
-                        T = Inti.default_density_eltype(op)
-                        c = rand(T)
-                        u = (qnode) -> Inti.SingleLayerKernel(op)(qnode, xs) * c
-                        dudn = (qnode) -> Inti.AdjointDoubleLayerKernel(op)(qnode, xs) * c
-                        γ₀u = map(u, quad)
-                        γ₁u = map(dudn, quad)
-                        γ₀u_norm = norm(norm.(γ₀u, Inf), Inf)
-                        γ₁u_norm = norm(norm.(γ₁u, Inf), Inf)
-                        # single and double layer
-                        G = Inti.SingleLayerKernel(op)
-                        Sop = Inti.IntegralOperator(G, quad)
-                        Smat = Inti.assemble_matrix(Sop)
-                        dG = Inti.DoubleLayerKernel(op)
-                        Dop = Inti.IntegralOperator(dG, quad)
-                        Dmat = Inti.assemble_matrix(Dop)
-                        e0 = norm(Smat * γ₁u - Dmat * γ₀u - σ * γ₀u, Inf) / γ₀u_norm
-                        S, D = Inti.single_double_layer(;
-                            op,
-                            target = quad,
-                            source = quad,
-                            compression = (method = :none,),
-                            correction,
-                        )
-                        e1 = norm(S * γ₁u - D * γ₀u - σ * γ₀u, Inf) / γ₀u_norm
-                        @testset "Single/double layer $(string(op))" begin
+
+                    @testset "Single/double layer $(string(op))" begin
+                        for t in types
+                            σ  = t == :interior ? 1 / 2 : -1 / 2
+                            xs = t == :interior ? ntuple(i -> 3, N) : ntuple(i -> 0.1, N)
+                            T  = Inti.default_density_eltype(op)
+                            c  = rand(T)
+                            u    = (qnode) -> Inti.SingleLayerKernel(op)(qnode, xs) * c
+                            dudn = (qnode) -> Inti.AdjointDoubleLayerKernel(op)(qnode, xs) * c
+                            γ₀u = map(u, quad_op)
+                            γ₁u = map(dudn, quad_op)
+                            γ₀u_norm = norm(norm.(γ₀u, Inf), Inf)
+                            e0 = norm(Smat * γ₁u - Dmat * γ₀u - σ * γ₀u, Inf) / γ₀u_norm
+                            e1 = norm(S * γ₁u - D * γ₀u - σ * γ₀u, Inf) / γ₀u_norm
                             @test norm(e0, Inf) > norm(e1, Inf)
                             @test norm(e1, Inf) < rtol1
                         end
-                        # adjoint double-layer and hypersingular.
-                        if op isa Inti.Stokes
-                            # skip cases where hypersingular has not been implemented
-                            continue
-                        end
-                        Kop = Inti.IntegralOperator(Inti.AdjointDoubleLayerKernel(op), quad)
-                        Kmat = Inti.assemble_matrix(Kop)
-                        Hop = Inti.IntegralOperator(Inti.HyperSingularKernel(op), quad)
-                        Hmat = Inti.assemble_matrix(Hop)
-                        e0 = norm(Kmat * γ₁u - Hmat * γ₀u - σ * γ₁u, Inf) / γ₁u_norm
-                        K, H = Inti.adj_double_layer_hypersingular(;
-                            op = op,
-                            target = quad,
-                            source = quad,
-                            compression = (method = :none,),
-                            correction,
-                        )
-                        e1 = norm(K * γ₁u - H * γ₀u - σ * γ₁u, Inf) / γ₁u_norm
-                        @testset "Adjoint double-layer/hypersingular $(string(op))" begin
+                    end
+
+                    # ── Adjoint double-layer / hypersingular ──────────────────────────────
+                    # Stokes hypersingular is not implemented; skip.
+                    if op isa Inti.Stokes
+                        continue
+                    end
+
+                    Kmat = Inti.assemble_matrix(Inti.IntegralOperator(Inti.AdjointDoubleLayerKernel(op), quad_op))
+                    Hmat = Inti.assemble_matrix(Inti.IntegralOperator(Inti.HyperSingularKernel(op), quad_op))
+                    K, H = Inti.adj_double_layer_hypersingular(;
+                        op,
+                        target = quad_op,
+                        source = quad_op,
+                        compression = (method = :none,),
+                        correction,
+                    )
+
+                    @testset "Adjoint double-layer/hypersingular $(string(op))" begin
+                        for t in types
+                            σ  = t == :interior ? 1 / 2 : -1 / 2
+                            xs = t == :interior ? ntuple(i -> 3, N) : ntuple(i -> 0.1, N)
+                            T  = Inti.default_density_eltype(op)
+                            c  = rand(T)
+                            u    = (qnode) -> Inti.SingleLayerKernel(op)(qnode, xs) * c
+                            dudn = (qnode) -> Inti.AdjointDoubleLayerKernel(op)(qnode, xs) * c
+                            γ₀u = map(u, quad_op)
+                            γ₁u = map(dudn, quad_op)
+                            γ₁u_norm = norm(norm.(γ₁u, Inf), Inf)
+                            e0 = norm(Kmat * γ₁u - Hmat * γ₀u - σ * γ₁u, Inf) / γ₁u_norm
+                            e1 = norm(K * γ₁u - H * γ₀u - σ * γ₁u, Inf) / γ₁u_norm
                             @test norm(e0, Inf) > norm(e1, Inf)
                             @test norm(e1, Inf) < rtol2
                         end
@@ -131,3 +142,55 @@ for correction in corrections
         end
     end
 end
+
+## Gradient Green's identity: W*γ₁u - GDL*γ₀u = ∇u (interior representation formula)
+@testset "Gradient Green's identity (kernel_variant = :gradient)" begin
+    for N in (2, 3)
+        for op in (Inti.Laplace(; dim = N), Inti.Helmholtz(; k = 1.2, dim = N),
+                   Inti.Elastostatic(; μ = 0.8, λ = 1.3, dim = N),
+                   Inti.Stokes(; μ = 1.2, dim = N))
+            Inti.clear_entities!()
+            msize = N == 2 ? meshsize : meshsize_3d
+            if N == 2
+                Γ = Inti.parametric_curve(x -> SVector(cos(x), sin(x)), 0.0, 2π) |> Inti.Domain
+                quad = Inti.Quadrature(Γ; meshsize = msize, qorder = 5)
+                target = vec([SVector(x, y) for x in -0.9:0.2:0.9, y in -0.9:0.2:0.9
+                              if x^2 + y^2 < 0.85])
+            else
+                Ω = Inti.GeometricEntity("ellipsoid") |> Inti.Domain
+                Γ = Inti.external_boundary(Ω)
+                quad = Inti.Quadrature(Γ; meshsize = msize, qorder = 3)
+                target = vec([SVector(x, y, z) for x in -0.7:0.3:0.7, y in -0.7:0.3:0.7,
+                              z in -0.7:0.3:0.7 if x^2 + y^2 + z^2 < 0.5])
+            end
+            xs = ntuple(i -> 3, N)
+            T = Inti.default_density_eltype(op)
+            c = rand(T)
+            u      = qnode -> Inti.SingleLayerKernel(op)(qnode, xs) * c
+            dudn   = qnode -> Inti.AdjointDoubleLayerKernel(op)(qnode, xs) * c
+            ∇u_ref = x     -> Inti.GradientSingleLayerKernel(op)(x, xs) * c
+            γ₀u  = map(u, quad)
+            γ₁u  = map(dudn, quad)
+            ∇u   = map(∇u_ref, target)
+            ∇u_norm = norm(norm.(∇u), Inf)
+            # uncorrected
+            Wmat   = Inti.assemble_matrix(Inti.IntegralOperator(Inti.GradientSingleLayerKernel(op), target, quad))
+            GDLmat = Inti.assemble_matrix(Inti.IntegralOperator(Inti.GradientDoubleLayerKernel(op), target, quad))
+            e0 = norm(Wmat * γ₁u - GDLmat * γ₀u - ∇u, Inf) / ∇u_norm
+            # corrected
+            W, GDL = Inti.single_double_layer(;
+                op,
+                target,
+                source = quad,
+                compression = (method = :none,),
+                correction = (method = :dim, maxdist = Inf, target_location = :inside),
+                kernel_variant = :gradient,
+            )
+            e1 = norm(W * γ₁u - GDL * γ₀u - ∇u, Inf) / ∇u_norm
+            @testset "Gradient identity $(N)d $(typeof(op).name.name)" begin
+                @test e0 > e1
+                @test e1 < rtol1
+            end
+        end
+    end
+end
diff --git a/test/kernels_test.jl b/test/kernels_test.jl
index 80ddecb1..51123549 100644
--- a/test/kernels_test.jl
+++ b/test/kernels_test.jl
@@ -85,19 +85,19 @@ end
     @test isapprox(
         ForwardDiff.derivative(t -> G((coords = x + t * nx,), y), 0),
         dGdnx((coords = x, normal = nx), y);
-        atol = 1.0e-6,
+        atol = 1.0e-4,
     )
     @test isapprox(
         ForwardDiff.derivative(t -> G(x, (coords = y + t * ny,)), 0),
         dGdny(x, (coords = y, normal = ny));
-        atol = 1.0e-6,
+        atol = 1.0e-4,
     )
     @test isapprox(
         ForwardDiff.derivative(
             t -> dGdny((coords = x + t * nx,), (coords = y, normal = ny)),
             0,
         ), d2Gdnxy((coords = x, normal = nx), (coords = y, normal = ny));
-        atol = 1.0e-6,
+        atol = 1.0e-4,
     )
     # test quasi-periodicity
     period = 2π
diff --git a/test/runtests.jl b/test/runtests.jl
index 61b6439e..84a705d6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -31,6 +31,7 @@ using Aqua
 
 @testset verbose = true "Corrections (Green identities)" begin
     include("green_identities_test.jl")
+    include("volume_potential_test.jl")
 end
 
 @testset "Accelerated density interpolation" include("dim_test.jl")
diff --git a/test/vdim_elastostatic_test.jl b/test/vdim_elastostatic_test.jl
deleted file mode 100644
index 983ae082..00000000
--- a/test/vdim_elastostatic_test.jl
+++ /dev/null
@@ -1,87 +0,0 @@
-# # High-order convergence of vdim
-
-using Inti
-using StaticArrays
-using Gmsh
-using LinearAlgebra
-using HMatrices
-using CairoMakie
-
-meshsize = 0.2
-meshorder = 1
-bdry_qorder = 8
-interpolation_order = 2
-
-Inti.clear_entities!()
-gmsh.initialize()
-gmsh.option.setNumber("Mesh.MeshSizeMax", meshsize)
-gmsh.option.setNumber("Mesh.MeshSizeMin", meshsize)
-gmsh.model.occ.addDisk(0, 0, 0, 1, 1)
-gmsh.model.occ.synchronize()
-gmsh.model.mesh.generate(2)
-gmsh.model.mesh.setOrder(meshorder)
-msh = Inti.import_mesh(; dim = 2)
-Ω = Inti.Domain(e -> Inti.geometric_dimension(e) == 2, Inti.entities(msh))
-gmsh.finalize()
-
-Γ = Inti.external_boundary(Ω)
-Ωₕ = view(msh, Ω)
-Γₕ = view(msh, Γ)
-##
-
-VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(interpolation_order)
-
-# Use VDIM with the Vioreanu-Rokhlin quadrature rule for Ωₕ
-Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
-dict = Dict(E => Q for E in Inti.element_types(Ωₕ))
-Ωₕ_quad = Inti.Quadrature(Ωₕ, dict)
-# Ωₕ_quad = Inti.Quadrature(Ωₕ; qorder = qorders[1])
-Γₕ_quad = Inti.Quadrature(Γₕ; qorder = bdry_qorder)
-
-##
-
-# build exact solution
-μ = 1.0
-λ = 1.0
-
-u = (x) -> SVector(1.0, 1.0)
-du = (x, n) -> SVector(0.0, 0.0)
-f = (x) -> SVector(0.0, 0.0)
-
-u_d = map(q -> u(q.coords), Ωₕ_quad)
-u_b = map(q -> u(q.coords), Γₕ_quad)
-du_b = map(q -> du(q.coords, q.normal), Γₕ_quad)
-f_d = map(q -> f(q.coords), Ωₕ_quad)
-
-op = Inti.Elastostatic(; λ = λ, μ = μ, dim = 2)
-
-# m, d, n = Inti._polynomial_solutions_vec(op, 1)
-# x = @SVector rand(2)
-# m[1](x)
-# d[1](x)
-# n[1]((; coords = x, normal = x))
-
-## Boundary operators
-S_b2d, D_b2d = Inti.single_double_layer(;
-    op,
-    target = Ωₕ_quad,
-    source = Γₕ_quad,
-    compression = (method = :none, tol = 1.0e-14),
-    correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
-)
-
-## Volume potentials
-V_d2d = Inti.volume_potential(;
-    op,
-    target = Ωₕ_quad,
-    source = Ωₕ_quad,
-    compression = (method = :none, tol = 1.0e-14),
-    correction = (method = :dim, interpolation_order),
-)
-
-vref = -u_d - D_b2d * u_b + S_b2d * du_b
-vapprox = V_d2d * f_d
-er = vref - vapprox
-
-ndofs = length(er)
-@show ndofs, meshsize, norm(er, Inf), t
diff --git a/test/volume_potential_test.jl b/test/volume_potential_test.jl
new file mode 100644
index 00000000..ce83ea2a
--- /dev/null
+++ b/test/volume_potential_test.jl
@@ -0,0 +1,413 @@
+#=
+
+Basic tests for the volume potentials based on
+
+S*γ₁u - D*γ₀u + V*f - σ*γ₀u ≈ 0     when    Lu = f
+
+where S, D are the single and double layer boundary integral operators, V is the volume
+potential operator, γ₀u and γ₁u are the Dirichlet and Neumann traces of u on the boundary, σ
+is 1/2 or -1/2 depending on whether the target point is inside or outside the domain, and L
+is the differential operator (Laplace, Helmholtz, Stokes, Elastostatic).
+
+For testing, we use known polynomial solutions of the PDEs to generate u and f = Lu.
+
+=#
+
+using Test
+using LinearAlgebra
+using Inti
+using Random
+using StaticArrays
+using ForwardDiff
+using Gmsh
+
+Random.seed!(1)
+
+## Test parameters
+rtol = 1.0e-10  # relative tolerance for volume potential tests
+meshsize    = 0.4  # 2D mesh size
+meshsize_3d = 0.8  # 3D mesh size — tests check polynomial exactness (not convergence),
+                   # so a coarser mesh is valid and keeps matrices small
+meshorder = 1
+bdry_qorder = 5
+interpolation_order = 2
+
+"""
+    test_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize)
+
+Test the volume potential operator for a given PDE operator `op`.
+Verifies the identity: S*γ₁u - D*γ₀u + V*f - σ*γ₀u ≈ 0 for polynomial solutions.
+Accepts pre-built quadratures so they can be shared across operators.
+"""
+function test_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize; interpolation_order = 2)
+    # Build boundary operators
+    S_b2d, D_b2d = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+    )
+
+    # Build volume potential
+    V_d2d = Inti.volume_potential(;
+        op,
+        target = Ωₕ_quad,
+        source = Ωₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :none,),
+    )
+
+    # Build VDIM correction
+    δV_d2d = Inti.vdim_correction(
+        op, Ωₕ_quad, Ωₕ_quad, Γₕ_quad, S_b2d, D_b2d, V_d2d;
+        green_multiplier = -ones(length(Ωₕ_quad)),
+        interpolation_order,
+        maxdist = Inf
+    )
+
+    basis = Inti.polynomial_solutions_vdim(op, interpolation_order)
+
+    errors_uncorrected = Float64[]
+    errors_corrected = Float64[]
+
+    for idx in 1:length(basis)
+        if Inti.default_density_eltype(op) <: SVector
+            N = Inti.ambient_dimension(Ωₕ_quad)
+            c = SVector(ntuple(i -> rand(), N)...)
+            f = (q) -> basis[idx].source(q) * c
+            u = (q) -> basis[idx].solution(q) * c
+            t = (q) -> basis[idx].neumann_trace(q) * c
+        else
+            f = (q) -> basis[idx].source(q)
+            u = (q) -> basis[idx].solution(q)
+            t = (q) -> basis[idx].neumann_trace(q)
+        end
+
+        u_d  = map(q -> u(q), Ωₕ_quad)
+        u_b  = map(q -> u(q), Γₕ_quad)
+        du_b = map(q -> t(q), Γₕ_quad)
+        f_d  = map(q -> f(q), Ωₕ_quad)
+
+        vref        = u_d + D_b2d * u_b - S_b2d * du_b
+        vapprox     = V_d2d * f_d
+        vapprox_corr = vapprox + δV_d2d * f_d
+
+        push!(errors_uncorrected, norm(vref - vapprox, Inf))
+        push!(errors_corrected, norm(vref - vapprox_corr, Inf))
+    end
+
+    return errors_uncorrected, errors_corrected
+end
+
+"""
+    test_gradient_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize)
+
+Test the gradient volume potential identity.
+Accepts pre-built quadratures so they can be shared across operators.
+"""
+function test_gradient_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize; interpolation_order = 2)
+    W, GDL = Inti.single_double_layer(;
+        op,
+        target = Ωₕ_quad,
+        source = Γₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+        kernel_variant = :gradient,
+    )
+    V_grad = Inti.volume_potential(;
+        op,
+        target = Ωₕ_quad,
+        source = Ωₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :none,),
+        kernel_variant = :gradient,
+    )
+    δV_grad = Inti.vdim_correction(
+        op, Ωₕ_quad, Ωₕ_quad, Γₕ_quad, W, GDL, V_grad;
+        green_multiplier = -ones(length(Ωₕ_quad)),
+        interpolation_order,
+        kernel_variant = :gradient,
+    )
+
+    basis = Inti.polynomial_solutions_vdim(op, interpolation_order)
+    errors_corrected = Float64[]
+
+    for idx in 1:length(basis)
+        ∇u_d = [basis[idx].gradient_solution(q) for q in Ωₕ_quad]
+        u_b  = [basis[idx].solution(q) for q in Γₕ_quad]
+        du_b = [basis[idx].neumann_trace(q) for q in Γₕ_quad]
+        f_d  = [basis[idx].source(q) for q in Ωₕ_quad]
+        vref    = ∇u_d + GDL * u_b - W * du_b
+        vapprox = V_grad * f_d + δV_grad * f_d
+        push!(errors_corrected, norm(vref - vapprox, Inf))
+    end
+    return errors_corrected
+end
+
+"""
+    test_W_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize)
+
+Test the `W[g] = -∫∇yG⋅g` volume integral operator (vector density `g`, scalar output)
+regularized via eq. (3.25) of the 3D VDIM paper. For each scalar monomial `pₐ` and
+direction `j`, the density `g = pₐeⱼ` is itself a polynomial, so the method must reproduce
+the (3.25) boundary representation `μΨⱼ + D[Ψⱼ] - S[∂νΨⱼ + pₐνⱼ]` to machine precision.
+
+Builds the operator through the high-level `volume_potential(...; kernel_variant = :gradient_source)` so
+the `VectorDensityOperator` return path (and bare `W*g`) is exercised. Returns the list of
+relative errors and the output element type of `W*g`.
+"""
+function test_W_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize; interpolation_order = 2)
+    N = Inti.ambient_dimension(Ωₕ_quad)
+    S, D = Inti.single_double_layer(;
+        op, target = Ωₕ_quad, source = Γₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :dim, maxdist = 5 * meshsize, target_location = :inside),
+    )
+    W = Inti.volume_potential(;
+        op, target = Ωₕ_quad, source = Ωₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :dim, maxdist = 5 * meshsize, boundary = Γₕ_quad, interpolation_order),
+        kernel_variant = :gradient_source,
+    )
+    basis = Inti.polynomial_solutions_vdim_W(op, interpolation_order)
+    errors = Float64[]
+    out_eltype = eltype(W * [zero(SVector{N, Float64}) for _ in Ωₕ_quad])
+    for b in basis, j in 1:N
+        ej = SVector(ntuple(d -> d == j ? 1.0 : 0.0, N))
+        g_d = [b.source(q) * ej for q in Ωₕ_quad]
+        w_app = W * g_d
+        sol_vol = [b.solution(q)[j] for q in Ωₕ_quad]
+        sol_bnd = [b.solution(q)[j] for q in Γₕ_quad]
+        neu_bnd = [b.neumann_trace(q)[j] for q in Γₕ_quad]
+        w_ref = sol_vol + D * sol_bnd - S * neu_bnd
+        push!(errors, norm(w_app - w_ref, Inf) / max(norm(w_ref, Inf), 1))
+    end
+    return errors, out_eltype
+end
+
+# Apply a scalar boundary→target layer operator `Op` component-wise to a
+# vector-valued boundary trace `trace_bnd::Vector{SVector{N}}`, returning a
+# `Vector{SVector{N}}` over the targets.
+function _apply_componentwise(Op, trace_bnd, N, ntarget)
+    out = [zero(SVector{N, ComplexF64}) for _ in 1:ntarget]
+    for c in 1:N
+        oc = Op * [t[c] for t in trace_bnd]
+        for i in 1:ntarget
+            out[i] += SVector(ntuple(d -> d == c ? oc[i] : zero(eltype(oc)), N))
+        end
+    end
+    return out
+end
+
+"""
+    test_X_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize)
+
+Test the `X[g] = ∇W[g] = S·g - PV∫∇ₓ∇_yG⋅g` volume integral operator (vector
+density `g`, vector output) regularized via eq. (3.28) of the 3D VDIM paper. For
+each scalar monomial `pₐ` and direction `j`, `g = pₐeⱼ` is a polynomial, so the
+method must reproduce the (3.28) boundary representation
+`μΥⱼ - ∇ₓS[pₐνⱼ] - S[(∂ⱼpₐ)ν + BνΥⱼ] + D[Υⱼ]` to machine precision (the free-term
+tensor `S` is implicitly contained in this representation).
+
+Builds the operator through `volume_potential(...; kernel_variant = :hessian_source)`.
+Returns the list of relative errors and the output element type of `X*g`.
+"""
+function test_X_volume_potential(op, Ωₕ_quad, Γₕ_quad, meshsize; interpolation_order = 2)
+    N = Inti.ambient_dimension(Ωₕ_quad)
+    corr = (method = :dim, maxdist = 5 * meshsize, target_location = :inside)
+    S, D = Inti.single_double_layer(;
+        op, target = Ωₕ_quad, source = Γₕ_quad,
+        compression = (method = :none,), correction = corr,
+    )
+    GS, _ = Inti.single_double_layer(;
+        op, target = Ωₕ_quad, source = Γₕ_quad,
+        compression = (method = :none,), correction = corr,
+        kernel_variant = :gradient,
+    )
+    X = Inti.volume_potential(;
+        op, target = Ωₕ_quad, source = Ωₕ_quad,
+        compression = (method = :none,),
+        correction = (method = :dim, maxdist = 5 * meshsize, boundary = Γₕ_quad, interpolation_order),
+        kernel_variant = :hessian_source,
+    )
+    basis = Inti.polynomial_solutions_vdim_X(op, interpolation_order)
+    ntarget = length(Ωₕ_quad)
+    errors = Float64[]
+    out_eltype = eltype(X * [zero(SVector{N, Float64}) for _ in Ωₕ_quad])
+    for b in basis, j in 1:N
+        ej = SVector(ntuple(d -> d == j ? 1.0 : 0.0, N))
+        g_d = [b.source(q) * ej for q in Ωₕ_quad]
+        x_app = X * g_d
+        # (3.28) reference (interior μ = 1):  Υⱼ - ∇ₓS[pₐνⱼ] - S[(∂ⱼpₐ)ν+BνΥⱼ] + D[Υⱼ]
+        Υj_vol = [b.solution(q)[:, j] for q in Ωₕ_quad]
+        gsterm = GS * [b.grad_single_trace(q)[j] for q in Γₕ_quad]
+        Sterm = _apply_componentwise(S, [b.single_trace(q)[:, j] for q in Γₕ_quad], N, ntarget)
+        Dterm = _apply_componentwise(D, [b.solution(q)[:, j] for q in Γₕ_quad], N, ntarget)
+        x_ref = Υj_vol .- gsterm .- Sterm .+ Dterm
+        push!(errors, norm(x_app - x_ref, Inf) / max(norm(x_ref, Inf), 1))
+    end
+    return errors, out_eltype
+end
+
+## Helper to build the shared volume + boundary quadratures for a given mesh/domain.
+function build_quadratures(Ωₕ, Γₕ, dim; interpolation_order, bdry_qorder)
+    if dim == 2
+        VR_qorder = Inti.Triangle_VR_interpolation_order_to_quadrature_order(interpolation_order)
+        Q = Inti.VioreanuRokhlin(; domain = :triangle, order = VR_qorder)
+    else
+        VR_qorder = Inti.Tetrahedron_VR_interpolation_order_to_quadrature_order(interpolation_order)
+        Q = Inti.VioreanuRokhlin(; domain = :tetrahedron, order = VR_qorder)
+    end
+    Ωₕ_quad = Inti.Quadrature(Ωₕ, Q)
+    Γₕ_quad = Inti.Quadrature(Γₕ; qorder = bdry_qorder)
+    return Ωₕ_quad, Γₕ_quad
+end
+
+# ── 2D tests ─────────────────────────────────────────────────────────────────────────────────
+# Build the 2D disk mesh ONCE and reuse it across all 2D volume and gradient tests.
+Inti.clear_entities!()
+gmsh.initialize()
+gmsh.option.setNumber("General.Verbosity", 0)
+gmsh.option.setNumber("Mesh.MeshSizeMax", meshsize)
+gmsh.option.setNumber("Mesh.MeshSizeMin", meshsize)
+gmsh.model.occ.addDisk(0, 0, 0, 1, 1)
+gmsh.model.occ.synchronize()
+gmsh.model.mesh.generate(2)
+gmsh.model.mesh.setOrder(meshorder)
+msh_2d = Inti.import_mesh(; dim = 2)
+Ω_2d = Inti.Domain(e -> Inti.geometric_dimension(e) == 2, Inti.entities(msh_2d))
+gmsh.finalize()
+Γ_2d = Inti.external_boundary(Ω_2d)
+Ωₕ_2d = view(msh_2d, Ω_2d)
+Γₕ_2d = view(msh_2d, Γ_2d)
+Ωₕ_quad_2d, Γₕ_quad_2d = build_quadratures(Ωₕ_2d, Γₕ_2d, 2; interpolation_order, bdry_qorder)
+
+@testset "Volume potential operators 2D" begin
+    for (name, op) in [
+        ("2D Laplace",      Inti.Laplace(; dim = 2)),
+        ("2D Helmholtz",    Inti.Helmholtz(; k = 0.7, dim = 2)),
+        ("2D Elastostatic", Inti.Elastostatic(; μ = 0.8, λ = 1.3, dim = 2)),
+        ("2D Stokes",       Inti.Stokes(; μ = 1.0, dim = 2)),
+    ]
+        @testset "$name" begin
+            err_uncorr, err_corr = test_volume_potential(op, Ωₕ_quad_2d, Γₕ_quad_2d, meshsize; interpolation_order)
+            @test maximum(err_corr) < rtol
+            @test maximum(err_corr) < maximum(err_uncorr)
+        end
+    end
+end
+
+@testset "Gradient volume potential 2D" begin
+    for (name, op) in [
+        ("Laplace",      Inti.Laplace(; dim = 2)),
+        ("Helmholtz",    Inti.Helmholtz(; k = 0.7, dim = 2)),
+        ("Elastostatic", Inti.Elastostatic(; μ = 0.8, λ = 1.3, dim = 2)),
+        ("Stokes",       Inti.Stokes(; μ = 1.2, dim = 2)),
+    ]
+        @testset "Gradient volume potential 2D $name" begin
+            err_corr = test_gradient_volume_potential(op, Ωₕ_quad_2d, Γₕ_quad_2d, meshsize; interpolation_order)
+            @test maximum(err_corr) < rtol
+        end
+    end
+end
+
+@testset "W volume potential 2D" begin
+    for (name, op, Tout) in [
+        ("Laplace",   Inti.Laplace(; dim = 2),               Float64),
+        ("Helmholtz", Inti.Helmholtz(; k = 0.7, dim = 2),    ComplexF64),
+    ]
+        @testset "W volume potential 2D $name" begin
+            err, out_eltype = test_W_volume_potential(op, Ωₕ_quad_2d, Γₕ_quad_2d, meshsize; interpolation_order)
+            @test maximum(err) < rtol
+            @test out_eltype == Tout   # bare `W*g` yields a clean scalar vector
+        end
+    end
+end
+
+@testset "X (∇W) volume potential 2D" begin
+    for (name, op, Tout) in [
+        ("Laplace", Inti.Laplace(; dim = 2), Float64),
+        ("Helmholtz", Inti.Helmholtz(; dim = 2, k = 1.2), ComplexF64),
+    ]
+        @testset "X volume potential 2D $name" begin
+            err, out_eltype = test_X_volume_potential(op, Ωₕ_quad_2d, Γₕ_quad_2d, meshsize; interpolation_order)
+            @test maximum(err) < rtol
+            @test out_eltype == SVector{2, Tout}   # X*g yields a clean vector
+        end
+    end
+end
+
+# ── 3D tests ─────────────────────────────────────────────────────────────────────────────────
+# Build the 3D sphere mesh ONCE and reuse it across all 3D volume and gradient tests.
+Inti.clear_entities!()
+gmsh.initialize()
+gmsh.option.setNumber("General.Verbosity", 0)
+gmsh.option.setNumber("Mesh.MeshSizeMax", meshsize_3d)
+gmsh.option.setNumber("Mesh.MeshSizeMin", meshsize_3d)
+gmsh.model.occ.addSphere(0, 0, 0, 1)
+gmsh.model.occ.synchronize()
+gmsh.model.mesh.generate(3)
+gmsh.model.mesh.setOrder(meshorder)
+msh_3d = Inti.import_mesh(; dim = 3)
+Ω_3d = Inti.Domain(e -> Inti.geometric_dimension(e) == 3, Inti.entities(msh_3d))
+gmsh.finalize()
+Γ_3d = Inti.external_boundary(Ω_3d)
+Ωₕ_3d = view(msh_3d, Ω_3d)
+Γₕ_3d = view(msh_3d, Γ_3d)
+Ωₕ_quad_3d, Γₕ_quad_3d = build_quadratures(Ωₕ_3d, Γₕ_3d, 3; interpolation_order, bdry_qorder)
+
+@testset "Volume potential operators 3D" begin
+    for (name, op) in [
+        ("3D Laplace",      Inti.Laplace(; dim = 3)),
+        ("3D Helmholtz",    Inti.Helmholtz(; k = 1.2, dim = 3)),
+        ("3D Elastostatic", Inti.Elastostatic(; μ = 1.1, λ = 0.9, dim = 3)),
+        ("3D Stokes",       Inti.Stokes(; μ = 1.0, dim = 3)),
+    ]
+        @testset "$name" begin
+            err_uncorr, err_corr = test_volume_potential(op, Ωₕ_quad_3d, Γₕ_quad_3d, meshsize_3d; interpolation_order)
+            @test maximum(err_corr) < rtol
+            @test maximum(err_corr) < maximum(err_uncorr)
+        end
+    end
+end
+
+@testset "Gradient volume potential 3D" begin
+    for (name, op) in [
+        ("Laplace",      Inti.Laplace(; dim = 3)),
+        ("Helmholtz",    Inti.Helmholtz(; k = 1.2, dim = 3)),
+        ("Elastostatic", Inti.Elastostatic(; μ = 1.1, λ = 0.9, dim = 3)),
+        ("Stokes",       Inti.Stokes(; μ = 1.2, dim = 3)),
+    ]
+        @testset "Gradient volume potential 3D $name" begin
+            err_corr = test_gradient_volume_potential(op, Ωₕ_quad_3d, Γₕ_quad_3d, meshsize_3d; interpolation_order)
+            @test maximum(err_corr) < rtol
+        end
+    end
+end
+
+@testset "W volume potential 3D" begin
+    for (name, op, Tout) in [
+        ("Laplace",   Inti.Laplace(; dim = 3),               Float64),
+        ("Helmholtz", Inti.Helmholtz(; k = 1.2, dim = 3),    ComplexF64),
+    ]
+        @testset "W volume potential 3D $name" begin
+            err, out_eltype = test_W_volume_potential(op, Ωₕ_quad_3d, Γₕ_quad_3d, meshsize_3d; interpolation_order)
+            @test maximum(err) < rtol
+            @test out_eltype == Tout
+        end
+    end
+end
+
+@testset "X (∇W) volume potential 3D" begin
+    for (name, op, Tout) in [
+        ("Laplace", Inti.Laplace(; dim = 3), Float64),
+        ("Helmholtz", Inti.Helmholtz(; dim = 3, k = 1.2), ComplexF64),
+    ]
+        @testset "X volume potential 3D $name" begin
+            err, out_eltype = test_X_volume_potential(op, Ωₕ_quad_3d, Γₕ_quad_3d, meshsize_3d; interpolation_order)
+            @test maximum(err) < rtol
+            @test out_eltype == SVector{3, Tout}
+        end
+    end
+end