fastmachinelearning · laurilaatu · Jan 26, 2026 · Jan 27, 2026 · Jan 27, 2026 · Feb 9, 2026
diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -19,7 +19,6 @@
     Embedding,
     Layer,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
@@ -257,13 +256,6 @@ def init_activation(self, layer):
         if layer.get_attr('recurrent_activation') == 'tanh':
             layer.set_attr('recurrent_activation', 'dense_tanh')
 
-    @layer_optimizer(Softmax)
-    def init_softmax(self, layer):
-        if layer.model.config.get_config_value('IOType') == 'io_parallel':
-            assert len(layer.get_input_variable().shape) == 1, (
-                'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
-            )
-
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -1,7 +1,10 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
 from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
+import numpy as np
 
 # Dense templates
 
@@ -194,12 +197,26 @@ def format(self, node):
 
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static constexpr unsigned n_in = {n_in};
-    static constexpr unsigned table_size = {table_size};
+    static constexpr unsigned exp_table_size = {exp_table_size};
+    static constexpr unsigned inv_table_size = {inv_table_size};
     static constexpr unsigned io_type = nnet::{iotype};
     static constexpr unsigned reuse_factor = {reuse};
     static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
-    typedef {inv_table_t.name} inv_table_t;
+    typedef {inv_table_t.name} inv_table_t;"""
+
+softmax_config_table_template = """
+
+    static constexpr const exp_table_t *exp_table = &{exp_table_name}[0];
+    static constexpr const inv_table_t *invert_table = &{inv_table_name}[0];
+}};\n"""
+
+softmax_config_table_template_stable = """  
+    typedef {inv_inp_t.name} inv_inp_t;
+    typedef {inp_norm_t.name} inp_norm_t;
+
+    static constexpr const exp_table_t *exp_table = &{exp_table_name}[0];
+    static constexpr const inv_table_t *invert_table = &{inv_table_name}[0];
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
@@ -220,7 +237,58 @@ def __init__(self):
     def format(self, node):
         params = self._default_config_params(node)
         params['type'] = node.get_attr('activation')
-
+
+        if params['type'] == 'softmax':
+
+            if 'exp_table_size' in params:
+                params['exp_table_size'] //= 2
+            else:
+                params['exp_table_size'] = 1024
+
+                params['exp_table_t'].precision.width = ceil_log2(params['exp_table_size'])
+                params['exp_table_t'].precision.integer = 3
+                params['exp_table_t'].precision.signed = False
+
+            if 'inp_norm_t' not in params:
+                input_t = node.get_input_variable().type.precision
+                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
+                width, iwidth = width - signed, iwidth - signed
+                import copy
+                params['inp_norm_t'] = copy.deepcopy(params['exp_table_t']) #assign type,later override
+
+                #this checks if table sizes will be default, if it is just use the table size to derive precision
+                if 'inv_table_size' not in params: 
+                    params['inp_norm_t'].precision.width = params['exp_table_t'].precision.width + 1
+                    params['inp_norm_t'].precision.integer = params['exp_table_t'].precision.integer + 1
+                    params['inp_norm_t'].precision.signed = True
+                    params['inp_norm_t'].name = f'{node.name}_inp_norm_t'
+                else:
+                    params['inp_norm_t'].name = f'ac_fixed<{width},{iwidth},{'true' if signed else 'false'},AC_RND,AC_SAT_SYM>'
+
+                node.set_attr('inp_norm_t', params['inp_norm_t'])
+
+            if 'inv_table_size' in params:
+                params['inv_table_size'] //= 2
+            else:
+                params['inv_table_size'] = 1024
+
+                params['inv_table_t'].precision.width = ceil_log2(params['inv_table_size'])
+                params['inv_table_t'].precision.integer = 3
+                params['inv_table_t'].precision.signed = False
+
+                params['inv_inp_t'].precision.width = params['inv_table_t'].precision.width + 1
+                params['inv_inp_t'].precision.integer = params['inv_table_t'].precision.integer + 1
+                params['inv_inp_t'].precision.signed = True
+
+
+            if params['implementation'] == 'stable':
+                self.template += softmax_config_table_template_stable
+            else:
+                self.template += softmax_config_table_template
+
+            params['exp_table_name'] = node.name + '_exp_table'
+            params['inv_table_name'] = node.name + '_inv_table'
+
         return self.template.format(**params)
 
 

diff --git a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
@@ -14,7 +14,7 @@
 
 
 class QMultiHeadAttentionHandler(QLayerHandler):
-    handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',)
+    handles = ('hgq.layers.attn.mha.QMultiHeadAttention',)
 
     def handle(
         self,
@@ -127,7 +127,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v):
 
 
 class QLinformerAttentionHandler(QMultiHeadAttentionHandler):
-    handles = ('hgq.layers.linformer_attention.QLinformerAttention',)
+    handles = ('hgq.layers.attn.linformer.QLinformerAttention',)
 
     def handle(
         self,

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -99,18 +99,21 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
 
 enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
-template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
+
+template <class data_T, unsigned table_size> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
     // Number of address bits for table
-    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
+    static constexpr int N = ceillog2<table_size>::val;
 
     // Slice the top N bits of the input
     [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
+
     // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
     if (x != 0 && y == 0)
         y[0] = 1;
     return y.to_uint();
 }
 
+
 template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
     // Number of address bits for table
     static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
@@ -120,19 +123,17 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
     return y.to_uint();
 }
 
+
 template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
-// Look-up tables
-#include "activation_tables/exp_table.tb"
-#include "activation_tables/invert_table.tb"
 
     // Find maximum
     Op_max<typename data_T::value_type> op_max;
     [[intel::fpga_register]] auto x_max =
         reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
     // For the diffs, use the same type as the input but force rounding and saturation
-    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
-        d_xi_xmax[CONFIG_T::n_in];
+    [[intel::fpga_register]]
+        typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         d_xi_xmax[i] = data[i] - x_max;
@@ -142,23 +143,25 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(cons
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
+        exp_res[i] = CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[i])]; //input_t, CONFIG_T
     }
 
     // Explicitly sum previously calculated exponentials with an adder tree
     Op_add<typename CONFIG_T::exp_table_t> op_add;
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
+    [[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
         reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
     // Multiply previously calculated exponetials with the reciprocal of the sum
     [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+        CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
+
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
     }
 }
 
+
 // TODO - Improve accuracy
 template <class data_T, class res_T, typename CONFIG_T> void softmax_latency(const data_T &data, res_T &res) {
 #include "activation_tables/exp_table_latency.tb"
@@ -265,6 +268,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
     }
 }
 
+// *************************************************
+//       Multidimensional Softmax
+// *************************************************
+
+// Helper to remap the config for the core softmax function
+template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
+    static constexpr unsigned n_in = CONFIG_T::n_slice;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
+    using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
+    using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
+    using slice_config = softmax_multidim_slice_config<CONFIG_T>;
+
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
+        #pragma unroll
+        for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {
+
+            [[intel::fpga_register]] buffer_data_t buffer_in;
+            [[intel::fpga_register]] buffer_res_t buffer_out;
+
+            // Gather Phase
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                buffer_in[j] = data[idx];
+            }
+
+            nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);
+
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                res[idx] = buffer_out[j];
+            }
+        }
+    }
+}
 // *************************************************
 //       TanH Activation
 // *************************************************

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -271,64 +271,63 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
 // *************************************************
 
 template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
-#include "activation_tables/exp_table.tb"
-#include "activation_tables/invert_table.tb"
+
+    using input_arr_t = typename ExtractPipeType<data_pipe>::value_type;
+    using input_t = typename ExtractPipeType<data_pipe>::value_type::value_type;
+    constexpr unsigned input_arr_size = std::tuple_size<input_arr_t>{};
+
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(input_arr_size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = input_arr_size / multiplier_limit;
 
-    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
-        data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+
+    [[intel::fpga_register]] input_t data_array[input_arr_size];
 
 SoftmaxArrayLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-                                                  i++) {
+    [[intel::initiation_interval(pipeline)]] 
+    for (unsigned i = 0; i < CONFIG_T::n_in / input_arr_size; i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxArrayPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
-        Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
-        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
-            reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
-                   Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);
-
-        // For the diffs, use the same type as the input but force rounding and saturation
-        [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
-                                          ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
-            d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        Op_max<input_t> op_max;
+        [[intel::fpga_register]] 
+        input_t x_max = reduce<input_t, input_arr_size, Op_max<input_t>>(data_array, op_max);
+
+        [[intel::fpga_register]]
+        typename CONFIG_T::inp_norm_t d_xi_xmax[input_arr_size];
+
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
         [[intel::fpga_register]]
-        typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        typename CONFIG_T::exp_table_t exp_res[input_arr_size];
+
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             exp_res[j] =
-                exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                                                           CONFIG_T>(d_xi_xmax[j])];
+                CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t,  CONFIG_T::exp_table_size>(d_xi_xmax[j])];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
+        [[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, input_arr_size,
                    Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
         [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+            CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t,  CONFIG_T::inv_table_size>(exp_sum)];
+
         typename ExtractPipeType<res_pipe>::value_type out_pack;
 
     SoftmaxInvPackLoop:

diff --git a/hls4ml/templates/oneapi/firmware/parameters.h b/hls4ml/templates/oneapi/firmware/parameters.h
@@ -6,6 +6,8 @@
 #include "nnet_utils/nnet_code_gen.h"
 #include "nnet_utils/nnet_helpers.h"
 
+// hls-fpga-machine-learning insert softmax tables
+
 // hls-fpga-machine-learning insert includes
 
 // hls-fpga-machine-learning insert layer-config