From 3d463b3d7948520cca82afe67df7d67c4163b897 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 26 Jan 2026 20:37:28 +0000
Subject: [PATCH 01/12] weights for dense

---
 hls4ml/backends/oneapi/passes/core_templates.py   | 15 +++++++++++----
 hls4ml/templates/oneapi/firmware/myproject.cpp    |  5 ++++-
 hls4ml/templates/oneapi/firmware/myproject.h      |  3 +++
 .../oneapi/firmware/nnet_utils/nnet_dense.h       |  7 +++----
 hls4ml/writer/oneapi_writer.py                    |  8 ++++++++
 5 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 9602b2d0fc..64a4c7097a 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -6,6 +6,7 @@
 # Dense templates
 
 dense_config_template = """struct config{index} : nnet::dense_config {{
+
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned n_out = {n_out};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -30,13 +31,16 @@
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
 
+    static constexpr weight_t weights = {weights};
+    static constexpr bias_t biases = {biases};
+
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
-dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output});'
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
-dense_stream_function_template = '{name}.async({w}, {b});'
+dense_stream_function_template = '{name}.async();'
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
 
 
@@ -53,6 +57,9 @@ def format(self, node):
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
+        params['weights'] = node.get_weights('weight').name
+        params['biases'] = node.get_weights('bias').name
+
         return self.template.format(**params)
 
 
@@ -63,8 +70,8 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['w'] = node.get_weights('weight').name
-        params['b'] = node.get_weights('bias').name
+        #params['w'] = node.get_weights('weight').name
+        #params['b'] = node.get_weights('bias').name
 
         return self.template.format(**params)
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index 06e7d3fe37..da9439f74a 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -1,9 +1,12 @@
 #include "myproject.h"
-#include "parameters.h"
 #include <sycl/ext/intel/experimental/task_sequence.hpp>
 
 // hls-fpga-machine-learning insert weights
 
+
+#include "parameters.h"
+
+
 // The inter-task pipes need to be declared in the global scope
 // hls-fpga-machine-learning insert inter-task pipes
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 082ae5dc8c..8f313ea30f 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -3,6 +3,9 @@
 
 #include "defines.h"
 
+// hls-fpga-machine-learning insert weights
+
+
 // This file defines the interface to the kernel
 
 // currently this is fixed
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index dc76189083..2b65eef42b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -152,12 +152,11 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
-                    const typename CONFIG_T::bias_t &biases) {
+void dense_resource(const data_T &data, res_T &res) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
     } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
     }
 }
 } // namespace nnet
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 3c0a778c50..b42ff2990f 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,6 +242,14 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
+               # Insert weights
+                elif '// hls-fpga-machine-learning insert weights' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        for w in layer.get_weights():
+                            #if w not in model_brams:
+                            newline += f'#include "weights/{w.name}.h"\n'                        
+
                 # Simply copy line, if no inserts are required
                 else:
                     newline = line

From d67857369385d066b7cdaad49077069b3bf9473c Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 27 Jan 2026 18:58:42 +0000
Subject: [PATCH 02/12] hgq2 homogeneous quant fix

---
 hls4ml/converters/keras_v3/hgq2/_base.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hls4ml/converters/keras_v3/hgq2/_base.py b/hls4ml/converters/keras_v3/hgq2/_base.py
index 4a6d0a22c2..f7b4c9ddd3 100644
--- a/hls4ml/converters/keras_v3/hgq2/_base.py
+++ b/hls4ml/converters/keras_v3/hgq2/_base.py
@@ -30,15 +30,19 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) ->
     k, B, I = ops.convert_to_numpy(k), ops.convert_to_numpy(B), ops.convert_to_numpy(I)  # noqa: E741
     I = np.where(B > 0, I, 0)  # noqa: E741 # type: ignore
 
-    k = np.broadcast_to(k.astype(np.int16), (1,) + shape)  # type: ignore
-    B = np.broadcast_to(B.astype(np.int16), (1,) + shape)  # type: ignore
-    I = np.broadcast_to(I.astype(np.int16), (1,) + shape)  # noqa: E741
+    if np.size(k) != 1:
+        k = np.broadcast_to(k.astype(np.int16), (1,) + shape)  # type: ignore
+        B = np.broadcast_to(B.astype(np.int16), (1,) + shape)  # type: ignore
+        I = np.broadcast_to(I.astype(np.int16), (1,) + shape)  # noqa: E741
+    else:
+        k = np.ravel(k).astype(np.int16)
+        B = np.ravel(B).astype(np.int16)
+        I = np.ravel(I).astype(np.int16)  # noqa: E741
 
     overflow_mode: str = internal_q.overflow_mode
     round_mode: str = internal_q.round_mode
     if round_mode.startswith('S_'):
         round_mode = round_mode[2:]
-    fusible = np.unique(k).size == 1 and np.unique(B).size == 1 and np.unique(I).size == 1
 
     input_keras_tensor_names = tensor.name if is_input else f'{tensor.name}_q'
     output_keras_tensor_names = f'{tensor.name}_q' if is_input else tensor.name
@@ -48,7 +52,7 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) ->
         'mask_kbi': (k, B, I),
         'SAT': overflow_mode,
         'RND': round_mode,
-        'fusible': fusible,
+        'fusible': None,
         'input_keras_tensor_names': [input_keras_tensor_names],
         'output_keras_tensor_names': [output_keras_tensor_names],
         'overrides': {},

From 59bd96f0c5e9c8e95538a9e96e0233c2d70695ba Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:31:00 +0000
Subject: [PATCH 03/12] Changes required for oneAPI MHA

---
 hls4ml/backends/oneapi/oneapi_backend.py      |   8 -
 .../backends/oneapi/passes/core_templates.py  |  88 ++++++++++-
 .../keras_v3/hgq2/multi_head_attention.py     |   4 +-
 .../firmware/nnet_utils/nnet_activation.h     |  82 +++++++---
 .../oneapi/firmware/nnet_utils/nnet_dense.h   |   7 +-
 hls4ml/writer/oneapi_writer.py                | 149 ++++++++++--------
 6 files changed, 233 insertions(+), 105 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 0c11c16d09..94f26c9f1c 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -19,7 +19,6 @@
     Embedding,
     Layer,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
@@ -257,13 +256,6 @@ def init_activation(self, layer):
         if layer.get_attr('recurrent_activation') == 'tanh':
             layer.set_attr('recurrent_activation', 'dense_tanh')
 
-    @layer_optimizer(Softmax)
-    def init_softmax(self, layer):
-        if layer.model.config.get_config_value('IOType') == 'io_parallel':
-            assert len(layer.get_input_variable().shape) == 1, (
-                'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
-            )
-
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 64a4c7097a..5a2d765e8f 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -38,7 +38,7 @@
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
-dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output});'
+dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 dense_stream_function_template = '{name}.async();'
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
@@ -70,8 +70,8 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        #params['w'] = node.get_weights('weight').name
-        #params['b'] = node.get_weights('bias').name
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
 
         return self.template.format(**params)
 
@@ -199,7 +199,7 @@ def format(self, node):
     static constexpr unsigned reuse_factor = {reuse};
 }};\n"""
 
-softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+softmax_config_template_qkeras = """struct {type}_config{index} : nnet::activ_config {{
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned table_size = {table_size};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -209,6 +209,26 @@ def format(self, node):
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""
 
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_slice = {n_slice};
+    static const unsigned n_outer = {n_outer};
+    static const unsigned n_inner = {n_inner};
+    static const unsigned parallelization_factor = {parallelization_factor};
+    static const unsigned exp_table_size = {exp_table_size};
+    static const unsigned inv_table_size = {inv_table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned axis = {axis};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    static constexpr float exp_scale = {exp_scale};
+    typedef {exp_table_t.name} exp_table_t;
+    typedef {inv_table_t.name} inv_table_t;
+    typedef {accum_t.name} accum_t;
+    typedef {inv_inp_t.name} inv_inp_t;
+    typedef {inp_norm_t_str} inp_norm_t;
+}};\n"""
+
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
@@ -260,10 +280,68 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
+    def format(self, node):
+        from math import ceil, log2
+
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+        params.setdefault('exp_table_size', params['table_size'])
+        params.setdefault('inv_table_size', params['table_size'])
+        params.setdefault('n_inner', 1)
+        params.setdefault('n_outer', 1)
+        params.setdefault('exp_scale', 1.0)
+        params.setdefault('parallelization_factor', -1)
+
+        n_slice = params['n_in'] // params['n_inner'] // params['n_outer']  # type: ignore
+        params['n_slice'] = n_slice
+
+        if params['accum_t'].name == 'model_default_t':  # type: ignore
+            scale = ceil(log2(n_slice))
+            exp_table_t = node.attributes['exp_table_t'].precision
+            signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer
+            params['accum_t_str'] = f'ac_{"" if signed else "u"}fixed<{width + scale}, {integers + scale}>'
+        else:
+            params['accum_t_str'] = params['accum_t'].name  # type: ignore
+        if params['inv_inp_t'].name == 'model_default_t':  # type: ignore
+            params['inv_inp_t'] = params['exp_table_t']
+
+        if params['implementation'] == 'stable':
+            if 'inp_norm_t' not in params:
+                # Only used in stable (max-normalized) implementation
+                input_t = node.get_input_variable().type.precision
+                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
+                width, iwidth = width - signed, iwidth - signed
+                if signed:
+                    # Fix table size if too large
+                    exp_table_size = params['inv_table_size']
+                    params['exp_table_size'] = str(min(int(exp_table_size), 2**width))
+                params['inp_norm_t_str'] = f'ac_ufixed<{width}, {iwidth}>'
+            else:
+                params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
+        else:
+            params['inp_norm_t_str'] = 'ac_fixed<1,0>'
+
+        return self.template.format(**params)
+
+
+class SoftmaxFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Softmax, include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1
+        use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel'
+        params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim'
+        params['config'] = f'softmax_config{node.index}'
+
+        return self.template.format(**params)
+
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation), include_header=activ_include_list)
         self.template = activ_function_template
 
     def format(self, node):
diff --git a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
index 24bd87d3e9..d5c1eda7b9 100644
--- a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
+++ b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
@@ -15,7 +15,7 @@
 
 @register
 class QMultiHeadAttentionHandler(QLayerHandler):
-    handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',)
+    handles = ('hgq.layers.attn.mha.QMultiHeadAttention',)
 
     def handle(
         self,
@@ -129,7 +129,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v):
 
 @register
 class QLinformerAttentionHandler(QMultiHeadAttentionHandler):
-    handles = ('hgq.layers.linformer_attention.QLinformerAttention',)
+    handles = ('hgq.layers.attn.linformer.QLinformerAttention',)
 
     def handle(
         self,
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index f118ecb05c..c2353c34a8 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -100,15 +100,8 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
 enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
 template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
-    // Number of address bits for table
-    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
-
-    // Slice the top N bits of the input
-    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
-    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
-    if (x != 0 && y == 0)
-        y[0] = 1;
-    return y.to_uint();
+    // Extract the lower 'width' bits of x
+    return x.template slc<data_T::width>(0).to_uint();
 }
 
 template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
@@ -121,7 +114,6 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
 }
 
 template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
-// Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
 
@@ -130,29 +122,34 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(cons
     [[intel::fpga_register]] auto x_max =
         reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
-    // For the diffs, use the same type as the input but force rounding and saturation
-    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
-        d_xi_xmax[CONFIG_T::n_in];
+    // Normalize inputs: d = x_max - x
+    [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        d_xi_xmax[i] = data[i] - x_max;
+        // HGQ stable: d = x_max - data
+        d_xi_xmax[i] = x_max - data[i];
     }
 
-    // Calculate all the e^x's
+    // Exponentials
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
+        unsigned idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[idx];
     }
 
-    // Explicitly sum previously calculated exponentials with an adder tree
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    // Sum of Exponentials
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
-    // Multiply previously calculated exponetials with the reciprocal of the sum
-    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    // Reciprocal of Sum
+    typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum;
+    unsigned inv_idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T>(exp_sum_cast);
+
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx];
+
+    // Final Multiplication
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
@@ -265,6 +262,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
     }
 }
 
+// *************************************************
+//       Multidimensional Softmax
+// *************************************************
+
+// Helper to remap the config for the core softmax function
+template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
+    static constexpr unsigned n_in = CONFIG_T::n_slice;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
+    using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
+    using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
+    using slice_config = softmax_multidim_slice_config<CONFIG_T>;
+
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
+        #pragma unroll
+        for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {
+
+            [[intel::fpga_register]] buffer_data_t buffer_in;
+            [[intel::fpga_register]] buffer_res_t buffer_out;
+
+            // Gather Phase
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                buffer_in[j] = data[idx];
+            }
+
+            nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);
+
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                res[idx] = buffer_out[j];
+            }
+        }
+    }
+}
 // *************************************************
 //       TanH Activation
 // *************************************************
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index 2b65eef42b..dc76189083 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -152,11 +152,12 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(const data_T &data, res_T &res) {
+void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                    const typename CONFIG_T::bias_t &biases) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
 }
 } // namespace nnet
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index b42ff2990f..007b645cb0 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,13 +242,13 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
-               # Insert weights
+                # Insert weights
                 elif '// hls-fpga-machine-learning insert weights' in line:
                     newline = line
                     for layer in model.get_layers():
                         for w in layer.get_weights():
-                            #if w not in model_brams:
-                            newline += f'#include "weights/{w.name}.h"\n'                        
+                            # if w not in model_brams:
+                            newline += f'#include "weights/{w.name}.h"\n'
 
                 # Simply copy line, if no inserts are required
                 else:
@@ -557,16 +557,16 @@ def write_nnet_utils(self, model):
             dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}'
             copyfile(srcpath, dstpath)
 
-    def __get_table_size(self, model, activation):
+    def __get_table_size(self, model, activation, table_name='table_size'):
         for layer in model.get_layers():
             if (
                 layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation
-            ) and layer.get_attr('table_size') is not None:
-                return int(layer.get_attr('table_size'))
+            ) and layer.get_attr(table_name) is not None:
+                return int(layer.get_attr(table_name))
         return 1024
 
-    def __get_table_header(self, table_name, table_size):
-        table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
+    def __get_table_header(self, table_name, table_size, table_type='table_t'):
+        table_header = f'static const typename CONFIG_T::{table_type} {table_name}[{table_size}] = {{'
         return table_header
 
     def __write_elu_table(self, model, path):
@@ -695,46 +695,58 @@ def __write_selu_table(self, model, path):
         h_file.write('};\n')
         h_file.close()
 
+    def __get_table_precision(self, model, activation, table_name='table_precision'):
+        for layer in model.get_layers():
+            if layer.get_attr('activation') == activation and layer.get_attr(table_name) is not None:
+                precision = layer.get_attr(table_name)
+                return precision.precision
+
+        return None  # fp_bits, fp_integer, fp_signed
+
     def __write_exp_table(self, model, path):
         table_name = 'exp_table'
-        table_size = self.__get_table_size(model, 'softmax')
+        table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size')
 
         h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size))
+        h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t'))
 
         # Default fixed point precision
         # 6 bits for integer part, 10 bits for decimal - total, 16
-        fp_bits = 16
-        fp_integer = 6
-        fp_signed = True
+        precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t')
+
+        if precision is None:
+            fp_bits = 16
+            fp_integer = 6
+            fp_signed = True
+
+            for layer in model.get_layers():
+                if layer.name == 'softmax':
+                    ac_type = layer.get_input_variable().type
+                    if ac_type is not None:
+                        try:
+                            fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                            fp_integer = ac_type.precision.integer
+                            fp_signed = ac_type.precision.signed
+                        except Exception:
+                            # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                            pass
+                        if fp_signed is False:
+                            raise Exception('Softmax types need to be signed')
 
-        # Exp table should use the same precision as exp_table, as seen in Vivado code
-        # init_exp_table<data_T, CONFIG_T>(exp_table);
-        for layer in model.get_layers():
-            if layer.name == 'softmax':
-                ac_type = layer.get_input_variable().type
-                if ac_type is not None:
-                    try:
-                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
-                        fp_integer = ac_type.precision.integer
-                        fp_signed = ac_type.precision.signed
-                    except Exception:
-                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                        pass
-                    if fp_signed is False:
-                        raise Exception('Softmax types need to be signed')
+        else:
+            fp_bits = precision.width
+            fp_integer = precision.integer
+            fp_signed = precision.signed
 
+        f_bits = fp_bits - fp_integer
         sep = ''
-        N = ceil_log2(table_size)
         for i in range(table_size):
-            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
-            b = uint_to_binary(i, N)
-            if i == 0:
-                b.insert(0, 0)
-            else:
-                b.insert(0, 1)
-            f.set_msb_bits(b)
-            real_val = f.exp_float()
+            # Index represents the raw bit pattern of the input
+            real_val_in = i * (2.0 ** (-f_bits))
+
+            # Calculate exp(-x) for the stable implementation
+            real_val = np.exp(-real_val_in)
+
             h_file.write(sep + str(real_val))
             sep = ', '
 
@@ -743,41 +755,50 @@ def __write_exp_table(self, model, path):
 
     def __write_invert_table(self, model, path):
         table_name = 'invert_table'
-        table_size = self.__get_table_size(model, 'softmax')
+        table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size')
 
         h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size))
-
+        h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t'))
         # Default fixed point precision, in case values from layer attributes cannot be extracted
         # 8 bits for integer part, 10 bits for decimal - total, 18
-        fp_bits = 18
-        fp_integer = 8
-        fp_signed = True
 
-        # Invert table should use the same precision as exp_table, as seen in Vivado code
-        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        for layer in model.get_layers():
-            if layer.name == 'softmax':
-                ac_type = layer.get_attr('exp_table_t')
-                if ac_type is not None:
-                    try:
-                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
-                        fp_integer = ac_type.precision.integer
-                        fp_signed = ac_type.precision.signed
-                    except Exception:
-                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                        pass
-                    if fp_signed is False:
-                        raise Exception('Softmax types need to be signed')
+        precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t')
+
+        if precision is None:
+            fp_bits = 18
+            fp_integer = 8
+            fp_signed = True
+
+            for layer in model.get_layers():
+                if layer.name == 'softmax':
+                    ac_type = layer.get_attr('exp_table_t')
+                    if ac_type is not None:
+                        try:
+                            fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                            fp_integer = ac_type.precision.integer
+                            fp_signed = ac_type.precision.signed
+                        except Exception:
+                            # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                            pass
+                        if fp_signed is False:
+                            raise Exception('Softmax types need to be signed')
+
+        else:
+            fp_bits = precision.width
+            fp_integer = precision.integer
+            fp_signed = precision.signed
 
+        f_bits = fp_bits - fp_integer
         sep = ''
-        N = ceil_log2(table_size)
         for i in range(table_size):
-            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
-            b = uint_to_binary(i, N)
-            b.insert(0, 0)
-            f.set_msb_bits(b)
-            real_val = f.inv_float()
+            # Index represents the raw bit pattern of the input
+            real_val_in = i * (2.0 ** (-f_bits))
+
+            if real_val_in == 0:
+                real_val = 999.0
+            else:
+                real_val = 1.0 / real_val_in
+
             h_file.write(sep + str(real_val))
             sep = ', '
 

From dbb207b7a5c1f343d8100bba9645340a2098730c Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:33:38 +0000
Subject: [PATCH 04/12] Original weight implementation

---
 .../backends/oneapi/passes/core_templates.py  | 91 +------------------
 1 file changed, 3 insertions(+), 88 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 5a2d765e8f..9602b2d0fc 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -6,7 +6,6 @@
 # Dense templates
 
 dense_config_template = """struct config{index} : nnet::dense_config {{
-
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned n_out = {n_out};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -31,16 +30,13 @@
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
 
-    static constexpr weight_t weights = {weights};
-    static constexpr bias_t biases = {biases};
-
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
-dense_stream_function_template = '{name}.async();'
+dense_stream_function_template = '{name}.async({w}, {b});'
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
 
 
@@ -57,9 +53,6 @@ def format(self, node):
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
-        params['weights'] = node.get_weights('weight').name
-        params['biases'] = node.get_weights('bias').name
-
         return self.template.format(**params)
 
 
@@ -199,7 +192,7 @@ def format(self, node):
     static constexpr unsigned reuse_factor = {reuse};
 }};\n"""
 
-softmax_config_template_qkeras = """struct {type}_config{index} : nnet::activ_config {{
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned table_size = {table_size};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -209,26 +202,6 @@ def format(self, node):
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""
 
-softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_slice = {n_slice};
-    static const unsigned n_outer = {n_outer};
-    static const unsigned n_inner = {n_inner};
-    static const unsigned parallelization_factor = {parallelization_factor};
-    static const unsigned exp_table_size = {exp_table_size};
-    static const unsigned inv_table_size = {inv_table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned axis = {axis};
-    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
-    static constexpr float exp_scale = {exp_scale};
-    typedef {exp_table_t.name} exp_table_t;
-    typedef {inv_table_t.name} inv_table_t;
-    typedef {accum_t.name} accum_t;
-    typedef {inv_inp_t.name} inv_inp_t;
-    typedef {inp_norm_t_str} inp_norm_t;
-}};\n"""
-
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
@@ -280,68 +253,10 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
-    def format(self, node):
-        from math import ceil, log2
-
-        params = self._default_config_params(node)
-        params['type'] = node.get_attr('activation')
-        params.setdefault('exp_table_size', params['table_size'])
-        params.setdefault('inv_table_size', params['table_size'])
-        params.setdefault('n_inner', 1)
-        params.setdefault('n_outer', 1)
-        params.setdefault('exp_scale', 1.0)
-        params.setdefault('parallelization_factor', -1)
-
-        n_slice = params['n_in'] // params['n_inner'] // params['n_outer']  # type: ignore
-        params['n_slice'] = n_slice
-
-        if params['accum_t'].name == 'model_default_t':  # type: ignore
-            scale = ceil(log2(n_slice))
-            exp_table_t = node.attributes['exp_table_t'].precision
-            signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer
-            params['accum_t_str'] = f'ac_{"" if signed else "u"}fixed<{width + scale}, {integers + scale}>'
-        else:
-            params['accum_t_str'] = params['accum_t'].name  # type: ignore
-        if params['inv_inp_t'].name == 'model_default_t':  # type: ignore
-            params['inv_inp_t'] = params['exp_table_t']
-
-        if params['implementation'] == 'stable':
-            if 'inp_norm_t' not in params:
-                # Only used in stable (max-normalized) implementation
-                input_t = node.get_input_variable().type.precision
-                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
-                width, iwidth = width - signed, iwidth - signed
-                if signed:
-                    # Fix table size if too large
-                    exp_table_size = params['inv_table_size']
-                    params['exp_table_size'] = str(min(int(exp_table_size), 2**width))
-                params['inp_norm_t_str'] = f'ac_ufixed<{width}, {iwidth}>'
-            else:
-                params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
-        else:
-            params['inp_norm_t_str'] = 'ac_fixed<1,0>'
-
-        return self.template.format(**params)
-
-
-class SoftmaxFunctionTemplate(FunctionCallTemplate):
-    def __init__(self):
-        super().__init__(Softmax, include_header=activ_include_list)
-        self.template = activ_function_template
-
-    def format(self, node):
-        params = self._default_function_params(node)
-        use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1
-        use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel'
-        params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim'
-        params['config'] = f'softmax_config{node.index}'
-
-        return self.template.format(**params)
-
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, HardActivation), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
         self.template = activ_function_template
 
     def format(self, node):

From 51efff0c34744ab2fa70d7e3a52fdbf196ffcf0a Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:51:19 +0000
Subject: [PATCH 05/12] Restore oneAPI weight placement

---
 hls4ml/templates/oneapi/firmware/myproject.cpp | 5 +----
 hls4ml/templates/oneapi/firmware/myproject.h   | 3 ---
 hls4ml/writer/oneapi_writer.py                 | 7 -------
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index da9439f74a..06e7d3fe37 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -1,12 +1,9 @@
 #include "myproject.h"
+#include "parameters.h"
 #include <sycl/ext/intel/experimental/task_sequence.hpp>
 
 // hls-fpga-machine-learning insert weights
 
-
-#include "parameters.h"
-
-
 // The inter-task pipes need to be declared in the global scope
 // hls-fpga-machine-learning insert inter-task pipes
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 8f313ea30f..082ae5dc8c 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -3,9 +3,6 @@
 
 #include "defines.h"
 
-// hls-fpga-machine-learning insert weights
-
-
 // This file defines the interface to the kernel
 
 // currently this is fixed
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 007b645cb0..8ef2b0b0a1 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,13 +242,6 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
-                # Insert weights
-                elif '// hls-fpga-machine-learning insert weights' in line:
-                    newline = line
-                    for layer in model.get_layers():
-                        for w in layer.get_weights():
-                            # if w not in model_brams:
-                            newline += f'#include "weights/{w.name}.h"\n'
 
                 # Simply copy line, if no inserts are required
                 else:

From 6067bea99e35fd0bb3b2d89323e721e3916b0960 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:52:42 +0000
Subject: [PATCH 06/12] pre-commit

---
 hls4ml/writer/oneapi_writer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 8ef2b0b0a1..b945f3faf9 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,7 +242,6 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
-
                 # Simply copy line, if no inserts are required
                 else:
                     newline = line

From 16ca197d57e0d72485265cf25e170ee7fc576280 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Tue, 24 Feb 2026 16:40:32 +0000
Subject: [PATCH 07/12] softmax multidim templates

---
 .../backends/oneapi/passes/core_templates.py  | 74 ++++++++++++++++---
 1 file changed, 64 insertions(+), 10 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 9602b2d0fc..8205fb1e17 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -1,3 +1,5 @@
+from math import ceil, log2
+
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
@@ -192,14 +194,24 @@ def format(self, node):
     static constexpr unsigned reuse_factor = {reuse};
 }};\n"""
 
+
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static constexpr unsigned n_in = {n_in};
-    static constexpr unsigned table_size = {table_size};
-    static constexpr unsigned io_type = nnet::{iotype};
-    static constexpr unsigned reuse_factor = {reuse};
-    static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    static const unsigned n_in = {n_in};
+    static const unsigned n_slice = {n_slice};
+    static const unsigned n_outer = {n_outer};
+    static const unsigned n_inner = {n_inner};
+    static const unsigned parallelization_factor = {parallelization_factor};
+    static const unsigned exp_table_size = {exp_table_size};
+    static const unsigned inv_table_size = {inv_table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned axis = {axis};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    static constexpr float exp_scale = {exp_scale};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
+    //typedef {accum_t.name} accum_t;
+    //typedef {inp_norm_t_str} inp_norm_t;
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
@@ -253,6 +265,48 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+        params.setdefault('exp_table_size', params['table_size'])
+        params.setdefault('inv_table_size', params['table_size'])
+        params.setdefault('n_inner', 1)
+        params.setdefault('n_outer', 1)
+        params.setdefault('exp_scale', 1.0)
+        params.setdefault('parallelization_factor', -1)
+
+        n_slice = params['n_in'] // params['n_inner'] // params['n_outer']  # type: ignore
+        params['n_slice'] = n_slice
+
+        if params['accum_t'].name == 'model_default_t':  # type: ignore
+            scale = ceil(log2(n_slice))
+            exp_table_t = node.attributes['exp_table_t'].precision
+            signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer
+            params['accum_t_str'] = f'ac_fixed<{width + scale}, {integers + scale}, {"true" if signed else "false"}>'
+        else:
+            params['accum_t_str'] = params['accum_t'].name  # type: ignore
+        if params['inv_inp_t'].name == 'model_default_t':  # type: ignore
+            params['inv_inp_t'] = params['exp_table_t']
+
+        if params['implementation'] == 'stable':
+            if 'inp_norm_t' not in params:
+                # Only used in stable (max-normalized) implementation
+                input_t = node.get_input_variable().type.precision
+                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
+                width, iwidth = width - signed, iwidth - signed
+                if signed:
+                    # Fix table size if too large
+                    exp_table_size = params['inv_table_size']
+                    params['exp_table_size'] = str(min(int(exp_table_size), 2**width))
+                params['inp_norm_t_str'] = f'ac_fixed<{width}, {iwidth}, false>'
+            else:
+                params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
+        else:
+            params['inp_norm_t_str'] = 'ac_fixed<2,0>'
+
+        return self.template.format(**params)
+
+
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
@@ -262,7 +316,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
-        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
 
         return self.template.format(**params)
 
@@ -276,7 +330,7 @@ def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node._get_act_function_name()
         params['param'] = node.get_attr('activ_param', 1.0)
-        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
 
         return self.template.format(**params)
 
@@ -290,7 +344,7 @@ def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
         params['param'] = node.get_weights('param').name
-        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
 
         return self.template.format(**params)
 
@@ -303,7 +357,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
-        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
         return self.template.format(**params)
 
 
@@ -315,7 +369,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node._get_act_function_name()
-        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
+        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
         return self.template.format(**params)
 
 

From 974e75a3962de9afa9b832d9c6d97edf85659a4f Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Tue, 24 Feb 2026 16:43:13 +0000
Subject: [PATCH 08/12] pre-commit

---
 hls4ml/backends/oneapi/passes/core_templates.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 8205fb1e17..4fae515efc 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -307,7 +307,6 @@ def format(self, node):
         return self.template.format(**params)
 
 
-
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
@@ -316,7 +315,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
-        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
 
         return self.template.format(**params)
 
@@ -330,7 +329,7 @@ def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node._get_act_function_name()
         params['param'] = node.get_attr('activ_param', 1.0)
-        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
 
         return self.template.format(**params)
 
@@ -344,7 +343,7 @@ def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
         params['param'] = node.get_weights('param').name
-        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
 
         return self.template.format(**params)
 
@@ -357,7 +356,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node.get_attr('activation').lower()
-        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
         return self.template.format(**params)
 
 
@@ -369,7 +368,7 @@ def __init__(self):
     def format(self, node):
         params = self._default_function_params(node)
         params['activation'] = node._get_act_function_name()
-        params['config'] = f"{node.get_attr('activation')}_config{node.index}"
+        params['config'] = f'{node.get_attr("activation")}_config{node.index}'
         return self.template.format(**params)
 
 

From 060c398933705518e10007b80a1d5dd4bc1a5b10 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Tue, 24 Feb 2026 17:07:48 +0000
Subject: [PATCH 09/12] uncomment

---
 hls4ml/backends/oneapi/passes/core_templates.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 4fae515efc..21dcc69490 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -210,8 +210,8 @@ def format(self, node):
     static constexpr float exp_scale = {exp_scale};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
-    //typedef {accum_t.name} accum_t;
-    //typedef {inp_norm_t_str} inp_norm_t;
+    typedef {accum_t.name} accum_t;
+    typedef {inp_norm_t_str} inp_norm_t;
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'

From 772b93ad31d680e2df79ee756dbef06a5e783284 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Wed, 25 Feb 2026 17:16:51 +0000
Subject: [PATCH 10/12] int_inp_t to config

---
 hls4ml/backends/oneapi/passes/core_templates.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 21dcc69490..f8d1b573d5 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -211,6 +211,7 @@ def format(self, node):
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
     typedef {accum_t.name} accum_t;
+    typedef {inv_inp_t.name} inv_inp_t;
     typedef {inp_norm_t_str} inp_norm_t;
 }};\n"""
 

From c3a45848a04dd98c0124b7ec8ae33545ab28e680 Mon Sep 17 00:00:00 2001
From: bugracyln <bugraceylan2002@hotmail.com>
Date: Mon, 13 Apr 2026 02:26:17 +0100
Subject: [PATCH 11/12] softmax fixed

---
 .../backends/oneapi/passes/core_templates.py  | 132 ++++++++-------
 .../firmware/nnet_utils/nnet_activation.h     |  50 +++---
 .../nnet_utils/nnet_activation_stream.h       |  57 ++++---
 hls4ml/templates/oneapi/firmware/parameters.h |   2 +
 hls4ml/writer/oneapi_writer.py                | 156 ++++++++++--------
 5 files changed, 220 insertions(+), 177 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index f8d1b573d5..c6050dfb57 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -1,9 +1,10 @@
-from math import ceil, log2
-
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
 from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
+import numpy as np
 
 # Dense templates
 
@@ -194,25 +195,28 @@ def format(self, node):
     static constexpr unsigned reuse_factor = {reuse};
 }};\n"""
 
-
 softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_slice = {n_slice};
-    static const unsigned n_outer = {n_outer};
-    static const unsigned n_inner = {n_inner};
-    static const unsigned parallelization_factor = {parallelization_factor};
-    static const unsigned exp_table_size = {exp_table_size};
-    static const unsigned inv_table_size = {inv_table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned axis = {axis};
-    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
-    static constexpr float exp_scale = {exp_scale};
+    static constexpr unsigned n_in = {n_in};
+    static constexpr unsigned exp_table_size = {exp_table_size};
+    static constexpr unsigned inv_table_size = {inv_table_size};
+    static constexpr unsigned io_type = nnet::{iotype};
+    static constexpr unsigned reuse_factor = {reuse};
+    static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
-    typedef {inv_table_t.name} inv_table_t;
-    typedef {accum_t.name} accum_t;
+    typedef {inv_table_t.name} inv_table_t;"""
+
+softmax_config_table_template = """
+
+    static constexpr const exp_table_t *exp_table = &{exp_table_name}[0];
+    static constexpr const inv_table_t *invert_table = &{inv_table_name}[0];
+}};\n"""
+
+softmax_config_table_template_stable = """  
     typedef {inv_inp_t.name} inv_inp_t;
-    typedef {inp_norm_t_str} inp_norm_t;
+    typedef {inp_norm_t.name} inp_norm_t;
+
+    static constexpr const exp_table_t *exp_table = &{exp_table_name}[0];
+    static constexpr const inv_table_t *invert_table = &{inv_table_name}[0];
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
@@ -233,7 +237,58 @@ def __init__(self):
     def format(self, node):
         params = self._default_config_params(node)
         params['type'] = node.get_attr('activation')
+        
+        if params['type'] == 'softmax':
+
+            if 'exp_table_size' in params:
+                params['exp_table_size'] //= 2
+            else:
+                params['exp_table_size'] = 1024
 
+                params['exp_table_t'].precision.width = ceil_log2(params['exp_table_size'])
+                params['exp_table_t'].precision.integer = 3
+                params['exp_table_t'].precision.signed = False
+            
+            if 'inp_norm_t' not in params:
+                input_t = node.get_input_variable().type.precision
+                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
+                width, iwidth = width - signed, iwidth - signed
+                import copy
+                params['inp_norm_t'] = copy.deepcopy(params['exp_table_t']) #assign type,later override
+
+                #this checks if table sizes will be default, if it is just use the table size to derive precision
+                if 'inv_table_size' not in params: 
+                    params['inp_norm_t'].precision.width = params['exp_table_t'].precision.width + 1
+                    params['inp_norm_t'].precision.integer = params['exp_table_t'].precision.integer + 1
+                    params['inp_norm_t'].precision.signed = True
+                    params['inp_norm_t'].name = f'{node.name}_inp_norm_t'
+                else:
+                    params['inp_norm_t'].name = f'ac_fixed<{width},{iwidth},{'true' if signed else 'false'},AC_RND,AC_SAT_SYM>'
+                
+                node.set_attr('inp_norm_t', params['inp_norm_t'])
+
+            if 'inv_table_size' in params:
+                params['inv_table_size'] //= 2
+            else:
+                params['inv_table_size'] = 1024
+
+                params['inv_table_t'].precision.width = ceil_log2(params['inv_table_size'])
+                params['inv_table_t'].precision.integer = 3
+                params['inv_table_t'].precision.signed = False
+                
+                params['inv_inp_t'].precision.width = params['inv_table_t'].precision.width + 1
+                params['inv_inp_t'].precision.integer = params['inv_table_t'].precision.integer + 1
+                params['inv_inp_t'].precision.signed = True
+
+        
+            if params['implementation'] == 'stable':
+                self.template += softmax_config_table_template_stable
+            else:
+                self.template += softmax_config_table_template
+
+            params['exp_table_name'] = node.name + '_exp_table'
+            params['inv_table_name'] = node.name + '_inv_table'
+        
         return self.template.format(**params)
 
 
@@ -266,47 +321,6 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
-    def format(self, node):
-        params = self._default_config_params(node)
-        params['type'] = node.get_attr('activation')
-        params.setdefault('exp_table_size', params['table_size'])
-        params.setdefault('inv_table_size', params['table_size'])
-        params.setdefault('n_inner', 1)
-        params.setdefault('n_outer', 1)
-        params.setdefault('exp_scale', 1.0)
-        params.setdefault('parallelization_factor', -1)
-
-        n_slice = params['n_in'] // params['n_inner'] // params['n_outer']  # type: ignore
-        params['n_slice'] = n_slice
-
-        if params['accum_t'].name == 'model_default_t':  # type: ignore
-            scale = ceil(log2(n_slice))
-            exp_table_t = node.attributes['exp_table_t'].precision
-            signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer
-            params['accum_t_str'] = f'ac_fixed<{width + scale}, {integers + scale}, {"true" if signed else "false"}>'
-        else:
-            params['accum_t_str'] = params['accum_t'].name  # type: ignore
-        if params['inv_inp_t'].name == 'model_default_t':  # type: ignore
-            params['inv_inp_t'] = params['exp_table_t']
-
-        if params['implementation'] == 'stable':
-            if 'inp_norm_t' not in params:
-                # Only used in stable (max-normalized) implementation
-                input_t = node.get_input_variable().type.precision
-                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
-                width, iwidth = width - signed, iwidth - signed
-                if signed:
-                    # Fix table size if too large
-                    exp_table_size = params['inv_table_size']
-                    params['exp_table_size'] = str(min(int(exp_table_size), 2**width))
-                params['inp_norm_t_str'] = f'ac_fixed<{width}, {iwidth}, false>'
-            else:
-                params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
-        else:
-            params['inp_norm_t_str'] = 'ac_fixed<2,0>'
-
-        return self.template.format(**params)
-
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index c2353c34a8..385457204d 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -99,11 +99,21 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
 
 enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
-template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
-    // Extract the lower 'width' bits of x
-    return x.template slc<data_T::width>(0).to_uint();
+
+template <class data_T, unsigned table_size> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
+    // Number of address bits for table
+    static constexpr int N = ceillog2<table_size>::val;
+
+    // Slice the top N bits of the input
+    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
+    
+    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
+    if (x != 0 && y == 0)
+        y[0] = 1;
+    return y.to_uint();
 }
 
+
 template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
     // Number of address bits for table
     static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
@@ -113,49 +123,45 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
     return y.to_uint();
 }
 
+
 template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
-#include "activation_tables/exp_table.tb"
-#include "activation_tables/invert_table.tb"
 
     // Find maximum
     Op_max<typename data_T::value_type> op_max;
     [[intel::fpga_register]] auto x_max =
         reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
-    // Normalize inputs: d = x_max - x
-    [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
+    // For the diffs, use the same type as the input but force rounding and saturation
+    [[intel::fpga_register]]
+        typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        // HGQ stable: d = x_max - data
-        d_xi_xmax[i] = x_max - data[i];
+        d_xi_xmax[i] = data[i] - x_max;
     }
 
-    // Exponentials
+    // Calculate all the e^x's
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        unsigned idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T>(d_xi_xmax[i]);
-        exp_res[i] = exp_table[idx];
+        exp_res[i] = CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[i])]; //input_t, CONFIG_T
     }
 
-    // Sum of Exponentials
-    Op_add<typename CONFIG_T::accum_t> op_add;
-    [[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
-
-    // Reciprocal of Sum
-    typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum;
-    unsigned inv_idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T>(exp_sum_cast);
+    // Explicitly sum previously calculated exponentials with an adder tree
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
-    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx];
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
+        CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];
 
-    // Final Multiplication
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
     }
 }
 
+
 // TODO - Improve accuracy
 template <class data_T, class res_T, typename CONFIG_T> void softmax_latency(const data_T &data, res_T &res) {
 #include "activation_tables/exp_table_latency.tb"
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
index e860c38988..d640f89f7e 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -271,64 +271,63 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
 // *************************************************
 
 template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
-#include "activation_tables/exp_table.tb"
-#include "activation_tables/invert_table.tb"
+
+    using input_arr_t = typename ExtractPipeType<data_pipe>::value_type;
+    using input_t = typename ExtractPipeType<data_pipe>::value_type::value_type;
+    constexpr unsigned input_arr_size = std::tuple_size<input_arr_t>{};
+
 
     constexpr unsigned multiplier_limit =
-        DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
-    constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
+        DIV_ROUNDUP(input_arr_size, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = input_arr_size / multiplier_limit;
 
-    [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
-        data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+
+    [[intel::fpga_register]] input_t data_array[input_arr_size];
 
 SoftmaxArrayLoop:
-    [[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
-                                                  i < CONFIG_T::n_in /
-                                                          std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
-                                                  i++) {
+    [[intel::initiation_interval(pipeline)]] 
+    for (unsigned i = 0; i < CONFIG_T::n_in / input_arr_size; i++) {
         auto in_pack = data_pipe::read();
 
     SoftmaxArrayPackLoop:
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             data_array[j] = in_pack[j];
         }
 
         // Find the max and compute all delta(x_i, x_max)
-        Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
-        [[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
-            reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                   std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
-                   Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);
-
-        // For the diffs, use the same type as the input but force rounding and saturation
-        [[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
-                                          ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
-            d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        Op_max<input_t> op_max;
+        [[intel::fpga_register]] 
+        input_t x_max = reduce<input_t, input_arr_size, Op_max<input_t>>(data_array, op_max);
+
+        [[intel::fpga_register]]
+        typename CONFIG_T::inp_norm_t d_xi_xmax[input_arr_size];
+
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             d_xi_xmax[j] = data_array[j] - x_max;
         }
 
         // Calculate all the e^x's
         [[intel::fpga_register]]
-        typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
+        typename CONFIG_T::exp_table_t exp_res[input_arr_size];
+
         #pragma unroll
-        for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
+        for (unsigned j = 0; j < input_arr_size; j++) {
             exp_res[j] =
-                exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
-                                                           CONFIG_T>(d_xi_xmax[j])];
+                CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t,  CONFIG_T::exp_table_size>(d_xi_xmax[j])];
         }
 
         // Explicitly sum the results with an adder tree.
         // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
         Op_add<typename CONFIG_T::exp_table_t> op_add;
-        [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-            reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
+        [[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, input_arr_size,
                    Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
 
         [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-            invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+            CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t,  CONFIG_T::inv_table_size>(exp_sum)];
+
         typename ExtractPipeType<res_pipe>::value_type out_pack;
 
     SoftmaxInvPackLoop:
diff --git a/hls4ml/templates/oneapi/firmware/parameters.h b/hls4ml/templates/oneapi/firmware/parameters.h
index 717059f1e8..ef4e5d26b9 100644
--- a/hls4ml/templates/oneapi/firmware/parameters.h
+++ b/hls4ml/templates/oneapi/firmware/parameters.h
@@ -6,6 +6,8 @@
 #include "nnet_utils/nnet_code_gen.h"
 #include "nnet_utils/nnet_helpers.h"
 
+// hls-fpga-machine-learning insert softmax tables
+
 // hls-fpga-machine-learning insert includes
 
 // hls-fpga-machine-learning insert layer-config
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index b945f3faf9..7f95830c21 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -302,6 +302,14 @@ def write_parameters(self, model):
                         config = layer.get_attr('config_cpp', None)
                         if config:
                             newline += config + '\n'
+
+                elif '// hls-fpga-machine-learning insert softmax tables' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        if 'softmax' in layer.name:
+                            newline += f'#include "nnet_utils/activation_tables/{layer.name}_exp_table.h"\n'
+                            newline += f'#include "nnet_utils/activation_tables/{layer.name}_inv_table.h"\n'
+
                 else:
                     newline = line
                 fout.write(newline)
@@ -695,25 +703,29 @@ def __get_table_precision(self, model, activation, table_name='table_precision')
 
         return None  # fp_bits, fp_integer, fp_signed
 
+
     def __write_exp_table(self, model, path):
-        table_name = 'exp_table'
-        table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size')
 
-        h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t'))
+        for layer in model.get_layers():
+            
+            if 'softmax' in layer.name:
+                
+                table_name = layer.name + '_exp_table'
+                table_size = int(layer.get_attr('exp_table_size'))//2 if (
+                    layer.get_attr('activation') == 'softmax' or layer.get_attr('recurrent_activation') == 'softmax'
+                    ) and layer.get_attr('exp_table_size') is not None else 1024
 
-        # Default fixed point precision
-        # 6 bits for integer part, 10 bits for decimal - total, 16
-        precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t')
+                with open(f'{path}/{table_name}.h', 'w') as h_file:
+
+                    header_name = table_name
+                    h_file.write(f'#ifndef {header_name.upper()}_H_\n')
+                    h_file.write(f'#define {header_name.upper()}_H_\n\n')
 
-        if precision is None:
-            fp_bits = 16
-            fp_integer = 6
-            fp_signed = True
+                    h_file.write(f'static constexpr {table_name}_t {table_name}[{table_size}] = {{')
 
-            for layer in model.get_layers():
-                if layer.name == 'softmax':
-                    ac_type = layer.get_input_variable().type
+                    #ac_type = layer.get_input_variable().type
+                    ac_type = layer.get_attr('inp_norm_t')
+                    
                     if ac_type is not None:
                         try:
                             fp_bits = ac_type.precision.integer + ac_type.precision.fractional
@@ -721,49 +733,55 @@ def __write_exp_table(self, model, path):
                             fp_signed = ac_type.precision.signed
                         except Exception:
                             # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                            pass
+                            fp_bits = 16
+                            fp_integer = 6
+                            fp_signed = True
+
                         if fp_signed is False:
                             raise Exception('Softmax types need to be signed')
+                    
+                    else:
+                        fp_bits = 16
+                        fp_integer = 6
+                        fp_signed = True
+
+                    sep = ''
+                    N = ceil_log2(table_size)
+                    for i in range(table_size):
+                        f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+                        b = uint_to_binary(i, N)
+                        if i == 0:
+                            b.insert(0, 0)
+                        else:
+                            b.insert(0, 1)
+                        f.set_msb_bits(b)
+                        real_val = f.exp_float()
+                        h_file.write(sep + str(real_val))
+                        sep = ', '
+                    
+                    h_file.write('};\n\n')
+                    h_file.write('#endif')
 
-        else:
-            fp_bits = precision.width
-            fp_integer = precision.integer
-            fp_signed = precision.signed
-
-        f_bits = fp_bits - fp_integer
-        sep = ''
-        for i in range(table_size):
-            # Index represents the raw bit pattern of the input
-            real_val_in = i * (2.0 ** (-f_bits))
-
-            # Calculate exp(-x) for the stable implementation
-            real_val = np.exp(-real_val_in)
-
-            h_file.write(sep + str(real_val))
-            sep = ', '
-
-        h_file.write('};\n')
-        h_file.close()
 
     def __write_invert_table(self, model, path):
-        table_name = 'invert_table'
-        table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size')
+        for layer in model.get_layers():
+            if 'softmax' in layer.name:
 
-        h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t'))
-        # Default fixed point precision, in case values from layer attributes cannot be extracted
-        # 8 bits for integer part, 10 bits for decimal - total, 18
+                table_name = layer.name + '_inv_table'
+                table_size = int(layer.get_attr('inv_table_size')) //2 if (
+                    layer.get_attr('activation') == 'softmax' or layer.get_attr('recurrent_activation') == 'softmax'
+                    ) and layer.get_attr('inv_table_size') is not None else 1024
+                
+                with open(f'{path}/{table_name}.h', 'w') as h_file:
+
+                    header_name = table_name
+                    h_file.write(f'#ifndef {header_name.upper()}_H_\n')
+                    h_file.write(f'#define {header_name.upper()}_H_\n\n')
 
-        precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t')
+                    h_file.write(f'static constexpr {table_name}_t {table_name}[{table_size}] = {{')
 
-        if precision is None:
-            fp_bits = 18
-            fp_integer = 8
-            fp_signed = True
+                    ac_type = layer.get_attr('inv_inp_t')
 
-            for layer in model.get_layers():
-                if layer.name == 'softmax':
-                    ac_type = layer.get_attr('exp_table_t')
                     if ac_type is not None:
                         try:
                             fp_bits = ac_type.precision.integer + ac_type.precision.fractional
@@ -771,31 +789,32 @@ def __write_invert_table(self, model, path):
                             fp_signed = ac_type.precision.signed
                         except Exception:
                             # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                            pass
+                            fp_bits = 18
+                            fp_integer = 8
+                            fp_signed = True
+
                         if fp_signed is False:
                             raise Exception('Softmax types need to be signed')
 
-        else:
-            fp_bits = precision.width
-            fp_integer = precision.integer
-            fp_signed = precision.signed
-
-        f_bits = fp_bits - fp_integer
-        sep = ''
-        for i in range(table_size):
-            # Index represents the raw bit pattern of the input
-            real_val_in = i * (2.0 ** (-f_bits))
+                    else:
+                        fp_bits = 18
+                        fp_integer = 8
+                        fp_signed = True
 
-            if real_val_in == 0:
-                real_val = 999.0
-            else:
-                real_val = 1.0 / real_val_in
+                    sep = ''
+                    N = ceil_log2(table_size)
+                    for i in range(table_size):
+                        f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+                        b = uint_to_binary(i, N)
+                        b.insert(0, 0)
+                        f.set_msb_bits(b)
+                        real_val = f.inv_float()
+                        h_file.write(sep + str(real_val))
+                        sep = ', '
 
-            h_file.write(sep + str(real_val))
-            sep = ', '
+                    h_file.write('};\n\n')
+                    h_file.write('#endif')
 
-        h_file.write('};\n')
-        h_file.close()
 
     def __write_exp_table_latency(self, model, path):
         table_name = 'exp_table_latency'
@@ -1015,3 +1034,6 @@ def write_hls(self, model):
         self.write_generated_code(model)
         self.write_yml(model)
         self.write_tar(model)
+
+
+

From 31b7ad65eca392429b2da852097261acebf883be Mon Sep 17 00:00:00 2001
From: bugracyln <bugraceylan2002@hotmail.com>
Date: Tue, 14 Apr 2026 21:39:58 +0100
Subject: [PATCH 12/12] table generation cleanup

---
 hls4ml/writer/oneapi_writer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 7f95830c21..320afa74db 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -722,8 +722,7 @@ def __write_exp_table(self, model, path):
                     h_file.write(f'#define {header_name.upper()}_H_\n\n')
 
                     h_file.write(f'static constexpr {table_name}_t {table_name}[{table_size}] = {{')
-
-                    #ac_type = layer.get_input_variable().type
+                    
                     ac_type = layer.get_attr('inp_norm_t')
                     
                     if ac_type is not None: