Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3d463b3
weights for dense
laurilaatu Jan 26, 2026
d678573
hgq2 homogeneous quant fix
calad0i Jan 27, 2026
77258bc
Merge branch 'hgq2_homo_quant' of github.com:calad0i/hls4ml into onea…
laurilaatu Jan 27, 2026
59bd96f
Changes required for oneAPI MHA
laurilaatu Feb 9, 2026
dbb207b
Original weight implementation
laurilaatu Feb 9, 2026
0c59255
Merge branch 'main' of github.com:fastmachinelearning/hls4ml into one…
laurilaatu Feb 9, 2026
51efff0
Restore oneAPI weight placement
laurilaatu Feb 9, 2026
6067bea
pre-commit
laurilaatu Feb 9, 2026
06fda4e
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 10, 2026
bf38a6b
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 13, 2026
e27fd11
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 16, 2026
9f4a448
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 20, 2026
16ca197
softmax multidim templates
laurilaatu Feb 24, 2026
564b692
Merge branch 'oneapi_qmha' of github.com:laurilaatu/hls4ml into oneap…
laurilaatu Feb 24, 2026
974e75a
pre-commit
laurilaatu Feb 24, 2026
060c398
uncomment
laurilaatu Feb 24, 2026
f78558c
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 25, 2026
772b93a
int_inp_t to config
laurilaatu Feb 25, 2026
d2b8921
Merge branch 'oneapi_qmha' of github.com:laurilaatu/hls4ml into oneap…
laurilaatu Feb 25, 2026
a1ad891
Merge branch 'main' into oneapi_qmha
laurilaatu Feb 26, 2026
d65544d
Merge branch 'main' into oneapi_qmha
laurilaatu Mar 16, 2026
2d6a5cc
Merge branch 'main' into oneapi_qmha
laurilaatu Mar 30, 2026
c3a4584
softmax fixed
bugracyln Apr 13, 2026
9b1cf17
Merge branch 'main' into oneapi_qmha
laurilaatu Apr 13, 2026
31b7ad6
table generation cleanup
bugracyln Apr 14, 2026
70b19d1
Merge pull request #4 from bugracyln/smax_fix
laurilaatu Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions hls4ml/backends/oneapi/oneapi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Embedding,
Layer,
SimpleRNN,
Softmax,
)
from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
Expand Down Expand Up @@ -257,13 +256,6 @@ def init_activation(self, layer):
if layer.get_attr('recurrent_activation') == 'tanh':
layer.set_attr('recurrent_activation', 'dense_tanh')

@layer_optimizer(Softmax)
def init_softmax(self, layer):
if layer.model.config.get_config_value('IOType') == 'io_parallel':
assert len(layer.get_input_variable().shape) == 1, (
'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
)

@layer_optimizer(Embedding)
def init_embed(self, layer):
if layer.attributes['n_in'] is None:
Expand Down
74 changes: 71 additions & 3 deletions hls4ml/backends/oneapi/passes/core_templates.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from hls4ml.backends.backend import get_backend
from hls4ml.backends.oneapi.oneapi_template import StreamFunctionCallTemplate, TaskSequenceTemplate
from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
from hls4ml.model.types import FixedPrecisionType, RoundingMode, SaturationMode
from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
import numpy as np

# Dense templates

Expand Down Expand Up @@ -194,12 +197,26 @@ def format(self, node):

softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
static constexpr unsigned n_in = {n_in};
static constexpr unsigned table_size = {table_size};
static constexpr unsigned exp_table_size = {exp_table_size};
static constexpr unsigned inv_table_size = {inv_table_size};
static constexpr unsigned io_type = nnet::{iotype};
static constexpr unsigned reuse_factor = {reuse};
static constexpr nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
typedef {exp_table_t.name} exp_table_t;
typedef {inv_table_t.name} inv_table_t;
typedef {inv_table_t.name} inv_table_t;"""

softmax_config_table_template = """

static constexpr const exp_table_t *exp_table = &{exp_table_name}[0];
static constexpr const inv_table_t *invert_table = &{inv_table_name}[0];
}};\n"""

softmax_config_table_template_stable = """
typedef {inv_inp_t.name} inv_inp_t;
typedef {inp_norm_t.name} inp_norm_t;

static constexpr const exp_table_t *exp_table = &{exp_table_name}[0];
static constexpr const inv_table_t *invert_table = &{inv_table_name}[0];
}};\n"""

activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
Expand All @@ -220,7 +237,58 @@ def __init__(self):
def format(self, node):
params = self._default_config_params(node)
params['type'] = node.get_attr('activation')


if params['type'] == 'softmax':

if 'exp_table_size' in params:
params['exp_table_size'] //= 2
else:
params['exp_table_size'] = 1024

params['exp_table_t'].precision.width = ceil_log2(params['exp_table_size'])
params['exp_table_t'].precision.integer = 3
params['exp_table_t'].precision.signed = False

if 'inp_norm_t' not in params:
input_t = node.get_input_variable().type.precision
width, iwidth, signed = input_t.width, input_t.integer, input_t.signed # noqa: F841
width, iwidth = width - signed, iwidth - signed
import copy
params['inp_norm_t'] = copy.deepcopy(params['exp_table_t']) #assign type,later override

#this checks if table sizes will be default, if it is just use the table size to derive precision
if 'inv_table_size' not in params:
params['inp_norm_t'].precision.width = params['exp_table_t'].precision.width + 1
params['inp_norm_t'].precision.integer = params['exp_table_t'].precision.integer + 1
params['inp_norm_t'].precision.signed = True
params['inp_norm_t'].name = f'{node.name}_inp_norm_t'
else:
params['inp_norm_t'].name = f'ac_fixed<{width},{iwidth},{'true' if signed else 'false'},AC_RND,AC_SAT_SYM>'

node.set_attr('inp_norm_t', params['inp_norm_t'])

if 'inv_table_size' in params:
params['inv_table_size'] //= 2
else:
params['inv_table_size'] = 1024

params['inv_table_t'].precision.width = ceil_log2(params['inv_table_size'])
params['inv_table_t'].precision.integer = 3
params['inv_table_t'].precision.signed = False

params['inv_inp_t'].precision.width = params['inv_table_t'].precision.width + 1
params['inv_inp_t'].precision.integer = params['inv_table_t'].precision.integer + 1
params['inv_inp_t'].precision.signed = True


if params['implementation'] == 'stable':
self.template += softmax_config_table_template_stable
else:
self.template += softmax_config_table_template

params['exp_table_name'] = node.name + '_exp_table'
params['inv_table_name'] = node.name + '_inv_table'

return self.template.format(**params)


Expand Down
4 changes: 2 additions & 2 deletions hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class QMultiHeadAttentionHandler(QLayerHandler):
handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',)
handles = ('hgq.layers.attn.mha.QMultiHeadAttention',)
Comment thread
jmitrevs marked this conversation as resolved.

def handle(
self,
Expand Down Expand Up @@ -127,7 +127,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v):


class QLinformerAttentionHandler(QMultiHeadAttentionHandler):
handles = ('hgq.layers.linformer_attention.QLinformerAttention',)
handles = ('hgq.layers.attn.linformer.QLinformerAttention',)

def handle(
self,
Expand Down
62 changes: 52 additions & 10 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,18 +99,21 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_

enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };

template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {

template <class data_T, unsigned table_size> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
// Number of address bits for table
static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
static constexpr int N = ceillog2<table_size>::val;

// Slice the top N bits of the input
[[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);

// If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
if (x != 0 && y == 0)
y[0] = 1;
return y.to_uint();
}


template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
// Number of address bits for table
static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
Expand All @@ -120,19 +123,17 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
return y.to_uint();
}


template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
// Look-up tables
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

// Find maximum
Op_max<typename data_T::value_type> op_max;
[[intel::fpga_register]] auto x_max =
reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);

// For the diffs, use the same type as the input but force rounding and saturation
[[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
d_xi_xmax[CONFIG_T::n_in];
[[intel::fpga_register]]
typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
d_xi_xmax[i] = data[i] - x_max;
Expand All @@ -142,23 +143,25 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(cons
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
exp_res[i] = CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[i])]; //input_t, CONFIG_T
}

// Explicitly sum previously calculated exponentials with an adder tree
Op_add<typename CONFIG_T::exp_table_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
[[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
[[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];

#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
res[i] = exp_res[i] * inv_exp_sum;
}
}


// TODO - Improve accuracy
template <class data_T, class res_T, typename CONFIG_T> void softmax_latency(const data_T &data, res_T &res) {
#include "activation_tables/exp_table_latency.tb"
Expand Down Expand Up @@ -265,6 +268,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
}
}

// *************************************************
// Multidimensional Softmax
// *************************************************

// Helper to remap the config for the core softmax function
template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
static constexpr unsigned n_in = CONFIG_T::n_slice;
};

template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
using slice_config = softmax_multidim_slice_config<CONFIG_T>;

#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
#pragma unroll
for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {

[[intel::fpga_register]] buffer_data_t buffer_in;
[[intel::fpga_register]] buffer_res_t buffer_out;

// Gather Phase
#pragma unroll
for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
buffer_in[j] = data[idx];
}

nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);

#pragma unroll
for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
res[idx] = buffer_out[j];
}
}
}
}
// *************************************************
// TanH Activation
// *************************************************
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,64 +271,63 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void softsign_stre
// *************************************************

template <class data_pipe, class res_pipe, typename CONFIG_T> void softmax_stable_stream() {
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

using input_arr_t = typename ExtractPipeType<data_pipe>::value_type;
using input_t = typename ExtractPipeType<data_pipe>::value_type::value_type;
constexpr unsigned input_arr_size = std::tuple_size<input_arr_t>{};


constexpr unsigned multiplier_limit =
DIV_ROUNDUP(std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}, CONFIG_T::reuse_factor);
constexpr unsigned pipeline = std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{} / multiplier_limit;
DIV_ROUNDUP(input_arr_size, CONFIG_T::reuse_factor);
constexpr unsigned pipeline = input_arr_size / multiplier_limit;

[[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type
data_array[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];

[[intel::fpga_register]] input_t data_array[input_arr_size];

SoftmaxArrayLoop:
[[intel::initiation_interval(pipeline)]] for (unsigned i = 0;
i < CONFIG_T::n_in /
std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{};
i++) {
[[intel::initiation_interval(pipeline)]]
for (unsigned i = 0; i < CONFIG_T::n_in / input_arr_size; i++) {
auto in_pack = data_pipe::read();

SoftmaxArrayPackLoop:
#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
for (unsigned j = 0; j < input_arr_size; j++) {
data_array[j] = in_pack[j];
}

// Find the max and compute all delta(x_i, x_max)
Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type> op_max;
[[intel::fpga_register]] typename ExtractPipeType<data_pipe>::value_type::value_type x_max =
reduce<typename ExtractPipeType<data_pipe>::value_type::value_type,
std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
Op_max<typename ExtractPipeType<data_pipe>::value_type::value_type>>(data_array, op_max);

// For the diffs, use the same type as the input but force rounding and saturation
[[intel::fpga_register]] ac_fixed<ExtractPipeType<data_pipe>::value_type::value_type::width,
ExtractPipeType<data_pipe>::value_type::value_type::i_width, true, AC_RND, AC_SAT>
d_xi_xmax[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
Op_max<input_t> op_max;
[[intel::fpga_register]]
input_t x_max = reduce<input_t, input_arr_size, Op_max<input_t>>(data_array, op_max);

[[intel::fpga_register]]
typename CONFIG_T::inp_norm_t d_xi_xmax[input_arr_size];

#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
for (unsigned j = 0; j < input_arr_size; j++) {
d_xi_xmax[j] = data_array[j] - x_max;
}

// Calculate all the e^x's
[[intel::fpga_register]]
typename CONFIG_T::exp_table_t exp_res[std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}];
typename CONFIG_T::exp_table_t exp_res[input_arr_size];

#pragma unroll
for (unsigned j = 0; j < std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{}; j++) {
for (unsigned j = 0; j < input_arr_size; j++) {
exp_res[j] =
exp_table[softmax_stable_idx_from_real_val<typename ExtractPipeType<data_pipe>::value_type::value_type,
CONFIG_T>(d_xi_xmax[j])];
CONFIG_T::exp_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T::exp_table_size>(d_xi_xmax[j])];
}

// Explicitly sum the results with an adder tree.
// Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
Op_add<typename CONFIG_T::exp_table_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, std::tuple_size<typename ExtractPipeType<data_pipe>::value_type>{},
[[intel::fpga_register]] typename CONFIG_T::inv_inp_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, input_arr_size,
Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);

[[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
CONFIG_T::invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T::inv_table_size>(exp_sum)];

typename ExtractPipeType<res_pipe>::value_type out_pack;

SoftmaxInvPackLoop:
Expand Down
2 changes: 2 additions & 0 deletions hls4ml/templates/oneapi/firmware/parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include "nnet_utils/nnet_code_gen.h"
#include "nnet_utils/nnet_helpers.h"

// hls-fpga-machine-learning insert softmax tables

// hls-fpga-machine-learning insert includes

// hls-fpga-machine-learning insert layer-config
Expand Down
Loading
Loading