From 94d6331e4736f78615db1e9e77ea8b9db7595d83 Mon Sep 17 00:00:00 2001 From: Ho Fung Tsoi Date: Sat, 18 Apr 2026 17:31:08 -0400 Subject: [PATCH 1/3] sparsepixels patch --- hls4ml/backends/vivado/passes/sparsepixels.py | 400 ++++++++++++++++++ hls4ml/backends/vivado/vivado_backend.py | 2 + hls4ml/converters/keras_v3/__init__.py | 1 + hls4ml/converters/keras_v3/sparsepixels.py | 250 +++++++++++ hls4ml/converters/keras_v3_to_hls.py | 8 + hls4ml/model/layers.py | 92 ++++ hls4ml/model/optimizer/passes/bit_exact.py | 107 +++++ .../vivado/nnet_utils/nnet_sparsepixels.h | 254 +++++++++++ pyproject.toml | 1 + test/pytest/test_sparsepixels.py | 79 ++++ 10 files changed, 1194 insertions(+) create mode 100644 hls4ml/backends/vivado/passes/sparsepixels.py create mode 100644 hls4ml/converters/keras_v3/sparsepixels.py create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h create mode 100644 test/pytest/test_sparsepixels.py diff --git a/hls4ml/backends/vivado/passes/sparsepixels.py b/hls4ml/backends/vivado/passes/sparsepixels.py new file mode 100644 index 0000000000..0dd3329c10 --- /dev/null +++ b/hls4ml/backends/vivado/passes/sparsepixels.py @@ -0,0 +1,400 @@ +from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate +from hls4ml.model.layers import ( + Input, + Reshape, + SparseActivation, + SparseConv2D, + SparseFlatten, + SparseInputReduce, + SparsePooling2D, +) +from hls4ml.model.optimizer import OptimizerPass +from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer + +sparsepixels_include = ['nnet_utils/nnet_sparsepixels.h'] + +# Optimizer pass: trace hash vars & Flatten->SparseFlatten + + +class SparseGraphOptimizer(OptimizerPass): + """Triggered by SparseInputReduce. Walks the full graph to wire hash variable names, + track spatial dims, and replace Flatten->SparseFlatten.""" + + def match(self, node): + return isinstance(node, SparseInputReduce) and node.get_attr('hash_out_name', None) is None + + def transform(self, model, node): + hash_map = {} + spatial = {} + changed = False + + for name, n in list(model.graph.items()): + if isinstance(n, SparseInputReduce): + h_var = f'sparse_hash_{name}' + n.set_attr('hash_out_name', h_var) + hash_map[name] = h_var + spatial[name] = (n.get_attr('in_height'), n.get_attr('in_width')) + + elif isinstance(n, SparseConv2D): + src = n.inputs[1] if len(n.inputs) > 1 else n.inputs[0] + h_var = hash_map.get(src, hash_map.get(n.inputs[0])) + n.set_attr('hash_in_name', h_var) + n.set_attr('hash_out_name', h_var) + hash_map[name] = h_var + spatial[name] = spatial.get(src, spatial.get(n.inputs[0])) + + elif isinstance(n, FixedPointQuantizer): + src = n.inputs[0] + if src in hash_map: + hash_map[name] = hash_map[src] + spatial[name] = spatial.get(src) + + elif isinstance(n, SparseActivation): + src = n.inputs[0] + h_var = hash_map.get(src) + hash_map[name] = h_var + spatial[name] = spatial.get(src) + + elif isinstance(n, SparsePooling2D): + src = n.inputs[1] if len(n.inputs) > 1 else n.inputs[0] + h_in = hash_map.get(src, hash_map.get(n.inputs[0])) + h_out = f'sparse_hash_{name}' + n.set_attr('hash_in_name', h_in) + n.set_attr('hash_out_name', h_out) + hash_map[name] = h_out + ps = n.get_attr('pool_size') + prev_h, prev_w = spatial.get(src, spatial.get(n.inputs[0], (0, 0))) + spatial[name] = (prev_h // ps, prev_w // ps) + + elif isinstance(n, SparseFlatten): + src = n.inputs[0] + h_var = hash_map.get(src) + if h_var is not None: + n.set_attr('hash_in_name', h_var) + hash_map[name] = h_var + spatial[name] = spatial.get(src, (1, 1)) + + elif isinstance(n, Reshape): + src = n.inputs[0] + if src in hash_map: + src_node = model.graph[src] + n_sparse = src_node.get_attr('n_sparse', None) + if n_sparse is None: + continue + n_chan = src_node.get_attr('n_chan', None) or src_node.get_attr('n_filt', None) + h_var = hash_map[src] + sp = spatial.get(src, (1, 1)) + + attrs = { + 'n_sparse': n_sparse, + 'n_chan': n_chan, + 'out_height': sp[0], + 'out_width': sp[1], + 'hash_in_name': h_var, + } + new_node = model.make_node('SparseFlatten', name, attrs, n.inputs.copy(), outputs=n.outputs.copy()) + model.replace_node(n, new_node) + changed = True + + return changed + + +# Config templates (struct definitions) + +sparse_input_reduce_config = """struct config{index} {{ + static const unsigned in_height = {in_height}; + static const unsigned in_width = {in_width}; + static const unsigned n_chan = {n_chan}; + static const unsigned n_sparse = {n_sparse}; + static const unsigned hash_bits = {hash_bits}; +}};\n""" + +sparse_conv2d_config = """struct config{index} {{ + static const unsigned n_sparse = {n_sparse}; + static const unsigned n_chan = {n_chan}; + static const unsigned n_filt = {n_filt}; + static const unsigned kernel_size = {kernel_size}; + typedef {accum_t.name} accum_t; +}};\n""" + +sparse_activation_config = """struct config{index} {{ + static const unsigned n_sparse = {n_sparse}; + static const unsigned n_chan = {n_chan}; +}};\n""" + +sparse_pooling2d_config = """struct config{index} {{ + static const unsigned n_sparse = {n_sparse}; + static const unsigned n_chan = {n_chan}; + static const unsigned pool_size = {pool_size}; + typedef {accum_t.name} accum_t; +}};\n""" + +sparse_flatten_config = """struct config{index} {{ + static const unsigned n_sparse = {n_sparse}; + static const unsigned n_chan = {n_chan}; + static const unsigned out_height = {out_height}; + static const unsigned out_width = {out_width}; +}};\n""" + + +class SparseInputReduceConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SparseInputReduce) + self.template = sparse_input_reduce_config + + def format(self, node): + return self.template.format(**self._default_config_params(node)) + + +class SparseConv2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SparseConv2D) + self.template = sparse_conv2d_config + + def format(self, node): + return self.template.format(**self._default_config_params(node)) + + +class SparseActivationConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SparseActivation) + self.template = sparse_activation_config + + def format(self, node): + return self.template.format(**self._default_config_params(node)) + + +class SparsePooling2DConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SparsePooling2D) + self.template = sparse_pooling2d_config + + def format(self, node): + return self.template.format(**self._default_config_params(node)) + + +class SparseFlattenConfigTemplate(LayerConfigTemplate): + def __init__(self): + super().__init__(SparseFlatten) + self.template = sparse_flatten_config + + def format(self, node): + return self.template.format(**self._default_config_params(node)) + + +# Function-call templates + +sparse_input_reduce_function = ( + '{input_t} threshold_{index} = {threshold};\n' + 'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n' + '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n' + 'sparse_input_reduce<{input_t}, {output_t}, ap_uint<{hash_bits}>, {in_height}, {in_width}, {n_chan}, {n_sparse}>' + '({input}, threshold_{index}, {output}, {hash_out});' +) + +sparse_conv2d_function = ( + 'sparse_conv<{input_t}, {output_t}, ap_uint<{hash_bits}>, {weight_t}, {bias_t}, {accum_t_name}, ' + '{n_sparse}, {n_chan}, {n_filt}, {kernel_size}>' + '({input}, {output}, {hash_in}, {w}, {b});' +) + +sparse_activation_function = 'sparse_relu<{input_t}, {output_t}, {n_sparse}, {n_chan}>({input}, {output});' + +sparse_pooling2d_function = ( + 'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n' + '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n' + 'sparse_pooling_avg<{input_t}, {output_t}, ap_uint<{hash_bits}>, {accum_t_name}, {n_sparse}, {n_chan}, {pool_size}>' + '({input}, {output}, {hash_in}, {hash_out});' +) + +sparse_flatten_function = ( + 'sparse_flatten<{input_t}, {output_t}, ap_uint<{hash_bits}>, {out_height}, {out_width}, {n_chan}, {n_sparse}>' + '({input}, {hash_in}, {output});' +) + + +def _get_hash_bits(node): + inp = node + while inp is not None: + hb = inp.get_attr('hash_bits', None) + if hb is not None: + return hb + if len(inp.inputs) > 0: + inp = inp.model.graph.get(inp.inputs[0]) + else: + break + return 10 + + +class SparseInputReduceFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SparseInputReduce, include_header=sparsepixels_include) + self.template = sparse_input_reduce_function + + def format(self, node): + params = self._default_function_params(node) + params['in_height'] = node.get_attr('in_height') + params['in_width'] = node.get_attr('in_width') + params['n_chan'] = node.get_attr('n_chan') + params['n_sparse'] = node.get_attr('n_sparse') + params['hash_bits'] = node.get_attr('hash_bits') + params['threshold'] = node.get_attr('threshold') + params['hash_out'] = node.get_attr('hash_out_name') + return self.template.format(**params) + + +class SparseConv2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SparseConv2D, include_header=sparsepixels_include) + self.template = sparse_conv2d_function + + def format(self, node): + params = self._default_function_params(node) + params['n_sparse'] = node.get_attr('n_sparse') + params['n_chan'] = node.get_attr('n_chan') + params['n_filt'] = node.get_attr('n_filt') + params['kernel_size'] = node.get_attr('kernel_size') + params['hash_bits'] = _get_hash_bits(node) + params['hash_in'] = node.get_attr('hash_in_name') + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name + params['weight_t'] = node.get_weights('weight').type.name + params['bias_t'] = node.get_weights('bias').type.name + params['accum_t_name'] = node.get_attr('accum_t').name + return self.template.format(**params) + + +class SparseActivationFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SparseActivation, include_header=sparsepixels_include) + self.template = sparse_activation_function + + def format(self, node): + params = self._default_function_params(node) + params['n_sparse'] = node.get_attr('n_sparse') + params['n_chan'] = node.get_attr('n_chan') + return self.template.format(**params) + + +class SparsePooling2DFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SparsePooling2D, include_header=sparsepixels_include) + self.template = sparse_pooling2d_function + + def format(self, node): + params = self._default_function_params(node) + params['n_sparse'] = node.get_attr('n_sparse') + params['n_chan'] = node.get_attr('n_chan') + params['pool_size'] = node.get_attr('pool_size') + params['hash_bits'] = _get_hash_bits(node) + params['hash_in'] = node.get_attr('hash_in_name') + params['hash_out'] = node.get_attr('hash_out_name') + params['accum_t_name'] = node.get_attr('accum_t').name + return self.template.format(**params) + + +class SparseFlattenFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(SparseFlatten, include_header=sparsepixels_include) + self.template = sparse_flatten_function + + def format(self, node): + params = self._default_function_params(node) + params['n_sparse'] = node.get_attr('n_sparse') + params['n_chan'] = node.get_attr('n_chan') + params['out_height'] = node.get_attr('out_height') + params['out_width'] = node.get_attr('out_width') + params['hash_bits'] = _get_hash_bits(node) + params['hash_in'] = node.get_attr('hash_in_name') + return self.template.format(**params) + + +# Optimizer pass: fix Input precision for sparse models + + +class SparseFixInputPrecision(OptimizerPass): + """Fix Input precision for sparse models. + + The standard FixInputPrecision cannot find FixedPointQuantizer nodes through + sparse layers (Input -> SparseInputReduce -> FPQ), so it falls back to a + minimal type. This pass corrects the Input precision using the downstream + FPQ's mask, then re-registers SparseInputReduce with the corrected type. + """ + + def match(self, node): + if not isinstance(node, Input): + return False + model = node.model + for layer in model.graph.values(): + if isinstance(layer, SparseInputReduce) and node.name in layer.inputs: + return True + return False + + def transform(self, model, node): + from hls4ml.model.optimizer.passes.bit_exact import ( + produce_kif, + register_precision, + to_hls4ml_fixed, + ) + + sparse_reduce = None + for layer in model.graph.values(): + if isinstance(layer, SparseInputReduce) and node.name in layer.inputs: + sparse_reduce = layer + break + if sparse_reduce is None: + return False + + fpq = None + for layer in model.graph.values(): + if isinstance(layer, FixedPointQuantizer) and sparse_reduce.name in layer.inputs: + fpq = layer + break + if fpq is None: + return False + + # Read FPQ's output type, which was correctly set by BitExact's + # register_precision using per-element max(k), max(i), max(f). + # We do NOT call _produce_kif(fpq) here because that would re-clip + # against the currently-wrong Input precision (set to ap_ufixed<1,0> + # by the standard FixInputPrecision which can't recurse through sparse layers). + fpq_prec = fpq.get_output_variable().type.precision + k = 1 if fpq_prec.signed else 0 + i = fpq_prec.integer - k + f = fpq_prec.width - fpq_prec.integer + + new_type = to_hls4ml_fixed(k, i, f + 1, f'{node.name}_t') + if hasattr(fpq, 'SAT') and fpq.SAT in ('SAT', 'SAT_SYM'): + new_type.precision.saturation_mode = 'SAT' + else: + new_type.precision.saturation_mode = 'WRAP' + node.get_output_variable().type = new_type + node.model.config.layer_name_precision[node.name] = str(new_type) + node.attributes['trusted'] = True + + produce_kif(sparse_reduce, force_reset=True) + register_precision(sparse_reduce) + for attr in ('_produce_kif', '_request_kif'): + if attr in sparse_reduce.attributes: + del sparse_reduce.attributes[attr] + + return False + + +# Backend registration hook + + +def register_sparsepixels(backend): + backend.register_pass('sparse_graph_optimizer', SparseGraphOptimizer) + backend.register_pass('sparse_fix_input_precision', SparseFixInputPrecision) + + backend.register_pass('sparseinputreduce_config_template', SparseInputReduceConfigTemplate) + backend.register_pass('sparseinputreduce_function_template', SparseInputReduceFunctionTemplate) + backend.register_pass('sparseconv2d_config_template', SparseConv2DConfigTemplate) + backend.register_pass('sparseconv2d_function_template', SparseConv2DFunctionTemplate) + backend.register_pass('sparseactivation_config_template', SparseActivationConfigTemplate) + backend.register_pass('sparseactivation_function_template', SparseActivationFunctionTemplate) + backend.register_pass('sparsepooling2d_config_template', SparsePooling2DConfigTemplate) + backend.register_pass('sparsepooling2d_function_template', SparsePooling2DFunctionTemplate) + backend.register_pass('sparseflatten_config_template', SparseFlattenConfigTemplate) + backend.register_pass('sparseflatten_function_template', SparseFlattenFunctionTemplate) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index 879784465a..5014f6836f 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -163,6 +163,8 @@ def _register_flows(self): quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name) optimization_passes = [ + 'vivado:sparse_graph_optimizer', + 'vivado:sparse_fix_input_precision', 'vivado:remove_final_reshape', 'vivado:optimize_pointwise_conv', 'vivado:inplace_parallel_reshape', diff --git a/hls4ml/converters/keras_v3/__init__.py b/hls4ml/converters/keras_v3/__init__.py index 21950aea6c..7208d06efd 100644 --- a/hls4ml/converters/keras_v3/__init__.py +++ b/hls4ml/converters/keras_v3/__init__.py @@ -6,6 +6,7 @@ merge, # noqa: F401 pooling, # noqa: F401 recurrent, # noqa: F401 + sparsepixels, # noqa: F401 ) from ._base import registry as layer_handlers diff --git a/hls4ml/converters/keras_v3/sparsepixels.py b/hls4ml/converters/keras_v3/sparsepixels.py new file mode 100644 index 0000000000..066e6a070e --- /dev/null +++ b/hls4ml/converters/keras_v3/sparsepixels.py @@ -0,0 +1,250 @@ +import math +import typing +from collections.abc import Sequence +from typing import Any + +import numpy as np + +from ._base import KerasV3LayerHandler + +if typing.TYPE_CHECKING: + import keras + from keras import KerasTensor + +_sparse_context: dict[str, Any] = {} + + +def _mark_sparse_output(tensor_name: str, n_sparse: int, n_chan: int, height: int, width: int): + """Record a tensor as coming from a sparse layer so Flatten can be converted.""" + sparse_outputs = _sparse_context.setdefault('sparse_output_tensors', {}) + sparse_outputs[tensor_name] = { + 'n_sparse': n_sparse, + 'n_chan': n_chan, + 'out_height': height, + 'out_width': width, + } + + +def _extract_sparse_iq_config(conv_layer, in_tensor_name: str, n_sparse: int, n_chan: int) -> dict[str, Any]: + """Extract input quantizer config from QConv2D, adapted for sparse tensor shape.""" + from keras import ops + + internal_q = conv_layer._iq.quantizer + kif_k, kif_i, kif_f = internal_q.kif + kif_k = np.ravel(ops.convert_to_numpy(kif_k)).astype(np.int16) + kif_i = np.ravel(ops.convert_to_numpy(kif_i)).astype(np.int16) + kif_f = np.ravel(ops.convert_to_numpy(kif_f)).astype(np.int16) + + # HGQ quantizers may be per-element (H*W*C); reduce to per-channel + # Take max of each component independently to get the envelope type + if kif_k.size > n_chan: + kif_k = np.max(kif_k.reshape(-1, n_chan), axis=0) + kif_i = np.max(kif_i.reshape(-1, n_chan), axis=0) + kif_f = np.max(kif_f.reshape(-1, n_chan), axis=0) + + # Reconstruct KBI from KIF: B = k + i + f, I_bits = k + i + k = kif_k + B = kif_k + kif_i + kif_f + I_bits = kif_k + kif_i + + if k.size > 1: + k = np.tile(k, n_sparse).reshape(1, -1) + B = np.tile(B, n_sparse).reshape(1, -1) + I_bits = np.tile(I_bits, n_sparse).reshape(1, -1) + + overflow_mode: str = internal_q.overflow_mode + round_mode: str = internal_q.round_mode + if round_mode.startswith('S_'): + round_mode = round_mode[2:] + + return { + 'name': conv_layer._iq.name, + 'class_name': 'FixedPointQuantizer', + 'mask_kbi': (k, B, I_bits), + 'SAT': overflow_mode, + 'RND': round_mode, + 'fusible': None, + 'input_keras_tensor_names': [in_tensor_name], + 'output_keras_tensor_names': [f'{in_tensor_name}_q'], + 'overrides': {}, + } + + +def post_process_sparse_layer_list(layer_list: list[dict[str, Any]]) -> None: + """Convert Reshape (from Flatten) nodes that follow sparse layers into SparseFlatten. + Called from keras_v3_to_hls after parsing.""" + sparse_outputs = _sparse_context.get('sparse_output_tensors', {}) + if not sparse_outputs: + return + + for conf in layer_list: + if conf.get('class_name') != 'Reshape': + continue + in_tensors = conf.get('input_keras_tensor_names', []) + if not in_tensors: + continue + src_tensor = in_tensors[0] + if src_tensor not in sparse_outputs: + continue + info = sparse_outputs[src_tensor] + conf['class_name'] = 'SparseFlatten' + conf['n_sparse'] = info['n_sparse'] + conf['n_chan'] = info['n_chan'] + conf['out_height'] = info['out_height'] + conf['out_width'] = info['out_width'] + conf.pop('target_shape', None) + + +class InputReduceHandler(KerasV3LayerHandler): + handles = ('sparsepixels.layers.InputReduce',) + + def handle( + self, + layer: 'keras.Layer', + in_tensors: Sequence['KerasTensor'], + out_tensors: Sequence['KerasTensor'], + ): + in_shape: tuple[int, ...] = in_tensors[0].shape[1:] # type: ignore + in_height, in_width, n_chan = in_shape + + n_sparse = layer.n_max_pixels + threshold = float(layer.threshold) if layer.threshold is not None else 0.0 + + # Clear any stale state from a previous conversion in the same Python process + _sparse_context.clear() + _sparse_context['n_sparse'] = n_sparse + _sparse_context['spatial'] = (int(in_height), int(in_width)) + + for t in out_tensors: + _mark_sparse_output(t.name, n_sparse, int(n_chan), int(in_height), int(in_width)) + + # Hash stores 1-based H and W coordinates separately (see nnet_sparsepixels.h::sparse_input_reduce). + # Spatial dims only shrink through the network (pooling), so input H/W bound the required bits. + max_dim = max(int(in_height), int(in_width)) + hash_bits = max(1, math.ceil(math.log2(max_dim + 1))) + + return { + 'class_name': 'SparseInputReduce', + 'in_height': int(in_height), + 'in_width': int(in_width), + 'n_chan': int(n_chan), + 'n_sparse': n_sparse, + 'threshold': threshold, + 'hash_bits': hash_bits, + } + + +class QConv2DSparseHandler(KerasV3LayerHandler): + handles = ('sparsepixels.layers.QConv2DSparse',) + + def handle( + self, + layer: 'keras.Layer', + in_tensors: Sequence['KerasTensor'], + out_tensors: Sequence['KerasTensor'], + ): + import keras + from keras import ops + + conv = layer.conv + n_chan = int(conv.kernel.shape[2]) + n_filt = int(conv.filters) + kernel_size = int(conv.kernel_size[0]) + n_sparse = _sparse_context.get('n_sparse', 0) + + if hasattr(conv, 'qkernel'): + weight_data = ops.convert_to_numpy(conv.qkernel) + else: + weight_data = ops.convert_to_numpy(conv.kernel) + + bias_data = None + if layer._use_bias and hasattr(layer, 'sparse_bias'): + if hasattr(layer, '_bq'): + bias_data = ops.convert_to_numpy(layer._bq(layer.sparse_bias)) + else: + bias_data = ops.convert_to_numpy(layer.sparse_bias) + + name = layer.name + in_tensor_names = [t.name for t in in_tensors] + out_tensor_names = [t.name for t in out_tensors] + + iq_conf = None + has_iq = hasattr(conv, '_iq') and hasattr(conv, '_enable_iq') and conv._enable_iq + if has_iq: + iq_conf = _extract_sparse_iq_config(conv, in_tensors[0].name, n_sparse, n_chan) + in_tensor_names = [f'{in_tensors[0].name}_q'] + + config: dict[str, Any] = { + 'class_name': 'SparseConv2D', + 'name': name, + 'n_sparse': n_sparse, + 'n_chan': n_chan, + 'n_filt': n_filt, + 'kernel_size': kernel_size, + 'weight_data': weight_data, + 'bias_data': bias_data, + 'input_keras_tensor_names': in_tensor_names, + 'output_keras_tensor_names': out_tensor_names, + } + + activation = layer._activation + spatial = _sparse_context.get('spatial', (1, 1)) + results: list[dict[str, Any]] = [] + if iq_conf is not None: + results.append(iq_conf) + + if activation not in (None, keras.activations.linear): + act_name = activation.__name__ + intermediate = f'{out_tensors[0].name}_sparse_act' + + config['output_keras_tensor_names'] = [intermediate] + + act_config: dict[str, Any] = { + 'class_name': 'SparseActivation', + 'name': f'{name}_{act_name}', + 'activation': act_name, + 'n_sparse': n_sparse, + 'n_chan': n_filt, + 'input_keras_tensor_names': [intermediate], + 'output_keras_tensor_names': out_tensor_names, + } + for t_name in out_tensor_names: + _mark_sparse_output(t_name, n_sparse, n_filt, spatial[0], spatial[1]) + results.extend([config, act_config]) + return tuple(results) + + for t_name in out_tensor_names: + _mark_sparse_output(t_name, n_sparse, n_filt, spatial[0], spatial[1]) + results.append(config) + return tuple(results) + + +class AveragePooling2DSparseHandler(KerasV3LayerHandler): + handles = ('sparsepixels.layers.AveragePooling2DSparse',) + + def handle( + self, + layer: 'keras.Layer', + in_tensors: Sequence['KerasTensor'], + out_tensors: Sequence['KerasTensor'], + ): + pool_size = int(layer.avg_pool.pool_size[0]) + + feat_shape: tuple[int, ...] = in_tensors[0].shape[1:] # type: ignore + n_chan = int(feat_shape[-1]) + n_sparse = _sparse_context.get('n_sparse', 0) + + prev_h, prev_w = _sparse_context.get('spatial', (1, 1)) + new_h, new_w = prev_h // pool_size, prev_w // pool_size + _sparse_context['spatial'] = (new_h, new_w) + + out_tensor_names = [t.name for t in out_tensors] + for t_name in out_tensor_names: + _mark_sparse_output(t_name, n_sparse, n_chan, new_h, new_w) + + return { + 'class_name': 'SparsePooling2D', + 'n_sparse': n_sparse, + 'n_chan': n_chan, + 'pool_size': pool_size, + } diff --git a/hls4ml/converters/keras_v3_to_hls.py b/hls4ml/converters/keras_v3_to_hls.py index 359bc391d6..697a6dfdce 100644 --- a/hls4ml/converters/keras_v3_to_hls.py +++ b/hls4ml/converters/keras_v3_to_hls.py @@ -352,6 +352,14 @@ def parse_keras_v3_model(model: 'keras.Model', allow_da_fallback=True, allow_v2_ # If no layer was added in the loop, then there is a circular dependency raise ValueError('Circular dependency detected') + # Post-process: convert Flatten following sparse layers to SparseFlatten + try: + from hls4ml.converters.keras_v3.sparsepixels import post_process_sparse_layer_list + + post_process_sparse_layer_list(layer_list) + except ImportError: + pass + # Mark inputs[inp layer name] for ModelGraph to parse from i/o keras tensor names provides: dict[str, str] = {} # tensor_name -> src_layer_name for conf in layer_list: diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 8bd8cd8a11..23b58beae8 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -1782,6 +1782,92 @@ def initialize(self): self.add_output_variable(shape) +class SparseInputReduce(Layer): + _expected_attributes = [ + Attribute('in_height'), + Attribute('in_width'), + Attribute('n_chan'), + Attribute('n_sparse'), + Attribute('threshold', value_type=float), + Attribute('hash_bits', value_type=int, default=10), + ] + + def initialize(self): + shape = [self.attributes['n_sparse'] * self.attributes['n_chan']] + self.add_output_variable(shape) + + +class SparseConv2D(Layer): + _expected_attributes = [ + Attribute('n_sparse'), + Attribute('n_chan'), + Attribute('n_filt'), + Attribute('kernel_size'), + WeightAttribute('weight'), + WeightAttribute('bias'), + TypeAttribute('weight'), + TypeAttribute('bias'), + TypeAttribute('accum'), + ] + + def initialize(self): + shape = [self.attributes['n_sparse'] * self.attributes['n_filt']] + self.add_output_variable(shape) + self.add_weights(quantizer=self.get_attr('weight_quantizer')) + self.add_bias(quantizer=self.get_attr('bias_quantizer')) + + def add_bias(self, quantizer=None): + data = self.get_attr('bias_data', None) + precision = None + type_name = None + if data is None: + data = np.zeros(self.attributes['n_filt']) + precision = IntegerPrecisionType(width=1, signed=False) + type_name = 'bias{index}_t' + quantizer = None + self.add_weights_variable( + name='bias', var_name='b{index}', type_name=type_name, precision=precision, data=data, quantizer=quantizer + ) + + +class SparseActivation(Layer): + _expected_attributes = [ + Attribute('n_sparse'), + Attribute('n_chan'), + Attribute('activation', value_type=str), + ] + + def initialize(self): + shape = [self.attributes['n_sparse'] * self.attributes['n_chan']] + self.add_output_variable(shape) + + +class SparsePooling2D(Layer): + _expected_attributes = [ + Attribute('n_sparse'), + Attribute('n_chan'), + Attribute('pool_size'), + TypeAttribute('accum'), + ] + + def initialize(self): + shape = [self.attributes['n_sparse'] * self.attributes['n_chan']] + self.add_output_variable(shape) + + +class SparseFlatten(Layer): + _expected_attributes = [ + Attribute('n_sparse'), + Attribute('n_chan'), + Attribute('out_height'), + Attribute('out_width'), + ] + + def initialize(self): + shape = [self.attributes['out_height'] * self.attributes['out_width'] * self.attributes['n_chan']] + self.add_output_variable(shape) + + layer_map = { 'Input': Input, 'InputLayer': Input, @@ -1860,6 +1946,12 @@ def initialize(self): # TensorFlow-specific layers: 'BiasAdd': BiasAdd, 'DACombinational': DACombinational, + # Sparsepixels layers: + 'SparseInputReduce': SparseInputReduce, + 'SparseConv2D': SparseConv2D, + 'SparseActivation': SparseActivation, + 'SparsePooling2D': SparsePooling2D, + 'SparseFlatten': SparseFlatten, } diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py index 88dc65c806..014d23e78e 100644 --- a/hls4ml/model/optimizer/passes/bit_exact.py +++ b/hls4ml/model/optimizer/passes/bit_exact.py @@ -34,6 +34,11 @@ Pooling2D, Reshape, Softmax, + SparseActivation, + SparseConv2D, + SparseFlatten, + SparseInputReduce, + SparsePooling2D, Transpose, ) from hls4ml.model.optimizer import ModelOptimizerPass, OptimizerPass @@ -197,6 +202,24 @@ def _(layer: Transpose): return ((k, i, f),) +@_request_kif.register +def _(layer: SparsePooling2D): + """SparsePooling2D has two inputs: features (idx=0) and hash (idx=1). + The hash input is an integer side-channel and must not widen the upstream's precision. + Return minimum values for the hash input so np.maximum in requested_kif does not + override the narrow request from the hash-producer's other downstream consumers (e.g. a FPQ).""" + # Default: max precision for the feature input (same as no dispatch) + feat_shape = get_input_shapes(layer)[0] + feat_kif = _maximum_kif_at_shape(feat_shape) + if len(get_input_shapes(layer)) > 1: + hash_shape = get_input_shapes(layer)[1] + k2 = np.zeros(hash_shape, dtype=np.int16) + i2 = np.full(hash_shape, -127, dtype=np.int16) + f2 = np.full(hash_shape, -127, dtype=np.int16) + return (feat_kif, (k2, i2, f2)) + return (feat_kif,) + + @_request_kif.register def _(layer: DACombinational): comb = layer.attributes['da_comb_trace'] @@ -677,6 +700,88 @@ def _(layer: Embedding): return k, i, f +@_produce_kif.register +def _(layer: SparseInputReduce): + k_in, i_in, f_in = get_input_kifs(layer)[0] + n_chan = layer.attributes['n_chan'] + n_sparse = layer.attributes['n_sparse'] + k_ch = np.max(k_in.reshape(-1, n_chan), axis=0) + i_ch = np.max(i_in.reshape(-1, n_chan), axis=0) + f_ch = np.max(f_in.reshape(-1, n_chan), axis=0) + return np.tile(k_ch, n_sparse), np.tile(i_ch, n_sparse), np.tile(f_ch, n_sparse) + + +@_produce_kif.register +def _(layer: SparseConv2D): + kernel = layer.attributes['weight'].data + _bias = layer.attributes['bias'] + bias = _bias.data if _bias is not None else 0 + k_in, i_in, f_in = get_input_kifs(layer)[0] + + n_sparse = layer.attributes['n_sparse'] + n_chan = layer.attributes['n_chan'] + n_filt = layer.attributes['n_filt'] + ks = layer.attributes['kernel_size'] + + # Match standard Conv2D precision: each output pixel accumulates ks*ks*n_chan + # MAC terms (the kernel window), same as dense conv. The sparse loop iterates + # n_sparse input pixels, but only those within the kernel radius contribute; + # the rest add 0. So the worst-case accumulation depth is ks*ks*n_chan, not n_sparse. + k_ch = np.tile(k_in[:n_chan], ks * ks) + i_ch = np.tile(i_in[:n_chan], ks * ks) + f_ch = np.tile(f_in[:n_chan], ks * ks) + qint_in = QIntervalArray.from_kif(k_ch, i_ch, f_ch) + + kernel_flat = kernel.reshape(-1, n_filt) # (ks*ks*n_chan, n_filt) + qint_out = qint_in @ kernel_flat + qint_out = qint_out + bias + k, i, f = qint_out.to_kif() + return ( + np.tile(k, n_sparse).astype(np.int16), + np.tile(i, n_sparse).astype(np.int16), + np.tile(f, n_sparse).astype(np.int16), + ) + + +@_produce_kif.register +def _(layer: SparseActivation): + k_in, i_in, f_in = get_input_kifs(layer)[0] + act = layer.attributes.get('activation', 'relu').lower() + if act == 'relu': + return np.zeros_like(k_in), i_in, f_in + return k_in, i_in, f_in + + +@_produce_kif.register +def _(layer: SparsePooling2D): + k_in, i_in, f_in = get_input_kifs(layer)[0] + # Average pooling divides by pool_size^2, adding fractional bits. + # Match standard Pooling2D: add ceil(log2(pool_size^2)) fractional bits. + pool_size = layer.attributes['pool_size'] + n_chan = layer.attributes['n_chan'] + extra_f = int(np.ceil(np.log2(pool_size * pool_size))) + k_ch = k_in[:n_chan] + i_ch = i_in[:n_chan] + f_ch = f_in[:n_chan] + extra_f + n_sparse = layer.attributes['n_sparse'] + return ( + np.tile(k_ch, n_sparse).astype(np.int16), + np.tile(i_ch, n_sparse).astype(np.int16), + np.tile(f_ch, n_sparse).astype(np.int16), + ) + + +@_produce_kif.register +def _(layer: SparseFlatten): + k_in, i_in, f_in = get_input_kifs(layer)[0] + n_chan = layer.attributes['n_chan'] + out_h = layer.attributes['out_height'] + out_w = layer.attributes['out_width'] + k_ch, i_ch, f_ch = k_in[:n_chan], i_in[:n_chan], f_in[:n_chan] + n_out = out_h * out_w + return np.tile(k_ch, n_out), np.tile(i_ch, n_out), np.tile(f_ch, n_out) + + def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]): return tuple(int(np.max(a)) for a in arr) @@ -966,6 +1071,8 @@ def get_output_layers_and_quantizers( elif isinstance(_node, (Reshape, Transpose, Concatenate)): layers.append(_node) get_output_layers_and_quantizers(_node, layers, quantizers) + elif isinstance(_node, (SparseInputReduce, SparseConv2D, SparseActivation, SparsePooling2D, SparseFlatten)): + layers.append(_node) else: raise ValueError(f'Layer {node.name} ({node.class_name}) unexpected input layer chain.') return layers, quantizers diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h new file mode 100644 index 0000000000..41e5953f75 --- /dev/null +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h @@ -0,0 +1,254 @@ +#ifndef NNET_SPARSEPIXELS_H_ +#define NNET_SPARSEPIXELS_H_ + +#include "ap_fixed.h" +#include "ap_int.h" + +constexpr int _sp_floorlog2(int x) { return (x < 2) ? 0 : 1 + _sp_floorlog2(x / 2); } +constexpr int _sp_pow2(int x) { return x == 0 ? 1 : 2 * _sp_pow2(x - 1); } +// ceil(log2(x)): bits needed to encode values 0..x-1 +constexpr int _sp_ceillog2(int x) { return (x <= 1) ? 1 : _sp_floorlog2(x - 1) + 1; } + +template struct value_idx_pair { + T value; + ap_uint index; +}; + +template class Op_active { + public: + T operator()(T a, T b, t threshold) { + if (a.value > threshold) + return a; + else if (b.value > threshold) + return b; + else { + T none; + none.value = 0; + none.index = 0; + return none; + } + } +}; + +template T find_active(T *x, Op op, t threshold) { + #pragma HLS INLINE + static constexpr int leftN = _sp_pow2(_sp_floorlog2(N - 1)) > 0 ? _sp_pow2(_sp_floorlog2(N - 1)) : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + + if (N == 1) { + return x[0]; + } + if (N == 2) { + return op(x[0], x[1], threshold); + } + return op(find_active(x, op, threshold), find_active(x + leftN, op, threshold), + threshold); +} + +template +void sparse_input_reduce(data_T input_arr[N_h * N_w * N_c], data_T threshold, res_T sparse_arr_feat[N_sparse * N_c], + hash_T sparse_arr_hash[N_sparse * 2]) { + + // Flat pixel index ranges over 0..N_h*N_w-1 -> auto-sized to minimum bits + static constexpr int IDX_BITS = _sp_ceillog2(N_h * N_w); + typedef value_idx_pair pair_t; + + pair_t pair_arr[N_h * N_w]; + int j_h_arr[N_h * N_w]; + int j_w_arr[N_h * N_w]; + #pragma HLS ARRAY_PARTITION variable = j_h_arr type = complete dim = 0 + #pragma HLS ARRAY_PARTITION variable = j_w_arr type = complete dim = 0 + #pragma HLS ARRAY_PARTITION variable = pair_arr type = complete dim = 0 + +DataPrepareLoop: + for (int j = 0; j < N_h * N_w; j++) { + #pragma HLS UNROLL + pair_arr[j].value = input_arr[N_c * j]; + pair_arr[j].index = j; + + int remainder = j % (N_h * N_w); + int j_h = remainder / N_w + 1; + int j_w = remainder % N_w + 1; + + j_h_arr[j] = j_h; + j_w_arr[j] = j_w; + } + + Op_active op_active; +MaxPixelsLoop: + for (int i = 0; i < N_sparse; i++) { + #pragma HLS PIPELINE + pair_t pair = find_active, data_T>(pair_arr, op_active, threshold); + sparse_arr_feat[N_c * i] = (res_T)pair.value; + for (int j = 1; j < N_c; j++) { + #pragma HLS UNROLL + sparse_arr_feat[N_c * i + j] = (res_T)input_arr[N_c * pair.index + j]; + } + + sparse_arr_hash[2 * i] = j_h_arr[pair.index]; + sparse_arr_hash[2 * i + 1] = j_w_arr[pair.index]; + + pair_arr[pair.index].value = 0; + } +} + +template +accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_arr_feat_in[n_chan * N_sparse], + w_T filt_w[ker_size * ker_size * n_chan * n_filt], int i_filt, int i_pixel_in) { + #pragma HLS INLINE + constexpr int R = (ker_size - 1) / 2; + if ((unsigned)(offset_h + R) >= ker_size || (unsigned)(offset_w + R) >= ker_size) { + return (accum_T)0; + } + ap_uint<4> row = R - offset_h; + ap_uint<4> col = R - offset_w; + ap_uint<7> pos = row * ker_size + col; + + accum_T acc = 0; +MultLoopPerFilter: + for (int i_chan = 0; i_chan < n_chan; i_chan++) { + #pragma HLS UNROLL + int w_idx = n_filt * n_chan * pos + n_filt * i_chan + i_filt; + acc += filt_w[w_idx] * sparse_arr_feat_in[n_chan * i_pixel_in + i_chan]; + } + return acc; +} + +template +void sparse_conv(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_filt], + hash_T sparse_arr_hash[N_sparse * 2], w_T w[ker_size * ker_size * n_chan * n_filt], b_T b[n_filt]) { + +OutputPixelLoop: + for (int i_pixel_out = 0; i_pixel_out < N_sparse; i_pixel_out++) { + #pragma HLS UNROLL + + bool nonzero = false; + for (int i_chan = 0; i_chan < n_chan; i_chan++) { + #pragma HLS UNROLL + nonzero |= (sparse_arr_feat_in[i_pixel_out * n_chan + i_chan] != (data_T)0); + } + + OutputFilterLoop: + for (int i_filt = 0; i_filt < n_filt; i_filt++) { + #pragma HLS UNROLL + accum_T acc = 0; + + InputPixelLoop: + for (int i_pixel_in = 0; i_pixel_in < N_sparse; i_pixel_in++) { + #pragma HLS UNROLL + int offset_h = sparse_arr_hash[2 * i_pixel_out] - sparse_arr_hash[2 * i_pixel_in]; + int offset_w = sparse_arr_hash[2 * i_pixel_out + 1] - sparse_arr_hash[2 * i_pixel_in + 1]; + + acc += mult_for_sparse_conv_kernel( + offset_h, offset_w, sparse_arr_feat_in, w, i_filt, i_pixel_in); + } + + if (acc != 0) { + acc += b[i_filt]; + } + if (nonzero == false) { + acc = 0; + } + sparse_arr_feat_out[n_filt * i_pixel_out + i_filt] = (res_T)acc; + } + } +} + +template +void sparse_relu(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan]) { + #pragma HLS PIPELINE + data_T data; + for (int i = 0; i < N_sparse * n_chan; i++) { + data = sparse_arr_feat_in[i]; + if (data > 0) { + sparse_arr_feat_out[i] = data; + } else { + sparse_arr_feat_out[i] = 0; + } + } +} + +template +void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan], + hash_T sparse_arr_hash_in[N_sparse * 2], hash_T sparse_arr_hash_out[N_sparse * 2]) { + + constexpr double _pool_size_recip_d = 1.0 / double(pool_size); + const ap_fixed<10, 0> pool_size_recip = _pool_size_recip_d; + + int hash_tmp[N_sparse * 2]; +#pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0 +ComputePooledLoc: + for (int i = 0; i < N_sparse; i++) { + #pragma HLS UNROLL + hash_tmp[2 * i] = (sparse_arr_hash_in[2 * i] - 1) / pool_size + 1; + hash_tmp[2 * i + 1] = (sparse_arr_hash_in[2 * i + 1] - 1) / pool_size + 1; + } + + data_T sparse_arr_feat_in_copy[N_sparse * n_chan]; + #pragma HLS ARRAY_PARTITION variable = sparse_arr_feat_in_copy type = complete dim = 0 + for (int i = 0; i < N_sparse * n_chan; i++) { + #pragma HLS UNROLL + sparse_arr_feat_in_copy[i] = sparse_arr_feat_in[i]; + } + +HashOutLoop: + for (int i_pixel = 0; i_pixel < N_sparse; i_pixel++) { + #pragma HLS UNROLL + int h_out = hash_tmp[2 * i_pixel]; + int w_out = hash_tmp[2 * i_pixel + 1]; + + ChannelLoop: + for (int i_chan = 0; i_chan < n_chan; i_chan++) { + #pragma HLS UNROLL + accum_T acc = 0; + + HashInLoop: + for (int j_pixel = 0; j_pixel < N_sparse; j_pixel++) { + #pragma HLS UNROLL + int h_in = hash_tmp[2 * j_pixel]; + int w_in = hash_tmp[2 * j_pixel + 1]; + + data_T data = sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan]; + if ((h_out == h_in) && (w_out == w_in)) { + acc += data; + sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan] = 0; + } + } + sparse_arr_feat_out[n_chan * i_pixel + i_chan] = (res_T)(acc * pool_size_recip * pool_size_recip); + } + sparse_arr_hash_out[2 * i_pixel] = h_out; + sparse_arr_hash_out[2 * i_pixel + 1] = w_out; + } +} + +template +void sparse_flatten(data_T sparse_arr_feat[N_sparse * n_chan], hash_T sparse_arr_hash[N_sparse * 2], + res_T flat_arr[n_height * n_width * n_chan]) { + +InitFlatArr: + for (int i = 0; i < n_height * n_width * n_chan; i++) { + #pragma HLS UNROLL + flat_arr[i] = 0; + } + +FillFlatArr: + for (int i = 0; i < N_sparse; i++) { + #pragma HLS UNROLL factor = 4 + int i_h = sparse_arr_hash[2 * i]; + int i_w = sparse_arr_hash[2 * i + 1]; + int pixel_idx = (i_h - 1) * n_width + (i_w - 1); + + ChannelLoop: + for (int i_chan = 0; i_chan < n_chan; i_chan++) { + #pragma HLS UNROLL + data_T data = sparse_arr_feat[n_chan * i + i_chan]; + + if (data != 0) { + flat_arr[n_chan * pixel_idx + i_chan] = (res_T)data; + } + } + } +} + +#endif // NNET_SPARSEPIXELS_H_ diff --git a/pyproject.toml b/pyproject.toml index a3ccc2e529..ac7d53276f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ optional-dependencies.qkeras = [ "tensorflow-model-optimization<=0.7.5", ] optional-dependencies.quartus-report = [ "calmjs-parse", "tabulate" ] +optional-dependencies.sparsepixels = [ "sparsepixels>=0.2.2" ] optional-dependencies.sr = [ "sympy>=1.13.1" ] optional-dependencies.testing = [ "calmjs-parse", diff --git a/test/pytest/test_sparsepixels.py b/test/pytest/test_sparsepixels.py new file mode 100644 index 0000000000..aaf92c4b85 --- /dev/null +++ b/test/pytest/test_sparsepixels.py @@ -0,0 +1,79 @@ +from pathlib import Path + +import keras +import numpy as np +import pytest + +sparsepixels = pytest.importorskip('sparsepixels') + +from hgq.config import LayerConfigScope, QuantizerConfigScope # noqa: E402 +from hgq.layers import QDense # noqa: E402 +from hgq.quantizer.config import QuantizerConfig # noqa: E402 +from keras.layers import Flatten # noqa: E402 +from sparsepixels.layers import AveragePooling2DSparse, InputReduce, QConv2DSparse # noqa: E402 + +import hls4ml # noqa: E402 + +test_root_path = Path(__file__).parent + + +def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4): + iq_conf = QuantizerConfig(place='datalane', q_type='kif', i0=4, f0=8, overflow_mode='WRAP') + with ( + QuantizerConfigScope(place='all', default_q_type='kbi', overflow_mode='SAT_SYM'), + QuantizerConfigScope(place='datalane', default_q_type='kif', overflow_mode='WRAP'), + LayerConfigScope(enable_ebops=True, enable_iq=True, beta0=1e-5), + ): + x_in = keras.Input(shape=input_shape, name='x_in') + x, keep_mask = InputReduce(n_max_pixels=n_max_pixels, threshold=threshold, name='input_reduce')(x_in) + x = QConv2DSparse( + filters=2, + kernel_size=3, + name='conv', + padding='same', + strides=1, + activation='relu', + iq_conf=iq_conf, + )([x, keep_mask]) + x, keep_mask = AveragePooling2DSparse(2, name='pool')([x, keep_mask]) + x = Flatten(name='flatten')(x) + x = QDense(1, name='dense', iq_conf=iq_conf)(x) + return keras.Model(x_in, x, name='cnn_sparse_test') + + +def _make_sparse_inputs(n_samples, h=8, w=8, n_active_per_sample=4, threshold=0.4): + x = np.zeros((n_samples, h, w, 1), dtype=np.float32) + for i in range(n_samples): + active_idx = np.random.choice(h * w, size=n_active_per_sample, replace=False) + for idx in active_idx: + x[i, idx // w, idx % w, 0] = threshold + 0.1 + np.random.rand() * 0.5 + return x + + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +def test_sparse_cnn(test_case_id, backend): + np.random.seed(42) + keras.utils.set_random_seed(42) + + model = _build_sparse_cnn() + x = _make_sparse_inputs(n_samples=1000) + + y_keras = model.predict(x, verbose=0) + + output_dir = test_root_path / test_case_id + hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) + hls_model = hls4ml.converters.convert_from_keras_model( + model, + hls_config=hls_config, + output_dir=str(output_dir), + backend=backend, + io_type='io_parallel', + ) + hls_model.compile() + + y_hls = hls_model.predict(x).reshape(y_keras.shape) + + mean_abs_diff = float(np.mean(np.abs(y_keras - y_hls))) + print(f'sparse-pixels {backend}: mean|diff|={mean_abs_diff:.4f}') + + assert mean_abs_diff < 0.5 From fe1ca2b33d83f42298fec3916e71791c30bd0757 Mon Sep 17 00:00:00 2001 From: Ho Fung Tsoi Date: Thu, 2 Jul 2026 22:55:47 -0400 Subject: [PATCH 2/3] upgrade: parallelization knobs for sparse layers; streaming option for input reduction; add maxpooling --- hls4ml/backends/vivado/passes/sparsepixels.py | 51 ++++- hls4ml/converters/keras_v3/sparsepixels.py | 51 +++-- hls4ml/model/layers.py | 1 + hls4ml/model/optimizer/passes/bit_exact.py | 48 ++++- .../vivado/nnet_utils/nnet_sparsepixels.h | 190 ++++++++++++++---- test/pytest/test_sparsepixels.py | 65 ++++-- 6 files changed, 326 insertions(+), 80 deletions(-) diff --git a/hls4ml/backends/vivado/passes/sparsepixels.py b/hls4ml/backends/vivado/passes/sparsepixels.py index 0dd3329c10..6b8a065e22 100644 --- a/hls4ml/backends/vivado/passes/sparsepixels.py +++ b/hls4ml/backends/vivado/passes/sparsepixels.py @@ -96,6 +96,15 @@ def transform(self, model, node): model.replace_node(n, new_node) changed = True + else: + # Passthrough nodes inside the sparse region (e.g. an input quantizer that + # bit_exact has rendered as a linear activation) preserve the sparse pixel + # layout, so carry the hash variable and spatial dims to the next sparse layer. + if n.inputs and n.inputs[0] in hash_map: + src = n.inputs[0] + hash_map[name] = hash_map[src] + spatial[name] = spatial.get(src) + return changed @@ -184,31 +193,42 @@ def format(self, node): # Function-call templates +# Input reduce: {fn} selects the tree (default) or the streaming implementation. sparse_input_reduce_function = ( '{input_t} threshold_{index} = {threshold};\n' 'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n' '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n' - 'sparse_input_reduce<{input_t}, {output_t}, ap_uint<{hash_bits}>, {in_height}, {in_width}, {n_chan}, {n_sparse}>' + '{fn}<{input_t}, {output_t}, ap_uint<{hash_bits}>, {in_height}, {in_width}, {n_chan}, {n_sparse}>' '({input}, threshold_{index}, {output}, {hash_out});' ) +# The last two template args are the parallelization factors (default to full parallelism). sparse_conv2d_function = ( 'sparse_conv<{input_t}, {output_t}, ap_uint<{hash_bits}>, {weight_t}, {bias_t}, {accum_t_name}, ' - '{n_sparse}, {n_chan}, {n_filt}, {kernel_size}>' + '{n_sparse}, {n_chan}, {n_filt}, {kernel_size}, {pixel_parallel_factor}, {filt_parallel_factor}>' '({input}, {output}, {hash_in}, {w}, {b});' ) sparse_activation_function = 'sparse_relu<{input_t}, {output_t}, {n_sparse}, {n_chan}>({input}, {output});' -sparse_pooling2d_function = ( - 'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n' - '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n' - 'sparse_pooling_avg<{input_t}, {output_t}, ap_uint<{hash_bits}>, {accum_t_name}, {n_sparse}, {n_chan}, {pool_size}>' +sparse_pooling2d_prefix = ( + 'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n' +) +# Average pooling takes an accum_t; max pooling does not. Both take the two parallelization factors. +sparse_pooling2d_avg_call = ( + 'sparse_pooling_avg<{input_t}, {output_t}, ap_uint<{hash_bits}>, {accum_t_name}, ' + '{n_sparse}, {n_chan}, {pool_size}, {pixel_parallel_factor}, {chan_parallel_factor}>' + '({input}, {output}, {hash_in}, {hash_out});' +) +sparse_pooling2d_max_call = ( + 'sparse_pooling_max<{input_t}, {output_t}, ap_uint<{hash_bits}>, ' + '{n_sparse}, {n_chan}, {pool_size}, {pixel_parallel_factor}, {chan_parallel_factor}>' '({input}, {output}, {hash_in}, {hash_out});' ) sparse_flatten_function = ( - 'sparse_flatten<{input_t}, {output_t}, ap_uint<{hash_bits}>, {out_height}, {out_width}, {n_chan}, {n_sparse}>' + 'sparse_flatten<{input_t}, {output_t}, ap_uint<{hash_bits}>, {out_height}, {out_width}, {n_chan}, ' + '{n_sparse}, {parallel_factor}>' '({input}, {hash_in}, {output});' ) @@ -240,6 +260,8 @@ def format(self, node): params['hash_bits'] = node.get_attr('hash_bits') params['threshold'] = node.get_attr('threshold') params['hash_out'] = node.get_attr('hash_out_name') + variant = node.get_attr('variant', 'tree') + params['fn'] = 'sparse_input_reduce_stream' if variant == 'stream' else 'sparse_input_reduce' return self.template.format(**params) @@ -261,6 +283,8 @@ def format(self, node): params['weight_t'] = node.get_weights('weight').type.name params['bias_t'] = node.get_weights('bias').type.name params['accum_t_name'] = node.get_attr('accum_t').name + params['pixel_parallel_factor'] = node.get_attr('pixel_parallel_factor') or params['n_sparse'] + params['filt_parallel_factor'] = node.get_attr('filt_parallel_factor') or params['n_filt'] return self.template.format(**params) @@ -279,7 +303,7 @@ def format(self, node): class SparsePooling2DFunctionTemplate(FunctionCallTemplate): def __init__(self): super().__init__(SparsePooling2D, include_header=sparsepixels_include) - self.template = sparse_pooling2d_function + self.template = sparse_pooling2d_prefix + sparse_pooling2d_avg_call def format(self, node): params = self._default_function_params(node) @@ -289,8 +313,14 @@ def format(self, node): params['hash_bits'] = _get_hash_bits(node) params['hash_in'] = node.get_attr('hash_in_name') params['hash_out'] = node.get_attr('hash_out_name') - params['accum_t_name'] = node.get_attr('accum_t').name - return self.template.format(**params) + params['pixel_parallel_factor'] = node.get_attr('pixel_parallel_factor') or params['n_sparse'] + params['chan_parallel_factor'] = node.get_attr('chan_parallel_factor') or params['n_chan'] + if node.get_attr('pool_op', 'avg') == 'max': + template = sparse_pooling2d_prefix + sparse_pooling2d_max_call + else: + template = sparse_pooling2d_prefix + sparse_pooling2d_avg_call + params['accum_t_name'] = node.get_attr('accum_t').name + return template.format(**params) class SparseFlattenFunctionTemplate(FunctionCallTemplate): @@ -306,6 +336,7 @@ def format(self, node): params['out_width'] = node.get_attr('out_width') params['hash_bits'] = _get_hash_bits(node) params['hash_in'] = node.get_attr('hash_in_name') + params['parallel_factor'] = node.get_attr('parallel_factor') or (params['out_height'] * params['out_width']) return self.template.format(**params) diff --git a/hls4ml/converters/keras_v3/sparsepixels.py b/hls4ml/converters/keras_v3/sparsepixels.py index 066e6a070e..f1eab5f31c 100644 --- a/hls4ml/converters/keras_v3/sparsepixels.py +++ b/hls4ml/converters/keras_v3/sparsepixels.py @@ -219,6 +219,30 @@ def handle( return tuple(results) +def _sparse_pooling_config( + in_tensors: Sequence['KerasTensor'], out_tensors: Sequence['KerasTensor'], pool_size: int, pool_op: str +) -> dict[str, Any]: + """Shared config for the average/max sparse pooling handlers (differ only in pool_op).""" + feat_shape: tuple[int, ...] = in_tensors[0].shape[1:] # type: ignore + n_chan = int(feat_shape[-1]) + n_sparse = _sparse_context.get('n_sparse', 0) + + prev_h, prev_w = _sparse_context.get('spatial', (1, 1)) + new_h, new_w = prev_h // pool_size, prev_w // pool_size + _sparse_context['spatial'] = (new_h, new_w) + + for t in out_tensors: + _mark_sparse_output(t.name, n_sparse, n_chan, new_h, new_w) + + return { + 'class_name': 'SparsePooling2D', + 'n_sparse': n_sparse, + 'n_chan': n_chan, + 'pool_size': pool_size, + 'pool_op': pool_op, + } + + class AveragePooling2DSparseHandler(KerasV3LayerHandler): handles = ('sparsepixels.layers.AveragePooling2DSparse',) @@ -228,23 +252,16 @@ def handle( in_tensors: Sequence['KerasTensor'], out_tensors: Sequence['KerasTensor'], ): - pool_size = int(layer.avg_pool.pool_size[0]) + return _sparse_pooling_config(in_tensors, out_tensors, int(layer.avg_pool.pool_size[0]), 'avg') - feat_shape: tuple[int, ...] = in_tensors[0].shape[1:] # type: ignore - n_chan = int(feat_shape[-1]) - n_sparse = _sparse_context.get('n_sparse', 0) - prev_h, prev_w = _sparse_context.get('spatial', (1, 1)) - new_h, new_w = prev_h // pool_size, prev_w // pool_size - _sparse_context['spatial'] = (new_h, new_w) +class MaxPooling2DSparseHandler(KerasV3LayerHandler): + handles = ('sparsepixels.layers.MaxPooling2DSparse',) - out_tensor_names = [t.name for t in out_tensors] - for t_name in out_tensor_names: - _mark_sparse_output(t_name, n_sparse, n_chan, new_h, new_w) - - return { - 'class_name': 'SparsePooling2D', - 'n_sparse': n_sparse, - 'n_chan': n_chan, - 'pool_size': pool_size, - } + def handle( + self, + layer: 'keras.Layer', + in_tensors: Sequence['KerasTensor'], + out_tensors: Sequence['KerasTensor'], + ): + return _sparse_pooling_config(in_tensors, out_tensors, int(layer.max_pool.pool_size[0]), 'max') diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index 9f5a89dd8c..08945876f6 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -1956,6 +1956,7 @@ class SparsePooling2D(Layer): Attribute('n_sparse'), Attribute('n_chan'), Attribute('pool_size'), + Attribute('pool_op', value_type=str, default='avg'), # 'avg' or 'max' TypeAttribute('accum'), ] diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py index 5deca39286..5e2d1d9854 100644 --- a/hls4ml/model/optimizer/passes/bit_exact.py +++ b/hls4ml/model/optimizer/passes/bit_exact.py @@ -222,6 +222,45 @@ def _(layer: SparsePooling2D): return (feat_kif,) +@_request_kif.register +def _(layer: SparseInputReduce): + """Propagate the downstream precision request back to the dense model input. + + The output is packed per (sparse pixel, channel); any input pixel may be selected into any + slot, so each input position must satisfy the max request over slots (per channel), broadcast + across the H*W input grid. Without this the untrusted model input keeps its maximal placeholder + precision, which downstream passes then collapse to a degenerate type (e.g. ap_ufixed<1,0>), + clamping the real inputs. See also the SparseFlatten dispatch below.""" + n_chan = layer.attributes['n_chan'] + n_sparse = layer.attributes['n_sparse'] + in_shape = get_input_shapes(layer)[0] # dense model input grid, channel-last + k, i, f = requested_kif(layer) + + def to_in(a): + per_chan = a.reshape(n_sparse, n_chan).max(axis=0) + return np.broadcast_to(per_chan, in_shape).astype(np.int16) + + return ((to_in(k), to_in(i), to_in(f)),) + + +@_request_kif.register +def _(layer: SparseFlatten): + """Map the flattened (dense) request back to the packed sparse input. Each sparse slot can + scatter to any spatial position, so its request is the max over positions (per channel). This + lets the request reach the SparseInputReduce (and hence the model input) when no quantizer sits + between them.""" + n_chan = layer.attributes['n_chan'] + n_sparse = layer.attributes['n_sparse'] + n_pos = layer.attributes['out_height'] * layer.attributes['out_width'] + k, i, f = requested_kif(layer) + + def to_in(a): + per_chan = a.reshape(n_pos, n_chan).max(axis=0) + return np.tile(per_chan, n_sparse).astype(np.int16) + + return ((to_in(k), to_in(i), to_in(f)),) + + @_request_kif.register def _(layer: DACombinational): comb = layer.attributes['da_comb_trace'] @@ -757,11 +796,14 @@ def _(layer: SparseActivation): @_produce_kif.register def _(layer: SparsePooling2D): k_in, i_in, f_in = get_input_kifs(layer)[0] - # Average pooling divides by pool_size^2, adding fractional bits. - # Match standard Pooling2D: add ceil(log2(pool_size^2)) fractional bits. + # Average pooling divides by pool_size^2, which adds ceil(log2(pool_size^2)) fractional bits + # (matching standard Pooling2D). Max pooling just selects an input, so the precision is unchanged. pool_size = layer.attributes['pool_size'] n_chan = layer.attributes['n_chan'] - extra_f = int(np.ceil(np.log2(pool_size * pool_size))) + if layer.attributes.get('pool_op', 'avg') == 'max': + extra_f = 0 + else: + extra_f = int(np.ceil(np.log2(pool_size * pool_size))) k_ch = k_in[:n_chan] i_ch = i_in[:n_chan] f_ch = f_in[:n_chan] + extra_f diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h index 41e5953f75..c31a516d52 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h @@ -45,11 +45,14 @@ template T find_active(T *x, Op op, t thresh threshold); } +// Input-reduce (find-max tree): selects the first N_sparse active pixels (first input channel +// > threshold) in raster order and emits their features (all channels) and 1-based (h, w) hashes. +// A combinational find-active reduction is reused across N_sparse pipelined extractions -- low +// latency, high LUT. template void sparse_input_reduce(data_T input_arr[N_h * N_w * N_c], data_T threshold, res_T sparse_arr_feat[N_sparse * N_c], hash_T sparse_arr_hash[N_sparse * 2]) { - // Flat pixel index ranges over 0..N_h*N_w-1 -> auto-sized to minimum bits static constexpr int IDX_BITS = _sp_ceillog2(N_h * N_w); typedef value_idx_pair pair_t; @@ -92,6 +95,41 @@ void sparse_input_reduce(data_T input_arr[N_h * N_w * N_c], data_T threshold, re } } +// Input-reduce (streaming): same selection as the tree, via a one-pixel-per-cycle raster scan -- +// minimal LUT, latency ~N_h*N_w. Unused output slots (fewer than N_sparse active pixels) are zeroed. +template +void sparse_input_reduce_stream(data_T input_arr[N_h * N_w * N_c], data_T threshold, res_T sparse_arr_feat[N_sparse * N_c], + hash_T sparse_arr_hash[N_sparse * 2]) { + constexpr int NP = N_h * N_w; + +InitOut: + for (int s = 0; s < N_sparse; s++) { + #pragma HLS UNROLL + for (int c = 0; c < N_c; c++) { + #pragma HLS UNROLL + sparse_arr_feat[N_c * s + c] = 0; + } + sparse_arr_hash[2 * s] = 0; + sparse_arr_hash[2 * s + 1] = 0; + } + + int cnt = 0; +ScanLoop: + for (int j = 0; j < NP; j++) { + #pragma HLS PIPELINE + if (cnt < N_sparse && input_arr[N_c * j] > threshold) { + sparse_arr_feat[N_c * cnt] = (res_T)input_arr[N_c * j]; + for (int c = 1; c < N_c; c++) { + #pragma HLS UNROLL + sparse_arr_feat[N_c * cnt + c] = (res_T)input_arr[N_c * j + c]; + } + sparse_arr_hash[2 * cnt] = j / N_w + 1; + sparse_arr_hash[2 * cnt + 1] = j % N_w + 1; + cnt++; + } + } +} + template accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_arr_feat_in[n_chan * N_sparse], w_T filt_w[ker_size * ker_size * n_chan * n_filt], int i_filt, int i_pixel_in) { @@ -100,9 +138,14 @@ accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_ar if ((unsigned)(offset_h + R) >= ker_size || (unsigned)(offset_w + R) >= ker_size) { return (accum_T)0; } - ap_uint<4> row = R - offset_h; - ap_uint<4> col = R - offset_w; - ap_uint<7> pos = row * ker_size + col; + // Smallest functional widths for the given ker_size (compile-time): + // row, col in [0, ker_size-1] -> ceil(log2(ker_size)) bits + // pos in [0, ker_size*ker_size-1] -> ceil(log2(ker_size*ker_size)) bits + static constexpr int ROW_BITS = _sp_ceillog2(ker_size); + static constexpr int POS_BITS = _sp_ceillog2(ker_size * ker_size); + ap_uint row = R - offset_h; + ap_uint col = R - offset_w; + ap_uint pos = row * ker_size + col; accum_T acc = 0; MultLoopPerFilter: @@ -114,14 +157,21 @@ accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_ar return acc; } +// Sparse convolution on the active pixels. Two independent parallelization knobs trade LUT for +// latency without changing the output: +// pixel_parallel_factor : output pixels (N_sparse axis) computed per cycle. Default = N_sparse. +// filt_parallel_factor : output filters (n_filt axis) computed per cycle. Default = n_filt. +// Both loops use UNROLL factor (no PIPELINE: pipelining the outer loop would force-unroll the filter +// loop and ignore filt_parallel_factor); inter-layer throughput comes from the top-level DATAFLOW. +// accum_T accumulates the MACs; a single cast to res_T is applied at the store. template + int ker_size, int pixel_parallel_factor = N_sparse, int filt_parallel_factor = n_filt> void sparse_conv(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_filt], hash_T sparse_arr_hash[N_sparse * 2], w_T w[ker_size * ker_size * n_chan * n_filt], b_T b[n_filt]) { OutputPixelLoop: for (int i_pixel_out = 0; i_pixel_out < N_sparse; i_pixel_out++) { - #pragma HLS UNROLL + #pragma HLS UNROLL factor = pixel_parallel_factor bool nonzero = false; for (int i_chan = 0; i_chan < n_chan; i_chan++) { @@ -131,7 +181,7 @@ void sparse_conv(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_ OutputFilterLoop: for (int i_filt = 0; i_filt < n_filt; i_filt++) { - #pragma HLS UNROLL + #pragma HLS UNROLL factor = filt_parallel_factor accum_T acc = 0; InputPixelLoop: @@ -169,7 +219,12 @@ void sparse_relu(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_ } } -template +// Sparse average pooling. Each pooled cell is emitted once -- by the lowest-indexed output pixel +// mapping to it (the is_first test); duplicate pixels of the same cell emit 0. The averaging reads +// only the input array (no scratch mutation), so it is safe to partially unroll. Two independent +// knobs: pixel_parallel_factor (N_sparse axis) and chan_parallel_factor (n_chan axis). +template void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan], hash_T sparse_arr_hash_in[N_sparse * 2], hash_T sparse_arr_hash_out[N_sparse * 2]) { @@ -177,7 +232,7 @@ void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T spar const ap_fixed<10, 0> pool_size_recip = _pool_size_recip_d; int hash_tmp[N_sparse * 2]; -#pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0 + #pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0 ComputePooledLoc: for (int i = 0; i < N_sparse; i++) { #pragma HLS UNROLL @@ -185,23 +240,81 @@ void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T spar hash_tmp[2 * i + 1] = (sparse_arr_hash_in[2 * i + 1] - 1) / pool_size + 1; } - data_T sparse_arr_feat_in_copy[N_sparse * n_chan]; - #pragma HLS ARRAY_PARTITION variable = sparse_arr_feat_in_copy type = complete dim = 0 - for (int i = 0; i < N_sparse * n_chan; i++) { +HashOutLoop: + for (int i_pixel = 0; i_pixel < N_sparse; i_pixel++) { + #pragma HLS UNROLL factor = pixel_parallel_factor + int h_out = hash_tmp[2 * i_pixel]; + int w_out = hash_tmp[2 * i_pixel + 1]; + + bool is_first = true; + FirstCheck: + for (int k = 0; k < N_sparse; k++) { + #pragma HLS UNROLL + if (k < i_pixel && hash_tmp[2 * k] == h_out && hash_tmp[2 * k + 1] == w_out) { + is_first = false; + } + } + + ChannelLoop: + for (int i_chan = 0; i_chan < n_chan; i_chan++) { + #pragma HLS UNROLL factor = chan_parallel_factor + accum_T acc = 0; + + HashInLoop: + for (int j_pixel = 0; j_pixel < N_sparse; j_pixel++) { + #pragma HLS UNROLL + int h_in = hash_tmp[2 * j_pixel]; + int w_in = hash_tmp[2 * j_pixel + 1]; + + if ((h_out == h_in) && (w_out == w_in)) { + acc += sparse_arr_feat_in[n_chan * j_pixel + i_chan]; + } + } + sparse_arr_feat_out[n_chan * i_pixel + i_chan] = + is_first ? (res_T)(acc * pool_size_recip * pool_size_recip) : (res_T)0; + } + sparse_arr_hash_out[2 * i_pixel] = h_out; + sparse_arr_hash_out[2 * i_pixel + 1] = w_out; + } +} + +// Sparse max pooling. Same structure as the average version (one emission per pooled cell via the +// is_first test), but takes the per-channel maximum of the active pixels in the cell, floored at 0 +// to match dense max pooling over the zero-masked window. Two independent knobs: +// pixel_parallel_factor (N_sparse axis) and chan_parallel_factor (n_chan axis). +template +void sparse_pooling_max(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan], + hash_T sparse_arr_hash_in[N_sparse * 2], hash_T sparse_arr_hash_out[N_sparse * 2]) { + + int hash_tmp[N_sparse * 2]; + #pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0 +ComputePooledLoc: + for (int i = 0; i < N_sparse; i++) { #pragma HLS UNROLL - sparse_arr_feat_in_copy[i] = sparse_arr_feat_in[i]; + hash_tmp[2 * i] = (sparse_arr_hash_in[2 * i] - 1) / pool_size + 1; + hash_tmp[2 * i + 1] = (sparse_arr_hash_in[2 * i + 1] - 1) / pool_size + 1; } HashOutLoop: for (int i_pixel = 0; i_pixel < N_sparse; i_pixel++) { - #pragma HLS UNROLL + #pragma HLS UNROLL factor = pixel_parallel_factor int h_out = hash_tmp[2 * i_pixel]; int w_out = hash_tmp[2 * i_pixel + 1]; + bool is_first = true; + FirstCheck: + for (int k = 0; k < N_sparse; k++) { + #pragma HLS UNROLL + if (k < i_pixel && hash_tmp[2 * k] == h_out && hash_tmp[2 * k + 1] == w_out) { + is_first = false; + } + } + ChannelLoop: for (int i_chan = 0; i_chan < n_chan; i_chan++) { - #pragma HLS UNROLL - accum_T acc = 0; + #pragma HLS UNROLL factor = chan_parallel_factor + data_T vmax = 0; HashInLoop: for (int j_pixel = 0; j_pixel < N_sparse; j_pixel++) { @@ -209,44 +322,53 @@ void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T spar int h_in = hash_tmp[2 * j_pixel]; int w_in = hash_tmp[2 * j_pixel + 1]; - data_T data = sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan]; - if ((h_out == h_in) && (w_out == w_in)) { - acc += data; - sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan] = 0; + data_T v = sparse_arr_feat_in[n_chan * j_pixel + i_chan]; + if ((h_out == h_in) && (w_out == w_in) && (v > vmax)) { + vmax = v; } } - sparse_arr_feat_out[n_chan * i_pixel + i_chan] = (res_T)(acc * pool_size_recip * pool_size_recip); + sparse_arr_feat_out[n_chan * i_pixel + i_chan] = is_first ? (res_T)vmax : (res_T)0; } sparse_arr_hash_out[2 * i_pixel] = h_out; sparse_arr_hash_out[2 * i_pixel + 1] = w_out; } } -template +// Scatters the sparse pixels back to a dense n_height * n_width * n_chan grid (the sparse->dense +// transition before Dense layers). Implemented as a gather: each dense location is written exactly +// once by scanning the sparse pixels for the one mapping to it (no data-dependent writes), so it is +// safe to fully or partially unroll. parallel_factor = dense locations produced per cycle. +template void sparse_flatten(data_T sparse_arr_feat[N_sparse * n_chan], hash_T sparse_arr_hash[N_sparse * 2], res_T flat_arr[n_height * n_width * n_chan]) { -InitFlatArr: - for (int i = 0; i < n_height * n_width * n_chan; i++) { + int pix_idx[N_sparse]; + #pragma HLS ARRAY_PARTITION variable = pix_idx type = complete dim = 0 +PixIdxLoop: + for (int i = 0; i < N_sparse; i++) { #pragma HLS UNROLL - flat_arr[i] = 0; + pix_idx[i] = (sparse_arr_hash[2 * i] - 1) * n_width + (sparse_arr_hash[2 * i + 1] - 1); } -FillFlatArr: - for (int i = 0; i < N_sparse; i++) { - #pragma HLS UNROLL factor = 4 - int i_h = sparse_arr_hash[2 * i]; - int i_w = sparse_arr_hash[2 * i + 1]; - int pixel_idx = (i_h - 1) * n_width + (i_w - 1); +GatherLoop: + for (int p = 0; p < n_height * n_width; p++) { + #pragma HLS UNROLL factor = parallel_factor ChannelLoop: for (int i_chan = 0; i_chan < n_chan; i_chan++) { #pragma HLS UNROLL - data_T data = sparse_arr_feat[n_chan * i + i_chan]; + res_T val = 0; - if (data != 0) { - flat_arr[n_chan * pixel_idx + i_chan] = (res_T)data; + ScanLoop: + for (int i = 0; i < N_sparse; i++) { + #pragma HLS UNROLL + data_T data = sparse_arr_feat[n_chan * i + i_chan]; + if (pix_idx[i] == p && data != 0) { + val = (res_T)data; + } } + flat_arr[n_chan * p + i_chan] = val; } } } diff --git a/test/pytest/test_sparsepixels.py b/test/pytest/test_sparsepixels.py index aaf92c4b85..53f30c4e5c 100644 --- a/test/pytest/test_sparsepixels.py +++ b/test/pytest/test_sparsepixels.py @@ -10,14 +10,19 @@ from hgq.layers import QDense # noqa: E402 from hgq.quantizer.config import QuantizerConfig # noqa: E402 from keras.layers import Flatten # noqa: E402 -from sparsepixels.layers import AveragePooling2DSparse, InputReduce, QConv2DSparse # noqa: E402 +from sparsepixels.layers import ( # noqa: E402 + AveragePooling2DSparse, + InputReduce, + MaxPooling2DSparse, + QConv2DSparse, +) import hls4ml # noqa: E402 test_root_path = Path(__file__).parent -def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4): +def _build_sparse_cnn(input_shape=(8, 8, 1), n=4, threshold=0.4, pool='avg'): iq_conf = QuantizerConfig(place='datalane', q_type='kif', i0=4, f0=8, overflow_mode='WRAP') with ( QuantizerConfigScope(place='all', default_q_type='kbi', overflow_mode='SAT_SYM'), @@ -25,7 +30,7 @@ def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4): LayerConfigScope(enable_ebops=True, enable_iq=True, beta0=1e-5), ): x_in = keras.Input(shape=input_shape, name='x_in') - x, keep_mask = InputReduce(n_max_pixels=n_max_pixels, threshold=threshold, name='input_reduce')(x_in) + x, keep_mask = InputReduce(n=n, threshold=threshold, name='input_reduce')(x_in) x = QConv2DSparse( filters=2, kernel_size=3, @@ -35,7 +40,8 @@ def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4): activation='relu', iq_conf=iq_conf, )([x, keep_mask]) - x, keep_mask = AveragePooling2DSparse(2, name='pool')([x, keep_mask]) + pool_layer = MaxPooling2DSparse(2, name='pool') if pool == 'max' else AveragePooling2DSparse(2, name='pool') + x, keep_mask = pool_layer([x, keep_mask]) x = Flatten(name='flatten')(x) x = QDense(1, name='dense', iq_conf=iq_conf)(x) return keras.Model(x_in, x, name='cnn_sparse_test') @@ -50,18 +56,13 @@ def _make_sparse_inputs(n_samples, h=8, w=8, n_active_per_sample=4, threshold=0. return x -@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) -def test_sparse_cnn(test_case_id, backend): - np.random.seed(42) - keras.utils.set_random_seed(42) - - model = _build_sparse_cnn() - x = _make_sparse_inputs(n_samples=1000) - +def _convert_and_check(model, x, output_dir, backend, layer_overrides=None): y_keras = model.predict(x, verbose=0) - output_dir = test_root_path / test_case_id hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend) + for name, overrides in (layer_overrides or {}).items(): + hls_config['LayerName'].setdefault(name, {}).update(overrides) + hls_model = hls4ml.converters.convert_from_keras_model( model, hls_config=hls_config, @@ -71,9 +72,41 @@ def test_sparse_cnn(test_case_id, backend): ) hls_model.compile() - y_hls = hls_model.predict(x).reshape(y_keras.shape) + # Guard the input-precision regression: bit_exact must propagate the downstream precision + # request back through the sparse layers to the model input. Otherwise x_in collapses to a + # degenerate type (e.g. ap_ufixed<1,0>) that clamps the real inputs to {0, 0.5}. + in_prec = hls_model.graph['x_in'].get_output_variable().type.precision + assert in_prec.width > 2, f'input precision collapsed to {in_prec}' + y_hls = hls_model.predict(x).reshape(y_keras.shape) mean_abs_diff = float(np.mean(np.abs(y_keras - y_hls))) - print(f'sparse-pixels {backend}: mean|diff|={mean_abs_diff:.4f}') + print(f'{output_dir.name}: mean|diff|={mean_abs_diff:.4f}') + assert mean_abs_diff < 0.05 - assert mean_abs_diff < 0.5 + +@pytest.mark.parametrize('backend', ['Vivado', 'Vitis']) +@pytest.mark.parametrize('pool', ['avg', 'max']) +def test_sparse_cnn(test_case_id, backend, pool): + np.random.seed(42) + keras.utils.set_random_seed(42) + + model = _build_sparse_cnn(pool=pool) + x = _make_sparse_inputs(n_samples=1000) + _convert_and_check(model, x, test_root_path / test_case_id, backend) + + +@pytest.mark.parametrize('backend', ['Vitis']) +def test_sparse_cnn_parallelization(test_case_id, backend): + # Partial parallelization and the streaming input reduce only change the unroll/implementation, + # so the numerical output must still match the fully-parallel/tree default. + np.random.seed(43) + keras.utils.set_random_seed(43) + + model = _build_sparse_cnn() + x = _make_sparse_inputs(n_samples=500) + overrides = { + 'input_reduce': {'Variant': 'stream'}, + 'conv': {'PixelParallelFactor': 2, 'FiltParallelFactor': 1}, + 'pool': {'PixelParallelFactor': 2, 'ChanParallelFactor': 1}, + } + _convert_and_check(model, x, test_root_path / test_case_id, backend, layer_overrides=overrides) From 31ce35996ec027f86374e85c67d2b027d4f21d3f Mon Sep 17 00:00:00 2001 From: Ho Fung Tsoi Date: Thu, 2 Jul 2026 23:13:22 -0400 Subject: [PATCH 3/3] fix version for pyproject-fmt --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e5bbf83ba5..b515e79465 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ optional-dependencies.qkeras = [ optional-dependencies.qkeras-v3 = [ "qkeras-v3" ] optional-dependencies.quartus-report = [ "calmjs-parse", "tabulate" ] optional-dependencies.snn = [ "snntorch", "torch" ] -optional-dependencies.sparsepixels = [ "sparsepixels>=0.3.0" ] +optional-dependencies.sparsepixels = [ "sparsepixels>=0.3" ] optional-dependencies.sr = [ "sympy>=1.13.1" ] optional-dependencies.testing = [ "calmjs-parse",