From 94d6331e4736f78615db1e9e77ea8b9db7595d83 Mon Sep 17 00:00:00 2001
From: Ho Fung Tsoi <hftsoi0411@gmail.com>
Date: Sat, 18 Apr 2026 17:31:08 -0400
Subject: [PATCH 1/3] sparsepixels patch

---
 hls4ml/backends/vivado/passes/sparsepixels.py | 400 ++++++++++++++++++
 hls4ml/backends/vivado/vivado_backend.py      |   2 +
 hls4ml/converters/keras_v3/__init__.py        |   1 +
 hls4ml/converters/keras_v3/sparsepixels.py    | 250 +++++++++++
 hls4ml/converters/keras_v3_to_hls.py          |   8 +
 hls4ml/model/layers.py                        |  92 ++++
 hls4ml/model/optimizer/passes/bit_exact.py    | 107 +++++
 .../vivado/nnet_utils/nnet_sparsepixels.h     | 254 +++++++++++
 pyproject.toml                                |   1 +
 test/pytest/test_sparsepixels.py              |  79 ++++
 10 files changed, 1194 insertions(+)
 create mode 100644 hls4ml/backends/vivado/passes/sparsepixels.py
 create mode 100644 hls4ml/converters/keras_v3/sparsepixels.py
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h
 create mode 100644 test/pytest/test_sparsepixels.py

diff --git a/hls4ml/backends/vivado/passes/sparsepixels.py b/hls4ml/backends/vivado/passes/sparsepixels.py
new file mode 100644
index 0000000000..0dd3329c10
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/sparsepixels.py
@@ -0,0 +1,400 @@
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import (
+    Input,
+    Reshape,
+    SparseActivation,
+    SparseConv2D,
+    SparseFlatten,
+    SparseInputReduce,
+    SparsePooling2D,
+)
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer
+
+sparsepixels_include = ['nnet_utils/nnet_sparsepixels.h']
+
+# Optimizer pass: trace hash vars & Flatten->SparseFlatten
+
+
+class SparseGraphOptimizer(OptimizerPass):
+    """Triggered by SparseInputReduce. Walks the full graph to wire hash variable names,
+    track spatial dims, and replace Flatten->SparseFlatten."""
+
+    def match(self, node):
+        return isinstance(node, SparseInputReduce) and node.get_attr('hash_out_name', None) is None
+
+    def transform(self, model, node):
+        hash_map = {}
+        spatial = {}
+        changed = False
+
+        for name, n in list(model.graph.items()):
+            if isinstance(n, SparseInputReduce):
+                h_var = f'sparse_hash_{name}'
+                n.set_attr('hash_out_name', h_var)
+                hash_map[name] = h_var
+                spatial[name] = (n.get_attr('in_height'), n.get_attr('in_width'))
+
+            elif isinstance(n, SparseConv2D):
+                src = n.inputs[1] if len(n.inputs) > 1 else n.inputs[0]
+                h_var = hash_map.get(src, hash_map.get(n.inputs[0]))
+                n.set_attr('hash_in_name', h_var)
+                n.set_attr('hash_out_name', h_var)
+                hash_map[name] = h_var
+                spatial[name] = spatial.get(src, spatial.get(n.inputs[0]))
+
+            elif isinstance(n, FixedPointQuantizer):
+                src = n.inputs[0]
+                if src in hash_map:
+                    hash_map[name] = hash_map[src]
+                    spatial[name] = spatial.get(src)
+
+            elif isinstance(n, SparseActivation):
+                src = n.inputs[0]
+                h_var = hash_map.get(src)
+                hash_map[name] = h_var
+                spatial[name] = spatial.get(src)
+
+            elif isinstance(n, SparsePooling2D):
+                src = n.inputs[1] if len(n.inputs) > 1 else n.inputs[0]
+                h_in = hash_map.get(src, hash_map.get(n.inputs[0]))
+                h_out = f'sparse_hash_{name}'
+                n.set_attr('hash_in_name', h_in)
+                n.set_attr('hash_out_name', h_out)
+                hash_map[name] = h_out
+                ps = n.get_attr('pool_size')
+                prev_h, prev_w = spatial.get(src, spatial.get(n.inputs[0], (0, 0)))
+                spatial[name] = (prev_h // ps, prev_w // ps)
+
+            elif isinstance(n, SparseFlatten):
+                src = n.inputs[0]
+                h_var = hash_map.get(src)
+                if h_var is not None:
+                    n.set_attr('hash_in_name', h_var)
+                    hash_map[name] = h_var
+                    spatial[name] = spatial.get(src, (1, 1))
+
+            elif isinstance(n, Reshape):
+                src = n.inputs[0]
+                if src in hash_map:
+                    src_node = model.graph[src]
+                    n_sparse = src_node.get_attr('n_sparse', None)
+                    if n_sparse is None:
+                        continue
+                    n_chan = src_node.get_attr('n_chan', None) or src_node.get_attr('n_filt', None)
+                    h_var = hash_map[src]
+                    sp = spatial.get(src, (1, 1))
+
+                    attrs = {
+                        'n_sparse': n_sparse,
+                        'n_chan': n_chan,
+                        'out_height': sp[0],
+                        'out_width': sp[1],
+                        'hash_in_name': h_var,
+                    }
+                    new_node = model.make_node('SparseFlatten', name, attrs, n.inputs.copy(), outputs=n.outputs.copy())
+                    model.replace_node(n, new_node)
+                    changed = True
+
+        return changed
+
+
+#  Config templates (struct definitions)
+
+sparse_input_reduce_config = """struct config{index} {{
+    static const unsigned in_height = {in_height};
+    static const unsigned in_width = {in_width};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned n_sparse = {n_sparse};
+    static const unsigned hash_bits = {hash_bits};
+}};\n"""
+
+sparse_conv2d_config = """struct config{index} {{
+    static const unsigned n_sparse = {n_sparse};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned kernel_size = {kernel_size};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+sparse_activation_config = """struct config{index} {{
+    static const unsigned n_sparse = {n_sparse};
+    static const unsigned n_chan = {n_chan};
+}};\n"""
+
+sparse_pooling2d_config = """struct config{index} {{
+    static const unsigned n_sparse = {n_sparse};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned pool_size = {pool_size};
+    typedef {accum_t.name} accum_t;
+}};\n"""
+
+sparse_flatten_config = """struct config{index} {{
+    static const unsigned n_sparse = {n_sparse};
+    static const unsigned n_chan = {n_chan};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_width = {out_width};
+}};\n"""
+
+
+class SparseInputReduceConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SparseInputReduce)
+        self.template = sparse_input_reduce_config
+
+    def format(self, node):
+        return self.template.format(**self._default_config_params(node))
+
+
+class SparseConv2DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SparseConv2D)
+        self.template = sparse_conv2d_config
+
+    def format(self, node):
+        return self.template.format(**self._default_config_params(node))
+
+
+class SparseActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SparseActivation)
+        self.template = sparse_activation_config
+
+    def format(self, node):
+        return self.template.format(**self._default_config_params(node))
+
+
+class SparsePooling2DConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SparsePooling2D)
+        self.template = sparse_pooling2d_config
+
+    def format(self, node):
+        return self.template.format(**self._default_config_params(node))
+
+
+class SparseFlattenConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(SparseFlatten)
+        self.template = sparse_flatten_config
+
+    def format(self, node):
+        return self.template.format(**self._default_config_params(node))
+
+
+#  Function-call templates
+
+sparse_input_reduce_function = (
+    '{input_t} threshold_{index} = {threshold};\n'
+    'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n'
+    '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n'
+    'sparse_input_reduce<{input_t}, {output_t}, ap_uint<{hash_bits}>, {in_height}, {in_width}, {n_chan}, {n_sparse}>'
+    '({input}, threshold_{index}, {output}, {hash_out});'
+)
+
+sparse_conv2d_function = (
+    'sparse_conv<{input_t}, {output_t}, ap_uint<{hash_bits}>, {weight_t}, {bias_t}, {accum_t_name}, '
+    '{n_sparse}, {n_chan}, {n_filt}, {kernel_size}>'
+    '({input}, {output}, {hash_in}, {w}, {b});'
+)
+
+sparse_activation_function = 'sparse_relu<{input_t}, {output_t}, {n_sparse}, {n_chan}>({input}, {output});'
+
+sparse_pooling2d_function = (
+    'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n'
+    '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n'
+    'sparse_pooling_avg<{input_t}, {output_t}, ap_uint<{hash_bits}>, {accum_t_name}, {n_sparse}, {n_chan}, {pool_size}>'
+    '({input}, {output}, {hash_in}, {hash_out});'
+)
+
+sparse_flatten_function = (
+    'sparse_flatten<{input_t}, {output_t}, ap_uint<{hash_bits}>, {out_height}, {out_width}, {n_chan}, {n_sparse}>'
+    '({input}, {hash_in}, {output});'
+)
+
+
+def _get_hash_bits(node):
+    inp = node
+    while inp is not None:
+        hb = inp.get_attr('hash_bits', None)
+        if hb is not None:
+            return hb
+        if len(inp.inputs) > 0:
+            inp = inp.model.graph.get(inp.inputs[0])
+        else:
+            break
+    return 10
+
+
+class SparseInputReduceFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SparseInputReduce, include_header=sparsepixels_include)
+        self.template = sparse_input_reduce_function
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['in_height'] = node.get_attr('in_height')
+        params['in_width'] = node.get_attr('in_width')
+        params['n_chan'] = node.get_attr('n_chan')
+        params['n_sparse'] = node.get_attr('n_sparse')
+        params['hash_bits'] = node.get_attr('hash_bits')
+        params['threshold'] = node.get_attr('threshold')
+        params['hash_out'] = node.get_attr('hash_out_name')
+        return self.template.format(**params)
+
+
+class SparseConv2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SparseConv2D, include_header=sparsepixels_include)
+        self.template = sparse_conv2d_function
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['n_sparse'] = node.get_attr('n_sparse')
+        params['n_chan'] = node.get_attr('n_chan')
+        params['n_filt'] = node.get_attr('n_filt')
+        params['kernel_size'] = node.get_attr('kernel_size')
+        params['hash_bits'] = _get_hash_bits(node)
+        params['hash_in'] = node.get_attr('hash_in_name')
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+        params['weight_t'] = node.get_weights('weight').type.name
+        params['bias_t'] = node.get_weights('bias').type.name
+        params['accum_t_name'] = node.get_attr('accum_t').name
+        return self.template.format(**params)
+
+
+class SparseActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SparseActivation, include_header=sparsepixels_include)
+        self.template = sparse_activation_function
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['n_sparse'] = node.get_attr('n_sparse')
+        params['n_chan'] = node.get_attr('n_chan')
+        return self.template.format(**params)
+
+
+class SparsePooling2DFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SparsePooling2D, include_header=sparsepixels_include)
+        self.template = sparse_pooling2d_function
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['n_sparse'] = node.get_attr('n_sparse')
+        params['n_chan'] = node.get_attr('n_chan')
+        params['pool_size'] = node.get_attr('pool_size')
+        params['hash_bits'] = _get_hash_bits(node)
+        params['hash_in'] = node.get_attr('hash_in_name')
+        params['hash_out'] = node.get_attr('hash_out_name')
+        params['accum_t_name'] = node.get_attr('accum_t').name
+        return self.template.format(**params)
+
+
+class SparseFlattenFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(SparseFlatten, include_header=sparsepixels_include)
+        self.template = sparse_flatten_function
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['n_sparse'] = node.get_attr('n_sparse')
+        params['n_chan'] = node.get_attr('n_chan')
+        params['out_height'] = node.get_attr('out_height')
+        params['out_width'] = node.get_attr('out_width')
+        params['hash_bits'] = _get_hash_bits(node)
+        params['hash_in'] = node.get_attr('hash_in_name')
+        return self.template.format(**params)
+
+
+#  Optimizer pass: fix Input precision for sparse models
+
+
+class SparseFixInputPrecision(OptimizerPass):
+    """Fix Input precision for sparse models.
+
+    The standard FixInputPrecision cannot find FixedPointQuantizer nodes through
+    sparse layers (Input -> SparseInputReduce -> FPQ), so it falls back to a
+    minimal type. This pass corrects the Input precision using the downstream
+    FPQ's mask, then re-registers SparseInputReduce with the corrected type.
+    """
+
+    def match(self, node):
+        if not isinstance(node, Input):
+            return False
+        model = node.model
+        for layer in model.graph.values():
+            if isinstance(layer, SparseInputReduce) and node.name in layer.inputs:
+                return True
+        return False
+
+    def transform(self, model, node):
+        from hls4ml.model.optimizer.passes.bit_exact import (
+            produce_kif,
+            register_precision,
+            to_hls4ml_fixed,
+        )
+
+        sparse_reduce = None
+        for layer in model.graph.values():
+            if isinstance(layer, SparseInputReduce) and node.name in layer.inputs:
+                sparse_reduce = layer
+                break
+        if sparse_reduce is None:
+            return False
+
+        fpq = None
+        for layer in model.graph.values():
+            if isinstance(layer, FixedPointQuantizer) and sparse_reduce.name in layer.inputs:
+                fpq = layer
+                break
+        if fpq is None:
+            return False
+
+        # Read FPQ's output type, which was correctly set by BitExact's
+        # register_precision using per-element max(k), max(i), max(f).
+        # We do NOT call _produce_kif(fpq) here because that would re-clip
+        # against the currently-wrong Input precision (set to ap_ufixed<1,0>
+        # by the standard FixInputPrecision which can't recurse through sparse layers).
+        fpq_prec = fpq.get_output_variable().type.precision
+        k = 1 if fpq_prec.signed else 0
+        i = fpq_prec.integer - k
+        f = fpq_prec.width - fpq_prec.integer
+
+        new_type = to_hls4ml_fixed(k, i, f + 1, f'{node.name}_t')
+        if hasattr(fpq, 'SAT') and fpq.SAT in ('SAT', 'SAT_SYM'):
+            new_type.precision.saturation_mode = 'SAT'
+        else:
+            new_type.precision.saturation_mode = 'WRAP'
+        node.get_output_variable().type = new_type
+        node.model.config.layer_name_precision[node.name] = str(new_type)
+        node.attributes['trusted'] = True
+
+        produce_kif(sparse_reduce, force_reset=True)
+        register_precision(sparse_reduce)
+        for attr in ('_produce_kif', '_request_kif'):
+            if attr in sparse_reduce.attributes:
+                del sparse_reduce.attributes[attr]
+
+        return False
+
+
+#  Backend registration hook
+
+
+def register_sparsepixels(backend):
+    backend.register_pass('sparse_graph_optimizer', SparseGraphOptimizer)
+    backend.register_pass('sparse_fix_input_precision', SparseFixInputPrecision)
+
+    backend.register_pass('sparseinputreduce_config_template', SparseInputReduceConfigTemplate)
+    backend.register_pass('sparseinputreduce_function_template', SparseInputReduceFunctionTemplate)
+    backend.register_pass('sparseconv2d_config_template', SparseConv2DConfigTemplate)
+    backend.register_pass('sparseconv2d_function_template', SparseConv2DFunctionTemplate)
+    backend.register_pass('sparseactivation_config_template', SparseActivationConfigTemplate)
+    backend.register_pass('sparseactivation_function_template', SparseActivationFunctionTemplate)
+    backend.register_pass('sparsepooling2d_config_template', SparsePooling2DConfigTemplate)
+    backend.register_pass('sparsepooling2d_function_template', SparsePooling2DFunctionTemplate)
+    backend.register_pass('sparseflatten_config_template', SparseFlattenConfigTemplate)
+    backend.register_pass('sparseflatten_function_template', SparseFlattenFunctionTemplate)
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 879784465a..5014f6836f 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -163,6 +163,8 @@ def _register_flows(self):
         quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
 
         optimization_passes = [
+            'vivado:sparse_graph_optimizer',
+            'vivado:sparse_fix_input_precision',
             'vivado:remove_final_reshape',
             'vivado:optimize_pointwise_conv',
             'vivado:inplace_parallel_reshape',
diff --git a/hls4ml/converters/keras_v3/__init__.py b/hls4ml/converters/keras_v3/__init__.py
index 21950aea6c..7208d06efd 100644
--- a/hls4ml/converters/keras_v3/__init__.py
+++ b/hls4ml/converters/keras_v3/__init__.py
@@ -6,6 +6,7 @@
     merge,  # noqa: F401
     pooling,  # noqa: F401
     recurrent,  # noqa: F401
+    sparsepixels,  # noqa: F401
 )
 from ._base import registry as layer_handlers
 
diff --git a/hls4ml/converters/keras_v3/sparsepixels.py b/hls4ml/converters/keras_v3/sparsepixels.py
new file mode 100644
index 0000000000..066e6a070e
--- /dev/null
+++ b/hls4ml/converters/keras_v3/sparsepixels.py
@@ -0,0 +1,250 @@
+import math
+import typing
+from collections.abc import Sequence
+from typing import Any
+
+import numpy as np
+
+from ._base import KerasV3LayerHandler
+
+if typing.TYPE_CHECKING:
+    import keras
+    from keras import KerasTensor
+
+_sparse_context: dict[str, Any] = {}
+
+
+def _mark_sparse_output(tensor_name: str, n_sparse: int, n_chan: int, height: int, width: int):
+    """Record a tensor as coming from a sparse layer so Flatten can be converted."""
+    sparse_outputs = _sparse_context.setdefault('sparse_output_tensors', {})
+    sparse_outputs[tensor_name] = {
+        'n_sparse': n_sparse,
+        'n_chan': n_chan,
+        'out_height': height,
+        'out_width': width,
+    }
+
+
+def _extract_sparse_iq_config(conv_layer, in_tensor_name: str, n_sparse: int, n_chan: int) -> dict[str, Any]:
+    """Extract input quantizer config from QConv2D, adapted for sparse tensor shape."""
+    from keras import ops
+
+    internal_q = conv_layer._iq.quantizer
+    kif_k, kif_i, kif_f = internal_q.kif
+    kif_k = np.ravel(ops.convert_to_numpy(kif_k)).astype(np.int16)
+    kif_i = np.ravel(ops.convert_to_numpy(kif_i)).astype(np.int16)
+    kif_f = np.ravel(ops.convert_to_numpy(kif_f)).astype(np.int16)
+
+    # HGQ quantizers may be per-element (H*W*C); reduce to per-channel
+    # Take max of each component independently to get the envelope type
+    if kif_k.size > n_chan:
+        kif_k = np.max(kif_k.reshape(-1, n_chan), axis=0)
+        kif_i = np.max(kif_i.reshape(-1, n_chan), axis=0)
+        kif_f = np.max(kif_f.reshape(-1, n_chan), axis=0)
+
+    # Reconstruct KBI from KIF: B = k + i + f, I_bits = k + i
+    k = kif_k
+    B = kif_k + kif_i + kif_f
+    I_bits = kif_k + kif_i
+
+    if k.size > 1:
+        k = np.tile(k, n_sparse).reshape(1, -1)
+        B = np.tile(B, n_sparse).reshape(1, -1)
+        I_bits = np.tile(I_bits, n_sparse).reshape(1, -1)
+
+    overflow_mode: str = internal_q.overflow_mode
+    round_mode: str = internal_q.round_mode
+    if round_mode.startswith('S_'):
+        round_mode = round_mode[2:]
+
+    return {
+        'name': conv_layer._iq.name,
+        'class_name': 'FixedPointQuantizer',
+        'mask_kbi': (k, B, I_bits),
+        'SAT': overflow_mode,
+        'RND': round_mode,
+        'fusible': None,
+        'input_keras_tensor_names': [in_tensor_name],
+        'output_keras_tensor_names': [f'{in_tensor_name}_q'],
+        'overrides': {},
+    }
+
+
+def post_process_sparse_layer_list(layer_list: list[dict[str, Any]]) -> None:
+    """Convert Reshape (from Flatten) nodes that follow sparse layers into SparseFlatten.
+    Called from keras_v3_to_hls after parsing."""
+    sparse_outputs = _sparse_context.get('sparse_output_tensors', {})
+    if not sparse_outputs:
+        return
+
+    for conf in layer_list:
+        if conf.get('class_name') != 'Reshape':
+            continue
+        in_tensors = conf.get('input_keras_tensor_names', [])
+        if not in_tensors:
+            continue
+        src_tensor = in_tensors[0]
+        if src_tensor not in sparse_outputs:
+            continue
+        info = sparse_outputs[src_tensor]
+        conf['class_name'] = 'SparseFlatten'
+        conf['n_sparse'] = info['n_sparse']
+        conf['n_chan'] = info['n_chan']
+        conf['out_height'] = info['out_height']
+        conf['out_width'] = info['out_width']
+        conf.pop('target_shape', None)
+
+
+class InputReduceHandler(KerasV3LayerHandler):
+    handles = ('sparsepixels.layers.InputReduce',)
+
+    def handle(
+        self,
+        layer: 'keras.Layer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        in_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
+        in_height, in_width, n_chan = in_shape
+
+        n_sparse = layer.n_max_pixels
+        threshold = float(layer.threshold) if layer.threshold is not None else 0.0
+
+        # Clear any stale state from a previous conversion in the same Python process
+        _sparse_context.clear()
+        _sparse_context['n_sparse'] = n_sparse
+        _sparse_context['spatial'] = (int(in_height), int(in_width))
+
+        for t in out_tensors:
+            _mark_sparse_output(t.name, n_sparse, int(n_chan), int(in_height), int(in_width))
+
+        # Hash stores 1-based H and W coordinates separately (see nnet_sparsepixels.h::sparse_input_reduce).
+        # Spatial dims only shrink through the network (pooling), so input H/W bound the required bits.
+        max_dim = max(int(in_height), int(in_width))
+        hash_bits = max(1, math.ceil(math.log2(max_dim + 1)))
+
+        return {
+            'class_name': 'SparseInputReduce',
+            'in_height': int(in_height),
+            'in_width': int(in_width),
+            'n_chan': int(n_chan),
+            'n_sparse': n_sparse,
+            'threshold': threshold,
+            'hash_bits': hash_bits,
+        }
+
+
+class QConv2DSparseHandler(KerasV3LayerHandler):
+    handles = ('sparsepixels.layers.QConv2DSparse',)
+
+    def handle(
+        self,
+        layer: 'keras.Layer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        import keras
+        from keras import ops
+
+        conv = layer.conv
+        n_chan = int(conv.kernel.shape[2])
+        n_filt = int(conv.filters)
+        kernel_size = int(conv.kernel_size[0])
+        n_sparse = _sparse_context.get('n_sparse', 0)
+
+        if hasattr(conv, 'qkernel'):
+            weight_data = ops.convert_to_numpy(conv.qkernel)
+        else:
+            weight_data = ops.convert_to_numpy(conv.kernel)
+
+        bias_data = None
+        if layer._use_bias and hasattr(layer, 'sparse_bias'):
+            if hasattr(layer, '_bq'):
+                bias_data = ops.convert_to_numpy(layer._bq(layer.sparse_bias))
+            else:
+                bias_data = ops.convert_to_numpy(layer.sparse_bias)
+
+        name = layer.name
+        in_tensor_names = [t.name for t in in_tensors]
+        out_tensor_names = [t.name for t in out_tensors]
+
+        iq_conf = None
+        has_iq = hasattr(conv, '_iq') and hasattr(conv, '_enable_iq') and conv._enable_iq
+        if has_iq:
+            iq_conf = _extract_sparse_iq_config(conv, in_tensors[0].name, n_sparse, n_chan)
+            in_tensor_names = [f'{in_tensors[0].name}_q']
+
+        config: dict[str, Any] = {
+            'class_name': 'SparseConv2D',
+            'name': name,
+            'n_sparse': n_sparse,
+            'n_chan': n_chan,
+            'n_filt': n_filt,
+            'kernel_size': kernel_size,
+            'weight_data': weight_data,
+            'bias_data': bias_data,
+            'input_keras_tensor_names': in_tensor_names,
+            'output_keras_tensor_names': out_tensor_names,
+        }
+
+        activation = layer._activation
+        spatial = _sparse_context.get('spatial', (1, 1))
+        results: list[dict[str, Any]] = []
+        if iq_conf is not None:
+            results.append(iq_conf)
+
+        if activation not in (None, keras.activations.linear):
+            act_name = activation.__name__
+            intermediate = f'{out_tensors[0].name}_sparse_act'
+
+            config['output_keras_tensor_names'] = [intermediate]
+
+            act_config: dict[str, Any] = {
+                'class_name': 'SparseActivation',
+                'name': f'{name}_{act_name}',
+                'activation': act_name,
+                'n_sparse': n_sparse,
+                'n_chan': n_filt,
+                'input_keras_tensor_names': [intermediate],
+                'output_keras_tensor_names': out_tensor_names,
+            }
+            for t_name in out_tensor_names:
+                _mark_sparse_output(t_name, n_sparse, n_filt, spatial[0], spatial[1])
+            results.extend([config, act_config])
+            return tuple(results)
+
+        for t_name in out_tensor_names:
+            _mark_sparse_output(t_name, n_sparse, n_filt, spatial[0], spatial[1])
+        results.append(config)
+        return tuple(results)
+
+
+class AveragePooling2DSparseHandler(KerasV3LayerHandler):
+    handles = ('sparsepixels.layers.AveragePooling2DSparse',)
+
+    def handle(
+        self,
+        layer: 'keras.Layer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        pool_size = int(layer.avg_pool.pool_size[0])
+
+        feat_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
+        n_chan = int(feat_shape[-1])
+        n_sparse = _sparse_context.get('n_sparse', 0)
+
+        prev_h, prev_w = _sparse_context.get('spatial', (1, 1))
+        new_h, new_w = prev_h // pool_size, prev_w // pool_size
+        _sparse_context['spatial'] = (new_h, new_w)
+
+        out_tensor_names = [t.name for t in out_tensors]
+        for t_name in out_tensor_names:
+            _mark_sparse_output(t_name, n_sparse, n_chan, new_h, new_w)
+
+        return {
+            'class_name': 'SparsePooling2D',
+            'n_sparse': n_sparse,
+            'n_chan': n_chan,
+            'pool_size': pool_size,
+        }
diff --git a/hls4ml/converters/keras_v3_to_hls.py b/hls4ml/converters/keras_v3_to_hls.py
index 359bc391d6..697a6dfdce 100644
--- a/hls4ml/converters/keras_v3_to_hls.py
+++ b/hls4ml/converters/keras_v3_to_hls.py
@@ -352,6 +352,14 @@ def parse_keras_v3_model(model: 'keras.Model', allow_da_fallback=True, allow_v2_
             # If no layer was added in the loop, then there is a circular dependency
             raise ValueError('Circular dependency detected')
 
+    # Post-process: convert Flatten following sparse layers to SparseFlatten
+    try:
+        from hls4ml.converters.keras_v3.sparsepixels import post_process_sparse_layer_list
+
+        post_process_sparse_layer_list(layer_list)
+    except ImportError:
+        pass
+
     # Mark inputs[inp layer name] for ModelGraph to parse from i/o keras tensor names
     provides: dict[str, str] = {}  # tensor_name -> src_layer_name
     for conf in layer_list:
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 8bd8cd8a11..23b58beae8 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1782,6 +1782,92 @@ def initialize(self):
         self.add_output_variable(shape)
 
 
+class SparseInputReduce(Layer):
+    _expected_attributes = [
+        Attribute('in_height'),
+        Attribute('in_width'),
+        Attribute('n_chan'),
+        Attribute('n_sparse'),
+        Attribute('threshold', value_type=float),
+        Attribute('hash_bits', value_type=int, default=10),
+    ]
+
+    def initialize(self):
+        shape = [self.attributes['n_sparse'] * self.attributes['n_chan']]
+        self.add_output_variable(shape)
+
+
+class SparseConv2D(Layer):
+    _expected_attributes = [
+        Attribute('n_sparse'),
+        Attribute('n_chan'),
+        Attribute('n_filt'),
+        Attribute('kernel_size'),
+        WeightAttribute('weight'),
+        WeightAttribute('bias'),
+        TypeAttribute('weight'),
+        TypeAttribute('bias'),
+        TypeAttribute('accum'),
+    ]
+
+    def initialize(self):
+        shape = [self.attributes['n_sparse'] * self.attributes['n_filt']]
+        self.add_output_variable(shape)
+        self.add_weights(quantizer=self.get_attr('weight_quantizer'))
+        self.add_bias(quantizer=self.get_attr('bias_quantizer'))
+
+    def add_bias(self, quantizer=None):
+        data = self.get_attr('bias_data', None)
+        precision = None
+        type_name = None
+        if data is None:
+            data = np.zeros(self.attributes['n_filt'])
+            precision = IntegerPrecisionType(width=1, signed=False)
+            type_name = 'bias{index}_t'
+            quantizer = None
+        self.add_weights_variable(
+            name='bias', var_name='b{index}', type_name=type_name, precision=precision, data=data, quantizer=quantizer
+        )
+
+
+class SparseActivation(Layer):
+    _expected_attributes = [
+        Attribute('n_sparse'),
+        Attribute('n_chan'),
+        Attribute('activation', value_type=str),
+    ]
+
+    def initialize(self):
+        shape = [self.attributes['n_sparse'] * self.attributes['n_chan']]
+        self.add_output_variable(shape)
+
+
+class SparsePooling2D(Layer):
+    _expected_attributes = [
+        Attribute('n_sparse'),
+        Attribute('n_chan'),
+        Attribute('pool_size'),
+        TypeAttribute('accum'),
+    ]
+
+    def initialize(self):
+        shape = [self.attributes['n_sparse'] * self.attributes['n_chan']]
+        self.add_output_variable(shape)
+
+
+class SparseFlatten(Layer):
+    _expected_attributes = [
+        Attribute('n_sparse'),
+        Attribute('n_chan'),
+        Attribute('out_height'),
+        Attribute('out_width'),
+    ]
+
+    def initialize(self):
+        shape = [self.attributes['out_height'] * self.attributes['out_width'] * self.attributes['n_chan']]
+        self.add_output_variable(shape)
+
+
 layer_map = {
     'Input': Input,
     'InputLayer': Input,
@@ -1860,6 +1946,12 @@ def initialize(self):
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
     'DACombinational': DACombinational,
+    # Sparsepixels layers:
+    'SparseInputReduce': SparseInputReduce,
+    'SparseConv2D': SparseConv2D,
+    'SparseActivation': SparseActivation,
+    'SparsePooling2D': SparsePooling2D,
+    'SparseFlatten': SparseFlatten,
 }
 
 
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 88dc65c806..014d23e78e 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -34,6 +34,11 @@
     Pooling2D,
     Reshape,
     Softmax,
+    SparseActivation,
+    SparseConv2D,
+    SparseFlatten,
+    SparseInputReduce,
+    SparsePooling2D,
     Transpose,
 )
 from hls4ml.model.optimizer import ModelOptimizerPass, OptimizerPass
@@ -197,6 +202,24 @@ def _(layer: Transpose):
     return ((k, i, f),)
 
 
+@_request_kif.register
+def _(layer: SparsePooling2D):
+    """SparsePooling2D has two inputs: features (idx=0) and hash (idx=1).
+    The hash input is an integer side-channel and must not widen the upstream's precision.
+    Return minimum values for the hash input so np.maximum in requested_kif does not
+    override the narrow request from the hash-producer's other downstream consumers (e.g. a FPQ)."""
+    # Default: max precision for the feature input (same as no dispatch)
+    feat_shape = get_input_shapes(layer)[0]
+    feat_kif = _maximum_kif_at_shape(feat_shape)
+    if len(get_input_shapes(layer)) > 1:
+        hash_shape = get_input_shapes(layer)[1]
+        k2 = np.zeros(hash_shape, dtype=np.int16)
+        i2 = np.full(hash_shape, -127, dtype=np.int16)
+        f2 = np.full(hash_shape, -127, dtype=np.int16)
+        return (feat_kif, (k2, i2, f2))
+    return (feat_kif,)
+
+
 @_request_kif.register
 def _(layer: DACombinational):
     comb = layer.attributes['da_comb_trace']
@@ -677,6 +700,88 @@ def _(layer: Embedding):
     return k, i, f
 
 
+@_produce_kif.register
+def _(layer: SparseInputReduce):
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    n_chan = layer.attributes['n_chan']
+    n_sparse = layer.attributes['n_sparse']
+    k_ch = np.max(k_in.reshape(-1, n_chan), axis=0)
+    i_ch = np.max(i_in.reshape(-1, n_chan), axis=0)
+    f_ch = np.max(f_in.reshape(-1, n_chan), axis=0)
+    return np.tile(k_ch, n_sparse), np.tile(i_ch, n_sparse), np.tile(f_ch, n_sparse)
+
+
+@_produce_kif.register
+def _(layer: SparseConv2D):
+    kernel = layer.attributes['weight'].data
+    _bias = layer.attributes['bias']
+    bias = _bias.data if _bias is not None else 0
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+
+    n_sparse = layer.attributes['n_sparse']
+    n_chan = layer.attributes['n_chan']
+    n_filt = layer.attributes['n_filt']
+    ks = layer.attributes['kernel_size']
+
+    # Match standard Conv2D precision: each output pixel accumulates ks*ks*n_chan
+    # MAC terms (the kernel window), same as dense conv. The sparse loop iterates
+    # n_sparse input pixels, but only those within the kernel radius contribute;
+    # the rest add 0. So the worst-case accumulation depth is ks*ks*n_chan, not n_sparse.
+    k_ch = np.tile(k_in[:n_chan], ks * ks)
+    i_ch = np.tile(i_in[:n_chan], ks * ks)
+    f_ch = np.tile(f_in[:n_chan], ks * ks)
+    qint_in = QIntervalArray.from_kif(k_ch, i_ch, f_ch)
+
+    kernel_flat = kernel.reshape(-1, n_filt)  # (ks*ks*n_chan, n_filt)
+    qint_out = qint_in @ kernel_flat
+    qint_out = qint_out + bias
+    k, i, f = qint_out.to_kif()
+    return (
+        np.tile(k, n_sparse).astype(np.int16),
+        np.tile(i, n_sparse).astype(np.int16),
+        np.tile(f, n_sparse).astype(np.int16),
+    )
+
+
+@_produce_kif.register
+def _(layer: SparseActivation):
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    act = layer.attributes.get('activation', 'relu').lower()
+    if act == 'relu':
+        return np.zeros_like(k_in), i_in, f_in
+    return k_in, i_in, f_in
+
+
+@_produce_kif.register
+def _(layer: SparsePooling2D):
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    # Average pooling divides by pool_size^2, adding fractional bits.
+    # Match standard Pooling2D: add ceil(log2(pool_size^2)) fractional bits.
+    pool_size = layer.attributes['pool_size']
+    n_chan = layer.attributes['n_chan']
+    extra_f = int(np.ceil(np.log2(pool_size * pool_size)))
+    k_ch = k_in[:n_chan]
+    i_ch = i_in[:n_chan]
+    f_ch = f_in[:n_chan] + extra_f
+    n_sparse = layer.attributes['n_sparse']
+    return (
+        np.tile(k_ch, n_sparse).astype(np.int16),
+        np.tile(i_ch, n_sparse).astype(np.int16),
+        np.tile(f_ch, n_sparse).astype(np.int16),
+    )
+
+
+@_produce_kif.register
+def _(layer: SparseFlatten):
+    k_in, i_in, f_in = get_input_kifs(layer)[0]
+    n_chan = layer.attributes['n_chan']
+    out_h = layer.attributes['out_height']
+    out_w = layer.attributes['out_width']
+    k_ch, i_ch, f_ch = k_in[:n_chan], i_in[:n_chan], f_in[:n_chan]
+    n_out = out_h * out_w
+    return np.tile(k_ch, n_out), np.tile(i_ch, n_out), np.tile(f_ch, n_out)
+
+
 def kif_arrs_to_ints(arr: tuple[np.ndarray, np.ndarray, np.ndarray]):
     return tuple(int(np.max(a)) for a in arr)
 
@@ -966,6 +1071,8 @@ def get_output_layers_and_quantizers(
         elif isinstance(_node, (Reshape, Transpose, Concatenate)):
             layers.append(_node)
             get_output_layers_and_quantizers(_node, layers, quantizers)
+        elif isinstance(_node, (SparseInputReduce, SparseConv2D, SparseActivation, SparsePooling2D, SparseFlatten)):
+            layers.append(_node)
         else:
             raise ValueError(f'Layer {node.name} ({node.class_name}) unexpected input layer chain.')
     return layers, quantizers
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h
new file mode 100644
index 0000000000..41e5953f75
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h
@@ -0,0 +1,254 @@
+#ifndef NNET_SPARSEPIXELS_H_
+#define NNET_SPARSEPIXELS_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+
+constexpr int _sp_floorlog2(int x) { return (x < 2) ? 0 : 1 + _sp_floorlog2(x / 2); }
+constexpr int _sp_pow2(int x) { return x == 0 ? 1 : 2 * _sp_pow2(x - 1); }
+// ceil(log2(x)): bits needed to encode values 0..x-1
+constexpr int _sp_ceillog2(int x) { return (x <= 1) ? 1 : _sp_floorlog2(x - 1) + 1; }
+
+template <typename T, int INDEX_BITS> struct value_idx_pair {
+    T value;
+    ap_uint<INDEX_BITS> index;
+};
+
+template <class T, class t> class Op_active {
+  public:
+    T operator()(T a, T b, t threshold) {
+        if (a.value > threshold)
+            return a;
+        else if (b.value > threshold)
+            return b;
+        else {
+            T none;
+            none.value = 0;
+            none.index = 0;
+            return none;
+        }
+    }
+};
+
+template <class T, int N, class Op, class t> T find_active(T *x, Op op, t threshold) {
+    #pragma HLS INLINE
+    static constexpr int leftN = _sp_pow2(_sp_floorlog2(N - 1)) > 0 ? _sp_pow2(_sp_floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1], threshold);
+    }
+    return op(find_active<T, leftN, Op, t>(x, op, threshold), find_active<T, rightN, Op, t>(x + leftN, op, threshold),
+              threshold);
+}
+
+template <class data_T, class res_T, class hash_T, int N_h, int N_w, int N_c, int N_sparse>
+void sparse_input_reduce(data_T input_arr[N_h * N_w * N_c], data_T threshold, res_T sparse_arr_feat[N_sparse * N_c],
+                         hash_T sparse_arr_hash[N_sparse * 2]) {
+
+    // Flat pixel index ranges over 0..N_h*N_w-1 -> auto-sized to minimum bits
+    static constexpr int IDX_BITS = _sp_ceillog2(N_h * N_w);
+    typedef value_idx_pair<data_T, IDX_BITS> pair_t;
+
+    pair_t pair_arr[N_h * N_w];
+    int j_h_arr[N_h * N_w];
+    int j_w_arr[N_h * N_w];
+    #pragma HLS ARRAY_PARTITION variable = j_h_arr type = complete dim = 0
+    #pragma HLS ARRAY_PARTITION variable = j_w_arr type = complete dim = 0
+    #pragma HLS ARRAY_PARTITION variable = pair_arr type = complete dim = 0
+
+DataPrepareLoop:
+    for (int j = 0; j < N_h * N_w; j++) {
+        #pragma HLS UNROLL
+        pair_arr[j].value = input_arr[N_c * j];
+        pair_arr[j].index = j;
+
+        int remainder = j % (N_h * N_w);
+        int j_h = remainder / N_w + 1;
+        int j_w = remainder % N_w + 1;
+
+        j_h_arr[j] = j_h;
+        j_w_arr[j] = j_w;
+    }
+
+    Op_active<pair_t, data_T> op_active;
+MaxPixelsLoop:
+    for (int i = 0; i < N_sparse; i++) {
+        #pragma HLS PIPELINE
+        pair_t pair = find_active<pair_t, N_h * N_w, Op_active<pair_t, data_T>, data_T>(pair_arr, op_active, threshold);
+        sparse_arr_feat[N_c * i] = (res_T)pair.value;
+        for (int j = 1; j < N_c; j++) {
+            #pragma HLS UNROLL
+            sparse_arr_feat[N_c * i + j] = (res_T)input_arr[N_c * pair.index + j];
+        }
+
+        sparse_arr_hash[2 * i] = j_h_arr[pair.index];
+        sparse_arr_hash[2 * i + 1] = j_w_arr[pair.index];
+
+        pair_arr[pair.index].value = 0;
+    }
+}
+
+template <class data_T, class accum_T, class w_T, int n_chan, int n_filt, int N_sparse, int ker_size>
+accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_arr_feat_in[n_chan * N_sparse],
+                                    w_T filt_w[ker_size * ker_size * n_chan * n_filt], int i_filt, int i_pixel_in) {
+    #pragma HLS INLINE
+    constexpr int R = (ker_size - 1) / 2;
+    if ((unsigned)(offset_h + R) >= ker_size || (unsigned)(offset_w + R) >= ker_size) {
+        return (accum_T)0;
+    }
+    ap_uint<4> row = R - offset_h;
+    ap_uint<4> col = R - offset_w;
+    ap_uint<7> pos = row * ker_size + col;
+
+    accum_T acc = 0;
+MultLoopPerFilter:
+    for (int i_chan = 0; i_chan < n_chan; i_chan++) {
+        #pragma HLS UNROLL
+        int w_idx = n_filt * n_chan * pos + n_filt * i_chan + i_filt;
+        acc += filt_w[w_idx] * sparse_arr_feat_in[n_chan * i_pixel_in + i_chan];
+    }
+    return acc;
+}
+
+template <class data_T, class res_T, class hash_T, class w_T, class b_T, class accum_T, int N_sparse, int n_chan, int n_filt,
+          int ker_size>
+void sparse_conv(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_filt],
+                 hash_T sparse_arr_hash[N_sparse * 2], w_T w[ker_size * ker_size * n_chan * n_filt], b_T b[n_filt]) {
+
+OutputPixelLoop:
+    for (int i_pixel_out = 0; i_pixel_out < N_sparse; i_pixel_out++) {
+        #pragma HLS UNROLL
+
+        bool nonzero = false;
+        for (int i_chan = 0; i_chan < n_chan; i_chan++) {
+            #pragma HLS UNROLL
+            nonzero |= (sparse_arr_feat_in[i_pixel_out * n_chan + i_chan] != (data_T)0);
+        }
+
+    OutputFilterLoop:
+        for (int i_filt = 0; i_filt < n_filt; i_filt++) {
+            #pragma HLS UNROLL
+            accum_T acc = 0;
+
+        InputPixelLoop:
+            for (int i_pixel_in = 0; i_pixel_in < N_sparse; i_pixel_in++) {
+                #pragma HLS UNROLL
+                int offset_h = sparse_arr_hash[2 * i_pixel_out] - sparse_arr_hash[2 * i_pixel_in];
+                int offset_w = sparse_arr_hash[2 * i_pixel_out + 1] - sparse_arr_hash[2 * i_pixel_in + 1];
+
+                acc += mult_for_sparse_conv_kernel<data_T, accum_T, w_T, n_chan, n_filt, N_sparse, ker_size>(
+                    offset_h, offset_w, sparse_arr_feat_in, w, i_filt, i_pixel_in);
+            }
+
+            if (acc != 0) {
+                acc += b[i_filt];
+            }
+            if (nonzero == false) {
+                acc = 0;
+            }
+            sparse_arr_feat_out[n_filt * i_pixel_out + i_filt] = (res_T)acc;
+        }
+    }
+}
+
+template <class data_T, class res_T, int N_sparse, int n_chan>
+void sparse_relu(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan]) {
+    #pragma HLS PIPELINE
+    data_T data;
+    for (int i = 0; i < N_sparse * n_chan; i++) {
+        data = sparse_arr_feat_in[i];
+        if (data > 0) {
+            sparse_arr_feat_out[i] = data;
+        } else {
+            sparse_arr_feat_out[i] = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, class hash_T, class accum_T, int N_sparse, int n_chan, int pool_size>
+void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan],
+                        hash_T sparse_arr_hash_in[N_sparse * 2], hash_T sparse_arr_hash_out[N_sparse * 2]) {
+
+    constexpr double _pool_size_recip_d = 1.0 / double(pool_size);
+    const ap_fixed<10, 0> pool_size_recip = _pool_size_recip_d;
+
+    int hash_tmp[N_sparse * 2];
+#pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0
+ComputePooledLoc:
+    for (int i = 0; i < N_sparse; i++) {
+        #pragma HLS UNROLL
+        hash_tmp[2 * i] = (sparse_arr_hash_in[2 * i] - 1) / pool_size + 1;
+        hash_tmp[2 * i + 1] = (sparse_arr_hash_in[2 * i + 1] - 1) / pool_size + 1;
+    }
+
+    data_T sparse_arr_feat_in_copy[N_sparse * n_chan];
+    #pragma HLS ARRAY_PARTITION variable = sparse_arr_feat_in_copy type = complete dim = 0
+    for (int i = 0; i < N_sparse * n_chan; i++) {
+        #pragma HLS UNROLL
+        sparse_arr_feat_in_copy[i] = sparse_arr_feat_in[i];
+    }
+
+HashOutLoop:
+    for (int i_pixel = 0; i_pixel < N_sparse; i_pixel++) {
+        #pragma HLS UNROLL
+        int h_out = hash_tmp[2 * i_pixel];
+        int w_out = hash_tmp[2 * i_pixel + 1];
+
+    ChannelLoop:
+        for (int i_chan = 0; i_chan < n_chan; i_chan++) {
+            #pragma HLS UNROLL
+            accum_T acc = 0;
+
+        HashInLoop:
+            for (int j_pixel = 0; j_pixel < N_sparse; j_pixel++) {
+                #pragma HLS UNROLL
+                int h_in = hash_tmp[2 * j_pixel];
+                int w_in = hash_tmp[2 * j_pixel + 1];
+
+                data_T data = sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan];
+                if ((h_out == h_in) && (w_out == w_in)) {
+                    acc += data;
+                    sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan] = 0;
+                }
+            }
+            sparse_arr_feat_out[n_chan * i_pixel + i_chan] = (res_T)(acc * pool_size_recip * pool_size_recip);
+        }
+        sparse_arr_hash_out[2 * i_pixel] = h_out;
+        sparse_arr_hash_out[2 * i_pixel + 1] = w_out;
+    }
+}
+
+template <class data_T, class res_T, class hash_T, int n_height, int n_width, int n_chan, int N_sparse>
+void sparse_flatten(data_T sparse_arr_feat[N_sparse * n_chan], hash_T sparse_arr_hash[N_sparse * 2],
+                    res_T flat_arr[n_height * n_width * n_chan]) {
+
+InitFlatArr:
+    for (int i = 0; i < n_height * n_width * n_chan; i++) {
+        #pragma HLS UNROLL
+        flat_arr[i] = 0;
+    }
+
+FillFlatArr:
+    for (int i = 0; i < N_sparse; i++) {
+        #pragma HLS UNROLL factor = 4
+        int i_h = sparse_arr_hash[2 * i];
+        int i_w = sparse_arr_hash[2 * i + 1];
+        int pixel_idx = (i_h - 1) * n_width + (i_w - 1);
+
+    ChannelLoop:
+        for (int i_chan = 0; i_chan < n_chan; i_chan++) {
+            #pragma HLS UNROLL
+            data_T data = sparse_arr_feat[n_chan * i + i_chan];
+
+            if (data != 0) {
+                flat_arr[n_chan * pixel_idx + i_chan] = (res_T)data;
+            }
+        }
+    }
+}
+
+#endif // NNET_SPARSEPIXELS_H_
diff --git a/pyproject.toml b/pyproject.toml
index a3ccc2e529..ac7d53276f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ optional-dependencies.qkeras = [
   "tensorflow-model-optimization<=0.7.5",
 ]
 optional-dependencies.quartus-report = [ "calmjs-parse", "tabulate" ]
+optional-dependencies.sparsepixels = [ "sparsepixels>=0.2.2" ]
 optional-dependencies.sr = [ "sympy>=1.13.1" ]
 optional-dependencies.testing = [
   "calmjs-parse",
diff --git a/test/pytest/test_sparsepixels.py b/test/pytest/test_sparsepixels.py
new file mode 100644
index 0000000000..aaf92c4b85
--- /dev/null
+++ b/test/pytest/test_sparsepixels.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+
+import keras
+import numpy as np
+import pytest
+
+sparsepixels = pytest.importorskip('sparsepixels')
+
+from hgq.config import LayerConfigScope, QuantizerConfigScope  # noqa: E402
+from hgq.layers import QDense  # noqa: E402
+from hgq.quantizer.config import QuantizerConfig  # noqa: E402
+from keras.layers import Flatten  # noqa: E402
+from sparsepixels.layers import AveragePooling2DSparse, InputReduce, QConv2DSparse  # noqa: E402
+
+import hls4ml  # noqa: E402
+
+test_root_path = Path(__file__).parent
+
+
+def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4):
+    iq_conf = QuantizerConfig(place='datalane', q_type='kif', i0=4, f0=8, overflow_mode='WRAP')
+    with (
+        QuantizerConfigScope(place='all', default_q_type='kbi', overflow_mode='SAT_SYM'),
+        QuantizerConfigScope(place='datalane', default_q_type='kif', overflow_mode='WRAP'),
+        LayerConfigScope(enable_ebops=True, enable_iq=True, beta0=1e-5),
+    ):
+        x_in = keras.Input(shape=input_shape, name='x_in')
+        x, keep_mask = InputReduce(n_max_pixels=n_max_pixels, threshold=threshold, name='input_reduce')(x_in)
+        x = QConv2DSparse(
+            filters=2,
+            kernel_size=3,
+            name='conv',
+            padding='same',
+            strides=1,
+            activation='relu',
+            iq_conf=iq_conf,
+        )([x, keep_mask])
+        x, keep_mask = AveragePooling2DSparse(2, name='pool')([x, keep_mask])
+        x = Flatten(name='flatten')(x)
+        x = QDense(1, name='dense', iq_conf=iq_conf)(x)
+    return keras.Model(x_in, x, name='cnn_sparse_test')
+
+
+def _make_sparse_inputs(n_samples, h=8, w=8, n_active_per_sample=4, threshold=0.4):
+    x = np.zeros((n_samples, h, w, 1), dtype=np.float32)
+    for i in range(n_samples):
+        active_idx = np.random.choice(h * w, size=n_active_per_sample, replace=False)
+        for idx in active_idx:
+            x[i, idx // w, idx % w, 0] = threshold + 0.1 + np.random.rand() * 0.5
+    return x
+
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+def test_sparse_cnn(test_case_id, backend):
+    np.random.seed(42)
+    keras.utils.set_random_seed(42)
+
+    model = _build_sparse_cnn()
+    x = _make_sparse_inputs(n_samples=1000)
+
+    y_keras = model.predict(x, verbose=0)
+
+    output_dir = test_root_path / test_case_id
+    hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend)
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=hls_config,
+        output_dir=str(output_dir),
+        backend=backend,
+        io_type='io_parallel',
+    )
+    hls_model.compile()
+
+    y_hls = hls_model.predict(x).reshape(y_keras.shape)
+
+    mean_abs_diff = float(np.mean(np.abs(y_keras - y_hls)))
+    print(f'sparse-pixels {backend}: mean|diff|={mean_abs_diff:.4f}')
+
+    assert mean_abs_diff < 0.5

From fe1ca2b33d83f42298fec3916e71791c30bd0757 Mon Sep 17 00:00:00 2001
From: Ho Fung Tsoi <hftsoi0411@gmail.com>
Date: Thu, 2 Jul 2026 22:55:47 -0400
Subject: [PATCH 2/3] upgrade: parallelization knobs for sparse layers;
 streaming option for input reduction; add maxpooling

---
 hls4ml/backends/vivado/passes/sparsepixels.py |  51 ++++-
 hls4ml/converters/keras_v3/sparsepixels.py    |  51 +++--
 hls4ml/model/layers.py                        |   1 +
 hls4ml/model/optimizer/passes/bit_exact.py    |  48 ++++-
 .../vivado/nnet_utils/nnet_sparsepixels.h     | 190 ++++++++++++++----
 test/pytest/test_sparsepixels.py              |  65 ++++--
 6 files changed, 326 insertions(+), 80 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/sparsepixels.py b/hls4ml/backends/vivado/passes/sparsepixels.py
index 0dd3329c10..6b8a065e22 100644
--- a/hls4ml/backends/vivado/passes/sparsepixels.py
+++ b/hls4ml/backends/vivado/passes/sparsepixels.py
@@ -96,6 +96,15 @@ def transform(self, model, node):
                     model.replace_node(n, new_node)
                     changed = True
 
+            else:
+                # Passthrough nodes inside the sparse region (e.g. an input quantizer that
+                # bit_exact has rendered as a linear activation) preserve the sparse pixel
+                # layout, so carry the hash variable and spatial dims to the next sparse layer.
+                if n.inputs and n.inputs[0] in hash_map:
+                    src = n.inputs[0]
+                    hash_map[name] = hash_map[src]
+                    spatial[name] = spatial.get(src)
+
         return changed
 
 
@@ -184,31 +193,42 @@ def format(self, node):
 
 #  Function-call templates
 
+# Input reduce: {fn} selects the tree (default) or the streaming implementation.
 sparse_input_reduce_function = (
     '{input_t} threshold_{index} = {threshold};\n'
     'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n'
     '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n'
-    'sparse_input_reduce<{input_t}, {output_t}, ap_uint<{hash_bits}>, {in_height}, {in_width}, {n_chan}, {n_sparse}>'
+    '{fn}<{input_t}, {output_t}, ap_uint<{hash_bits}>, {in_height}, {in_width}, {n_chan}, {n_sparse}>'
     '({input}, threshold_{index}, {output}, {hash_out});'
 )
 
+# The last two template args are the parallelization factors (default to full parallelism).
 sparse_conv2d_function = (
     'sparse_conv<{input_t}, {output_t}, ap_uint<{hash_bits}>, {weight_t}, {bias_t}, {accum_t_name}, '
-    '{n_sparse}, {n_chan}, {n_filt}, {kernel_size}>'
+    '{n_sparse}, {n_chan}, {n_filt}, {kernel_size}, {pixel_parallel_factor}, {filt_parallel_factor}>'
     '({input}, {output}, {hash_in}, {w}, {b});'
 )
 
 sparse_activation_function = 'sparse_relu<{input_t}, {output_t}, {n_sparse}, {n_chan}>({input}, {output});'
 
-sparse_pooling2d_function = (
-    'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n'
-    '#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n'
-    'sparse_pooling_avg<{input_t}, {output_t}, ap_uint<{hash_bits}>, {accum_t_name}, {n_sparse}, {n_chan}, {pool_size}>'
+sparse_pooling2d_prefix = (
+    'ap_uint<{hash_bits}> {hash_out}[{n_sparse} * 2];\n#pragma HLS ARRAY_PARTITION variable={hash_out} complete dim=0\n'
+)
+# Average pooling takes an accum_t; max pooling does not. Both take the two parallelization factors.
+sparse_pooling2d_avg_call = (
+    'sparse_pooling_avg<{input_t}, {output_t}, ap_uint<{hash_bits}>, {accum_t_name}, '
+    '{n_sparse}, {n_chan}, {pool_size}, {pixel_parallel_factor}, {chan_parallel_factor}>'
+    '({input}, {output}, {hash_in}, {hash_out});'
+)
+sparse_pooling2d_max_call = (
+    'sparse_pooling_max<{input_t}, {output_t}, ap_uint<{hash_bits}>, '
+    '{n_sparse}, {n_chan}, {pool_size}, {pixel_parallel_factor}, {chan_parallel_factor}>'
     '({input}, {output}, {hash_in}, {hash_out});'
 )
 
 sparse_flatten_function = (
-    'sparse_flatten<{input_t}, {output_t}, ap_uint<{hash_bits}>, {out_height}, {out_width}, {n_chan}, {n_sparse}>'
+    'sparse_flatten<{input_t}, {output_t}, ap_uint<{hash_bits}>, {out_height}, {out_width}, {n_chan}, '
+    '{n_sparse}, {parallel_factor}>'
     '({input}, {hash_in}, {output});'
 )
 
@@ -240,6 +260,8 @@ def format(self, node):
         params['hash_bits'] = node.get_attr('hash_bits')
         params['threshold'] = node.get_attr('threshold')
         params['hash_out'] = node.get_attr('hash_out_name')
+        variant = node.get_attr('variant', 'tree')
+        params['fn'] = 'sparse_input_reduce_stream' if variant == 'stream' else 'sparse_input_reduce'
         return self.template.format(**params)
 
 
@@ -261,6 +283,8 @@ def format(self, node):
         params['weight_t'] = node.get_weights('weight').type.name
         params['bias_t'] = node.get_weights('bias').type.name
         params['accum_t_name'] = node.get_attr('accum_t').name
+        params['pixel_parallel_factor'] = node.get_attr('pixel_parallel_factor') or params['n_sparse']
+        params['filt_parallel_factor'] = node.get_attr('filt_parallel_factor') or params['n_filt']
         return self.template.format(**params)
 
 
@@ -279,7 +303,7 @@ def format(self, node):
 class SparsePooling2DFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(SparsePooling2D, include_header=sparsepixels_include)
-        self.template = sparse_pooling2d_function
+        self.template = sparse_pooling2d_prefix + sparse_pooling2d_avg_call
 
     def format(self, node):
         params = self._default_function_params(node)
@@ -289,8 +313,14 @@ def format(self, node):
         params['hash_bits'] = _get_hash_bits(node)
         params['hash_in'] = node.get_attr('hash_in_name')
         params['hash_out'] = node.get_attr('hash_out_name')
-        params['accum_t_name'] = node.get_attr('accum_t').name
-        return self.template.format(**params)
+        params['pixel_parallel_factor'] = node.get_attr('pixel_parallel_factor') or params['n_sparse']
+        params['chan_parallel_factor'] = node.get_attr('chan_parallel_factor') or params['n_chan']
+        if node.get_attr('pool_op', 'avg') == 'max':
+            template = sparse_pooling2d_prefix + sparse_pooling2d_max_call
+        else:
+            template = sparse_pooling2d_prefix + sparse_pooling2d_avg_call
+            params['accum_t_name'] = node.get_attr('accum_t').name
+        return template.format(**params)
 
 
 class SparseFlattenFunctionTemplate(FunctionCallTemplate):
@@ -306,6 +336,7 @@ def format(self, node):
         params['out_width'] = node.get_attr('out_width')
         params['hash_bits'] = _get_hash_bits(node)
         params['hash_in'] = node.get_attr('hash_in_name')
+        params['parallel_factor'] = node.get_attr('parallel_factor') or (params['out_height'] * params['out_width'])
         return self.template.format(**params)
 
 
diff --git a/hls4ml/converters/keras_v3/sparsepixels.py b/hls4ml/converters/keras_v3/sparsepixels.py
index 066e6a070e..f1eab5f31c 100644
--- a/hls4ml/converters/keras_v3/sparsepixels.py
+++ b/hls4ml/converters/keras_v3/sparsepixels.py
@@ -219,6 +219,30 @@ def handle(
         return tuple(results)
 
 
+def _sparse_pooling_config(
+    in_tensors: Sequence['KerasTensor'], out_tensors: Sequence['KerasTensor'], pool_size: int, pool_op: str
+) -> dict[str, Any]:
+    """Shared config for the average/max sparse pooling handlers (differ only in pool_op)."""
+    feat_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
+    n_chan = int(feat_shape[-1])
+    n_sparse = _sparse_context.get('n_sparse', 0)
+
+    prev_h, prev_w = _sparse_context.get('spatial', (1, 1))
+    new_h, new_w = prev_h // pool_size, prev_w // pool_size
+    _sparse_context['spatial'] = (new_h, new_w)
+
+    for t in out_tensors:
+        _mark_sparse_output(t.name, n_sparse, n_chan, new_h, new_w)
+
+    return {
+        'class_name': 'SparsePooling2D',
+        'n_sparse': n_sparse,
+        'n_chan': n_chan,
+        'pool_size': pool_size,
+        'pool_op': pool_op,
+    }
+
+
 class AveragePooling2DSparseHandler(KerasV3LayerHandler):
     handles = ('sparsepixels.layers.AveragePooling2DSparse',)
 
@@ -228,23 +252,16 @@ def handle(
         in_tensors: Sequence['KerasTensor'],
         out_tensors: Sequence['KerasTensor'],
     ):
-        pool_size = int(layer.avg_pool.pool_size[0])
+        return _sparse_pooling_config(in_tensors, out_tensors, int(layer.avg_pool.pool_size[0]), 'avg')
 
-        feat_shape: tuple[int, ...] = in_tensors[0].shape[1:]  # type: ignore
-        n_chan = int(feat_shape[-1])
-        n_sparse = _sparse_context.get('n_sparse', 0)
 
-        prev_h, prev_w = _sparse_context.get('spatial', (1, 1))
-        new_h, new_w = prev_h // pool_size, prev_w // pool_size
-        _sparse_context['spatial'] = (new_h, new_w)
+class MaxPooling2DSparseHandler(KerasV3LayerHandler):
+    handles = ('sparsepixels.layers.MaxPooling2DSparse',)
 
-        out_tensor_names = [t.name for t in out_tensors]
-        for t_name in out_tensor_names:
-            _mark_sparse_output(t_name, n_sparse, n_chan, new_h, new_w)
-
-        return {
-            'class_name': 'SparsePooling2D',
-            'n_sparse': n_sparse,
-            'n_chan': n_chan,
-            'pool_size': pool_size,
-        }
+    def handle(
+        self,
+        layer: 'keras.Layer',
+        in_tensors: Sequence['KerasTensor'],
+        out_tensors: Sequence['KerasTensor'],
+    ):
+        return _sparse_pooling_config(in_tensors, out_tensors, int(layer.max_pool.pool_size[0]), 'max')
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 9f5a89dd8c..08945876f6 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1956,6 +1956,7 @@ class SparsePooling2D(Layer):
         Attribute('n_sparse'),
         Attribute('n_chan'),
         Attribute('pool_size'),
+        Attribute('pool_op', value_type=str, default='avg'),  # 'avg' or 'max'
         TypeAttribute('accum'),
     ]
 
diff --git a/hls4ml/model/optimizer/passes/bit_exact.py b/hls4ml/model/optimizer/passes/bit_exact.py
index 5deca39286..5e2d1d9854 100644
--- a/hls4ml/model/optimizer/passes/bit_exact.py
+++ b/hls4ml/model/optimizer/passes/bit_exact.py
@@ -222,6 +222,45 @@ def _(layer: SparsePooling2D):
     return (feat_kif,)
 
 
+@_request_kif.register
+def _(layer: SparseInputReduce):
+    """Propagate the downstream precision request back to the dense model input.
+
+    The output is packed per (sparse pixel, channel); any input pixel may be selected into any
+    slot, so each input position must satisfy the max request over slots (per channel), broadcast
+    across the H*W input grid. Without this the untrusted model input keeps its maximal placeholder
+    precision, which downstream passes then collapse to a degenerate type (e.g. ap_ufixed<1,0>),
+    clamping the real inputs. See also the SparseFlatten dispatch below."""
+    n_chan = layer.attributes['n_chan']
+    n_sparse = layer.attributes['n_sparse']
+    in_shape = get_input_shapes(layer)[0]  # dense model input grid, channel-last
+    k, i, f = requested_kif(layer)
+
+    def to_in(a):
+        per_chan = a.reshape(n_sparse, n_chan).max(axis=0)
+        return np.broadcast_to(per_chan, in_shape).astype(np.int16)
+
+    return ((to_in(k), to_in(i), to_in(f)),)
+
+
+@_request_kif.register
+def _(layer: SparseFlatten):
+    """Map the flattened (dense) request back to the packed sparse input. Each sparse slot can
+    scatter to any spatial position, so its request is the max over positions (per channel). This
+    lets the request reach the SparseInputReduce (and hence the model input) when no quantizer sits
+    between them."""
+    n_chan = layer.attributes['n_chan']
+    n_sparse = layer.attributes['n_sparse']
+    n_pos = layer.attributes['out_height'] * layer.attributes['out_width']
+    k, i, f = requested_kif(layer)
+
+    def to_in(a):
+        per_chan = a.reshape(n_pos, n_chan).max(axis=0)
+        return np.tile(per_chan, n_sparse).astype(np.int16)
+
+    return ((to_in(k), to_in(i), to_in(f)),)
+
+
 @_request_kif.register
 def _(layer: DACombinational):
     comb = layer.attributes['da_comb_trace']
@@ -757,11 +796,14 @@ def _(layer: SparseActivation):
 @_produce_kif.register
 def _(layer: SparsePooling2D):
     k_in, i_in, f_in = get_input_kifs(layer)[0]
-    # Average pooling divides by pool_size^2, adding fractional bits.
-    # Match standard Pooling2D: add ceil(log2(pool_size^2)) fractional bits.
+    # Average pooling divides by pool_size^2, which adds ceil(log2(pool_size^2)) fractional bits
+    # (matching standard Pooling2D). Max pooling just selects an input, so the precision is unchanged.
     pool_size = layer.attributes['pool_size']
     n_chan = layer.attributes['n_chan']
-    extra_f = int(np.ceil(np.log2(pool_size * pool_size)))
+    if layer.attributes.get('pool_op', 'avg') == 'max':
+        extra_f = 0
+    else:
+        extra_f = int(np.ceil(np.log2(pool_size * pool_size)))
     k_ch = k_in[:n_chan]
     i_ch = i_in[:n_chan]
     f_ch = f_in[:n_chan] + extra_f
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h
index 41e5953f75..c31a516d52 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sparsepixels.h
@@ -45,11 +45,14 @@ template <class T, int N, class Op, class t> T find_active(T *x, Op op, t thresh
               threshold);
 }
 
+// Input-reduce (find-max tree): selects the first N_sparse active pixels (first input channel
+// > threshold) in raster order and emits their features (all channels) and 1-based (h, w) hashes.
+// A combinational find-active reduction is reused across N_sparse pipelined extractions -- low
+// latency, high LUT.
 template <class data_T, class res_T, class hash_T, int N_h, int N_w, int N_c, int N_sparse>
 void sparse_input_reduce(data_T input_arr[N_h * N_w * N_c], data_T threshold, res_T sparse_arr_feat[N_sparse * N_c],
                          hash_T sparse_arr_hash[N_sparse * 2]) {
 
-    // Flat pixel index ranges over 0..N_h*N_w-1 -> auto-sized to minimum bits
     static constexpr int IDX_BITS = _sp_ceillog2(N_h * N_w);
     typedef value_idx_pair<data_T, IDX_BITS> pair_t;
 
@@ -92,6 +95,41 @@ void sparse_input_reduce(data_T input_arr[N_h * N_w * N_c], data_T threshold, re
     }
 }
 
+// Input-reduce (streaming): same selection as the tree, via a one-pixel-per-cycle raster scan --
+// minimal LUT, latency ~N_h*N_w. Unused output slots (fewer than N_sparse active pixels) are zeroed.
+template <class data_T, class res_T, class hash_T, int N_h, int N_w, int N_c, int N_sparse>
+void sparse_input_reduce_stream(data_T input_arr[N_h * N_w * N_c], data_T threshold, res_T sparse_arr_feat[N_sparse * N_c],
+                                hash_T sparse_arr_hash[N_sparse * 2]) {
+    constexpr int NP = N_h * N_w;
+
+InitOut:
+    for (int s = 0; s < N_sparse; s++) {
+        #pragma HLS UNROLL
+        for (int c = 0; c < N_c; c++) {
+            #pragma HLS UNROLL
+            sparse_arr_feat[N_c * s + c] = 0;
+        }
+        sparse_arr_hash[2 * s] = 0;
+        sparse_arr_hash[2 * s + 1] = 0;
+    }
+
+    int cnt = 0;
+ScanLoop:
+    for (int j = 0; j < NP; j++) {
+        #pragma HLS PIPELINE
+        if (cnt < N_sparse && input_arr[N_c * j] > threshold) {
+            sparse_arr_feat[N_c * cnt] = (res_T)input_arr[N_c * j];
+            for (int c = 1; c < N_c; c++) {
+                #pragma HLS UNROLL
+                sparse_arr_feat[N_c * cnt + c] = (res_T)input_arr[N_c * j + c];
+            }
+            sparse_arr_hash[2 * cnt] = j / N_w + 1;
+            sparse_arr_hash[2 * cnt + 1] = j % N_w + 1;
+            cnt++;
+        }
+    }
+}
+
 template <class data_T, class accum_T, class w_T, int n_chan, int n_filt, int N_sparse, int ker_size>
 accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_arr_feat_in[n_chan * N_sparse],
                                     w_T filt_w[ker_size * ker_size * n_chan * n_filt], int i_filt, int i_pixel_in) {
@@ -100,9 +138,14 @@ accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_ar
     if ((unsigned)(offset_h + R) >= ker_size || (unsigned)(offset_w + R) >= ker_size) {
         return (accum_T)0;
     }
-    ap_uint<4> row = R - offset_h;
-    ap_uint<4> col = R - offset_w;
-    ap_uint<7> pos = row * ker_size + col;
+    // Smallest functional widths for the given ker_size (compile-time):
+    //   row, col in [0, ker_size-1]          -> ceil(log2(ker_size)) bits
+    //   pos     in [0, ker_size*ker_size-1]  -> ceil(log2(ker_size*ker_size)) bits
+    static constexpr int ROW_BITS = _sp_ceillog2(ker_size);
+    static constexpr int POS_BITS = _sp_ceillog2(ker_size * ker_size);
+    ap_uint<ROW_BITS> row = R - offset_h;
+    ap_uint<ROW_BITS> col = R - offset_w;
+    ap_uint<POS_BITS> pos = row * ker_size + col;
 
     accum_T acc = 0;
 MultLoopPerFilter:
@@ -114,14 +157,21 @@ accum_T mult_for_sparse_conv_kernel(int offset_h, int offset_w, data_T sparse_ar
     return acc;
 }
 
+// Sparse convolution on the active pixels. Two independent parallelization knobs trade LUT for
+// latency without changing the output:
+//   pixel_parallel_factor : output pixels (N_sparse axis) computed per cycle. Default = N_sparse.
+//   filt_parallel_factor  : output filters (n_filt axis) computed per cycle. Default = n_filt.
+// Both loops use UNROLL factor (no PIPELINE: pipelining the outer loop would force-unroll the filter
+// loop and ignore filt_parallel_factor); inter-layer throughput comes from the top-level DATAFLOW.
+// accum_T accumulates the MACs; a single cast to res_T is applied at the store.
 template <class data_T, class res_T, class hash_T, class w_T, class b_T, class accum_T, int N_sparse, int n_chan, int n_filt,
-          int ker_size>
+          int ker_size, int pixel_parallel_factor = N_sparse, int filt_parallel_factor = n_filt>
 void sparse_conv(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_filt],
                  hash_T sparse_arr_hash[N_sparse * 2], w_T w[ker_size * ker_size * n_chan * n_filt], b_T b[n_filt]) {
 
 OutputPixelLoop:
     for (int i_pixel_out = 0; i_pixel_out < N_sparse; i_pixel_out++) {
-        #pragma HLS UNROLL
+        #pragma HLS UNROLL factor = pixel_parallel_factor
 
         bool nonzero = false;
         for (int i_chan = 0; i_chan < n_chan; i_chan++) {
@@ -131,7 +181,7 @@ void sparse_conv(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_
 
     OutputFilterLoop:
         for (int i_filt = 0; i_filt < n_filt; i_filt++) {
-            #pragma HLS UNROLL
+            #pragma HLS UNROLL factor = filt_parallel_factor
             accum_T acc = 0;
 
         InputPixelLoop:
@@ -169,7 +219,12 @@ void sparse_relu(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_
     }
 }
 
-template <class data_T, class res_T, class hash_T, class accum_T, int N_sparse, int n_chan, int pool_size>
+// Sparse average pooling. Each pooled cell is emitted once -- by the lowest-indexed output pixel
+// mapping to it (the is_first test); duplicate pixels of the same cell emit 0. The averaging reads
+// only the input array (no scratch mutation), so it is safe to partially unroll. Two independent
+// knobs: pixel_parallel_factor (N_sparse axis) and chan_parallel_factor (n_chan axis).
+template <class data_T, class res_T, class hash_T, class accum_T, int N_sparse, int n_chan, int pool_size,
+          int pixel_parallel_factor = N_sparse, int chan_parallel_factor = n_chan>
 void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan],
                         hash_T sparse_arr_hash_in[N_sparse * 2], hash_T sparse_arr_hash_out[N_sparse * 2]) {
 
@@ -177,7 +232,7 @@ void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T spar
     const ap_fixed<10, 0> pool_size_recip = _pool_size_recip_d;
 
     int hash_tmp[N_sparse * 2];
-#pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0
+    #pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0
 ComputePooledLoc:
     for (int i = 0; i < N_sparse; i++) {
         #pragma HLS UNROLL
@@ -185,23 +240,81 @@ void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T spar
         hash_tmp[2 * i + 1] = (sparse_arr_hash_in[2 * i + 1] - 1) / pool_size + 1;
     }
 
-    data_T sparse_arr_feat_in_copy[N_sparse * n_chan];
-    #pragma HLS ARRAY_PARTITION variable = sparse_arr_feat_in_copy type = complete dim = 0
-    for (int i = 0; i < N_sparse * n_chan; i++) {
+HashOutLoop:
+    for (int i_pixel = 0; i_pixel < N_sparse; i_pixel++) {
+        #pragma HLS UNROLL factor = pixel_parallel_factor
+        int h_out = hash_tmp[2 * i_pixel];
+        int w_out = hash_tmp[2 * i_pixel + 1];
+
+        bool is_first = true;
+    FirstCheck:
+        for (int k = 0; k < N_sparse; k++) {
+            #pragma HLS UNROLL
+            if (k < i_pixel && hash_tmp[2 * k] == h_out && hash_tmp[2 * k + 1] == w_out) {
+                is_first = false;
+            }
+        }
+
+    ChannelLoop:
+        for (int i_chan = 0; i_chan < n_chan; i_chan++) {
+            #pragma HLS UNROLL factor = chan_parallel_factor
+            accum_T acc = 0;
+
+        HashInLoop:
+            for (int j_pixel = 0; j_pixel < N_sparse; j_pixel++) {
+                #pragma HLS UNROLL
+                int h_in = hash_tmp[2 * j_pixel];
+                int w_in = hash_tmp[2 * j_pixel + 1];
+
+                if ((h_out == h_in) && (w_out == w_in)) {
+                    acc += sparse_arr_feat_in[n_chan * j_pixel + i_chan];
+                }
+            }
+            sparse_arr_feat_out[n_chan * i_pixel + i_chan] =
+                is_first ? (res_T)(acc * pool_size_recip * pool_size_recip) : (res_T)0;
+        }
+        sparse_arr_hash_out[2 * i_pixel] = h_out;
+        sparse_arr_hash_out[2 * i_pixel + 1] = w_out;
+    }
+}
+
+// Sparse max pooling. Same structure as the average version (one emission per pooled cell via the
+// is_first test), but takes the per-channel maximum of the active pixels in the cell, floored at 0
+// to match dense max pooling over the zero-masked window. Two independent knobs:
+// pixel_parallel_factor (N_sparse axis) and chan_parallel_factor (n_chan axis).
+template <class data_T, class res_T, class hash_T, int N_sparse, int n_chan, int pool_size,
+          int pixel_parallel_factor = N_sparse, int chan_parallel_factor = n_chan>
+void sparse_pooling_max(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T sparse_arr_feat_out[N_sparse * n_chan],
+                        hash_T sparse_arr_hash_in[N_sparse * 2], hash_T sparse_arr_hash_out[N_sparse * 2]) {
+
+    int hash_tmp[N_sparse * 2];
+    #pragma HLS ARRAY_PARTITION variable = hash_tmp type = complete dim = 0
+ComputePooledLoc:
+    for (int i = 0; i < N_sparse; i++) {
         #pragma HLS UNROLL
-        sparse_arr_feat_in_copy[i] = sparse_arr_feat_in[i];
+        hash_tmp[2 * i] = (sparse_arr_hash_in[2 * i] - 1) / pool_size + 1;
+        hash_tmp[2 * i + 1] = (sparse_arr_hash_in[2 * i + 1] - 1) / pool_size + 1;
     }
 
 HashOutLoop:
     for (int i_pixel = 0; i_pixel < N_sparse; i_pixel++) {
-        #pragma HLS UNROLL
+        #pragma HLS UNROLL factor = pixel_parallel_factor
         int h_out = hash_tmp[2 * i_pixel];
         int w_out = hash_tmp[2 * i_pixel + 1];
 
+        bool is_first = true;
+    FirstCheck:
+        for (int k = 0; k < N_sparse; k++) {
+            #pragma HLS UNROLL
+            if (k < i_pixel && hash_tmp[2 * k] == h_out && hash_tmp[2 * k + 1] == w_out) {
+                is_first = false;
+            }
+        }
+
     ChannelLoop:
         for (int i_chan = 0; i_chan < n_chan; i_chan++) {
-            #pragma HLS UNROLL
-            accum_T acc = 0;
+            #pragma HLS UNROLL factor = chan_parallel_factor
+            data_T vmax = 0;
 
         HashInLoop:
             for (int j_pixel = 0; j_pixel < N_sparse; j_pixel++) {
@@ -209,44 +322,53 @@ void sparse_pooling_avg(data_T sparse_arr_feat_in[N_sparse * n_chan], res_T spar
                 int h_in = hash_tmp[2 * j_pixel];
                 int w_in = hash_tmp[2 * j_pixel + 1];
 
-                data_T data = sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan];
-                if ((h_out == h_in) && (w_out == w_in)) {
-                    acc += data;
-                    sparse_arr_feat_in_copy[n_chan * j_pixel + i_chan] = 0;
+                data_T v = sparse_arr_feat_in[n_chan * j_pixel + i_chan];
+                if ((h_out == h_in) && (w_out == w_in) && (v > vmax)) {
+                    vmax = v;
                 }
             }
-            sparse_arr_feat_out[n_chan * i_pixel + i_chan] = (res_T)(acc * pool_size_recip * pool_size_recip);
+            sparse_arr_feat_out[n_chan * i_pixel + i_chan] = is_first ? (res_T)vmax : (res_T)0;
         }
         sparse_arr_hash_out[2 * i_pixel] = h_out;
         sparse_arr_hash_out[2 * i_pixel + 1] = w_out;
     }
 }
 
-template <class data_T, class res_T, class hash_T, int n_height, int n_width, int n_chan, int N_sparse>
+// Scatters the sparse pixels back to a dense n_height * n_width * n_chan grid (the sparse->dense
+// transition before Dense layers). Implemented as a gather: each dense location is written exactly
+// once by scanning the sparse pixels for the one mapping to it (no data-dependent writes), so it is
+// safe to fully or partially unroll. parallel_factor = dense locations produced per cycle.
+template <class data_T, class res_T, class hash_T, int n_height, int n_width, int n_chan, int N_sparse,
+          int parallel_factor = n_height *n_width>
 void sparse_flatten(data_T sparse_arr_feat[N_sparse * n_chan], hash_T sparse_arr_hash[N_sparse * 2],
                     res_T flat_arr[n_height * n_width * n_chan]) {
 
-InitFlatArr:
-    for (int i = 0; i < n_height * n_width * n_chan; i++) {
+    int pix_idx[N_sparse];
+    #pragma HLS ARRAY_PARTITION variable = pix_idx type = complete dim = 0
+PixIdxLoop:
+    for (int i = 0; i < N_sparse; i++) {
         #pragma HLS UNROLL
-        flat_arr[i] = 0;
+        pix_idx[i] = (sparse_arr_hash[2 * i] - 1) * n_width + (sparse_arr_hash[2 * i + 1] - 1);
     }
 
-FillFlatArr:
-    for (int i = 0; i < N_sparse; i++) {
-        #pragma HLS UNROLL factor = 4
-        int i_h = sparse_arr_hash[2 * i];
-        int i_w = sparse_arr_hash[2 * i + 1];
-        int pixel_idx = (i_h - 1) * n_width + (i_w - 1);
+GatherLoop:
+    for (int p = 0; p < n_height * n_width; p++) {
+        #pragma HLS UNROLL factor = parallel_factor
 
     ChannelLoop:
         for (int i_chan = 0; i_chan < n_chan; i_chan++) {
             #pragma HLS UNROLL
-            data_T data = sparse_arr_feat[n_chan * i + i_chan];
+            res_T val = 0;
 
-            if (data != 0) {
-                flat_arr[n_chan * pixel_idx + i_chan] = (res_T)data;
+        ScanLoop:
+            for (int i = 0; i < N_sparse; i++) {
+                #pragma HLS UNROLL
+                data_T data = sparse_arr_feat[n_chan * i + i_chan];
+                if (pix_idx[i] == p && data != 0) {
+                    val = (res_T)data;
+                }
             }
+            flat_arr[n_chan * p + i_chan] = val;
         }
     }
 }
diff --git a/test/pytest/test_sparsepixels.py b/test/pytest/test_sparsepixels.py
index aaf92c4b85..53f30c4e5c 100644
--- a/test/pytest/test_sparsepixels.py
+++ b/test/pytest/test_sparsepixels.py
@@ -10,14 +10,19 @@
 from hgq.layers import QDense  # noqa: E402
 from hgq.quantizer.config import QuantizerConfig  # noqa: E402
 from keras.layers import Flatten  # noqa: E402
-from sparsepixels.layers import AveragePooling2DSparse, InputReduce, QConv2DSparse  # noqa: E402
+from sparsepixels.layers import (  # noqa: E402
+    AveragePooling2DSparse,
+    InputReduce,
+    MaxPooling2DSparse,
+    QConv2DSparse,
+)
 
 import hls4ml  # noqa: E402
 
 test_root_path = Path(__file__).parent
 
 
-def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4):
+def _build_sparse_cnn(input_shape=(8, 8, 1), n=4, threshold=0.4, pool='avg'):
     iq_conf = QuantizerConfig(place='datalane', q_type='kif', i0=4, f0=8, overflow_mode='WRAP')
     with (
         QuantizerConfigScope(place='all', default_q_type='kbi', overflow_mode='SAT_SYM'),
@@ -25,7 +30,7 @@ def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4):
         LayerConfigScope(enable_ebops=True, enable_iq=True, beta0=1e-5),
     ):
         x_in = keras.Input(shape=input_shape, name='x_in')
-        x, keep_mask = InputReduce(n_max_pixels=n_max_pixels, threshold=threshold, name='input_reduce')(x_in)
+        x, keep_mask = InputReduce(n=n, threshold=threshold, name='input_reduce')(x_in)
         x = QConv2DSparse(
             filters=2,
             kernel_size=3,
@@ -35,7 +40,8 @@ def _build_sparse_cnn(input_shape=(8, 8, 1), n_max_pixels=4, threshold=0.4):
             activation='relu',
             iq_conf=iq_conf,
         )([x, keep_mask])
-        x, keep_mask = AveragePooling2DSparse(2, name='pool')([x, keep_mask])
+        pool_layer = MaxPooling2DSparse(2, name='pool') if pool == 'max' else AveragePooling2DSparse(2, name='pool')
+        x, keep_mask = pool_layer([x, keep_mask])
         x = Flatten(name='flatten')(x)
         x = QDense(1, name='dense', iq_conf=iq_conf)(x)
     return keras.Model(x_in, x, name='cnn_sparse_test')
@@ -50,18 +56,13 @@ def _make_sparse_inputs(n_samples, h=8, w=8, n_active_per_sample=4, threshold=0.
     return x
 
 
-@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
-def test_sparse_cnn(test_case_id, backend):
-    np.random.seed(42)
-    keras.utils.set_random_seed(42)
-
-    model = _build_sparse_cnn()
-    x = _make_sparse_inputs(n_samples=1000)
-
+def _convert_and_check(model, x, output_dir, backend, layer_overrides=None):
     y_keras = model.predict(x, verbose=0)
 
-    output_dir = test_root_path / test_case_id
     hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend)
+    for name, overrides in (layer_overrides or {}).items():
+        hls_config['LayerName'].setdefault(name, {}).update(overrides)
+
     hls_model = hls4ml.converters.convert_from_keras_model(
         model,
         hls_config=hls_config,
@@ -71,9 +72,41 @@ def test_sparse_cnn(test_case_id, backend):
     )
     hls_model.compile()
 
-    y_hls = hls_model.predict(x).reshape(y_keras.shape)
+    # Guard the input-precision regression: bit_exact must propagate the downstream precision
+    # request back through the sparse layers to the model input. Otherwise x_in collapses to a
+    # degenerate type (e.g. ap_ufixed<1,0>) that clamps the real inputs to {0, 0.5}.
+    in_prec = hls_model.graph['x_in'].get_output_variable().type.precision
+    assert in_prec.width > 2, f'input precision collapsed to {in_prec}'
 
+    y_hls = hls_model.predict(x).reshape(y_keras.shape)
     mean_abs_diff = float(np.mean(np.abs(y_keras - y_hls)))
-    print(f'sparse-pixels {backend}: mean|diff|={mean_abs_diff:.4f}')
+    print(f'{output_dir.name}: mean|diff|={mean_abs_diff:.4f}')
+    assert mean_abs_diff < 0.05
 
-    assert mean_abs_diff < 0.5
+
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+@pytest.mark.parametrize('pool', ['avg', 'max'])
+def test_sparse_cnn(test_case_id, backend, pool):
+    np.random.seed(42)
+    keras.utils.set_random_seed(42)
+
+    model = _build_sparse_cnn(pool=pool)
+    x = _make_sparse_inputs(n_samples=1000)
+    _convert_and_check(model, x, test_root_path / test_case_id, backend)
+
+
+@pytest.mark.parametrize('backend', ['Vitis'])
+def test_sparse_cnn_parallelization(test_case_id, backend):
+    # Partial parallelization and the streaming input reduce only change the unroll/implementation,
+    # so the numerical output must still match the fully-parallel/tree default.
+    np.random.seed(43)
+    keras.utils.set_random_seed(43)
+
+    model = _build_sparse_cnn()
+    x = _make_sparse_inputs(n_samples=500)
+    overrides = {
+        'input_reduce': {'Variant': 'stream'},
+        'conv': {'PixelParallelFactor': 2, 'FiltParallelFactor': 1},
+        'pool': {'PixelParallelFactor': 2, 'ChanParallelFactor': 1},
+    }
+    _convert_and_check(model, x, test_root_path / test_case_id, backend, layer_overrides=overrides)

From 31ce35996ec027f86374e85c67d2b027d4f21d3f Mon Sep 17 00:00:00 2001
From: Ho Fung Tsoi <hftsoi0411@gmail.com>
Date: Thu, 2 Jul 2026 23:13:22 -0400
Subject: [PATCH 3/3] fix version for pyproject-fmt

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e5bbf83ba5..b515e79465 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,7 @@ optional-dependencies.qkeras = [
 optional-dependencies.qkeras-v3 = [ "qkeras-v3" ]
 optional-dependencies.quartus-report = [ "calmjs-parse", "tabulate" ]
 optional-dependencies.snn = [ "snntorch", "torch" ]
-optional-dependencies.sparsepixels = [ "sparsepixels>=0.3.0" ]
+optional-dependencies.sparsepixels = [ "sparsepixels>=0.3" ]
 optional-dependencies.sr = [ "sympy>=1.13.1" ]
 optional-dependencies.testing = [
   "calmjs-parse",