diff --git a/src/v/kafka/protocol/schemata/BUILD b/src/v/kafka/protocol/schemata/BUILD index 2ecaf0eaa31d2..cd3d7531ef023 100644 --- a/src/v/kafka/protocol/schemata/BUILD +++ b/src/v/kafka/protocol/schemata/BUILD @@ -1,4 +1,4 @@ -load("@rules_python//python:defs.bzl", "py_binary") +load("@rules_python//python:defs.bzl", "py_binary", "py_test") load("//src/v/kafka/protocol/schemata:generator.bzl", "generate_kafka_messages") py_binary( @@ -12,4 +12,23 @@ py_binary( ], ) +py_test( + name = "generator_reproducibility_test", + size = "small", + srcs = ["generator_reproducibility_test.py"], + data = [ + "create_topics_request.json", + "create_topics_response.json", + "describe_configs_response.json", + "fetch_request.json", + "fetch_response.json", + "generator.py", + "metadata_response.json", + ], + deps = [ + "@python_deps//jinja2", + "@python_deps//jsonschema", + ], +) + generate_kafka_messages() diff --git a/src/v/kafka/protocol/schemata/generator.py b/src/v/kafka/protocol/schemata/generator.py index c880e4752fff0..36338b82529c8 100755 --- a/src/v/kafka/protocol/schemata/generator.py +++ b/src/v/kafka/protocol/schemata/generator.py @@ -995,7 +995,7 @@ def type_headers(t): h = h.get(which, ()) yield from maybe_strings(h) - return set(h for t in types for h in type_headers(t)) + return sorted({h for t in types for h in type_headers(t)}) @property def is_default_comparable(self): @@ -2171,7 +2171,7 @@ def fail(msg): src = jinja2.Template(COMBINED_SOURCE_TEMPLATE).render( schema_headers=map(lambda p: p.name, headers), - extra_headers=extra_schema_headers, + extra_headers=sorted(extra_schema_headers), sources=sources, ) diff --git a/src/v/kafka/protocol/schemata/generator_reproducibility_test.py b/src/v/kafka/protocol/schemata/generator_reproducibility_test.py new file mode 100644 index 0000000000000..c3c5c42a6d745 --- /dev/null +++ b/src/v/kafka/protocol/schemata/generator_reproducibility_test.py @@ -0,0 +1,79 @@ +# Copyright 2026 Redpanda Data, Inc. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.md +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0 +"""Hermeticity regression test for the kafka schemata codegen. + +The generator emits C++ source from JSON schemas. Earlier versions iterated +Python sets when emitting #include lines, so the byte-level output varied with +PYTHONHASHSEED (which CPython picks fresh per interpreter). Bazel keys its +action cache on input content hashes, so non-deterministic codegen invalidates +the cache for every downstream compile even when the result is +preprocessor-equivalent. + +This test runs the generator with several PYTHONHASHSEED values and fails if +any output byte differs. +""" + +import os +import subprocess +import sys +import tempfile +import unittest +from pathlib import Path + +_HERE = Path(__file__).resolve().parent +_GENERATOR = _HERE / "generator.py" + +_SEEDS = ("1", "4294967295") +_EXTS = ("h", "cc") + +# Schemata chosen to exercise the codegen paths most likely to expose set-order +# bugs: ones whose fields pull in multiple `extra_headers` entries. +_SCHEMATA = ( + "create_topics_request", + "create_topics_response", + "fetch_request", + "fetch_response", + "metadata_response", + "describe_configs_response", +) + + +def _run_generator(schema: str, seed: str, outdir: Path) -> dict[str, bytes]: + """Run the generator once and return {ext: bytes} for the produced files.""" + outdir.mkdir() + subprocess.run( + [ + sys.executable, + str(_GENERATOR), + str(_HERE / f"{schema}.json"), + *(str(outdir / f"{schema}.{ext}") for ext in _EXTS), + ], + check=True, + env={**os.environ, "PYTHONHASHSEED": seed}, + ) + return {ext: (outdir / f"{schema}.{ext}").read_bytes() for ext in _EXTS} + + +class GeneratorReproducibilityTest(unittest.TestCase): + def test_codegen_is_hash_seed_independent(self) -> None: + for schema in _SCHEMATA: + with self.subTest(schema=schema), tempfile.TemporaryDirectory() as tmp: + outputs = [ + _run_generator(schema, s, Path(tmp) / f"seed{s}") for s in _SEEDS + ] + for ext in _EXTS: + self.assertEqual( + {o[ext] for o in outputs}, + {outputs[0][ext]}, + f"{schema}.{ext} varies across PYTHONHASHSEED values", + ) + + +if __name__ == "__main__": + unittest.main()