fix: reduce pytest warnings (logging, HDF5, tests)

benjello · benjello · commit fc62a0ed40c6 · 2026-03-03T09:41:58.000+01:00
- policy/simulations: log.warn -&gt; log.warning (3), groupby(..., observed=False)
- io/hdf: hdf5_safe_key() for PyTables NaturalNameWarning; to_hdf key= keyword
- core/survey: use hdf5_safe_key when reading HDF5, backward compat fallback
- tests: fix PytestReturnNotNoneWarning (assert instead of return)

Made-with: Cursor
diff --git a/openfisca_survey_manager/core/survey.py b/openfisca_survey_manager/core/survey.py
@@ -15,6 +15,7 @@
 
 from openfisca_survey_manager.core.table import Table
 from openfisca_survey_manager.exceptions import SurveyIOError, SurveyManagerError
+from openfisca_survey_manager.io.hdf import hdf5_safe_key
 from openfisca_survey_manager.processing.harmonization import harmonize_data_frame_columns
 
 if TYPE_CHECKING:
@@ -196,19 +197,23 @@ def _get_values_from_hdf5(self, table: str, ignorecase: bool = False) -> tuple[p
         )
         store = pandas.HDFStore(self.hdf5_file_path, "r")
         try:
+            # Use same key normalization as at write time (PyTables NaturalNameWarning)
+            hdf5_key = hdf5_safe_key(table)
             if ignorecase:
                 keys = store.keys()
-                eligible_tables = [
-                    match[0] for string in keys for match in [re.findall(table, string, re.IGNORECASE)] if match
-                ]
+                eligible_tables = [k for k in keys if hdf5_safe_key(k.lstrip("/")).lower() == hdf5_key.lower()]
                 if len(eligible_tables) > 1:
                     raise SurveyManagerError(
                         f"{table} is ambiguous since the following tables are available: {eligible_tables}"
                     )
                 if len(eligible_tables) == 0:
                     raise SurveyIOError(f"No eligible available table in {keys}")
-                table = eligible_tables[0]
-            df = store.select(table)
+                hdf5_key = eligible_tables[0].lstrip("/")
+            try:
+                df = store.select(hdf5_key)
+            except KeyError:
+                # Backward compat: try raw table name (old files may have keys with hyphens)
+                df = store.select(table)
             return df, table
         except KeyError:
             log.error("No table %s in the file %s", table, self.hdf5_file_path)
diff --git a/openfisca_survey_manager/io/hdf.py b/openfisca_survey_manager/io/hdf.py
@@ -3,12 +3,22 @@
 from __future__ import annotations
 
 import logging
+import re
 from typing import Any
 
 import pandas as pd
 
 log = logging.getLogger(__name__)
 
+# PyTables / pandas-HDF5 require node names to match ^[a-zA-Z_][a-zA-Z0-9_]*$
+# to avoid NaturalNameWarning. We normalize table names (e.g. person_2017-01 -> person_2017_01).
+_HDF5_SAFE_PATTERN = re.compile(r"[^a-zA-Z0-9_]")
+
+
+def hdf5_safe_key(name: str) -> str:
+    """Return an HDF5 node name safe for PyTables (valid Python identifier)."""
+    return _HDF5_SAFE_PATTERN.sub("_", name)
+
 
 def write_table_to_hdf5(
     data_frame: pd.DataFrame,
@@ -22,8 +32,9 @@ def write_table_to_hdf5(
     Mirrors historical behavior from `tables.Table.save_data_frame_to_hdf5`.
     May mutate `data_frame` (type conversions) to ensure it can be written.
     """
+    key = hdf5_safe_key(store_path)
     try:
-        data_frame.to_hdf(hdf5_file_path, store_path, append=False, **kwargs)
+        data_frame.to_hdf(hdf5_file_path, key=key, append=False, **kwargs)
     except (TypeError, NotImplementedError):
         log.info("Type problem(s) when creating %s in %s", store_path, hdf5_file_path)
         dtypes = data_frame.dtypes
@@ -42,4 +53,4 @@ def write_table_to_hdf5(
                 "The following types are added as category using the table format %s",
                 dtypes[converted_dtypes],
             )
-            data_frame.to_hdf(hdf5_file_path, store_path, append=False, format="table", **kwargs)
+            data_frame.to_hdf(hdf5_file_path, key=key, append=False, format="table", **kwargs)
diff --git a/openfisca_survey_manager/policy/simulations.py b/openfisca_survey_manager/policy/simulations.py
@@ -405,7 +405,7 @@ def compute_pivot_table(
                 variables.add(weight_variable)
 
             else:
-                log.warn(
+                log.warning(
                     f"There is no weight variable for entity {entity_key} nor alternative weights. "
                     "Switch to unweighted"
                 )
@@ -750,7 +750,7 @@ def compute_winners_losers(
             weight_variable = weight_variable_by_entity[entity_key]
             weight = baseline_simulation.calculate(weight_variable, period=period)
         else:
-            log.warn(
+            log.warning(
                 f"There is no weight variable for entity {entity_key} nor alternative weights. Switch to unweighted"
             )
 
@@ -1249,7 +1249,7 @@ def init_variable_in_entity(
     if variable.definition_period == YEAR and period.unit == MONTH:
         # Some variables defined for a year are present in month/quarter dataframes
         # Cleaning the dataframe would probably be better in the long run
-        log.warn(
+        log.warning(
             f"Trying to set a monthly value for variable {variable_name}, which is defined on a year. "
             "The  montly values you provided will be summed."
         )
@@ -1434,7 +1434,7 @@ def summarize_variable(
                     )
                     df = pd.DataFrame({variable: array}).replace(categories_by_index).astype(categories_type)
                     df["weights"] = weights if weighted else 1
-                    groupby = df.groupby(variable)["weights"].sum()
+                    groupby = df.groupby(variable, observed=False)["weights"].sum()
                     total = groupby.sum()
                     expr = [f" {index} = {row:.2e} ({row / total:.1%})" for index, row in groupby.items()]
                     log.info("%s: %s.", period, ",".join(expr))
diff --git a/openfisca_survey_manager/policy/tests/test_compute_pivot_table.py b/openfisca_survey_manager/policy/tests/test_compute_pivot_table.py
@@ -7,7 +7,7 @@ def test_compute_pivot_table():
     survey_scenario = create_randomly_initialized_survey_scenario(reform=modify_social_security_taxation)
     period = "2017-01"
 
-    return survey_scenario.compute_pivot_table(
+    pivot_table = survey_scenario.compute_pivot_table(
         aggfunc="mean",
         columns=["age"],
         difference=False,
@@ -22,3 +22,4 @@ def test_compute_pivot_table():
         weighted=True,
         alternative_weights=None,
     )
+    assert pivot_table is not None
diff --git a/openfisca_survey_manager/tests/test_calibration.py b/openfisca_survey_manager/tests/test_calibration.py
@@ -164,4 +164,4 @@ def test_simulation_calibration_input_from_data(tmp_path):
             f"{simulation_name} weight_variable_by_entity does not match {weight_variable_by_entity}"
         )
         assert (survey_scenario.calculate_series("household_weight", period, simulation=simulation_name) != 0).all()
-    return survey_scenario
+    assert survey_scenario is not None

Original file line number	Diff line number	Diff line change
`@@ -164,4 +164,4 @@ def test_simulation_calibration_input_from_data(tmp_path):`
`164`	`164`	`f"{simulation_name} weight_variable_by_entity does not match {weight_variable_by_entity}"`
`165`	`165`	`)`
`166`	`166`	`assert (survey_scenario.calculate_series("household_weight", period, simulation=simulation_name) != 0).all()`
`167`		`- return survey_scenario`
	`167`	`+ assert survey_scenario is not None`