Skip to content

Commit 885e8d1

Browse files
authored
Switch to using blockcache with size 4MB by default for h5netcdf (#11216)
Changes the default cache type when passing the path as a string to `open_dataset` and `open_datatree` with the `h5netcdf` engine. This represents a significant improvement when opening optimized hdf5 files. This idea is based off the work that @betolink has been doing in earthaccess (earthaccess-dev/earthaccess#1061)
1 parent e7b1e5b commit 885e8d1

4 files changed

Lines changed: 89 additions & 4 deletions

File tree

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ Breaking Changes
2727
:py:meth:`open_dataset`. This finalizes the deprecation cycle initiated in
2828
xarray version 2025.01.2 (:pull:`11173`). By `Spencer Clark
2929
<https://github.com/spencerkclark>`_.
30+
- When using ``h5netcdf`` engine and passing the path as a string to
31+
``open_dataset`` and ``open_datatree`` the default behavior of fsspec is now to
32+
use block caching with a 4MB block size (:pull:`11216`). By `Julia Signell
33+
<https://github.com/jsignell>`_.
3034

3135
Deprecations
3236
~~~~~~~~~~~~

xarray/backends/common.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,13 +214,16 @@ def getbuffer(self) -> memoryview:
214214
return self.getvalue()
215215

216216

217-
def _open_remote_file(file, mode, storage_options=None):
217+
def _open_remote_file(file, mode, storage_options=None, open_kwargs=None):
218218
import fsspec
219219

220220
fs, _, paths = fsspec.get_fs_token_paths(
221221
file, mode=mode, storage_options=storage_options
222222
)
223-
return fs.open(paths[0], mode=mode)
223+
224+
open_kwargs = open_kwargs or {}
225+
226+
return fs.open(paths[0], mode=mode, **open_kwargs)
224227

225228

226229
def _encode_variable_name(name):

xarray/backends/h5netcdf_.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,29 @@ def open(
183183
driver=None,
184184
driver_kwds=None,
185185
storage_options: dict[str, Any] | None = None,
186+
open_kwargs: dict[str, Any] | None = None,
186187
):
187188
import h5netcdf
188189

189190
if isinstance(filename, str) and is_remote_uri(filename) and driver is None:
190191
mode_ = "rb" if mode == "r" else mode
192+
193+
open_kwargs = open_kwargs or {}
194+
195+
# Use blockcache with size 4MB by default
196+
if "cache_type" not in open_kwargs:
197+
open_kwargs["cache_type"] = "blockcache"
198+
if (
199+
open_kwargs["cache_type"] == "blockcache"
200+
and "block_size" not in open_kwargs
201+
):
202+
open_kwargs["block_size"] = 4 * 1024 * 1024
203+
191204
filename = _open_remote_file(
192-
filename, mode=mode_, storage_options=storage_options
205+
filename,
206+
mode=mode_,
207+
storage_options=storage_options,
208+
open_kwargs=open_kwargs,
193209
)
194210

195211
if isinstance(filename, BytesIOProxy):
@@ -531,6 +547,7 @@ def open_dataset(
531547
driver=None,
532548
driver_kwds=None,
533549
storage_options: dict[str, Any] | None = None,
550+
open_kwargs: dict[str, Any] | None = None,
534551
) -> Dataset:
535552
# Keep this message for some versions
536553
# remove and set phony_dims="access" above
@@ -548,6 +565,7 @@ def open_dataset(
548565
driver=driver,
549566
driver_kwds=driver_kwds,
550567
storage_options=storage_options,
568+
open_kwargs=open_kwargs,
551569
)
552570

553571
store_entrypoint = StoreBackendEntrypoint()
@@ -633,6 +651,8 @@ def open_groups_as_dict(
633651
decode_vlen_strings=True,
634652
driver=None,
635653
driver_kwds=None,
654+
storage_options: dict[str, Any] | None = None,
655+
open_kwargs: dict[str, Any] | None = None,
636656
**kwargs,
637657
) -> dict[str, Dataset]:
638658
from xarray.backends.common import _iter_nc_groups
@@ -654,6 +674,8 @@ def open_groups_as_dict(
654674
decode_vlen_strings=decode_vlen_strings,
655675
driver=driver,
656676
driver_kwds=driver_kwds,
677+
storage_options=storage_options,
678+
open_kwargs=open_kwargs,
657679
)
658680

659681
# Check for a group and make it a parent if it exists

xarray/tests/test_backends.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
open_mfdataset,
4545
save_mfdataset,
4646
)
47-
from xarray.backends.common import robust_getitem
47+
from xarray.backends.common import _open_remote_file, robust_getitem
4848
from xarray.backends.h5netcdf_ import H5netcdfBackendEntrypoint
4949
from xarray.backends.netcdf3 import _nc3_dtype_coercions
5050
from xarray.backends.netCDF4_ import (
@@ -5298,6 +5298,62 @@ def test_write_inconsistent_chunks(self) -> None:
52985298
assert actual["y"].encoding["chunksizes"] == (100, 50)
52995299

53005300

5301+
@requires_h5netcdf
5302+
@requires_fsspec
5303+
@pytest.mark.parametrize(
5304+
"open_kwargs, expected_cache_type, expected_block_size",
5305+
[
5306+
# Default: blockcache with 4MB block size
5307+
(None, "blockcache", 4 * 1024 * 1024),
5308+
({}, "blockcache", 4 * 1024 * 1024),
5309+
# Custom block_size still uses blockcache
5310+
({"block_size": 8 * 1024 * 1024}, "blockcache", 8 * 1024 * 1024),
5311+
# Explicit blockcache with default block_size
5312+
({"cache_type": "blockcache"}, "blockcache", 4 * 1024 * 1024),
5313+
# Custom cache_type: no block_size default injected
5314+
({"cache_type": "readahead"}, "readahead", None),
5315+
],
5316+
ids=["default", "empty-dict", "8mb", "blockcache", "readahead"],
5317+
)
5318+
def test_h5netcdf_open_kwargs(
5319+
open_kwargs, expected_cache_type, expected_block_size
5320+
) -> None:
5321+
"""Test that open_kwargs are forwarded to the remote file opener."""
5322+
expected = create_test_data()
5323+
with create_tmp_file() as tmp_file:
5324+
expected.to_netcdf(tmp_file, engine="h5netcdf")
5325+
5326+
captured = {}
5327+
5328+
def capturing_open_remote_file(
5329+
file, mode, storage_options=None, open_kwargs=None
5330+
):
5331+
captured["open_kwargs"] = open_kwargs
5332+
return _open_remote_file(
5333+
file,
5334+
mode=mode,
5335+
storage_options=storage_options,
5336+
open_kwargs=open_kwargs,
5337+
)
5338+
5339+
with patch(
5340+
"xarray.backends.h5netcdf_._open_remote_file",
5341+
side_effect=capturing_open_remote_file,
5342+
):
5343+
# Use a file:// URI so is_remote_uri returns True and _open_remote_file is called
5344+
file_uri = f"file://{tmp_file}"
5345+
with open_dataset(
5346+
file_uri, engine="h5netcdf", open_kwargs=open_kwargs
5347+
) as actual:
5348+
assert_identical(actual, expected)
5349+
5350+
assert captured["open_kwargs"]["cache_type"] == expected_cache_type
5351+
if expected_block_size is None:
5352+
assert "block_size" not in captured["open_kwargs"]
5353+
else:
5354+
assert captured["open_kwargs"]["block_size"] == expected_block_size
5355+
5356+
53015357
@requires_netCDF4
53025358
@requires_h5netcdf
53035359
def test_memoryview_write_h5netcdf_read_netcdf4() -> None:

0 commit comments

Comments
 (0)