diff --git a/doc/whats-new.rst b/doc/whats-new.rst index effb199f18e..fcf35d8894d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -71,6 +71,10 @@ New Features or a fixed ``(width, height)`` tuple instead of computing figure size from ``size`` and ``aspect`` (:issue:`11103`). By `Kristian Kollsga `_. +- Added glob pattern support to the ``group`` parameter of :py:func:`open_datatree` + and :py:func:`open_groups`, allowing patterns like ``"*/sweep_0"`` to selectively + open matching groups (:issue:`11196`). + By `Alfonso Ladino `_. Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index fd992f3e5d8..ad3d9953755 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1021,8 +1021,18 @@ def open_datatree( Additional keyword arguments passed on to the engine open function. For example: - - 'group': path to the group in the given file to open as the root group as - a str. + - 'group': path to the group in the given file to open as the root + group as a str. If the string contains glob metacharacters + (``*``, ``?``, ``[``), it is interpreted as a pattern and only + groups whose paths match are loaded (along with their ancestors). + For example, ``group="*/sweep_0"`` loads every ``sweep_0`` one + level deep while skipping sibling groups. Matching follows + ``fnmatch`` / :py:meth:`pathlib.PurePath.match` semantics, so + group names that contain literal glob metacharacters can be + targeted with character-class escapes: ``[*]`` matches a + literal ``*``, ``[?]`` a literal ``?``, and ``[[]`` a literal + ``[``. For example, ``group="group_[*]_01"`` matches a group + literally named ``group_*_01``. - 'lock': resource lock to use when reading data from disk. Only relevant when using dask or another form of parallelism. By default, appropriate locks are chosen to safely read and write files with the @@ -1265,8 +1275,18 @@ def open_groups( Additional keyword arguments passed on to the engine open function. For example: - - 'group': path to the group in the given file to open as the root group as - a str. + - 'group': path to the group in the given file to open as the root + group as a str. If the string contains glob metacharacters + (``*``, ``?``, ``[``), it is interpreted as a pattern and only + groups whose paths match are loaded (along with their ancestors). + For example, ``group="*/sweep_0"`` loads every ``sweep_0`` one + level deep while skipping sibling groups. Matching follows + ``fnmatch`` / :py:meth:`pathlib.PurePath.match` semantics, so + group names that contain literal glob metacharacters can be + targeted with character-class escapes: ``[*]`` matches a + literal ``*``, ``[?]`` a literal ``?``, and ``[[]`` a literal + ``[``. For example, ``group="group_[*]_01"`` matches a group + literally named ``group_*_01``. - 'lock': resource lock to use when reading data from disk. Only relevant when using dask or another form of parallelism. By default, appropriate locks are chosen to safely read and write files with the diff --git a/xarray/backends/common.py b/xarray/backends/common.py index f2580ea2a43..f246f2f25ad 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -249,6 +249,34 @@ def _iter_nc_groups(root, parent="/"): yield from _iter_nc_groups(group, parent=gpath) +def _is_glob_pattern(pattern: str) -> bool: + return any(c in pattern for c in "*?[") + + +def _filter_group_paths(group_paths: Iterable[str], pattern: str) -> list[str]: + from xarray.core.treenode import NodePath + + matched: set[str] = {"/"} + for path in group_paths: + np_ = NodePath(path) + if np_.match(pattern): + matched.add(path) + matched.update(str(p) for p in np_.parents if str(p)) + + return [p for p in group_paths if p in matched] + + +def _resolve_group_and_filter( + group: str | None, + all_group_paths: list[str], +) -> tuple[str | None, list[str]]: + if group is None: + return None, all_group_paths + if _is_glob_pattern(group): + return None, _filter_group_paths(all_group_paths, group) + return group, all_group_paths + + def find_root_and_group(ds): """Find the root and group name of a netCDF4/h5netcdf dataset.""" hierarchy = () diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 9b828c8e236..006f41abeae 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -655,7 +655,11 @@ def open_groups_as_dict( open_kwargs: dict[str, Any] | None = None, **kwargs, ) -> dict[str, Dataset]: - from xarray.backends.common import _iter_nc_groups + from xarray.backends.common import ( + _is_glob_pattern, + _iter_nc_groups, + _resolve_group_and_filter, + ) from xarray.core.treenode import NodePath from xarray.core.utils import close_on_error @@ -664,10 +668,12 @@ def open_groups_as_dict( emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) filename_or_obj = _normalize_filename_or_obj(filename_or_obj) + + effective_group = None if (group and _is_glob_pattern(group)) else group store = H5NetCDFStore.open( filename_or_obj, format=format, - group=group, + group=effective_group, lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, @@ -678,15 +684,17 @@ def open_groups_as_dict( open_kwargs=open_kwargs, ) - # Check for a group and make it a parent if it exists - if group: - parent = NodePath("/") / NodePath(group) + if effective_group: + parent = NodePath("/") / NodePath(effective_group) else: parent = NodePath("/") manager = store._manager + all_group_paths = list(_iter_nc_groups(store.ds, parent=parent)) + _, filtered_paths = _resolve_group_and_filter(group, all_group_paths) + groups_dict = {} - for path_group in _iter_nc_groups(store.ds, parent=parent): + for path_group in filtered_paths: group_store = H5NetCDFStore(manager, group=path_group, **kwargs) store_entrypoint = StoreBackendEntrypoint() with close_on_error(group_store): @@ -701,7 +709,7 @@ def open_groups_as_dict( decode_timedelta=decode_timedelta, ) - if group: + if effective_group: group_name = str(NodePath(path_group).relative_to(parent)) else: group_name = str(NodePath(path_group)) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 39dedd139c0..277f8c91a92 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -859,13 +859,19 @@ def open_groups_as_dict( autoclose=False, **kwargs, ) -> dict[str, Dataset]: - from xarray.backends.common import _iter_nc_groups + from xarray.backends.common import ( + _is_glob_pattern, + _iter_nc_groups, + _resolve_group_and_filter, + ) from xarray.core.treenode import NodePath filename_or_obj = _normalize_path(filename_or_obj) + + effective_group = None if (group and _is_glob_pattern(group)) else group store = NetCDF4DataStore.open( filename_or_obj, - group=group, + group=effective_group, format=format, clobber=clobber, diskless=diskless, @@ -875,15 +881,17 @@ def open_groups_as_dict( autoclose=autoclose, ) - # Check for a group and make it a parent if it exists - if group: - parent = NodePath("/") / NodePath(group) + if effective_group: + parent = NodePath("/") / NodePath(effective_group) else: parent = NodePath("/") manager = store._manager + all_group_paths = list(_iter_nc_groups(store.ds, parent=parent)) + _, filtered_paths = _resolve_group_and_filter(group, all_group_paths) + groups_dict = {} - for path_group in _iter_nc_groups(store.ds, parent=parent): + for path_group in filtered_paths: group_store = NetCDF4DataStore(manager, group=path_group, **kwargs) store_entrypoint = StoreBackendEntrypoint() with close_on_error(group_store): @@ -897,7 +905,7 @@ def open_groups_as_dict( use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - if group: + if effective_group: group_name = str(NodePath(path_group).relative_to(parent)) else: group_name = str(NodePath(path_group)) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d9279dc2de9..bfbcd227f7e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1820,11 +1820,13 @@ def open_groups_as_dict( zarr_version=None, zarr_format=None, ) -> dict[str, Dataset]: + from xarray.backends.common import _is_glob_pattern, _resolve_group_and_filter + filename_or_obj = _normalize_path(filename_or_obj) - # Check for a group and make it a parent if it exists - if group: - parent = str(NodePath("/") / NodePath(group)) + effective_group = None if (group and _is_glob_pattern(group)) else group + if effective_group: + parent = str(NodePath("/") / NodePath(effective_group)) else: parent = str(NodePath("/")) @@ -1841,8 +1843,11 @@ def open_groups_as_dict( zarr_format=zarr_format, ) + _, filtered_paths = _resolve_group_and_filter(group, list(stores.keys())) + groups_dict = {} - for path_group, store in stores.items(): + for path_group in filtered_paths: + store = stores[path_group] store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): @@ -1856,7 +1861,7 @@ def open_groups_as_dict( use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - if group: + if effective_group: group_name = str(NodePath(path_group).relative_to(parent)) else: group_name = str(NodePath(path_group)) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 32f224e89a6..eee09c6c919 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -377,6 +377,118 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None: assert subgroup_tree.root.parent is None assert_equal(subgroup_tree, expected_subtree) + def test_open_datatree_group_glob(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/sweep_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/sweep_1": xr.Dataset({"data": ("x", [3, 4])}), + "/B": xr.Dataset({"b_var": 3}), + "/B/sweep_0": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + filepath = tmpdir / "glob_test.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree(filepath, group="*/sweep_0", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert "/A/sweep_0" in paths + assert "/B/sweep_0" in paths + assert "/A/sweep_1" not in paths + + def test_open_datatree_group_glob_no_match(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + } + ) + filepath = tmpdir / "glob_nomatch.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree(filepath, group="*/nonexistent", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/"} + + def test_open_datatree_group_glob_preserves_data(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/sweep_0": xr.Dataset({"data": ("x", [1, 2])}), + } + ) + filepath = tmpdir / "glob_data.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree(filepath, group="*/sweep_0", engine=self.engine) as tree: + assert tree["/A"].dataset["a_var"].item() == 2 + np.testing.assert_array_equal( + tree["/A/sweep_0"].dataset["data"].values, [1, 2] + ) + + def test_open_groups_group_glob(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/sweep_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/sweep_1": xr.Dataset({"data": ("x", [3, 4])}), + } + ) + filepath = tmpdir / "glob_groups.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + groups = open_groups(filepath, group="*/sweep_0", engine=self.engine) + try: + assert "/" in groups + assert "/A" in groups + assert "/A/sweep_0" in groups + assert "/A/sweep_1" not in groups + finally: + for ds in groups.values(): + ds.close() + + def test_open_datatree_glob_char_class_escape_literal_metachar( + self, tmpdir + ) -> None: + # Groups whose names contain glob metacharacters (*, ?, [) are + # reachable by character-class escaping (e.g. "[*]" matches a + # literal "*"), mirroring fnmatch / PurePath.match semantics. + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/group_*_01": xr.Dataset({"data": ("x", [1, 2])}), + "/group_*_02": xr.Dataset({"data": ("x", [3, 4])}), + "/group_?_01": xr.Dataset({"data": ("x", [5, 6])}), + "/plain_01": xr.Dataset({"data": ("x", [7, 8])}), + } + ) + filepath = tmpdir / "glob_escape.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + # Escape `*` as `[*]` — match only the literal-star group ending in _01. + with open_datatree(filepath, group="group_[*]_01", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" not in paths + assert "/group_?_01" not in paths + + # Escape `*` as `[*]` + `*` — match both literal-star groups. + with open_datatree(filepath, group="group_[*]_*", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" in paths + assert "/group_?_01" not in paths + assert "/plain_01" not in paths + + # Escape `?` as `[?]` — match only the literal-? group. + with open_datatree(filepath, group="group_[?]_01", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_?_01" in paths + assert "/group_*_01" not in paths + @requires_h5netcdf_or_netCDF4 class TestGenericNetCDFIO(NetCDFIOBase): @@ -1025,6 +1137,113 @@ def test_open_datatree_specific_group( assert subgroup_tree.root.parent is None assert_equal(subgroup_tree, expected_subtree) + def test_open_datatree_group_glob(self, tmpdir, zarr_format) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/sweep_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/sweep_1": xr.Dataset({"data": ("x", [3, 4])}), + "/B": xr.Dataset({"b_var": 3}), + "/B/sweep_0": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + filepath = str(tmpdir / "glob_test.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + with open_datatree(filepath, group="*/sweep_0", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert "/A/sweep_0" in paths + assert "/B/sweep_0" in paths + assert "/A/sweep_1" not in paths + + def test_open_datatree_group_glob_no_match(self, tmpdir, zarr_format) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + } + ) + filepath = str(tmpdir / "glob_nomatch.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + with open_datatree(filepath, group="*/nonexistent", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/"} + + def test_open_groups_group_glob(self, tmpdir, zarr_format) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/sweep_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/sweep_1": xr.Dataset({"data": ("x", [3, 4])}), + } + ) + filepath = str(tmpdir / "glob_groups.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + groups = open_groups(filepath, group="*/sweep_0", engine=self.engine) + try: + assert "/" in groups + assert "/A" in groups + assert "/A/sweep_0" in groups + assert "/A/sweep_1" not in groups + finally: + for ds in groups.values(): + ds.close() + + def test_open_datatree_glob_char_class_escape_literal_metachar( + self, zarr_format + ) -> None: + # In-memory store: Windows disallows "*" and "?" in directory names. + from zarr.storage import MemoryStore + + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/group_*_01": xr.Dataset({"data": ("x", [1, 2])}), + "/group_*_02": xr.Dataset({"data": ("x", [3, 4])}), + "/group_?_01": xr.Dataset({"data": ("x", [5, 6])}), + "/plain_01": xr.Dataset({"data": ("x", [7, 8])}), + } + ) + store = MemoryStore() + original_dt.to_zarr(store, zarr_format=zarr_format) + + with open_datatree( + store, # type: ignore[arg-type] + group="group_[*]_01", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" not in paths + assert "/group_?_01" not in paths + + with open_datatree( + store, # type: ignore[arg-type] + group="group_[*]_*", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" in paths + assert "/group_?_01" not in paths + assert "/plain_01" not in paths + + with open_datatree( + store, # type: ignore[arg-type] + group="group_[?]_01", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_?_01" in paths + assert "/group_*_01" not in paths + @requires_dask def test_open_groups_chunks(self, tmpdir, zarr_format) -> None: """Test `open_groups` with chunks on a zarr store.""" @@ -1142,3 +1361,90 @@ def test_zarr_engine_recognised(self, tmpdir, zarr_format) -> None: with open_datatree(filepath) as roundtrip_dt: assert_identical(original_dt, roundtrip_dt) + + +class TestGlobPatternUtilities: + def test_is_glob_pattern(self) -> None: + from xarray.backends.common import _is_glob_pattern + + assert _is_glob_pattern("*/sweep_0") + assert _is_glob_pattern("VCP-34/sweep_[01]") + assert _is_glob_pattern("sweep_?") + assert not _is_glob_pattern("VCP-34") + assert not _is_glob_pattern("/group/subgroup") + + def test_filter_group_paths(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/A/sweep_0", "/A/sweep_1", "/B", "/B/sweep_0"] + result = _filter_group_paths(paths, "*/sweep_0") + assert result == ["/", "/A", "/A/sweep_0", "/B", "/B/sweep_0"] + + def test_filter_group_paths_no_match(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/B"] + result = _filter_group_paths(paths, "*/nonexistent") + assert result == ["/"] + + def test_filter_group_paths_question_mark(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/B", "/AB"] + result = _filter_group_paths(paths, "?") + assert result == ["/", "/A", "/B"] + + def test_filter_group_paths_bracket(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/A/sweep_0", "/A/sweep_1", "/A/sweep_2"] + result = _filter_group_paths(paths, "*/sweep_[01]") + assert result == ["/", "/A", "/A/sweep_0", "/A/sweep_1"] + + def test_filter_group_paths_literal_metachar_via_char_class(self) -> None: + from xarray.backends.common import _filter_group_paths + + # Groups whose names literally contain glob metacharacters are + # reachable via character-class escaping (inherited from + # fnmatch / PurePath.match semantics). + paths = ["/", "/group_*_01", "/group_*_02", "/group_?_01", "/plain_01"] + + # "[*]" matches a literal "*" + assert _filter_group_paths(paths, "group_[*]_01") == [ + "/", + "/group_*_01", + ] + assert _filter_group_paths(paths, "group_[*]_*") == [ + "/", + "/group_*_01", + "/group_*_02", + ] + # "[?]" matches a literal "?" + assert _filter_group_paths(paths, "group_[?]_01") == [ + "/", + "/group_?_01", + ] + + def test_resolve_group_and_filter_none(self) -> None: + from xarray.backends.common import _resolve_group_and_filter + + paths = ["/", "/A"] + effective, filtered = _resolve_group_and_filter(None, paths) + assert effective is None + assert filtered == paths + + def test_resolve_group_and_filter_literal(self) -> None: + from xarray.backends.common import _resolve_group_and_filter + + paths = ["/", "/A"] + effective, filtered = _resolve_group_and_filter("A", paths) + assert effective == "A" + assert filtered == paths + + def test_resolve_group_and_filter_glob(self) -> None: + from xarray.backends.common import _resolve_group_and_filter + + paths = ["/", "/A", "/A/sweep_0", "/A/sweep_1", "/B", "/B/sweep_0"] + effective, filtered = _resolve_group_and_filter("*/sweep_0", paths) + assert effective is None + assert filtered == ["/", "/A", "/A/sweep_0", "/B", "/B/sweep_0"]