diff --git a/.conda-recipe/meta.yaml b/.conda-recipe/meta.yaml index 83678be..4a75a5c 100644 --- a/.conda-recipe/meta.yaml +++ b/.conda-recipe/meta.yaml @@ -23,6 +23,8 @@ requirements: - numpy - scipy - ruamel_yaml + - pandas + - pyarrow - pyyaml - pathlib # [py2k] - enum34 # [py2k] @@ -34,6 +36,8 @@ test: - pytest - pytest-benchmark - h5py + - pyarrow + - pandas - six - coverage - codecov diff --git a/exdir/__init__.py b/exdir/__init__.py index 2092ccb..16e7125 100644 --- a/exdir/__init__.py +++ b/exdir/__init__.py @@ -1,7 +1,10 @@ from . import core from . import plugin_interface from . import plugins -from .core import File, validation, Attribute, Dataset, Group, Raw, Object +from .core import ( + File, validation, Attribute, Dataset, Group, Raw, Object, SoftLink, + ExternalLink +) # TODO remove versioneer from ._version import get_versions diff --git a/exdir/core/__init__.py b/exdir/core/__init__.py index 474ff48..11caae6 100644 --- a/exdir/core/__init__.py +++ b/exdir/core/__init__.py @@ -11,3 +11,4 @@ from .dataset import Dataset from .group import Group from .raw import Raw +from .links import SoftLink, ExternalLink diff --git a/exdir/core/constants.py b/exdir/core/constants.py index eb833aa..8edda4f 100644 --- a/exdir/core/constants.py +++ b/exdir/core/constants.py @@ -2,6 +2,15 @@ EXDIR_METANAME = "exdir" TYPE_METANAME = "type" VERSION_METANAME = "version" +LINK_METANAME = "link" +TARGET_METANAME = "target" + +#links +LINK_TYPENAME = "link" +LINK_TARGETNAME = "target" +LINK_EXTERNALNAME = "external" +LINK_SOFTNAME = "soft" +LINK_FILENAME = "file" # filenames META_FILENAME = "exdir.yaml" diff --git a/exdir/core/dataset.py b/exdir/core/dataset.py index 9fe36fd..78dbe18 100644 --- a/exdir/core/dataset.py +++ b/exdir/core/dataset.py @@ -1,10 +1,16 @@ import numbers import numpy as np +import pyarrow.feather as feather +import pandas as pd import exdir from . import exdir_object as exob from .mode import assert_file_open, OpenMode, assert_file_writable +NUMPY_SUFFIX = '.npy' +FEATHER_SUFFIX = '.feather' + + def _prepare_write(data, plugins, attrs, meta): for plugin in plugins: dataset_data = exdir.plugin_interface.DatasetData( @@ -25,7 +31,14 @@ def _prepare_write(data, plugins, attrs, meta): def _dataset_filename(dataset_directory): - return dataset_directory / "data.npy" + base = dataset_directory / "data" + if base.with_suffix(FEATHER_SUFFIX).exists(): + filename = base.with_suffix(FEATHER_SUFFIX) + is_numpy = False + else: + filename = base.with_suffix(NUMPY_SUFFIX) + is_numpy = True + return filename, is_numpy class Dataset(exob.Object): @@ -44,9 +57,8 @@ def __init__(self, root_directory, parent_path, object_name, file): object_name=object_name, file=file ) - self._data_memmap = None + self._data_loaded = None self.plugin_manager = file.plugin_manager - self.data_filename = str(_dataset_filename(self.directory)) def __getitem__(self, args): assert_file_open(self.file) @@ -75,9 +87,8 @@ def __getitem__(self, args): meta = self.meta.to_dict() atts = self.attrs.to_dict() - dataset_data = exdir.plugin_interface.DatasetData(data=values, - attrs=self.attrs.to_dict(), - meta=meta) + dataset_data = exdir.plugin_interface.DatasetData( + data=values, attrs=self.attrs.to_dict(), meta=meta) for plugin in plugins: dataset_data = plugin.prepare_read(dataset_data) @@ -87,6 +98,7 @@ def __getitem__(self, args): def __setitem__(self, args, value): assert_file_writable(self.file) + data_filename, is_numpy = _dataset_filename(self.directory) value, attrs, meta = _prepare_write( data=value, @@ -94,14 +106,23 @@ def __setitem__(self, args, value): attrs=self.attrs.to_dict(), meta=self.meta.to_dict() ) - self._data[args] = value + if is_numpy: + self._data[args] = value + else: + self._data[args] = value + self.flush() self.attrs = attrs self.meta._set_data(meta) + def flush(self): + self.data = self._data + def _reload_data(self): assert_file_open(self.file) + data_filename, is_numpy = _dataset_filename(self.directory) for plugin in self.plugin_manager.dataset_plugins.write_order: - plugin.before_load(self.data_filename) + plugin.before_load(str(data_filename)) + if self.file.io_mode == OpenMode.READ_ONLY: mmap_mode = "r" @@ -109,35 +130,54 @@ def _reload_data(self): mmap_mode = "r+" try: - self._data_memmap = np.load(self.data_filename, mmap_mode=mmap_mode, allow_pickle=False) + if is_numpy: + self._data_loaded = np.load( + str(data_filename), + mmap_mode=mmap_mode, allow_pickle=False) + else: + self._data_loaded = feather.read_feather(str(data_filename)) self.file._open_datasets[self.name] = self except ValueError as e: - # Could be that it is a Git LFS file. Let's see if that is the case and warn if so. - with open(self.data_filename, "r") as f: + # Could be that numpy needs to pickle, suggest the user to use + # dataframe + + # Could be that it is a Git LFS file. + # Let's see if that is the case and warn if so. + with open(str(data_filename), "r") as f: test_string = "version https://git-lfs.github.com/spec/v1" contents = f.read(len(test_string)) if contents == test_string: raise IOError("The file '{}' is a Git LFS placeholder. " - "Open the the Exdir File with the Git LFS plugin or run " - "`git lfs fetch` first. ".format(self.data_filename)) + "Open the the Exdir File with the Git LFS plugin or run" + " `git lfs fetch` first. ".format(str(data_filename))) else: raise e def _reset_data(self, value, attrs, meta): assert_file_open(self.file) - self._data_memmap = np.lib.format.open_memmap( - self.data_filename, - mode="w+", - dtype=value.dtype, - shape=value.shape - ) - - if len(value.shape) == 0: - # scalars need to be set with itemset - self._data_memmap.itemset(value) + data_filename, _ = _dataset_filename(self.directory) + if isinstance(value, pd.DataFrame): + feather.write_feather( + value, str(data_filename.with_suffix(FEATHER_SUFFIX))) + if data_filename.with_suffix(NUMPY_SUFFIX).exists(): + data_filename.with_suffix(NUMPY_SUFFIX).unlink() else: - # replace the contents with the value - self._data_memmap[:] = value + self._data_loaded = np.lib.format.open_memmap( + str(data_filename.with_suffix(NUMPY_SUFFIX)), + mode="w+", + dtype=value.dtype, + shape=value.shape + ) + + if len(value.shape) == 0: + # scalars need to be set with itemset + self._data_loaded.itemset(value) + else: + # replace the contents with the value + self._data_loaded[:] = value + + if data_filename.with_suffix(FEATHER_SUFFIX).exists(): + data_filename.with_suffix(FEATHER_SUFFIX).unlink() # update attributes and plugin metadata if attrs: @@ -177,7 +217,7 @@ def data(self): @data.setter def data(self, value): assert_file_open(self.file) - if self._data.shape != value.shape or self._data.dtype != value.dtype: + if isinstance(value, pd.DataFrame): value, attrs, meta = _prepare_write( data=value, plugins=self.plugin_manager.dataset_plugins.write_order, @@ -185,9 +225,22 @@ def data(self, value): meta=self.meta.to_dict() ) self._reset_data(value, attrs, meta) - return - - self[:] = value + else: + if hasattr(self._data, 'dtype'): + new_dtype = self._data.dtype != value.dtype + else: + new_dtype = True # changing from feather to numpy + if self._data.shape != value.shape or new_dtype: + value, attrs, meta = _prepare_write( + data=value, + plugins=self.plugin_manager.dataset_plugins.write_order, + attrs=self.attrs.to_dict(), + meta=self.meta.to_dict() + ) + self._reset_data(value, attrs, meta) + return + + self[:] = value @property def shape(self): @@ -261,8 +314,8 @@ def __iter__(self): if len(self.shape) == 0: raise TypeError("Can't iterate over a scalar dataset") - for i in range(self.shape[0]): - yield self[i] + for val in self.data: + yield val def __str__(self): return self.data.__str__() @@ -270,12 +323,12 @@ def __str__(self): def __repr__(self): if self.file.io_mode == OpenMode.FILE_CLOSED: return "" - return "".format( - self.name, self.shape, self.dtype) + return "".format( + self.name, self.shape) @property def _data(self): assert_file_open(self.file) - if self._data_memmap is None: + if self._data_loaded is None: self._reload_data() - return self._data_memmap + return self._data_loaded diff --git a/exdir/core/exdir_file.py b/exdir/core/exdir_file.py index 82a07d9..cec16e3 100644 --- a/exdir/core/exdir_file.py +++ b/exdir/core/exdir_file.py @@ -71,7 +71,7 @@ class File(Group): def __init__(self, directory, mode=None, allow_remove=False, name_validation=None, plugins=None): self._open_datasets = weakref.WeakValueDictionary({}) - directory = pathlib.Path(directory) #.resolve() + directory = pathlib.Path(directory).absolute() #.resolve() if directory.suffix != ".exdir": directory = directory.with_suffix(directory.suffix + ".exdir") self.user_mode = mode = mode or 'a' @@ -170,8 +170,8 @@ def close(self): # there are no way to close the memmap other than deleting all # references to it, thus try: - data_set._data_memmap.flush() - data_set._data_memmap.setflags(write=False) # TODO does not work + data_set._data_loaded.flush() + data_set._data_loaded.setflags(write=False) # TODO does not work except AttributeError: pass # force garbage collection to clean weakrefs @@ -220,6 +220,12 @@ def __getitem__(self, name): return self return super(File, self).__getitem__(path) + def __setitem__(self, name, value): + path = utils.path.remove_root(name) + if len(path.parts) < 1: + return self + return super(File, self).__setitem__(path, value) + def __contains__(self, name): path = utils.path.remove_root(name) return super(File, self).__contains__(path) diff --git a/exdir/core/exdir_object.py b/exdir/core/exdir_object.py index a4b4bcf..e6994a0 100644 --- a/exdir/core/exdir_object.py +++ b/exdir/core/exdir_object.py @@ -112,7 +112,8 @@ def is_nonraw_object_directory(directory): return False if TYPE_METANAME not in meta_data[EXDIR_METANAME]: return False - valid_types = [DATASET_TYPENAME, FILE_TYPENAME, GROUP_TYPENAME] + valid_types = [ + DATASET_TYPENAME, FILE_TYPENAME, GROUP_TYPENAME, LINK_METANAME] if meta_data[EXDIR_METANAME][TYPE_METANAME] not in valid_types: return False return True diff --git a/exdir/core/group.py b/exdir/core/group.py index 53be5e6..f5ecc08 100644 --- a/exdir/core/group.py +++ b/exdir/core/group.py @@ -9,6 +9,7 @@ raise e import numpy as np import exdir +import pandas as pd try: import ruamel_yaml as yaml except ImportError: @@ -19,12 +20,15 @@ import collections as abc from .exdir_object import Object +from .links import Link, SoftLink, ExternalLink from .mode import assert_file_open, OpenMode, assert_file_writable from . import exdir_object as exob +from . import exdir_file as exfile from . import dataset as ds from . import raw from .. import utils + def _data_to_shape_and_dtype(data, shape, dtype): if data is not None: if shape is None: @@ -36,6 +40,7 @@ def _data_to_shape_and_dtype(data, shape, dtype): dtype = np.float32 return shape, dtype + def _assert_data_shape_dtype_match(data, shape, dtype): if data is not None: if shape is not None and np.product(shape) != np.product(data.shape): @@ -53,6 +58,7 @@ def _assert_data_shape_dtype_match(data, shape, dtype): ) return + class Group(Object): """ Container of other groups and datasets. @@ -124,22 +130,33 @@ def create_dataset(self, name, shape=None, dtype=None, meta=exob._default_metadata(exob.DATASET_TYPENAME) ) - _assert_data_shape_dtype_match(prepared_data, shape, dtype) + if not isinstance(data, pd.DataFrame): - shape, dtype = _data_to_shape_and_dtype(prepared_data, shape, dtype) + _assert_data_shape_dtype_match(prepared_data, shape, dtype) - if prepared_data is not None: - if shape is not None and prepared_data.shape != shape: - prepared_data = np.reshape(prepared_data, shape) - else: - if shape is None: - prepared_data = None - else: - fillvalue = fillvalue or 0.0 - prepared_data = np.full(shape, fillvalue, dtype=dtype) + shape, dtype = _data_to_shape_and_dtype(prepared_data, shape, dtype) - if prepared_data is None: - raise TypeError("Could not create a meaningful dataset.") + if prepared_data is not None: + if shape is not None and prepared_data.shape != shape: + prepared_data = np.reshape(prepared_data, shape) + else: + if shape is None: + prepared_data = None + else: + fillvalue = fillvalue or 0.0 + prepared_data = np.full(shape, fillvalue, dtype=dtype) + + if prepared_data is None: + raise TypeError("Could not create a meaningful dataset.") + else: + if dtype is not None: + raise NotImplementedError( + 'We currently do not support forcing dtype on creating with' + ' DataFrames.') + if shape is not None: + raise NotImplementedError( + 'We currently do not support reshape on creating with ' + 'DataFrames.') dataset_directory = self.directory / name exob._create_object_directory(dataset_directory, meta) @@ -306,30 +323,36 @@ def require_dataset(self, name, shape=None, dtype=None, exact=False, # TODO verify proper attributes - - _assert_data_shape_dtype_match(data, shape, dtype) - shape, dtype = _data_to_shape_and_dtype(data, shape, dtype) - - if not np.array_equal(shape, current_object.shape): - raise TypeError( - "Shapes do not match (existing {} vs " - "new {})".format(current_object.shape, shape) - ) - - if dtype != current_object.dtype: - if exact: + if any(isinstance(a, pd.DataFrame) for a in [data, current_object.data]): + if not isinstance(data, type(current_object.data)): + raise IOError( + 'Not allowed to require different data instance with' + '(existing {} vs new {}), set data if a change is desired.' + ''.format(type(current_object.data), type(data))) + if not isinstance(data, pd.DataFrame): + _assert_data_shape_dtype_match(data, shape, dtype) + shape, dtype = _data_to_shape_and_dtype(data, shape, dtype) + + if not np.array_equal(shape, current_object.shape): raise TypeError( - "Datatypes do not exactly match " - "existing {} vs new {})".format(current_object.dtype, dtype) + "Shapes do not match (existing {} vs " + "new {})".format(current_object.shape, shape) ) - if not np.can_cast(dtype, current_object.dtype): - raise TypeError( - "Cannot safely cast from {} to {}".format( - dtype, - current_object.dtype + if dtype != current_object.dtype: + if exact: + raise TypeError( + "Datatypes do not exactly match " + "existing {} vs new {})".format(current_object.dtype, dtype) + ) + + if not np.can_cast(dtype, current_object.dtype): + raise TypeError( + "Cannot safely cast from {} to {}".format( + dtype, + current_object.dtype + ) ) - ) return current_object @@ -403,6 +426,8 @@ def __getitem__(self, name): return self._dataset(name) elif meta_data[exob.EXDIR_METANAME][exob.TYPE_METANAME] == exob.GROUP_TYPENAME: return self._group(name) + elif meta_data[exob.EXDIR_METANAME][exob.TYPE_METANAME] == exob.LINK_TYPENAME: + return self._link(name) else: error_string = ( "Object {name} has data type {type}.\n" @@ -413,6 +438,25 @@ def __getitem__(self, name): ) raise NotImplementedError(error_string) + def _link(self, name, get_link=False): + link_meta = self._group(name).meta[exob.EXDIR_METANAME][exob.LINK_METANAME] + print(link_meta) + if link_meta[exob.TYPE_METANAME] == exob.LINK_SOFTNAME: + if get_link: + result = SoftLink(link_meta[exob.LINK_TARGETNAME]) + else: + result = self[link_meta[exob.LINK_TARGETNAME]] + elif link_meta[exob.TYPE_METANAME] == exob.LINK_EXTERNALNAME: + if get_link: + result = ExternalLink( + link_meta[exob.LINK_FILENAME], + link_meta[exob.LINK_TARGETNAME]) + else: + external_file = exfile.File( + link_meta[exob.LINK_FILENAME], 'r') + result = external_file[link_meta[exob.LINK_TARGETNAME]] + return result + def _dataset(self, name): return ds.Dataset( root_directory=self.root_directory, @@ -439,6 +483,13 @@ def __setitem__(self, name, value): self[path.parent][path.name] = value return + if isinstance(value, Link): + link_group = self.create_group(name) + # if value.path not in self.file: + # return # TODO works when merging with lepmik/close + link_group.meta[exob.EXDIR_METANAME].update(value._link) + return + if name not in self: self.create_dataset(name, data=value) return @@ -447,8 +498,7 @@ def __setitem__(self, name, value): raise RuntimeError( "Unable to assign value, {} already exists".format(name) ) - - self[name].value = value + self[name].data = value def __delitem__(self, name): """ @@ -509,20 +559,22 @@ def __len__(self): assert_file_open(self.file) return len([a for a in self]) - def get(self, key): + def get(self, name, get_link=False): """ Get an object in the group. Parameters ---------- - key : str - The key of the desired object + name : str + The name of the desired object Returns ------- Value or None if object does not exist. """ assert_file_open(self.file) - if key in self: - return self[key] + if name in self: + if get_link: + return self._link(name, get_link) + return self[name] else: return None diff --git a/exdir/core/links.py b/exdir/core/links.py new file mode 100644 index 0000000..dbe0312 --- /dev/null +++ b/exdir/core/links.py @@ -0,0 +1,69 @@ +try: + import pathlib +except ImportError as e: + try: + import pathlib2 as pathlib + except ImportError: + raise e +from . import exdir_file +from .exdir_object import Object, is_nonraw_object_directory +from .constants import * + + +class Link(Object): + """ + Super class for link objects + """ + def __init__(self, path): + self.path = path + + @property + def _link(self): + return {TYPE_METANAME: LINK_TYPENAME} + + def __eq__(self, other): + return self._link.get(LINK_METANAME) == other._link.get(LINK_METANAME) + + +class SoftLink(Link): + def __init__(self, path): + super(SoftLink, self).__init__( + path=path + ) + + @property + def _link(self): + result = { + TYPE_METANAME: LINK_TYPENAME, + LINK_METANAME: { + TYPE_METANAME: LINK_SOFTNAME, + LINK_TARGETNAME: self.path + } + } + return result + + def __repr__(self): + return "Exdir SoftLink '{}' at {}".format(self.path, id(self)) + + +class ExternalLink(Link): + def __init__(self, filename, path): + super(ExternalLink, self).__init__( + path=path + ) + self.filename = filename + + @property + def _link(self): + result = { + TYPE_METANAME: LINK_TYPENAME, + LINK_METANAME: { + TYPE_METANAME: LINK_EXTERNALNAME, + LINK_TARGETNAME: self.path, + LINK_FILENAME: str(self.filename) + } + } + return result + + def __repr__(self): + return "Exdir SoftLink '{}' at {}".format(self.path, id(self)) diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py new file mode 100644 index 0000000..d312da7 --- /dev/null +++ b/tests/test_dataframe.py @@ -0,0 +1,322 @@ +# -*- coding: utf-8 -*- + +# This file is part of Exdir, the Experimental Directory Structure. +# +# Copyright 2017 Simen Tennøe +# +# License: MIT, see "LICENSE" file for the full license terms. +# +# This file contains code from h5py, a Python interface to the HDF5 library, +# licensed under a standard 3-clause BSD license +# with copyright Andrew Collette and contributors. +# See http://www.h5py.org and the "3rdparty/h5py-LICENSE" file for details. + + +import pytest +import numpy as np +import pandas as pd +import os + +from exdir.core import Attribute, File, Dataset + +# TODO add the code below for testing true equality when parallelizing +# def __eq__(self, other): +# self[:] +# if isinstance(other, self.__class__): +# other[:] +# if self.__dict__.keys() != other.__dict__.keys(): +# return False +# +# for key in self.__dict__: +# if key == "_data": +# if not np.array_equal(self.__dict__["_data"], other.__dict__["_data"]): +# return False +# else: +# if self.__dict__[key] != other.__dict__[key]: +# return False +# return True +# else: +# return False + + +# NOTE feather converts integer column names to str +def dataframe_equal(orig_df, new_df): + columns = [] + for col1, col2 in zip(orig_df.columns, new_df.columns): + try: + columns.append(int(col2)==int(col1)) + except: + columns.append(col2==col1) + index = [] + for row1, row2 in zip(orig_df.index, new_df.index): + try: + index.append(int(row2)==int(row1)) + except: + index.append(row2==row1) + result = ( + np.array_equal(orig_df.values, new_df.values) and + all(columns) and + all(index) + ) + return result + + +def test_create_empty(setup_teardown_file): + """Create a scalar dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + data = pd.DataFrame([]) + dset = grp.create_dataset('foo', data=data) + assert dset.shape == (0,0) + assert dset.data.equals(data) + + +def test_create_scalar(setup_teardown_file): + """Create a size-1 dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + data = pd.DataFrame([1]) + dset = grp.create_dataset('foo', data=data) + assert dset.shape == (1,1) + assert dset.shape == dset.data.shape + assert dataframe_equal(data, dset.data) + + +def test_create_extended(setup_teardown_file): + """Create an extended dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + data = pd.DataFrame(np.arange(63)) + dset = grp.create_dataset('foo', data=data) + assert dset.shape == (63,1) + assert dset.size == 63 + + data = pd.DataFrame(np.zeros((6,10))) + dset = f.create_dataset('bar', data=data) + assert dset.shape == (6, 10) + assert dset.size == (60) + + +def test_no_dtype(setup_teardown_file): + """Confirm that datafram has no dtype.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data = pd.DataFrame(np.zeros((6,10))) + dset = f.create_dataset('bar', data=data) + with pytest.raises(AttributeError): + dset.dtype + + +def test_no_dtype_create(setup_teardown_file): + """Confirm that one can force dtype """ + f = setup_teardown_file[3] + data = pd.DataFrame(np.zeros((6,10))) + with pytest.raises(NotImplementedError): + f.create_dataset('bar', data=data, dtype=np.int16) + + +def test_numpy_then_dataframe(setup_teardown_file): + """Confirm that datafram has no dtype.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data = pd.DataFrame(np.zeros((6,10))) + dset = f.create_dataset('bar', (6,10)) + assert isinstance(f['bar'].data, np.ndarray) + f['bar'] = data + assert np.array_equal(f['bar'].data.values, data.values) + assert isinstance(f['bar'].data, pd.DataFrame) + assert not (dset.directory / 'data.npy').exists() + + +def test_datafram_then_numpy(setup_teardown_file): + """Confirm that datafram has no dtype.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data = pd.DataFrame(np.zeros((6,10))) + dset = f.create_dataset('bar', data=data) + assert isinstance(f['bar'].data, pd.DataFrame) + f['bar'] = np.zeros((6,10)) + assert np.array_equal(f['bar'].data, np.zeros((6,10))) + assert isinstance(f['bar'].data, np.ndarray) + assert not (dset.directory / 'data.feather').exists() + + +def test_reshape(setup_teardown_file): + """Create from existing data, and make it fit a new shape.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data = pd.DataFrame(np.zeros((6,10))) + with pytest.raises(NotImplementedError): + dset = grp.create_dataset('foo', shape=(10, 3), data=data) + + +# # Feature: Datasets can be created only if they don't exist in the file +def test_create(setup_teardown_file): + """Create new dataset with no conflicts.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data = pd.DataFrame(np.zeros((10, 3))) + dset = grp.require_dataset('foo', (10, 3)) + assert isinstance(dset, Dataset) + assert dset.shape == (10, 3) + + with pytest.raises(RuntimeError): + grp.create_dataset('foo', (10, 3)) + + +def test_create_existing_shape_mismatch(setup_teardown_file): + """require_dataset yields existing dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data2 = pd.DataFrame(np.zeros((3, 10))) + data3 = pd.DataFrame(np.zeros((4, 11))) + dset2 = grp.require_dataset('bar', data=data2) + dset3 = grp.require_dataset('bar', data=data3) + assert isinstance(dset2, Dataset) + assert dataframe_equal(dset2.data, data2) + assert dataframe_equal(dset3.data, data2) + assert dset2 == dset3 + + +def test_create_existing_same_shape(setup_teardown_file): + """require_dataset yields existing dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data2 = pd.DataFrame((3, 10)) + data3 = pd.DataFrame((4, 11)) + dset2 = grp.require_dataset('bar', data=data2) + dset3 = grp.require_dataset('bar', data=data3) + assert isinstance(dset2, Dataset) + assert dataframe_equal(dset2.data, data2) + assert dataframe_equal(dset3.data, data2) + assert dset2 == dset3 + + +def test_create_existing_df_to_npy(setup_teardown_file): + """require_dataset yields existing dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data2 = pd.DataFrame((3, 10)) + data3 = np.zeros((1, 2)) + dset2 = grp.require_dataset('bar', data=data2) + with pytest.raises(IOError): + grp.require_dataset('bar', data=data3) + + +def test_create_existing_npy_to_df(setup_teardown_file): + """require_dataset yields existing dataset.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + data2 = np.zeros((1, 2)) + data3 = pd.DataFrame((3, 10)) + dset2 = grp.require_dataset('bar', data=data2) + with pytest.raises(IOError): + grp.require_dataset('bar', data=data3) + + +def test_compound(setup_teardown_file): + """Fill value works with compound types.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + dt = np.dtype([('a', 'f4'), ('b', 'i8')]) + v = np.ones((1,), dtype=dt) + data = pd.DataFrame(v) + dset = grp.create_dataset('foo', data=data) + assert dataframe_equal(dset.data, data) + + +def test_variable_length_string_numpy(setup_teardown_file): + """Assignement of variable-length byte string produces a fixed-length + ascii dataset """ + f = setup_teardown_file[3] + grp = f.create_group("test") + # unable to change length of string with setitem + data = np.array(['a', 'aa']) + dset = grp.create_dataset('foo', data=data) + dset[1] = 'aaaaaa' + assert np.array_equal(dset.data, data) + +# sjekk at setterne faktisk endrer på fil +def test_variable_length_string_df(setup_teardown_file): + """Assignement of variable-length byte string produces a fixed-length + ascii dataset """ + f = setup_teardown_file[3] + grp = f.create_group("test") + # one needs object dtype in order to be able to change length of string with setitem + values = ['a', 'aa'] + data = pd.DataFrame(values).T + dset = grp.create_dataset('foo', data=data) + + dset['1'] = 'aaaaaa' + values[1] = 'aaaaaa' + assert np.array_equal(dset.data.values[0], values) + dset._reload_data() + assert np.array_equal(dset.data.values[0], values) + + +def test_flush(setup_teardown_file): + """Assignement of variable-length byte string produces a fixed-length + ascii dataset """ + f = setup_teardown_file[3] + grp = f.create_group("test") + # one needs object dtype in order to be able to change length of string with setitem + values = ['a', 'aa'] + data = pd.DataFrame(values).T + dset = grp.create_dataset('foo', data=data) + # using iloc changes data only in memory + values[1] = 'aaaaaa' + dset.data.iloc[0,1] = 'aaaaaa' + assert np.array_equal(dset.data.values[0], values) + dset._reload_data() + assert dataframe_equal(dset.data, data) + # flush saves data to file + dset.data.iloc[0,1] = 'aaaaaa' + dset.flush() + dset._reload_data() + assert np.array_equal(dset.data.values[0], values) + + +# Feature: Size of first axis is available via Python's len +def test_len(setup_teardown_file): + """len().""" + f = setup_teardown_file[3] + grp = f.create_group("test") + data = pd.DataFrame(np.zeros((3, 10))) + dset = grp.require_dataset('bar', data=data) + assert len(dset) == 3 + +# Feature: Iterating over a dataset yields index keys + +def test_iter(setup_teardown_file): + """Iterating over a dataset yields rows.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + values = np.arange(30, dtype='f').reshape((10, 3)) + data = pd.DataFrame(values) + dset = grp.create_dataset('foo', data=data) + for x, y in zip(dset, data): + assert x == str(y) # NOTE feather converts int names to str + + +def test_set_data(setup_teardown_file): + """Set data works correctly.""" + f = setup_teardown_file[3] + grp = f.create_group("test") + + testdata = pd.DataFrame(np.ones((10, 2))) + grp['testdata'] = testdata + outdata = grp['testdata'].data + assert dataframe_equal(testdata, outdata) + + grp['testdata'] = testdata diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 599d2ac..b16d3b8 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -171,6 +171,17 @@ def test_shape_conflict(setup_teardown_file): grp.require_dataset('foo', (10, 4), 'f') +def test_create_dtype_object(setup_teardown_file): + """Assignement of variable-length byte string produces a fixed-length + ascii dataset """ + f = setup_teardown_file[3] + grp = f.create_group("test") + # one needs object dtype in order to be able to change length of string with setitem + data = np.array(['aaaa', 'aaaaa'], dtype=object) + with pytest.raises(ValueError): + grp.create_dataset('foo', data=data) + + def test_type_confict(setup_teardown_file): """require_dataset with object type conflict yields TypeError.""" f = setup_teardown_file[3] diff --git a/tests/test_links.py b/tests/test_links.py new file mode 100644 index 0000000..c0b9386 --- /dev/null +++ b/tests/test_links.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- + +# This file is part of Exdir, the Experimental Directory Structure. +# +# Copyright 2019 Mikkel Lepperød +# +# License: MIT, see "LICENSE" file for the full license terms. +# +# This file contains code from h5py, a Python interface to the HDF5 library, +# licensed under a standard 3-clause BSD license +# with copyright Andrew Collette and contributors. +# See http://www.h5py.org and the "3rdparty/h5py-LICENSE" file for details. + +from exdir import SoftLink, ExternalLink, File +import pytest +import numpy as np +try: + import ruamel_yaml as yaml +except ImportError: + import ruamel.yaml as yaml + + +def test_soft_links(setup_teardown_file): + """ Broken softlinks are contained, but their members are not """ + f = setup_teardown_file[3] + f.create_group('mongoose') + f.create_group('grp') + f['/grp/soft'] = SoftLink('/mongoose') + assert '/grp/soft' in f + assert '/grp/soft/something' not in f + + +def test_external_links(setup_teardown_file): + """ Broken softlinks are contained, but their members are not """ + f = setup_teardown_file[3] + g = File(setup_teardown_file[0] / 'mongoose.exdir', 'w') + g.create_group('mongoose') + f.create_group('grp') + f['/grp/external'] = ExternalLink('mongoose.exdir', '/mongoose') + assert '/grp/external' in f + assert '/grp/external/something' not in f + + +def test_get_link(setup_teardown_file): + """ Get link values """ + f = setup_teardown_file[3] + g = File(setup_teardown_file[0] / 'somewhere.exdir') + f.create_group('mongoose') + g.create_group('mongoose') + sl = SoftLink('/mongoose') + el = ExternalLink('somewhere.exdir', 'mongoose') + + f['soft'] = sl + f['external'] = el + + out_sl = f.get('soft', get_link=True) + out_el = f.get('external', get_link=True) + + assert isinstance(out_sl, SoftLink) + assert out_sl == sl + assert isinstance(out_el, ExternalLink) + assert out_el == el + + +# Feature: Create and manage soft links with the high-level interface +def test_soft_path(setup_teardown_file): + """ SoftLink directory attribute """ + sl = SoftLink('/foo') + assert sl.path == '/foo' + + +def test_soft_repr(setup_teardown_file): + """ SoftLink path repr """ + sl = SoftLink('/foo') + assert isinstance(repr(sl), str) + + +def test_linked_group_equal(setup_teardown_file): + """ Create new soft link by assignment """ + f = setup_teardown_file[3] + g = f.create_group('new') + sl = SoftLink('/new') + f['alias'] = sl + g2 = f['alias'] + assert g == g2 + + +def test_exc(setup_teardown_file): + """ Opening dangling soft link results in KeyError """ + f = setup_teardown_file[3] + f['alias'] = SoftLink('new') + with pytest.raises(KeyError): + f['alias'] + + +# Feature: Create and manage external links +def test_external_path(setup_teardown_file): + """ External link paths attributes """ + external_path = setup_teardown_file[0] / 'foo.exdir' + g = File(external_path, 'w') + egrp = g.create_group('foo') + el = ExternalLink(external_path, '/foo') + assert el.filename == external_path + assert el.path == '/foo' + + +def test_external_repr(setup_teardown_file): + """ External link repr """ + external_path = setup_teardown_file[0] / 'foo.exdir' + g = File(external_path, 'w') + el = ExternalLink(external_path, '/foo') + assert isinstance(repr(el), str) + + +def test_create(setup_teardown_file): + """ Creating external links """ + external_path = setup_teardown_file[0] / 'foo.exdir' + f = setup_teardown_file[3] + g = File(external_path, 'w') + egrp = g.require_group('external') + f['ext'] = ExternalLink(external_path, '/external') + grp = f['ext'] + ef = grp.file + assert ef != f + assert grp.name == '/external' + + +def test_broken_external_link(setup_teardown_file): + """ KeyError raised when attempting to open broken link """ + external_path = setup_teardown_file[0] / 'foo.exdir' + f = setup_teardown_file[3] + g = File(external_path, 'w') + f['ext'] = ExternalLink(external_path, '/missing') + with pytest.raises(KeyError): + f['ext'] + + +def test_exc_missingfile(setup_teardown_file): + """ KeyError raised when attempting to open missing file """ + f = setup_teardown_file[3] + f['ext'] = ExternalLink('mongoose.exdir','/foo') + with pytest.raises(RuntimeError): + f['ext'] + + +def test_close_file(setup_teardown_file): + """ Files opened by accessing external links can be closed + """ + external_path = setup_teardown_file[0] / 'foo.exdir' + f = setup_teardown_file[3] + g = File(external_path, 'w') + f['ext'] = ExternalLink(external_path, '/') + grp = f['ext'] + f2 = grp.file + f2.close() + assert not f2 + +# TODO uncomment if we start accepting unicode names +# def test_unicode_encode(setup_teardown_file): +# """ +# Check that external links encode unicode filenames properly +# """ +# external_path = setup_teardown_file[0] / u"α.exdir" +# with File(external_path, "w") as ext_file: +# ext_file.create_group('external') +# f['ext'] = ExternalLink(external_path, '/external') +# +# +# def test_unicode_decode(setup_teardown_file): +# """ +# Check that external links decode unicode filenames properly +# """ +# external_path = setup_teardown_file[0] / u"α.exdir" +# with File(external_path, "w") as ext_file: +# ext_file.create_group('external') +# ext_file["external"].attrs["ext_attr"] = "test" +# f['ext'] = ExternalLink(external_path, '/external') +# assert f["ext"].attrs["ext_attr"] == "test" +# +# +# def test_unicode_exdir_path(setup_teardown_file): +# """ +# Check that external links handle unicode exdir paths properly +# """ +# external_path = setup_teardown_file[0] / u"external.exdir" +# with File(external_path, "w") as ext_file: +# ext_file.create_group(u'α') +# ext_file[u"α"].attrs["ext_attr"] = "test" +# f['ext'] = ExternalLink(external_path, u'/α') +# assertEqual(f["ext"].attrs["ext_attr"], "test")