mathurinm
diff --git a/‎README.rst‎
Lines changed: 17 additions & 8 deletions b/‎README.rst‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎libsvmdata/__init__.py‎
Lines changed: 3 additions & 2 deletions b/‎libsvmdata/__init__.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎libsvmdata/abstraction.py‎
Lines changed: 119 additions & 0 deletions b/‎libsvmdata/abstraction.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎libsvmdata/core.py‎
Lines changed: 56 additions & 0 deletions b/‎libsvmdata/core.py‎
Lines changed: 56 additions & 0 deletions
@@ -1,30 +1,39 @@
 |image0| |image1|
 
-A python util to fetch datasets from the LIBSVM website.
+A python util to fetch datasets from different databases.
 
+Currently supported databases are:
+
+- LIBSVM (libsvm_)
 
 Getting design matrix and target variable is as easy as:
 
 ::
 
-    from libsvmdata import fetch_libsvm
-    X, y = fetch_libsvm("news20.binary")
+    from libsvmdata import fetch_dataset
+    X, y = fetch_dataset("news20.binary")
+
+Currently supported datasets are in ``libsvmdata.supported`` and can be displayed as: 
 
+::
 
-Currently supported datasets are in ``libsvmdata.supported``.
+   from libsvmdata import print_supported_datasets
+   print_supported_datasets()
 
+There is no need to specify the database name.
 
-The datasets are saved in a subfolder ``libsvm`` inside ``libsvmdata.datasets.DATA_HOME``, whose value is:
+Files are saved under ``DATA_HOME/<database_name>``, where the value of ``DATA_HOME`` is:
 
-- the environment variable LIBSVMDATA_HOME if it exists,
+- the environment variable ``LIBSVMDATA_HOME`` if it exists,
 
-- else, the environment variable XDG_DATA_HOME if it exists,
+- else, the environment variable ``XDG_DATA_HOME`` if it exists,
 
-- else, $HOME/data.
+- else, ``$HOME/data``.
 
 
 
 .. |image0| image:: https://github.com/mathurinm/libsvmdata/actions/workflows/build.yml/badge.svg?branch=main
    :target: https://github.com/mathurinm/libsvmdata/actions/workflows/build.yml
 .. |image1| image:: https://codecov.io/gh/mathurinm/libsvmdata/branch/main/graphs/badge.svg?branch=main
    :target: https://codecov.io/gh/mathurinm/libsvmdata
+.. _libsvm: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
@@ -1,5 +1,6 @@
-from .datasets import fetch_libsvm, download_libsvm, NAMES  # noqa
+from libsvmdata.datasets import fetch_libsvm, download_libsvm  
+from libsvmdata.core import fetch_dataset, print_supported_datasets, ALL_DATASETS
 
-supported = list(NAMES.keys())  # noqa
+supported = list(ALL_DATASETS.keys())
 
 __version__ = '0.5dev0'
@@ -0,0 +1,119 @@
+import os
+import re
+import numpy as np
+from abc import ABC, abstractmethod
+from download import download
+from pathlib import Path
+from scipy import sparse
+
+
+def _get_data_home(subdir_name=""):
+    """
+    Defines the data home folder. The top priority is the environment
+    variable $LIBSVMDATA_HOME which is specific to this package. Otherwise, we
+    seek for the variable $XDG_DATA_HOME. Finally, the fallback is $HOME/data.
+    """
+    data_home = os.environ.get("LIBSVMDATA_HOME", None)
+    if data_home is None:
+        data_home = os.environ.get("XDG_DATA_HOME", None)
+    if data_home is None:
+        data_home = Path.home() / "data"
+    return data_home / subdir_name
+
+
+class AbstractDataset(ABC):
+    """Base class defining a dataset along with its fetching methods."""
+
+    # In the derived class, __init__() must set the following attributes :
+    dataset_name = None  # dataset name
+    dataset_file = None  # dataset file (with potential extensions)
+    dataset_dir = None  # subdirectory name (see _get_data_home())
+    dataset_url = None  # dataset download url
+
+    @abstractmethod
+    def __init__(self):
+        """
+        In the derived class, this function must define the class attributes.
+        It can also be used to pass additional information required in the
+        function _load_file_and_save_data() of the derived class.
+        """
+        pass
+
+    @abstractmethod
+    def _load_file_and_save_data(self, raw_dataset_path, ext_dataset_path):
+        """
+        In the derived class, this function is responsible of the
+        transformation of the raw dataset file into two .npy/.npz files
+        containing the feature matrix X and the response vector/matrix y. These
+        files must be named <self.dataset_name>_X.<npz/npy> and
+        <self.dataset_name>_y.<npz/npy>. This function is also responsible for
+        removing the raw dataset file when needed.
+        """
+        pass
+
+    def _load_data(self, ext_dataset_path):
+        """Load data from the extracted .npz/.npy files."""
+
+        try:
+            X = sparse.load_npz(str(ext_dataset_path) + "_X.npz")
+        except FileNotFoundError:
+            X = np.load(str(ext_dataset_path) + "_X.npy")
+
+        try:
+            y = sparse.load_npz(str(ext_dataset_path) + "_y.npz")
+        except FileNotFoundError:
+            y = np.load(str(ext_dataset_path) + "_y.npy")
+
+        return X, y
+
+    def get_X_y(self, replace=False, verbose=False):
+        """
+        Load a dataset as matrix X and vector y. If X and y already exist as
+        .npz and/or .npy files, they are not redownloaded, unless replace=True.
+        """
+
+        raw_dataset_path = self.dataset_dir / self.dataset_file
+        ext_dataset_path = self.dataset_dir / self.dataset_name
+
+        # Check if the dataset already exists
+        if self.dataset_dir.exists():
+            regex = re.compile(f"{self.dataset_name}_(X|y).(npz|npy)")
+            files = os.listdir(self.dataset_dir)
+            found = [f for f in files if re.search(regex, f)]
+            exists = len(found) == 2
+        else:
+            found = []
+            exists = False
+
+        if replace or not exists:
+
+            # Remove existing dataset files if there are any
+            if raw_dataset_path.exists():
+                raw_dataset_path.unlink()
+            for file in found:
+                Path(self.dataset_dir / file).unlink()
+
+            # Path of the raw dataset file
+            if verbose:
+                print("Downloading...")
+            download(
+                self.dataset_url,
+                raw_dataset_path,
+                progressbar=verbose,
+                replace=replace,
+                verbose=verbose,
+            )
+
+            if verbose:
+                print("Loading file and saving data...")
+            X, y = self._load_file_and_save_data(
+                raw_dataset_path,
+                ext_dataset_path
+            )
+
+        else:
+            if verbose:
+                print("Loading data...")
+            X, y = self._load_data(ext_dataset_path)
+
+        return X, y
@@ -0,0 +1,56 @@
+from libsvmdata.libsvm import DATASETS as libsvm_datasets
+
+ALL_DATABASES = {"LIBSVM": libsvm_datasets}
+
+ALL_DATASETS = {
+    dataset.dataset_name: dataset
+    for datasets in ALL_DATABASES.values()
+    for dataset in datasets
+}
+
+
+def fetch_dataset(dataset_name, replace=False, verbose=False):
+    """
+    Load a dataset. It is downloaded only if not present or when replace=True.
+
+    Parameters
+    ----------
+    dataset_name : string
+        Dataset name.
+
+    replace : bool, default=False
+        Whether to re-download the dataset if it is already downloaded.
+
+    verbose : bool, default=False
+        Whether or not to print information about dataset loading.
+
+
+    Returns
+    -------
+    X : np.ndarray or scipy.sparse.csc_matrix
+        Design matrix, as 2D array or column sparse format depending on the
+        dataset.
+
+    y : 1D or 2D np.ndarray
+        Design vector (or matrix in multiclass setting).
+    """
+
+    if dataset_name not in ALL_DATASETS.keys():
+        raise ValueError(
+            f"Unsupported dataset `{dataset_name}`. Supported datasets can be "
+            "displayed using the `libsvmdata.print_supported_datasets` "
+            "function."
+        )
+
+    dataset = ALL_DATASETS[dataset_name]
+
+    X, y = dataset.get_X_y(replace=replace, verbose=verbose)
+
+    return X, y
+
+
+def print_supported_datasets():
+    print("Supported datasets")
+    for database_name, datasets in ALL_DATABASES.items():
+        print(f"- {database_name}: ")
+        print(", ".join(dataset.dataset_name for dataset in datasets))