IBM · shinnar · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -2,6 +2,8 @@
 name: Tests
 
 on: [push, pull_request, workflow_dispatch]
+env:
+  LALE_DOWNLOAD_CACHE_DIR: ${{ github.workspace }}/.cache/data
 
 jobs:
   static:
@@ -40,6 +42,7 @@ jobs:
         pipdeptree -fl
     - name: pre-commit checks
       run: pre-commit run -a
+
   docs:
     name: Documentation build
     runs-on: ubuntu-latest
@@ -76,14 +79,61 @@ jobs:
       run: sphinx-build "." "_build" -W --keep-going
       working-directory: ./docs
 
+  download_data:
+    name: Download data used for tests
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+    - name: Cache pip
+      uses: actions/cache@v4
+      with:
+        # This path is specific to Ubuntu
+        path: ${{ env.pythonLocation }}
+        # Look to see if there is a cache hit for the setup file
+        key: ${{ runner.os }}-pip-new3-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-new3
+          ${{ runner.os }}-new3
+    - name: Install numpy
+      run: pip install -U numpy
+    - name: Install dependencies
+      run: pip install --upgrade --upgrade-strategy eager .[full,test,dev]
+    - name: pip list packages
+      run: pip list
+    - name: show pip dependencies
+      run: |
+        pip install pipdeptree
+        pipdeptree -fl
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
+    - name: Download the test data
+      run: python lale/datasets/prefetch.py
+    - name: Cache test data save
+      uses: actions/cache/save@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+
+
   # test_matrix_pr and test_matrix_master have a lot of redundancy
   # If GitHub actions adds support for conditional matrix expressions
   # these should be combined, but current workarounds seem overly complex
   test_matrix_pr:
     name: Test (PR)
     runs-on: ubuntu-latest
     if: ${{github.event_name == 'pull_request' }}
-    needs: [static]
+    needs: [static, download_data]
     strategy:
       fail-fast: false
       matrix:
@@ -149,6 +199,14 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-new3
           ${{ runner.os }}-new3
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
     - name: Install numpy
       run: pip install -U numpy
     - name: Install dependencies
@@ -172,7 +230,7 @@ jobs:
     name: Test (master)
     runs-on: ubuntu-latest
     if: ${{ github.event_name == 'push' }}
-    needs: [static]
+    needs: [static, download_data]
     strategy:
       fail-fast: false
       matrix:
@@ -239,6 +297,14 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-new3
           ${{ runner.os }}-new3
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
     - name: Install numpy
       run: pip install -U numpy
     - name: Install dependencies
@@ -261,7 +327,7 @@ jobs:
   test_newer:
     name: Test with newest sklearn
     runs-on: ubuntu-latest
-    needs: [static]
+    needs: [static, download_data]
     strategy:
       fail-fast: false
       matrix:
@@ -289,6 +355,14 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-new3
           ${{ runner.os }}-new3
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
     - name: Install numpy
       run: pip install -U numpy
     - name: Install dependencies
@@ -309,7 +383,7 @@ jobs:
 
   test_notebooks_pr:
     name: Test Notebooks (PR)
-    needs: [static]
+    needs: [static, download_data]
     runs-on: ubuntu-latest
     if: ${{ github.event_name == 'pull_request' }}
     env:
@@ -349,6 +423,14 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-new3
           ${{ runner.os }}-new3
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
     - name: Install packages
       run: sudo apt-get install graphviz swig
     - name: Install numpy
@@ -374,7 +456,7 @@ jobs:
 
   test_notebooks_master:
     name: Test Notebooks (master)
-    needs: [static]
+    needs: [static, download_data]
     runs-on: ubuntu-latest
     if: ${{ github.event_name == 'push' }}
     env:
@@ -434,6 +516,14 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-new3
           ${{ runner.os }}-new3
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
     - name: Install packages
       run: sudo apt-get install graphviz swig
     - name: Install numpy
@@ -460,7 +550,7 @@ jobs:
   test_fairness:
     name: Test fairness install
     runs-on: ubuntu-latest
-    needs: [static]
+    needs: [static, download_data]
     strategy:
       fail-fast: false
       matrix:
@@ -487,6 +577,14 @@ jobs:
       run: |
         pip install pipdeptree
         pipdeptree -fl
+    - name: Cache test data restore
+      uses: actions/cache/restore@v4
+      with:
+        path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
+        key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
+        restore-keys: |
+          ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
+          ${{ runner.os }}-dcache-new3
     - name: Run test
       run: py.test -v --capture=tee-sys ${{matrix.test-case}}
 

diff --git a/lale/datasets/movie_review.py b/lale/datasets/movie_review.py
@@ -18,15 +18,18 @@
 
 import numpy as np
 
+from lale.datasets.util import download_data_cache_dir
+
 
 def load_movie_review():
     """Loads the sentiment classification from a movie reviews dataset.
     Read the readme from data/movie_review for more details.
     """
     download_base_url = "https://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/rt-polaritydata.tar.gz"
-    download_data_dir = os.path.join(
-        os.path.dirname(__file__), "data", "movie_review", "download_data"
+    download_data_dir = (
+        download_data_cache_dir / "data" / "movie_review" / "download_data"
     )
+
     data_file_path = os.path.join(download_data_dir, "rt-polaritydata.tar.gz")
     if not os.path.exists(download_data_dir):
         os.makedirs(download_data_dir)

diff --git a/lale/datasets/multitable/fetch_datasets.py b/lale/datasets/multitable/fetch_datasets.py
@@ -21,12 +21,15 @@
 
 import lale.datasets.openml
 from lale.datasets.data_schemas import add_table_name
+from lale.datasets.util import download_data_cache_dir
 from lale.helpers import datatype_param_type
 
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+download_multitable_data_cache_dir = download_data_cache_dir / "multitable"
+
 try:
     from pyspark.sql import SparkSession
 
@@ -93,7 +96,7 @@ def fetch_go_sales_dataset(datatype: datatype_param_type = "pandas"):
     go_sales_list : list of singleton dictionary of pandas / spark dataframes
     """
 
-    download_data_dir = os.path.join(os.path.dirname(__file__), "go_sales_data")
+    download_data_dir = download_multitable_data_cache_dir / "go_sales_data"
     base_url = "https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/go_sales/"
     filenames = [
         "go_1k.csv",
@@ -158,7 +161,7 @@ def fetch_imdb_dataset(datatype: datatype_param_type = "pandas"):
         dataset not found
     """
 
-    download_data_dir = os.path.join(os.path.dirname(__file__), "imdb_data")
+    download_data_dir = download_multitable_data_cache_dir / "imdb_data"
     imdb_list = []
     if not os.path.exists(download_data_dir):
         raise ValueError(

diff --git a/lale/datasets/openml/openml_datasets.py b/lale/datasets/openml/openml_datasets.py
@@ -25,6 +25,8 @@
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
+from lale.datasets.util import download_data_cache_dir
+
 sklearn_version = version.parse(getattr(sklearn, "__version__"))
 
 try:
@@ -37,7 +39,8 @@
     pip install 'lale[full]'"""
     ) from import_exc
 
-download_data_dir = os.path.join(os.path.dirname(__file__), "download_data")
+download_data_dir = download_data_cache_dir / "openml" / "download_data"
+
 experiments_dict: Dict[str, Dict[str, Union[str, int]]] = {}
 
 # 1.25

diff --git a/lale/datasets/prefetch.py b/lale/datasets/prefetch.py
@@ -0,0 +1,102 @@
+# Copyright 2025 IBM Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import lale.lib.aif360.datasets
+from lale.datasets.movie_review import load_movie_review
+from lale.datasets.multitable.fetch_datasets import fetch_go_sales_dataset
+from lale.datasets.openml.openml_datasets import download_if_missing
+from lale.datasets.uci.uci_datasets import (
+    fetch_drugslib,
+    fetch_household_power_consumption,
+)
+
+openml_experiments = [
+    "credit-g",
+    "breast-cancer",
+    "adult",
+    "bank-marketing",
+    "Default-of-Credit-Card-Clients-Dataset",
+    "heart-disease",
+    "law-school-admission-bianry",
+    "national-longitudinal-survey-binary",
+    "UCI-student-performance-mat",
+    "UCI-student-performance-por",
+    "tae",
+    "us_crime",
+    "ricci",
+    "SpeedDating",
+    "nursery",
+    "titanic",
+    "cloud",
+]
+
+
+def fetch_fairness_dbs():
+    dataset_names = {
+        "adult": "adult",
+        "bank": "bank",
+        "compas": "compas",
+        "compas_violent": "compas_violent",
+        "creditg": "creditg",
+        "default_credit": "default_credit",
+        "heart_disease": "heart_disease",
+        "law_school": "law_school",
+        # "meps19": "meps_panel19_fy2015",
+        # "meps20": "meps_panel20_fy2015",
+        # "meps21": "meps_panel21_fy2016",
+        "nlsy": "nlsy",
+        "nursery": "nursery",
+        "ricci": "ricci",
+        "speeddating": "speeddating",
+        "student_math": "student_math",
+        "student_por": "student_por",
+        "tae": "tae",
+        "titanic": "titanic",
+        "us_crime": "us_crime",
+    }
+
+    def try_fetch(dataset_name):
+        long_name = dataset_names[dataset_name]
+        fetcher_function = getattr(lale.lib.aif360.datasets, f"fetch_{long_name}_df")
+        try:
+            X, y, fairness_info = fetcher_function()
+        except SystemExit:
+            print(f"skipping {dataset_name} because it is not downloaded")
+            return None
+        return X, y, fairness_info
+
+    for name in dataset_names:
+        try_fetch(name)
+
+
+def prefetch_data():
+    load_movie_review()
+
+    fetch_go_sales_dataset()
+
+    fetch_drugslib()
+    fetch_household_power_consumption()
+
+    for name in openml_experiments:
+        download_if_missing(name, True)
+
+    fetch_fairness_dbs()
+
+
+def main():
+    prefetch_data()
+
+
+if __name__ == "__main__":
+    main()