Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 104 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
name: Tests

on: [push, pull_request, workflow_dispatch]
env:
LALE_DOWNLOAD_CACHE_DIR: ${{ github.workspace }}/.cache/data

jobs:
static:
Expand Down Expand Up @@ -40,6 +42,7 @@ jobs:
pipdeptree -fl
- name: pre-commit checks
run: pre-commit run -a

docs:
name: Documentation build
runs-on: ubuntu-latest
Expand Down Expand Up @@ -76,14 +79,61 @@ jobs:
run: sphinx-build "." "_build" -W --keep-going
working-directory: ./docs

download_data:
name: Download data used for tests
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Cache pip
uses: actions/cache@v4
with:
# This path is specific to Ubuntu
path: ${{ env.pythonLocation }}
# Look to see if there is a cache hit for the setup file
key: ${{ runner.os }}-pip-new3-${{ env.pythonLocation }}-${{ hashFiles('setup.py') }}
restore-keys: |
${{ runner.os }}-pip-new3
${{ runner.os }}-new3
- name: Install numpy
run: pip install -U numpy
- name: Install dependencies
run: pip install --upgrade --upgrade-strategy eager .[full,test,dev]
- name: pip list packages
run: pip list
- name: show pip dependencies
run: |
pip install pipdeptree
pipdeptree -fl
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Download the test data
run: python lale/datasets/prefetch.py
- name: Cache test data save
uses: actions/cache/save@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}


# test_matrix_pr and test_matrix_master have a lot of redundancy
# If GitHub actions adds support for conditional matrix expressions
# these should be combined, but current workarounds seem overly complex
test_matrix_pr:
name: Test (PR)
runs-on: ubuntu-latest
if: ${{github.event_name == 'pull_request' }}
needs: [static]
needs: [static, download_data]
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -149,6 +199,14 @@ jobs:
restore-keys: |
${{ runner.os }}-pip-new3
${{ runner.os }}-new3
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Install numpy
run: pip install -U numpy
- name: Install dependencies
Expand All @@ -172,7 +230,7 @@ jobs:
name: Test (master)
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' }}
needs: [static]
needs: [static, download_data]
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -239,6 +297,14 @@ jobs:
restore-keys: |
${{ runner.os }}-pip-new3
${{ runner.os }}-new3
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Install numpy
run: pip install -U numpy
- name: Install dependencies
Expand All @@ -261,7 +327,7 @@ jobs:
test_newer:
name: Test with newest sklearn
runs-on: ubuntu-latest
needs: [static]
needs: [static, download_data]
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -289,6 +355,14 @@ jobs:
restore-keys: |
${{ runner.os }}-pip-new3
${{ runner.os }}-new3
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Install numpy
run: pip install -U numpy
- name: Install dependencies
Expand All @@ -309,7 +383,7 @@ jobs:

test_notebooks_pr:
name: Test Notebooks (PR)
needs: [static]
needs: [static, download_data]
runs-on: ubuntu-latest
if: ${{ github.event_name == 'pull_request' }}
env:
Expand Down Expand Up @@ -349,6 +423,14 @@ jobs:
restore-keys: |
${{ runner.os }}-pip-new3
${{ runner.os }}-new3
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Install packages
run: sudo apt-get install graphviz swig
- name: Install numpy
Expand All @@ -374,7 +456,7 @@ jobs:

test_notebooks_master:
name: Test Notebooks (master)
needs: [static]
needs: [static, download_data]
runs-on: ubuntu-latest
if: ${{ github.event_name == 'push' }}
env:
Expand Down Expand Up @@ -434,6 +516,14 @@ jobs:
restore-keys: |
${{ runner.os }}-pip-new3
${{ runner.os }}-new3
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Install packages
run: sudo apt-get install graphviz swig
- name: Install numpy
Expand All @@ -460,7 +550,7 @@ jobs:
test_fairness:
name: Test fairness install
runs-on: ubuntu-latest
needs: [static]
needs: [static, download_data]
strategy:
fail-fast: false
matrix:
Expand All @@ -487,6 +577,14 @@ jobs:
run: |
pip install pipdeptree
pipdeptree -fl
- name: Cache test data restore
uses: actions/cache/restore@v4
with:
path: ${{ env.LALE_DOWNLOAD_CACHE_DIR }}
key: ${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-${{ hashFiles('lale/datasets/prefetch.py') }}
restore-keys: |
${{ runner.os }}-dcache-new3-${{ env.LALE_DOWNLOAD_CACHE_DIR }}-
${{ runner.os }}-dcache-new3
- name: Run test
run: py.test -v --capture=tee-sys ${{matrix.test-case}}

Expand Down
7 changes: 5 additions & 2 deletions lale/datasets/movie_review.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,18 @@

import numpy as np

from lale.datasets.util import download_data_cache_dir


def load_movie_review():
"""Loads the sentiment classification from a movie reviews dataset.
Read the readme from data/movie_review for more details.
"""
download_base_url = "https://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/rt-polaritydata.tar.gz"
download_data_dir = os.path.join(
os.path.dirname(__file__), "data", "movie_review", "download_data"
download_data_dir = (
download_data_cache_dir / "data" / "movie_review" / "download_data"
)

data_file_path = os.path.join(download_data_dir, "rt-polaritydata.tar.gz")
if not os.path.exists(download_data_dir):
os.makedirs(download_data_dir)
Expand Down
7 changes: 5 additions & 2 deletions lale/datasets/multitable/fetch_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@

import lale.datasets.openml
from lale.datasets.data_schemas import add_table_name
from lale.datasets.util import download_data_cache_dir
from lale.helpers import datatype_param_type

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

download_multitable_data_cache_dir = download_data_cache_dir / "multitable"

try:
from pyspark.sql import SparkSession

Expand Down Expand Up @@ -93,7 +96,7 @@ def fetch_go_sales_dataset(datatype: datatype_param_type = "pandas"):
go_sales_list : list of singleton dictionary of pandas / spark dataframes
"""

download_data_dir = os.path.join(os.path.dirname(__file__), "go_sales_data")
download_data_dir = download_multitable_data_cache_dir / "go_sales_data"
base_url = "https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/go_sales/"
filenames = [
"go_1k.csv",
Expand Down Expand Up @@ -158,7 +161,7 @@ def fetch_imdb_dataset(datatype: datatype_param_type = "pandas"):
dataset not found
"""

download_data_dir = os.path.join(os.path.dirname(__file__), "imdb_data")
download_data_dir = download_multitable_data_cache_dir / "imdb_data"
imdb_list = []
if not os.path.exists(download_data_dir):
raise ValueError(
Expand Down
5 changes: 4 additions & 1 deletion lale/datasets/openml/openml_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from lale.datasets.util import download_data_cache_dir

sklearn_version = version.parse(getattr(sklearn, "__version__"))

try:
Expand All @@ -37,7 +39,8 @@
pip install 'lale[full]'"""
) from import_exc

download_data_dir = os.path.join(os.path.dirname(__file__), "download_data")
download_data_dir = download_data_cache_dir / "openml" / "download_data"

experiments_dict: Dict[str, Dict[str, Union[str, int]]] = {}

# 1.25
Expand Down
102 changes: 102 additions & 0 deletions lale/datasets/prefetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright 2025 IBM Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import lale.lib.aif360.datasets
from lale.datasets.movie_review import load_movie_review
from lale.datasets.multitable.fetch_datasets import fetch_go_sales_dataset
from lale.datasets.openml.openml_datasets import download_if_missing
from lale.datasets.uci.uci_datasets import (
fetch_drugslib,
fetch_household_power_consumption,
)

openml_experiments = [
"credit-g",
"breast-cancer",
"adult",
"bank-marketing",
"Default-of-Credit-Card-Clients-Dataset",
"heart-disease",
"law-school-admission-bianry",
"national-longitudinal-survey-binary",
"UCI-student-performance-mat",
"UCI-student-performance-por",
"tae",
"us_crime",
"ricci",
"SpeedDating",
"nursery",
"titanic",
"cloud",
]


def fetch_fairness_dbs():
dataset_names = {
"adult": "adult",
"bank": "bank",
"compas": "compas",
"compas_violent": "compas_violent",
"creditg": "creditg",
"default_credit": "default_credit",
"heart_disease": "heart_disease",
"law_school": "law_school",
# "meps19": "meps_panel19_fy2015",
# "meps20": "meps_panel20_fy2015",
# "meps21": "meps_panel21_fy2016",
"nlsy": "nlsy",
"nursery": "nursery",
"ricci": "ricci",
"speeddating": "speeddating",
"student_math": "student_math",
"student_por": "student_por",
"tae": "tae",
"titanic": "titanic",
"us_crime": "us_crime",
}

def try_fetch(dataset_name):
long_name = dataset_names[dataset_name]
fetcher_function = getattr(lale.lib.aif360.datasets, f"fetch_{long_name}_df")
try:
X, y, fairness_info = fetcher_function()
except SystemExit:
print(f"skipping {dataset_name} because it is not downloaded")
return None
return X, y, fairness_info

for name in dataset_names:
try_fetch(name)


def prefetch_data():
load_movie_review()

fetch_go_sales_dataset()

fetch_drugslib()
fetch_household_power_consumption()

for name in openml_experiments:
download_if_missing(name, True)

fetch_fairness_dbs()


def main():
prefetch_data()


if __name__ == "__main__":
main()
Loading
Loading