Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions bofire/data_models/domain/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,31 @@ def validate_constraints(self):
c.validate_inputs(self.inputs)
return self

@model_validator(mode="after")
def validate_timeseries_config(self):
"""Validate the timeseries configuration of the domain.

Raises:
ValueError: If multiple features are marked as timeseries.

Returns:
self: The validated domain instance.
"""
# Get all numerical inputs that have the is_timeseries attribute
from bofire.data_models.features.numerical import NumericalInput

timeseries_features = [
f.key
for f in self.inputs
if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
]

if len(timeseries_features) > 1:
raise ValueError(
f"Multiple features ({', '.join(timeseries_features)}) are marked as timeseries. Only one is allowed."
)
return self

# TODO: tidy this up
def get_nchoosek_combinations(self, exhaustive: bool = False):
"""Get all possible NChooseK combinations
Expand Down Expand Up @@ -379,6 +404,81 @@ def validate_experiments(
strict=strict,
)
experiments = self.outputs.validate_experiments(experiments=experiments)

# Check for _trajectory_id if timeseries features are present
from bofire.data_models.features.numerical import NumericalInput

timeseries_features = [
f.key
for f in self.inputs
if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
]

if len(timeseries_features) > 0:
trajectory_col = "_trajectory_id"
if trajectory_col not in experiments.columns:
raise ValueError(
f"Timeseries feature '{timeseries_features[0]}' detected, but required column '{trajectory_col}' "
f"is not present in the experiments. When using timeseries features, you must include a "
f"'{trajectory_col}' column that identifies which trajectory/experiment each row belongs to."
)

return experiments

def add_trajectory_id(
self,
experiments: pd.DataFrame,
eps: float = 1e-6,
) -> pd.DataFrame:
"""Add _trajectory_id column to experiments by inferring trajectory groupings.

This method automatically groups experiments into trajectories based on their
non-timeseries input feature values. Experiments with the same (or nearly same,
within eps) values for all non-timeseries features are assigned the same
trajectory ID.

This is useful when you have experimental data from multiple runs/trajectories
but haven't manually labeled which observations belong to which trajectory.

Args:
experiments (pd.DataFrame): Dataframe with experimental data. Must contain
columns for all input features defined in the domain. If _trajectory_id
already exists, it will be overwritten.
eps (float, optional): Tolerance for comparing continuous values. Two
continuous values are considered equal if their absolute difference is
less than eps. Default: 1e-6. Does not apply to discrete, categorical,
or molecular features.

Returns:
pd.DataFrame: Copy of experiments with _trajectory_id column added or updated.

Raises:
ValueError: If no timeseries feature is found in the domain.
ValueError: If required input feature columns are missing from experiments.

Example:
>>> domain = Domain(
... inputs=Inputs(features=[
... ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
... ContinuousInput(key="temperature", bounds=(20, 80)),
... ]),
... outputs=Outputs(features=[ContinuousOutput(key="yield")]),
... )
>>> experiments = pd.DataFrame({
... 'time': [0, 10, 20, 0, 10, 20],
... 'temperature': [25, 25, 25, 30, 30, 30],
... 'yield': [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
... 'valid_yield': [1] * 6,
... })
>>> experiments = domain.add_trajectory_id(experiments)
>>> # Rows with temperature=25 get one ID, temperature=30 get another
"""
from bofire.utils.timeseries import infer_trajectory_id

experiments = experiments.copy()
experiments["_trajectory_id"] = infer_trajectory_id(
experiments, self.inputs, eps=eps
)
return experiments

def describe_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:
Expand Down
5 changes: 5 additions & 0 deletions bofire/data_models/features/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
from pydantic import Field

from bofire.data_models.features.feature import Input, TTransform

Expand All @@ -11,6 +12,10 @@ class NumericalInput(Input):
"""Abstract base class for all numerical (ordinal) input features."""

unit: Optional[str] = None
is_timeseries: bool = Field(
default=False,
description="Field to mark if this feature represents time in a timeseries",
)

@staticmethod
def valid_transform_types() -> List:
Expand Down
52 changes: 49 additions & 3 deletions bofire/surrogates/trainable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, KFold, StratifiedKFold
from sklearn.model_selection import (
GroupKFold,
GroupShuffleSplit,
KFold,
StratifiedKFold,
)

from bofire.data_models.enum import OutputFilteringEnum
from bofire.data_models.features.api import (
Expand Down Expand Up @@ -85,6 +90,7 @@ def cross_validate(
random_state: Optional[int] = None,
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
use_shuffle_split: bool = False,
hooks: Optional[
Dict[
str,
Expand Down Expand Up @@ -118,6 +124,9 @@ def cross_validate(
training and testing sets. This is useful in scenarios where data points are related or dependent on each
other, and splitting them into different sets would violate the assumption of independence. The number of
unique groups must be greater than or equal to the number of folds. Defaults to None.
use_shuffle_split (bool, optional): When group_split_column is provided, use GroupShuffleSplit
instead of GroupKFold. GroupKFold (default) ensures each group is tested exactly once,
while GroupShuffleSplit allows flexible test_size but may not test all groups. Defaults to False.
hooks (Dict[str, Callable[[Model, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], Any]], optional):
Dictionary of callable hooks that are called within the CV loop. The callable retrieves the current trained
modeld and the current CV folds in the following order: X_train, y_train, X_test, y_test. Defaults to {}.
Expand Down Expand Up @@ -156,6 +165,36 @@ def cross_validate(
"The feature to be stratified needs to be a DiscreteInput, CategoricalInput, CategoricalOutput, or ContinuousOutput",
)

# Auto-detect group split column
if group_split_column is None:
trajectory_col = "_trajectory_id"
# If _trajectory_id column is present, use it for group split
if trajectory_col in experiments.columns:
group_split_column = trajectory_col
else:
# Check if any input feature is marked as timeseries
timeseries_features = [
feat
for feat in self.inputs.get() # type: ignore
if hasattr(feat, "is_timeseries") and feat.is_timeseries
]
if len(timeseries_features) > 0:
# Auto-infer trajectory IDs from non-timeseries input features
from bofire.utils.timeseries import infer_trajectory_id

warnings.warn(
f"Timeseries feature '{timeseries_features[0].key}' detected but '{trajectory_col}' column "
f"is missing. Automatically inferring trajectory IDs from non-timeseries input features.",
UserWarning,
)

trajectory_ids = infer_trajectory_id(experiments, self.inputs) # type: ignore

# Add trajectory IDs to experiments
experiments = experiments.copy()
experiments[trajectory_col] = trajectory_ids
group_split_column = trajectory_col

if group_split_column is not None:
# check if the group split column is present in the experiments
if group_split_column not in experiments.columns:
Expand Down Expand Up @@ -187,6 +226,7 @@ def cross_validate(
stratified_feature=stratified_feature,
group_split_column=group_split_column,
random_state=random_state,
use_shuffle_split=use_shuffle_split,
)

key = self.outputs.get_keys()[0] # type: ignore
Expand Down Expand Up @@ -301,8 +341,9 @@ def _make_cv_split(
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
random_state: Optional[int] = None,
use_shuffle_split: bool = False,
) -> Tuple[
Union[KFold, StratifiedKFold, GroupShuffleSplit],
Union[KFold, StratifiedKFold, GroupKFold, GroupShuffleSplit],
Generator[Tuple[np.ndarray, np.ndarray], None, None],
]:
"""
Expand All @@ -321,7 +362,12 @@ def _make_cv_split(
if stratified_feature is None:
if group_split_column is not None:
# GROUP SPLIT FUNCTIONALITY
cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
if use_shuffle_split:
# Use GroupShuffleSplit for flexible test_size
cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
else:
# Use GroupKFold for exhaustive testing (default)
cv = GroupKFold(n_splits=folds)
cv_func = cv.split(experiments, groups=experiments[group_split_column])
else:
cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)
Expand Down
Loading
Loading