experimental-design · jkeupp · Sep 29, 2025 · Sep 29, 2025 · Sep 30, 2025 · Nov 17, 2025
diff --git a/bofire/data_models/domain/domain.py b/bofire/data_models/domain/domain.py
@@ -138,6 +138,31 @@ def validate_constraints(self):
             c.validate_inputs(self.inputs)
         return self
 
+    @model_validator(mode="after")
+    def validate_timeseries_config(self):
+        """Validate the timeseries configuration of the domain.
+
+        Raises:
+            ValueError: If multiple features are marked as timeseries.
+
+        Returns:
+            self: The validated domain instance.
+        """
+        # Get all numerical inputs that have the is_timeseries attribute
+        from bofire.data_models.features.numerical import NumericalInput
+
+        timeseries_features = [
+            f.key
+            for f in self.inputs
+            if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
+        ]
+
+        if len(timeseries_features) > 1:
+            raise ValueError(
+                f"Multiple features ({', '.join(timeseries_features)}) are marked as timeseries. Only one is allowed."
+            )
+        return self
+
     # TODO: tidy this up
     def get_nchoosek_combinations(self, exhaustive: bool = False):
         """Get all possible NChooseK combinations
@@ -379,6 +404,81 @@ def validate_experiments(
             strict=strict,
         )
         experiments = self.outputs.validate_experiments(experiments=experiments)
+
+        # Check for _trajectory_id if timeseries features are present
+        from bofire.data_models.features.numerical import NumericalInput
+
+        timeseries_features = [
+            f.key
+            for f in self.inputs
+            if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
+        ]
+
+        if len(timeseries_features) > 0:
+            trajectory_col = "_trajectory_id"
+            if trajectory_col not in experiments.columns:
+                raise ValueError(
+                    f"Timeseries feature '{timeseries_features[0]}' detected, but required column '{trajectory_col}' "
+                    f"is not present in the experiments. When using timeseries features, you must include a "
+                    f"'{trajectory_col}' column that identifies which trajectory/experiment each row belongs to."
+                )
+
+        return experiments
+
+    def add_trajectory_id(
+        self,
+        experiments: pd.DataFrame,
+        eps: float = 1e-6,
+    ) -> pd.DataFrame:
+        """Add _trajectory_id column to experiments by inferring trajectory groupings.
+
+        This method automatically groups experiments into trajectories based on their
+        non-timeseries input feature values. Experiments with the same (or nearly same,
+        within eps) values for all non-timeseries features are assigned the same
+        trajectory ID.
+
+        This is useful when you have experimental data from multiple runs/trajectories
+        but haven't manually labeled which observations belong to which trajectory.
+
+        Args:
+            experiments (pd.DataFrame): Dataframe with experimental data. Must contain
+                columns for all input features defined in the domain. If _trajectory_id
+                already exists, it will be overwritten.
+            eps (float, optional): Tolerance for comparing continuous values. Two
+                continuous values are considered equal if their absolute difference is
+                less than eps. Default: 1e-6. Does not apply to discrete, categorical,
+                or molecular features.
+
+        Returns:
+            pd.DataFrame: Copy of experiments with _trajectory_id column added or updated.
+
+        Raises:
+            ValueError: If no timeseries feature is found in the domain.
+            ValueError: If required input feature columns are missing from experiments.
+
+        Example:
+            >>> domain = Domain(
+            ...     inputs=Inputs(features=[
+            ...         ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+            ...         ContinuousInput(key="temperature", bounds=(20, 80)),
+            ...     ]),
+            ...     outputs=Outputs(features=[ContinuousOutput(key="yield")]),
+            ... )
+            >>> experiments = pd.DataFrame({
+            ...     'time': [0, 10, 20, 0, 10, 20],
+            ...     'temperature': [25, 25, 25, 30, 30, 30],
+            ...     'yield': [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
+            ...     'valid_yield': [1] * 6,
+            ... })
+            >>> experiments = domain.add_trajectory_id(experiments)
+            >>> # Rows with temperature=25 get one ID, temperature=30 get another
+        """
+        from bofire.utils.timeseries import infer_trajectory_id
+
+        experiments = experiments.copy()
+        experiments["_trajectory_id"] = infer_trajectory_id(
+            experiments, self.inputs, eps=eps
+        )
         return experiments
 
     def describe_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:

diff --git a/bofire/data_models/features/numerical.py b/bofire/data_models/features/numerical.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+from pydantic import Field
 
 from bofire.data_models.features.feature import Input, TTransform
 
@@ -11,6 +12,10 @@ class NumericalInput(Input):
     """Abstract base class for all numerical (ordinal) input features."""
 
     unit: Optional[str] = None
+    is_timeseries: bool = Field(
+        default=False,
+        description="Field to mark if this feature represents time in a timeseries",
+    )
 
     @staticmethod
     def valid_transform_types() -> List:

diff --git a/bofire/surrogates/trainable.py b/bofire/surrogates/trainable.py
@@ -4,7 +4,12 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.model_selection import GroupShuffleSplit, KFold, StratifiedKFold
+from sklearn.model_selection import (
+    GroupKFold,
+    GroupShuffleSplit,
+    KFold,
+    StratifiedKFold,
+)
 
 from bofire.data_models.enum import OutputFilteringEnum
 from bofire.data_models.features.api import (
@@ -85,6 +90,7 @@ def cross_validate(
         random_state: Optional[int] = None,
         stratified_feature: Optional[str] = None,
         group_split_column: Optional[str] = None,
+        use_shuffle_split: bool = False,
         hooks: Optional[
             Dict[
                 str,
@@ -118,6 +124,9 @@ def cross_validate(
                 training and testing sets. This is useful in scenarios where data points are related or dependent on each
                 other, and splitting them into different sets would violate the assumption of independence. The number of
                 unique groups must be greater than or equal to the number of folds. Defaults to None.
+            use_shuffle_split (bool, optional): When group_split_column is provided, use GroupShuffleSplit
+                instead of GroupKFold. GroupKFold (default) ensures each group is tested exactly once,
+                while GroupShuffleSplit allows flexible test_size but may not test all groups. Defaults to False.
             hooks (Dict[str, Callable[[Model, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], Any]], optional):
                 Dictionary of callable hooks that are called within the CV loop. The callable retrieves the current trained
                 modeld and the current CV folds in the following order: X_train, y_train, X_test, y_test. Defaults to {}.
@@ -156,6 +165,36 @@ def cross_validate(
                     "The feature to be stratified needs to be a DiscreteInput, CategoricalInput, CategoricalOutput, or ContinuousOutput",
                 )
 
+        # Auto-detect group split column
+        if group_split_column is None:
+            trajectory_col = "_trajectory_id"
+            # If _trajectory_id column is present, use it for group split
+            if trajectory_col in experiments.columns:
+                group_split_column = trajectory_col
+            else:
+                # Check if any input feature is marked as timeseries
+                timeseries_features = [
+                    feat
+                    for feat in self.inputs.get()  # type: ignore
+                    if hasattr(feat, "is_timeseries") and feat.is_timeseries
+                ]
+                if len(timeseries_features) > 0:
+                    # Auto-infer trajectory IDs from non-timeseries input features
+                    from bofire.utils.timeseries import infer_trajectory_id
+
+                    warnings.warn(
+                        f"Timeseries feature '{timeseries_features[0].key}' detected but '{trajectory_col}' column "
+                        f"is missing. Automatically inferring trajectory IDs from non-timeseries input features.",
+                        UserWarning,
+                    )
+
+                    trajectory_ids = infer_trajectory_id(experiments, self.inputs)  # type: ignore
+
+                    # Add trajectory IDs to experiments
+                    experiments = experiments.copy()
+                    experiments[trajectory_col] = trajectory_ids
+                    group_split_column = trajectory_col
+
         if group_split_column is not None:
             # check if the group split column is present in the experiments
             if group_split_column not in experiments.columns:
@@ -187,6 +226,7 @@ def cross_validate(
             stratified_feature=stratified_feature,
             group_split_column=group_split_column,
             random_state=random_state,
+            use_shuffle_split=use_shuffle_split,
         )
 
         key = self.outputs.get_keys()[0]  # type: ignore
@@ -301,8 +341,9 @@ def _make_cv_split(
         stratified_feature: Optional[str] = None,
         group_split_column: Optional[str] = None,
         random_state: Optional[int] = None,
+        use_shuffle_split: bool = False,
     ) -> Tuple[
-        Union[KFold, StratifiedKFold, GroupShuffleSplit],
+        Union[KFold, StratifiedKFold, GroupKFold, GroupShuffleSplit],
         Generator[Tuple[np.ndarray, np.ndarray], None, None],
     ]:
         """
@@ -321,7 +362,12 @@ def _make_cv_split(
         if stratified_feature is None:
             if group_split_column is not None:
                 # GROUP SPLIT FUNCTIONALITY
-                cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
+                if use_shuffle_split:
+                    # Use GroupShuffleSplit for flexible test_size
+                    cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
+                else:
+                    # Use GroupKFold for exhaustive testing (default)
+                    cv = GroupKFold(n_splits=folds)
                 cv_func = cv.split(experiments, groups=experiments[group_split_column])
             else:
                 cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)