experimental-design · jduerholt · Feb 6, 2025
diff --git a/bofire/data_models/domain/features.py b/bofire/data_models/domain/features.py
@@ -366,19 +366,22 @@ def validate_candidates(self, candidates: pd.DataFrame) -> pd.DataFrame:
     def validate_experiments(
         self,
         experiments: pd.DataFrame,
-        strict=False,
+        strict: bool = False,
+        check_nan: bool = True,
+        check_missing_cols: bool = True,
     ) -> pd.DataFrame:
         for feature in self:
-            if feature.key not in experiments:
+            if (feature.key not in experiments) and check_missing_cols:
                 raise ValueError(f"no col for input feature `{feature.key}`")
             experiments[feature.key] = feature.validate_experimental(
                 experiments[feature.key],
                 strict=strict,
             )
-        if experiments[self.get_keys()].isnull().to_numpy().any():
-            raise ValueError("there are null values")
-        if experiments[self.get_keys()].isna().to_numpy().any():
-            raise ValueError("there are na values")
+        if check_nan:
+            if experiments[self.get_keys()].isnull().to_numpy().any():
+                raise ValueError("there are null values")
+            if experiments[self.get_keys()].isna().to_numpy().any():
+                raise ValueError("there are na values")
         return experiments
 
     def get_categorical_combinations(

diff --git a/bofire/strategies/doe/utils_categorical_discrete.py b/bofire/strategies/doe/utils_categorical_discrete.py
@@ -15,7 +15,7 @@
 from bofire.data_models.features.categorical import CategoricalInput
 from bofire.data_models.features.continuous import ContinuousInput
 from bofire.data_models.features.discrete import DiscreteInput
-from bofire.data_models.features.feature import Feature, Output
+from bofire.data_models.features.feature import Feature, Output, get_encoded_name
 from bofire.data_models.types import DiscreteVals
 
 
@@ -57,8 +57,8 @@ def discrete_to_relaxable_domain_mapper(
     new_constraints = []
     categorical_groups: List[List[ContinuousInput]] = []
     for c_input in categorical_inputs:
-        current_group_keys = list(c_input.categories)  # type: ignore
-        pick_1_constraint, group_vars = generate_mixture_constraints(current_group_keys)
+        assert isinstance(c_input, CategoricalInput)
+        pick_1_constraint, group_vars = generate_mixture_constraints(c_input)
         categorical_groups.append(group_vars)
         relaxable_categorical_inputs.extend(group_vars)
         new_constraints.append(pick_1_constraint)
@@ -487,13 +487,18 @@ def NChooseKGroup(
 
 
 def generate_mixture_constraints(
-    keys: List[str],
+    feature: CategoricalInput,
 ) -> Tuple[LinearEqualityConstraint, List[ContinuousInput]]:
-    binary_vars = (ContinuousInput(key=x, bounds=[0, 1]) for x in keys)
+    binary_vars = (
+        ContinuousInput(key=get_encoded_name(feature.key, category), bounds=[0, 1])
+        for category in feature.categories
+    )
 
     mixture_constraint = LinearEqualityConstraint(
-        features=keys,
-        coefficients=[1 for x in range(len(keys))],
+        features=[
+            get_encoded_name(feature.key, category) for category in feature.categories
+        ],
+        coefficients=[1 for _ in range(len(feature.categories))],
         rhs=1,
     )
 

diff --git a/bofire/strategies/doe_strategy.py b/bofire/strategies/doe_strategy.py
@@ -4,7 +4,8 @@
 from pydantic.types import PositiveInt
 
 import bofire.data_models.strategies.api as data_models
-from bofire.data_models.features.api import CategoricalInput, Input
+from bofire.data_models.enum import CategoricalEncodingEnum
+from bofire.data_models.features.api import CategoricalInput
 from bofire.data_models.strategies.doe import (
     AnyDoEOptimalityCriterion,
     DoEOptimalityCriterion,
@@ -37,8 +38,7 @@ def __init__(
     ):
         super().__init__(data_model=data_model, **kwargs)
         self.data_model = data_model
-        self._partially_fixed_candidates = None
-        self._fixed_candidates = None
+        self._allow_partially_filled_candidates = True
 
     @property
     def formula(self):
@@ -48,28 +48,6 @@ def formula(self):
             )
         return None
 
-    def set_candidates(self, candidates: pd.DataFrame):
-        original_columns = self.domain.inputs.get_keys(includes=Input)
-        to_many_columns = []
-        for col in candidates.columns:
-            if col not in original_columns:
-                to_many_columns.append(col)
-        if len(to_many_columns) > 0:
-            raise AttributeError(
-                f"provided candidates have columns: {*to_many_columns,},  which do not exist in original domain",
-            )
-
-        to_few_columns = []
-        for col in original_columns:
-            if col not in candidates.columns:
-                to_few_columns.append(col)
-        if len(to_few_columns) > 0:
-            raise AttributeError(
-                f"provided candidates are missing columns: {*to_few_columns,} which exist in original domain",
-            )
-
-        self._candidates = candidates
-
     def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
         all_new_categories = []
 
@@ -88,16 +66,29 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
             all_new_categories.extend(new_categories)
 
         # here we adapt the (partially) fixed experiments to the new domain
-        fixed_experiments_count = 0
         _candidate_count = candidate_count
         adapted_partially_fixed_candidates = self._transform_candidates_to_new_domain(
             new_domain,
             self.candidates,
         )
-
+        # not yet working,
+        # target is to also condition on self.experiments
         if self.candidates is not None:
             fixed_experiments_count = self.candidates.notnull().all(axis=1).sum()
             _candidate_count = candidate_count + fixed_experiments_count
+            adapted_partially_fixed_candidates = (
+                self._transform_candidates_to_new_domain(
+                    new_domain,
+                    self.candidates,
+                )
+            )
+
+        # we have to also adapt the experiments, commented now to convince ruff for now
+        # if self.experiments is not None:
+        #     adapted_fixed_experiments = self._transform_candidates_to_new_domain(
+        #         new_domain,
+        #         self.experiments,
+        #     )
 
         num_binary_vars = len([var for group in new_categories for var in group])
         num_discrete_vars = len(new_discretes)
@@ -183,7 +174,7 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
                     ignore_index=True,
                 )
                 print(
-                    f"Status: {i+1} of {_candidate_count} experiments determined \n"
+                    f"Status: {i + 1} of {_candidate_count} experiments determined \n"
                     f"Current experimental plan:\n {design_from_new_to_original_domain(self.domain, design)}",
                 )
 
@@ -230,24 +221,34 @@ def _transform_candidates_to_new_domain(self, new_domain, candidates):
             for col in missing_columns:
                 intermediate_candidates.insert(0, col, None)
 
-            cat_columns = self.domain.inputs.get(includes=CategoricalInput)
-            for cat in cat_columns:
-                for row_index, c in enumerate(intermediate_candidates[cat.key].values):
-                    if pd.isnull(c):
-                        continue
-                    if c not in cat.categories:  # type: ignore
-                        raise AttributeError(
-                            f"provided value {c} for categorical variable {cat.key} "
-                            f"does not exist in the corresponding categories {cat.categories}",  # type: ignore
-                        )
-                    intermediate_candidates.loc[row_index, cat.categories] = 0  # type: ignore
-                    intermediate_candidates.loc[row_index, c] = 1
-
-            intermediate_candidates = intermediate_candidates.drop(
-                [cat.key for cat in cat_columns],
-                axis=1,
+            # this is doing the one-hot encoding in a well tested way
+            intermediate_candidates = self.domain.inputs.transform(
+                intermediate_candidates,
+                {
+                    key: CategoricalEncodingEnum.ONE_HOT
+                    for key in self.domain.inputs.get_keys(CategoricalInput)
+                },
             )
 
+            # cat_columns = self.domain.inputs.get(includes=CategoricalInput)
+            # for cat in cat_columns:
+            #     for row_index, c in enumerate(intermediate_candidates[cat.key].values):
+            #         if pd.isnull(c):
+            #             continue
+            #         if c not in cat.categories:  # type: ignore
+            #             raise AttributeError(
+            #                 f"provided value {c} for categorical variable {cat.key} "
+            #                 f"does not exist in the corresponding categories {cat.categories}",  # type: ignore
+            #             )
+            #         intermediate_candidates.loc[row_index, cat.categories] = 0  # type: ignore
+            #         intermediate_candidates.loc[row_index, c] = 1
+
+            # intermediate_candidates = intermediate_candidates.drop(
+            #     [cat.key for cat in cat_columns],
+            #     axis=1,
+            # )
+
+            # What is this doing?
             adapted_partially_fixed_candidates = pd.concat(
                 [
                     intermediate_candidates[candidates.notnull().all(axis=1)],

diff --git a/bofire/strategies/strategy.py b/bofire/strategies/strategy.py
@@ -25,6 +25,7 @@ def __init__(
         self.domain = data_model.domain
         self.seed = data_model.seed or np.random.default_rng().integers(1000)
         self.rng = np.random.default_rng(self.seed)
+        self._allow_partially_filled_candidates = False
         self._experiments = None
         self._candidates = None
 
@@ -195,6 +196,8 @@ def set_candidates(self, candidates: pd.DataFrame):
         candidates = self.domain.inputs.validate_experiments(
             candidates[self.domain.inputs.get_keys()],
             strict=False,
+            check_nan=self._allow_partially_filled_candidates is False,
+            check_missing_cols=self._allow_partially_filled_candidates is False,
         )
         self._candidates = candidates[self.domain.inputs.get_keys()]