diff --git a/bofire/data_models/domain/features.py b/bofire/data_models/domain/features.py index 34461e0a0..4344bf51d 100644 --- a/bofire/data_models/domain/features.py +++ b/bofire/data_models/domain/features.py @@ -366,19 +366,22 @@ def validate_candidates(self, candidates: pd.DataFrame) -> pd.DataFrame: def validate_experiments( self, experiments: pd.DataFrame, - strict=False, + strict: bool = False, + check_nan: bool = True, + check_missing_cols: bool = True, ) -> pd.DataFrame: for feature in self: - if feature.key not in experiments: + if (feature.key not in experiments) and check_missing_cols: raise ValueError(f"no col for input feature `{feature.key}`") experiments[feature.key] = feature.validate_experimental( experiments[feature.key], strict=strict, ) - if experiments[self.get_keys()].isnull().to_numpy().any(): - raise ValueError("there are null values") - if experiments[self.get_keys()].isna().to_numpy().any(): - raise ValueError("there are na values") + if check_nan: + if experiments[self.get_keys()].isnull().to_numpy().any(): + raise ValueError("there are null values") + if experiments[self.get_keys()].isna().to_numpy().any(): + raise ValueError("there are na values") return experiments def get_categorical_combinations( diff --git a/bofire/strategies/doe/utils_categorical_discrete.py b/bofire/strategies/doe/utils_categorical_discrete.py index e93c4469c..f0bace6cb 100644 --- a/bofire/strategies/doe/utils_categorical_discrete.py +++ b/bofire/strategies/doe/utils_categorical_discrete.py @@ -15,7 +15,7 @@ from bofire.data_models.features.categorical import CategoricalInput from bofire.data_models.features.continuous import ContinuousInput from bofire.data_models.features.discrete import DiscreteInput -from bofire.data_models.features.feature import Feature, Output +from bofire.data_models.features.feature import Feature, Output, get_encoded_name from bofire.data_models.types import DiscreteVals @@ -57,8 +57,8 @@ def discrete_to_relaxable_domain_mapper( new_constraints = [] categorical_groups: List[List[ContinuousInput]] = [] for c_input in categorical_inputs: - current_group_keys = list(c_input.categories) # type: ignore - pick_1_constraint, group_vars = generate_mixture_constraints(current_group_keys) + assert isinstance(c_input, CategoricalInput) + pick_1_constraint, group_vars = generate_mixture_constraints(c_input) categorical_groups.append(group_vars) relaxable_categorical_inputs.extend(group_vars) new_constraints.append(pick_1_constraint) @@ -487,13 +487,18 @@ def NChooseKGroup( def generate_mixture_constraints( - keys: List[str], + feature: CategoricalInput, ) -> Tuple[LinearEqualityConstraint, List[ContinuousInput]]: - binary_vars = (ContinuousInput(key=x, bounds=[0, 1]) for x in keys) + binary_vars = ( + ContinuousInput(key=get_encoded_name(feature.key, category), bounds=[0, 1]) + for category in feature.categories + ) mixture_constraint = LinearEqualityConstraint( - features=keys, - coefficients=[1 for x in range(len(keys))], + features=[ + get_encoded_name(feature.key, category) for category in feature.categories + ], + coefficients=[1 for _ in range(len(feature.categories))], rhs=1, ) diff --git a/bofire/strategies/doe_strategy.py b/bofire/strategies/doe_strategy.py index 20e51d0bc..b94e46ae9 100644 --- a/bofire/strategies/doe_strategy.py +++ b/bofire/strategies/doe_strategy.py @@ -4,7 +4,8 @@ from pydantic.types import PositiveInt import bofire.data_models.strategies.api as data_models -from bofire.data_models.features.api import CategoricalInput, Input +from bofire.data_models.enum import CategoricalEncodingEnum +from bofire.data_models.features.api import CategoricalInput from bofire.data_models.strategies.doe import ( AnyDoEOptimalityCriterion, DoEOptimalityCriterion, @@ -37,8 +38,7 @@ def __init__( ): super().__init__(data_model=data_model, **kwargs) self.data_model = data_model - self._partially_fixed_candidates = None - self._fixed_candidates = None + self._allow_partially_filled_candidates = True @property def formula(self): @@ -48,28 +48,6 @@ def formula(self): ) return None - def set_candidates(self, candidates: pd.DataFrame): - original_columns = self.domain.inputs.get_keys(includes=Input) - to_many_columns = [] - for col in candidates.columns: - if col not in original_columns: - to_many_columns.append(col) - if len(to_many_columns) > 0: - raise AttributeError( - f"provided candidates have columns: {*to_many_columns,}, which do not exist in original domain", - ) - - to_few_columns = [] - for col in original_columns: - if col not in candidates.columns: - to_few_columns.append(col) - if len(to_few_columns) > 0: - raise AttributeError( - f"provided candidates are missing columns: {*to_few_columns,} which exist in original domain", - ) - - self._candidates = candidates - def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore all_new_categories = [] @@ -88,16 +66,29 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore all_new_categories.extend(new_categories) # here we adapt the (partially) fixed experiments to the new domain - fixed_experiments_count = 0 _candidate_count = candidate_count adapted_partially_fixed_candidates = self._transform_candidates_to_new_domain( new_domain, self.candidates, ) - + # not yet working, + # target is to also condition on self.experiments if self.candidates is not None: fixed_experiments_count = self.candidates.notnull().all(axis=1).sum() _candidate_count = candidate_count + fixed_experiments_count + adapted_partially_fixed_candidates = ( + self._transform_candidates_to_new_domain( + new_domain, + self.candidates, + ) + ) + + # we have to also adapt the experiments, commented now to convince ruff for now + # if self.experiments is not None: + # adapted_fixed_experiments = self._transform_candidates_to_new_domain( + # new_domain, + # self.experiments, + # ) num_binary_vars = len([var for group in new_categories for var in group]) num_discrete_vars = len(new_discretes) @@ -183,7 +174,7 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore ignore_index=True, ) print( - f"Status: {i+1} of {_candidate_count} experiments determined \n" + f"Status: {i + 1} of {_candidate_count} experiments determined \n" f"Current experimental plan:\n {design_from_new_to_original_domain(self.domain, design)}", ) @@ -230,24 +221,34 @@ def _transform_candidates_to_new_domain(self, new_domain, candidates): for col in missing_columns: intermediate_candidates.insert(0, col, None) - cat_columns = self.domain.inputs.get(includes=CategoricalInput) - for cat in cat_columns: - for row_index, c in enumerate(intermediate_candidates[cat.key].values): - if pd.isnull(c): - continue - if c not in cat.categories: # type: ignore - raise AttributeError( - f"provided value {c} for categorical variable {cat.key} " - f"does not exist in the corresponding categories {cat.categories}", # type: ignore - ) - intermediate_candidates.loc[row_index, cat.categories] = 0 # type: ignore - intermediate_candidates.loc[row_index, c] = 1 - - intermediate_candidates = intermediate_candidates.drop( - [cat.key for cat in cat_columns], - axis=1, + # this is doing the one-hot encoding in a well tested way + intermediate_candidates = self.domain.inputs.transform( + intermediate_candidates, + { + key: CategoricalEncodingEnum.ONE_HOT + for key in self.domain.inputs.get_keys(CategoricalInput) + }, ) + # cat_columns = self.domain.inputs.get(includes=CategoricalInput) + # for cat in cat_columns: + # for row_index, c in enumerate(intermediate_candidates[cat.key].values): + # if pd.isnull(c): + # continue + # if c not in cat.categories: # type: ignore + # raise AttributeError( + # f"provided value {c} for categorical variable {cat.key} " + # f"does not exist in the corresponding categories {cat.categories}", # type: ignore + # ) + # intermediate_candidates.loc[row_index, cat.categories] = 0 # type: ignore + # intermediate_candidates.loc[row_index, c] = 1 + + # intermediate_candidates = intermediate_candidates.drop( + # [cat.key for cat in cat_columns], + # axis=1, + # ) + + # What is this doing? adapted_partially_fixed_candidates = pd.concat( [ intermediate_candidates[candidates.notnull().all(axis=1)], diff --git a/bofire/strategies/strategy.py b/bofire/strategies/strategy.py index 951cf6379..bfb723911 100644 --- a/bofire/strategies/strategy.py +++ b/bofire/strategies/strategy.py @@ -25,6 +25,7 @@ def __init__( self.domain = data_model.domain self.seed = data_model.seed or np.random.default_rng().integers(1000) self.rng = np.random.default_rng(self.seed) + self._allow_partially_filled_candidates = False self._experiments = None self._candidates = None @@ -195,6 +196,8 @@ def set_candidates(self, candidates: pd.DataFrame): candidates = self.domain.inputs.validate_experiments( candidates[self.domain.inputs.get_keys()], strict=False, + check_nan=self._allow_partially_filled_candidates is False, + check_missing_cols=self._allow_partially_filled_candidates is False, ) self._candidates = candidates[self.domain.inputs.get_keys()]