diff --git a/bofire/data_models/constraints/nchoosek.py b/bofire/data_models/constraints/nchoosek.py index aa41d47ce..cb2bf5bbe 100644 --- a/bofire/data_models/constraints/nchoosek.py +++ b/bofire/data_models/constraints/nchoosek.py @@ -41,6 +41,14 @@ def validate_inputs(self, inputs: Inputs): assert isinstance( feature_, ContinuousInput ), f"Feature {f} is not a ContinuousInput." + if not ( + feature_.bounds[0] == 0 + or (feature_.bounds[0] > 0 and feature_.allow_zero) + ): + raise ValueError( + f"Feature {f} must have a lower bound of 0 or `allow_zero=True`, " + f"but has bounds[0]={feature_.bounds[0]} and allow_zero={feature_.allow_zero}", + ) @model_validator(mode="after") def validate_counts(self): diff --git a/bofire/data_models/domain/domain.py b/bofire/data_models/domain/domain.py index bef8a9230..8482a7e50 100644 --- a/bofire/data_models/domain/domain.py +++ b/bofire/data_models/domain/domain.py @@ -1,5 +1,4 @@ import collections.abc -import itertools import warnings from collections.abc import Sequence from typing import Any, Dict, Literal, Optional, Tuple, Union @@ -14,7 +13,6 @@ Constraint, ConstraintNotFulfilledError, InterpointConstraint, - NChooseKConstraint, ) from bofire.data_models.domain.constraints import Constraints from bofire.data_models.domain.features import Inputs, Outputs @@ -133,109 +131,6 @@ def validate_constraints(self): c.validate_inputs(self.inputs) return self - # TODO: tidy this up - def get_nchoosek_combinations(self, exhaustive: bool = False): - """Get all possible NChooseK combinations - - Args: - exhaustive (bool, optional): if True all combinations are returned. Defaults to False. - - Returns: - Tuple(used_features_list, unused_features_list): used_features_list is a list of lists containing features used in each NChooseK combination. - unused_features_list is a list of lists containing features unused in each NChooseK combination. - - """ - if len(self.constraints.get(NChooseKConstraint)) == 0: - used_continuous_features = self.inputs.get_keys(ContinuousInput) - return used_continuous_features, [] - - used_features_list_all = [] - - # loops through each NChooseK constraint - for con in self.constraints.get(NChooseKConstraint): - assert isinstance(con, NChooseKConstraint) - used_features_list = [] - - if exhaustive: - for n in range(con.min_count, con.max_count + 1): - used_features_list.extend(itertools.combinations(con.features, n)) - - if con.none_also_valid: - used_features_list.append(()) - else: - used_features_list.extend( - itertools.combinations(con.features, con.max_count), - ) - - used_features_list_all.append(used_features_list) - - used_features_list_all = list( - itertools.product(*used_features_list_all), - ) # product between NChooseK constraints - - # format into a list of used features - used_features_list_formatted = [] - for used_features_list in used_features_list_all: - used_features_list_flattened = [ - item for sublist in used_features_list for item in sublist - ] - used_features_list_formatted.append(list(set(used_features_list_flattened))) - - # sort lists - used_features_list_sorted = [] - for used_features in used_features_list_formatted: - used_features_list_sorted.append(sorted(used_features)) - - # drop duplicates - used_features_list_no_dup = [] - for used_features in used_features_list_sorted: - if used_features not in used_features_list_no_dup: - used_features_list_no_dup.append(used_features) - - # remove combinations not fulfilling constraints - used_features_list_final = [] - for combo in used_features_list_no_dup: - fulfil_constraints = [] # list of bools tracking if constraints are fulfilled - for con in self.constraints.get(NChooseKConstraint): - assert isinstance(con, NChooseKConstraint) - count = 0 # count of features in combo that are in con.features - for f in combo: - if f in con.features: - count += 1 - if ( - count >= con.min_count - and count <= con.max_count - or count == 0 - and con.none_also_valid - ): - fulfil_constraints.append(True) - else: - fulfil_constraints.append(False) - if np.all(fulfil_constraints): - used_features_list_final.append(combo) - - # features unused - features_in_cc = [] - for con in self.constraints.get(NChooseKConstraint): - assert isinstance(con, NChooseKConstraint) - features_in_cc.extend(con.features) - features_in_cc = list(set(features_in_cc)) - features_in_cc.sort() - unused_features_list = [] - for used_features in used_features_list_final: - unused_features_list.append( - [f_key for f_key in features_in_cc if f_key not in used_features], - ) - - # postprocess - # used_features_list_final2 = [] - # unused_features_list2 = [] - # for used, unused in zip(used_features_list_final,unused_features_list): - # if len(used) == 3: - # used_features_list_final2.append(used), unused_features_list2.append(unused) - - return used_features_list_final, unused_features_list - def coerce_invalids(self, experiments: pd.DataFrame) -> pd.DataFrame: """Coerces all invalid output measurements to np.nan diff --git a/bofire/data_models/strategies/random.py b/bofire/data_models/strategies/random.py index d1f0326d8..7e2284253 100644 --- a/bofire/data_models/strategies/random.py +++ b/bofire/data_models/strategies/random.py @@ -19,12 +19,61 @@ class RandomStrategy(Strategy): + """Strategy for drawing random samples from a domain subject to its constraints. + + Sampling proceeds in four regimes, picked automatically based on the + constraints present in the domain: + + 1. **Unconstrained / categorical-only domains.** Samples are drawn directly + from each input feature using ``fallback_sampling_method`` (uniform, + Sobol, or LHS). + 2. **Linear/interpoint-constrained domains.** A hit-and-run polytope sampler + (``botorch.sample_q_batches_from_polytope``) draws candidates that + satisfy linear equality, linear inequality, and interpoint equality + constraints. Categorical and discrete inputs are sampled independently + with the fallback method. + 3. **NChooseK and/or ``allow_zero`` features.** The strategy draws up to + ``max_combinations`` distinct active-feature subsets uniformly from all + valid subsets (one group per ``NChooseKConstraint``, plus one singleton + group per ``ContinuousInput`` with ``allow_zero=True`` outside any + NChooseK). For each drawn subset the unselected zeroable features are + pinned to ``0`` and the remaining features are sampled via the polytope + sampler. Final candidates are concatenated and uniformly subsampled. + 4. **Nonlinear or product constraints.** When constraints are present that + cannot be enforced directly by the polytope sampler, regimes 1-3 are + used as a proposal distribution and rejection sampling filters + candidates until ``candidate_count`` valid samples are found. + + Attributes: + fallback_sampling_method: Sampling method for unconstrained / fixed + inputs and for categorical and discrete features. + n_burnin: Burn-in length for the hit-and-run polytope sampler. + n_thinning: Thinning factor for the hit-and-run polytope sampler. + num_base_samples: Batch size used when drawing proposals during + rejection sampling. If ``None``, the requested ``candidate_count`` + is used. + max_iters: Maximum number of rejection-sampling iterations before the + strategy gives up. Each iteration draws ``num_base_samples`` + candidates. + max_combinations: Maximum number of distinct active-feature subsets to + draw per ``ask`` when NChooseK or ``allow_zero`` features are + present. Larger values give a better mix of subsets at the cost of + more polytope-sampler invocations. + nchoosek_max_iters: Maximum number of rejection-sampling attempts when + drawing a single valid active-feature subset under overlapping + ``NChooseKConstraint``s. Independent from ``max_iters``. + sampler_kwargs: Extra keyword arguments forwarded to the fallback + sampler (e.g. ``{"scramble": True}`` for Sobol). + """ + type: Literal["RandomStrategy"] = "RandomStrategy" fallback_sampling_method: SamplingMethodEnum = SamplingMethodEnum.UNIFORM n_burnin: Annotated[int, Field(ge=1)] = 1000 n_thinning: Annotated[int, Field(ge=1)] = 32 num_base_samples: Optional[Annotated[int, Field(gt=0)]] = None max_iters: Annotated[int, Field(gt=0)] = 1000 + max_combinations: Annotated[int, Field(gt=0)] = 64 + nchoosek_max_iters: Annotated[int, Field(gt=0)] = 1000 sampler_kwargs: Optional[dict] = None def is_constraint_implemented(self, my_type: Type[Constraint]) -> bool: diff --git a/bofire/strategies/random.py b/bofire/strategies/random.py index 46b38569b..2f22bbb3a 100644 --- a/bofire/strategies/random.py +++ b/bofire/strategies/random.py @@ -1,7 +1,9 @@ import math +import random import warnings +from collections import Counter from copy import deepcopy -from typing import Dict, Optional, cast +from typing import Dict, List, Optional, Tuple, cast import numpy as np import pandas as pd @@ -59,6 +61,8 @@ def __init__( self.fallback_sampling_method = data_model.fallback_sampling_method self.n_burnin = data_model.n_burnin self.n_thinning = data_model.n_thinning + self.max_combinations = data_model.max_combinations + self.nchoosek_max_iters = data_model.nchoosek_max_iters self.sampler_kwargs = data_model.sampler_kwargs def has_sufficient_experiments(self) -> bool: @@ -111,6 +115,30 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: n_iters += 1 return pd.concat(valid_samples, ignore_index=True).iloc[:candidate_count] + @staticmethod + def _get_zeroable_keys(domain: Domain) -> Tuple[set[str], set[str]]: + """Collect feature keys that can take the value zero. + + Returns: + A tuple ``(nchoosek_keys, allow_zero_singleton_keys)`` where + ``nchoosek_keys`` are all feature keys appearing in any + ``NChooseKConstraint`` and ``allow_zero_singleton_keys`` are the + keys of ``ContinuousInput``s with ``allow_zero=True`` that are + not already part of an ``NChooseKConstraint``. + """ + nchoosek_keys: set[str] = set() + for constraint in domain.constraints.get(NChooseKConstraint): + assert isinstance(constraint, NChooseKConstraint) + nchoosek_keys.update(constraint.features) + allow_zero_singleton_keys = { + feat.key + for feat in domain.inputs.get(ContinuousInput) + if isinstance(feat, ContinuousInput) + and feat.allow_zero + and feat.key not in nchoosek_keys + } + return nchoosek_keys, allow_zero_singleton_keys + def _sample_with_nchooseks( self, candidate_count: int, @@ -124,34 +152,39 @@ def _sample_with_nchooseks( pd.DataFrame: A DataFrame containing the sampled data. """ - if len(self.domain.constraints.get(NChooseKConstraint)) > 0: - _, unused = self.domain.get_nchoosek_combinations() - - if candidate_count <= len(unused): - sampled_combinations = [ - unused[i] - for i in np.random.default_rng(self._get_seed()).choice( - len(unused), - size=candidate_count, - replace=False, - ) - ] - num_samples_per_it = 1 - else: - sampled_combinations = unused - num_samples_per_it = math.ceil(candidate_count / len(unused)) + nchoosek_keys, allow_zero_keys = self._get_zeroable_keys(self.domain) + zeroable_keys = nchoosek_keys | allow_zero_keys + + if zeroable_keys: + # Draw a uniform sample of valid active-feature subsets (one per + # NChooseK constraint plus one per allow_zero singleton). We draw + # at most `max_combinations` distinct subsets; their multiplicities + # in `drawn` determine how many polytope samples each subset gets. + n_combos = min(self.max_combinations, candidate_count) + drawn = self.sample_valid_nchoosek_features( + domain=self.domain, + seed=self._get_seed(), + n=n_combos, + max_iters=self.nchoosek_max_iters, + ) + combinations = Counter(drawn) + + # Each sampled subset gets `count * sampling_multiplier` polytope + # samples, so the total before final resampling is at least + # `candidate_count`. + sampling_multiplier = math.ceil(candidate_count / n_combos) samples = [] - for u in sampled_combinations: - # create new domain without the nchoosekconstraints + for combo, count in combinations.items(): + # Clone the domain and turn the NChooseK problem into a plain + # polytope problem: drop the NChooseK constraints, then pin + # every zeroable feature that wasn't selected to bounds [0, 0]. domain = deepcopy(self.domain) domain.constraints = domain.constraints.get(excludes=NChooseKConstraint) - # fix the unused features - for key in u: + for key in zeroable_keys - set(combo): feat = domain.inputs.get_by_key(key=key) assert isinstance(feat, ContinuousInput) feat.bounds = [0.0, 0.0] - # setup then sampler for this situation samples.append( self._sample_from_polytope( domain=domain, @@ -159,7 +192,7 @@ def _sample_with_nchooseks( n_burnin=self.n_burnin, n_thinning=self.n_thinning, seed=self._get_seed(), - n=num_samples_per_it, + n=count * sampling_multiplier, sampler_kwargs=self.sampler_kwargs, ), ) @@ -181,6 +214,79 @@ def _sample_with_nchooseks( sampler_kwargs=self.sampler_kwargs, ) + @staticmethod + def sample_valid_nchoosek_features( + domain: Domain, + seed: Optional[int] = None, + n: int = 1, + max_iters: int = 1000, + ) -> List[Tuple[str, ...]]: + """Sample sets of active feature keys uniformly from all valid subsets. + + Includes (a) one group per ``NChooseKConstraint`` (respecting + ``min_count``, ``max_count``, and ``none_also_valid``) and (b) one + singleton group per ``ContinuousInput`` with ``allow_zero=True`` that + is not already part of any ``NChooseKConstraint``. + + Within each group the subset size ``k`` is drawn with probability + proportional to ``C(n, k)`` and ``k`` features are then chosen + uniformly, so the per-group distribution is uniform over all valid + subsets. When ``NChooseKConstraint``s share features, the per-group + union may violate one of the constraints; in that case rejection + sampling is used (up to ``max_iters`` attempts per drawn combination). + + Args: + domain: Domain to sample from. + seed: Random seed used to initialise the internal sampler. + Defaults to ``None`` (non-deterministic). + n: Number of combinations to draw. Defaults to 1. + max_iters: Maximum number of rejection-sampling attempts per + drawn combination. Defaults to 1000. + + Returns: + A list of ``n`` sorted tuples of active feature keys. + + Raises: + ValueError: If a valid combination is not found within + ``max_iters`` attempts. + """ + rng = random.Random(seed) + groups: List[Tuple[List[str], List[int], List[int]]] = [] + nchoosek_cons = list(domain.constraints.get(NChooseKConstraint)) + con_feature_sets: List[set[str]] = [] + for con in nchoosek_cons: + assert isinstance(con, NChooseKConstraint) + ks = list(range(con.min_count, con.max_count + 1)) + if con.none_also_valid and 0 not in ks: + ks.insert(0, 0) + weights = [math.comb(len(con.features), k) for k in ks] + groups.append((con.features, ks, weights)) + con_feature_sets.append(set(con.features)) + _, allow_zero_keys = RandomStrategy._get_zeroable_keys(domain) + for key in allow_zero_keys: + groups.append(([key], [0, 1], [1, 1])) + + results: List[Tuple[str, ...]] = [] + for _ in range(n): + for _ in range(max_iters): + active: set[str] = set() + for features, ks, weights in groups: + k = rng.choices(ks, weights=weights, k=1)[0] + active.update(rng.sample(features, k)) + if all( + (con.none_also_valid and len(active & fset) == 0) + or con.min_count <= len(active & fset) <= con.max_count + for con, fset in zip(nchoosek_cons, con_feature_sets) + ): + results.append(tuple(sorted(active))) + break + else: + raise ValueError( + f"Failed to sample a valid NChooseK combination after " + f"{max_iters} attempts.", + ) + return results + @staticmethod def _sample_from_polytope( domain: Domain, @@ -277,7 +383,7 @@ def _sample_from_polytope( samples = pd.DataFrame( data=np.nan, index=range(n), - columns=domain.inputs.get_keys(), + columns=domain.inputs.get_keys(ContinuousInput), ) else: bounds = torch.tensor([lower, upper]).to(**tkwargs) @@ -334,10 +440,13 @@ def _sample_from_polytope( ) # setup the categoricals and discrete ones as uniform sampled vals + # we have to make sure here that no fixed ones occur here samples = pd.concat( [ samples, - domain.inputs.get([CategoricalInput, DiscreteInput]).sample( + domain.inputs.get([CategoricalInput, DiscreteInput]) + .get_free() + .sample( n, method=fallback_sampling_method, seed=seed, @@ -350,6 +459,9 @@ def _sample_from_polytope( # setup the fixed continuous ones for key, value in fixed_features.items(): samples[key] = value + # setup the fixed discrete/categorical ones + for feat in domain.inputs.get([CategoricalInput, DiscreteInput]).get_fixed(): + samples[feat.key] = feat.fixed_value()[0] return samples[domain.inputs.get_keys()] @@ -362,6 +474,8 @@ def make( n_thinning: int | None = None, num_base_samples: int | None = None, max_iters: int | None = None, + max_combinations: int | None = None, + nchoosek_max_iters: int | None = None, seed: int | None = None, sampler_kwargs: Optional[dict] = None, ) -> Self: @@ -373,6 +487,8 @@ def make( n_thinning: The thinning factor for the polytope sampler. num_base_samples: The number of base samples for rejection sampling. max_iters: The maximum number of iterations for rejection sampling. + max_combinations: The maximum number of distinct NChooseK feature combinations to draw per ask. + nchoosek_max_iters: The maximum number of rejection-sampling attempts per drawn NChooseK combination. seed: The seed value for random number generation. sampler_kwargs: Additional arguments for the sampler. Defaults to None. Returns: diff --git a/bofire/surrogates/fully_bayesian.py b/bofire/surrogates/fully_bayesian.py index e39b2fffc..c515bdabd 100644 --- a/bofire/surrogates/fully_bayesian.py +++ b/bofire/surrogates/fully_bayesian.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd import torch -from botorch import fit_fully_bayesian_model_nuts +from botorch.fit import fit_fully_bayesian_model_nuts from botorch.models.fully_bayesian import ( FullyBayesianLinearSingleTaskGP, FullyBayesianSingleTaskGP, diff --git a/tests/bofire/data_models/domain/test_domain_nchoosek_combinatorics.py b/tests/bofire/data_models/domain/test_domain_nchoosek_combinatorics.py deleted file mode 100644 index d9017b198..000000000 --- a/tests/bofire/data_models/domain/test_domain_nchoosek_combinatorics.py +++ /dev/null @@ -1,249 +0,0 @@ -import unittest - -import numpy as np -import pandas as pd -import pytest - -from bofire.data_models.constraints.api import NChooseKConstraint -from bofire.data_models.domain.api import Constraints, Domain, Inputs -from bofire.data_models.features.api import ( - CategoricalDescriptorInput, - CategoricalInput, - ContinuousInput, - ContinuousOutput, -) - - -# NChooseKConstraint constraints 1 -cc1a = NChooseKConstraint( - features=["0", "1", "2", "3"], - min_count=2, - max_count=3, - none_also_valid=True, -) -cc2a = NChooseKConstraint( - features=["2", "3", "4", "5"], - min_count=1, - max_count=2, - none_also_valid=True, -) - -# NChooseKConstraint constraints 2 -cc1b = NChooseKConstraint( - features=["0", "1", "2", "3"], - min_count=2, - max_count=3, - none_also_valid=False, -) -cc2b = NChooseKConstraint( - features=["2", "3", "4", "5"], - min_count=1, - max_count=2, - none_also_valid=True, -) - -# NChooseKConstraint constraint 3 -cc3 = [ - NChooseKConstraint( - features=["0", "1", "2", "3"], - min_count=2, - max_count=3, - none_also_valid=True, - ), -] - -# input features -continuous_inputs = [] -for i in range(6): - f = ContinuousInput(key=str(i), bounds=(0, 1)) - continuous_inputs.append(f) -categorical_feature = CategoricalInput( - key="categorical_feature", - categories=["c1", "c2"], -) -categorical_descriptor_feature = CategoricalDescriptorInput( - key="categorical_descriptor_feature", - categories=["cd1", "cd2"], - descriptors=["d1", "d2"], - values=[[1.0, 1.0], [2.0, 2.0]], -) - -# output feature -outputs = [ContinuousOutput(key="y")] - - -""" -TEST CASES: - -CASE 1: 6 continuous features, 2 overlapping NChooseKConstraint constraints, none_also_valid: True, True -CASE 2: 6 continuous features, 2 overlapping NChooseKConstraint constraints, none_also_valid: False, True -""" - -# CASE 1 -test_features_used_1 = [ - ["0", "1", "2"], - ["0", "1", "3"], - ["0", "1", "4"], - ["0", "1", "5"], - ["0", "1", "2", "4"], - ["0", "1", "2", "5"], - ["0", "1", "3", "4"], - ["0", "1", "3", "5"], - ["0", "1", "4", "5"], - ["0", "1"], - ["0", "2"], - ["0", "2", "3"], - ["0", "2", "4"], - ["0", "2", "5"], - ["0", "3"], - ["0", "3", "4"], - ["0", "3", "5"], - ["1", "2"], - ["1", "2", "3"], - ["1", "2", "4"], - ["1", "2", "5"], - ["1", "3"], - ["1", "3", "4"], - ["1", "3", "5"], - ["2", "3"], - ["4"], - ["5"], - ["4", "5"], - [], -] -test_features_unused_1 = [] -for used in test_features_used_1: - unused = [f for f in ["0", "1", "2", "3", "4", "5"] if f not in used] - test_features_unused_1.append(unused) - -# CASE 2 -test_features_used_2 = [ - ["0", "1", "2"], - ["0", "1", "3"], - ["0", "1", "4"], - ["0", "1", "5"], - ["0", "1", "2", "4"], - ["0", "1", "2", "5"], - ["0", "1", "3", "4"], - ["0", "1", "3", "5"], - ["0", "1", "4", "5"], - ["0", "1"], - ["0", "2"], - ["0", "2", "3"], - ["0", "2", "4"], - ["0", "2", "5"], - ["0", "3"], - ["0", "3", "4"], - ["0", "3", "5"], - ["1", "2"], - ["1", "2", "3"], - ["1", "2", "4"], - ["1", "2", "5"], - ["1", "3"], - ["1", "3", "4"], - ["1", "3", "5"], - ["2", "3"], -] -test_features_unused_2 = [] -for used in test_features_used_2: - unused = [f for f in ["0", "1", "2", "3", "4", "5"] if f not in used] - test_features_unused_2.append(unused) - -# experiments -experiments = pd.DataFrame( - np.random.uniform(size=(24, 7)), - columns=["0", "1", "2", "3", "4", "5", "y"], -) -experiments["categorical_feature"] = ["c1"] * 12 + ["c2"] * 12 -experiments["categorical_descriptor_feature"] = (["cd1"] * 6 + ["cd2"] * 6) * 2 -experiments["valid_y"] = 1 - - -##### LIST OF TASTE CASES ##### - -test_cases = [] - -# CASE 1 -test_case = {} -domain = Domain.from_lists( - inputs=continuous_inputs, - outputs=outputs, - constraints=[cc1a, cc2a], -) -test_case["domain"] = domain -test_case["experiments"] = experiments -test_case["descriptor_method"] = None -test_case["categorical_method"] = None -test_case["descriptor_encoding"] = None -test_case["categorical_encoding"] = None -test_case["test_features_used"] = test_features_used_1 -test_case["test_features_unused"] = test_features_unused_1 -test_cases.append(test_case) - -# CASE 2 -test_case = {} -domain = Domain.from_lists( - continuous_inputs, - outputs, - constraints=[cc1b, cc2b], -) -test_case["domain"] = domain -test_case["experiments"] = experiments -test_case["descriptor_method"] = None -test_case["categorical_method"] = None -test_case["descriptor_encoding"] = None -test_case["categorical_encoding"] = None -test_case["test_features_used"] = test_features_used_2 -test_case["test_features_unused"] = test_features_unused_2 -test_cases.append(test_case) - - -@pytest.mark.parametrize("test_case", test_cases) -def test_nchoosek_combinations_completeness(test_case): - domain = test_case["domain"] - features_used, features_unused = domain.get_nchoosek_combinations(exhaustive=True) - for features in test_case["test_features_used"]: - assert features in features_used - for features in test_case["test_features_unused"]: - assert features in features_unused - - -def test_nchoosek_combinations_nonexhaustive(): - domain = Domain( - inputs=Inputs( - features=[ - ContinuousInput(key=f"if{i + 1}", bounds=(0, 1)) for i in range(6) - ], - ), - constraints=Constraints( - constraints=[ - NChooseKConstraint( - features=[f"if{i + 1}" for i in range(4)], - min_count=0, - max_count=2, - none_also_valid=True, - ), - ], - ), - ) - used, unused = domain.get_nchoosek_combinations(exhaustive=False) - expected_used = [ - ["if1", "if2"], - ["if1", "if3"], - ["if1", "if4"], - ["if2", "if3"], - ["if2", "if4"], - ["if3", "if4"], - ] - expected_unused = [ - ["if3", "if4"], - ["if2", "if4"], - ["if2", "if3"], - ["if1", "if4"], - ["if1", "if3"], - ["if1", "if2"], - ] - # print(combos, expected_combos) - c = unittest.TestCase() - c.assertCountEqual(used, expected_used) - c.assertCountEqual(unused, expected_unused) diff --git a/tests/bofire/data_models/specs/domain.py b/tests/bofire/data_models/specs/domain.py index 47bf3b036..3c8da9219 100644 --- a/tests/bofire/data_models/specs/domain.py +++ b/tests/bofire/data_models/specs/domain.py @@ -107,8 +107,8 @@ def create_spec(c): return lambda: { "inputs": Inputs( features=[ - features.valid(ContinuousInput).obj(key="i1"), - features.valid(ContinuousInput).obj(key="i2"), + features.valid(ContinuousInput).obj(key="i1", bounds=[0, 1]), + features.valid(ContinuousInput).obj(key="i2", bounds=[0, 1]), ], ), "outputs": Outputs( @@ -146,3 +146,34 @@ def create_spec(c): error=ValueError, message="Feature i3 is not a continuous input feature in the provided Inputs object.", ) + + +# NChooseK on a feature whose lower bound is non-zero and which is not allow_zero +specs.add_invalid( + Domain, + lambda: { + "inputs": Inputs( + features=[ + ContinuousInput(key="i1", bounds=(0.5, 1.0)), + ContinuousInput(key="i2", bounds=(0, 1)), + ], + ), + "outputs": Outputs( + features=[ + features.valid(ContinuousOutput).obj(key="o1"), + ], + ), + "constraints": Constraints( + constraints=[ + NChooseKConstraint( + features=["i1", "i2"], + min_count=1, + max_count=1, + none_also_valid=False, + ), + ], + ), + }, + error=ValueError, + message="Feature i1 must have a lower bound of 0 or `allow_zero=True`", +) diff --git a/tests/bofire/strategies/doe/test_design.py b/tests/bofire/strategies/doe/test_design.py index bef253261..e43a9d8e5 100644 --- a/tests/bofire/strategies/doe/test_design.py +++ b/tests/bofire/strategies/doe/test_design.py @@ -171,9 +171,9 @@ def test_find_local_max_ipopt_mixed_results(): ), NChooseKConstraint( features=[f"x{i + 1}" for i in range(3)], - min_count=0, + min_count=1, max_count=1, - none_also_valid=True, + none_also_valid=False, ), ], ) @@ -311,28 +311,6 @@ def test_find_local_max_ipopt_fixed_experiments(): ), ], ) - # np.random.seed(1) - # fixed_experiments = pd.DataFrame([[0.3, 0.5, 0.2]], columns=["x1", "x2", "x3"]) - # A = find_local_max_ipopt( - # domain, - # "linear", - # n_experiments=12, - # fixed_experiments=fixed_experiments, - # ) - # opt = np.array( - # [ - # [0.2, 0.2, 0.6], - # [0.3, 0.6, 0.1], - # [0.7, 0.1, 0.2], - # [0.3, 0.1, 0.6], - # [0.3, 0.5, 0.2], - # ] - # ) - # for row in A.to_numpy(): - # assert any([np.allclose(row, o, atol=1e-2) for o in opt]) - # for o in opt[:-1]: - # assert any([np.allclose(o, row, atol=1e-2) for row in A.to_numpy()]) - # assert np.allclose(A.to_numpy()[0, :], np.array([0.3, 0.5, 0.2])) # define domain: no NChooseK constraints, invalid proposal np.random.seed(1) @@ -376,9 +354,9 @@ def test_find_local_max_ipopt_fixed_experiments(): ), NChooseKConstraint( features=[f"x{i + 1}" for i in range(3)], - min_count=0, + min_count=1, max_count=1, - none_also_valid=True, + none_also_valid=False, ), ], ) diff --git a/tests/bofire/strategies/doe/test_utils.py b/tests/bofire/strategies/doe/test_utils.py index 112749c12..ee54a4981 100644 --- a/tests/bofire/strategies/doe/test_utils.py +++ b/tests/bofire/strategies/doe/test_utils.py @@ -649,7 +649,10 @@ def test_check_nchoosek_constraints_as_bounds(): # NChooseK with non-zero lower bounds should be allowed: inactive variables # are pinned to [0, 0] by the bounds formulation regardless of the original lb. domain = Domain.from_lists( - inputs=[ContinuousInput(key=f"x{i + 1}", bounds=(0.1, 1)) for i in range(4)], + inputs=[ + ContinuousInput(key=f"x{i + 1}", bounds=(0.1, 1), allow_zero=True) + for i in range(4) + ], outputs=[ContinuousOutput(key="y")], constraints=[ NChooseKConstraint( @@ -664,8 +667,8 @@ def test_check_nchoosek_constraints_as_bounds(): domain = Domain.from_lists( inputs=[ - ContinuousInput(key=f"x{1}", bounds=(0.1, 1.0)), - ContinuousInput(key=f"x{2}", bounds=(0.1, 1.0)), + ContinuousInput(key=f"x{1}", bounds=(0.1, 1.0), allow_zero=True), + ContinuousInput(key=f"x{2}", bounds=(0.1, 1.0), allow_zero=True), ContinuousInput(key=f"x{3}", bounds=(0.1, 1.0)), ContinuousInput(key=f"x{4}", bounds=(0.1, 1.0)), ], @@ -1017,7 +1020,8 @@ def test_nchoosek_bounds_none_also_valid(): # --- Case 1: none_also_valid=False with min_count=0 --- d_no_none = Domain.from_lists( inputs=[ - ContinuousInput(key=f"x{i}", bounds=(1.0, 2.0)) for i in range(n_features) + ContinuousInput(key=f"x{i}", bounds=(1.0, 2.0), allow_zero=True) + for i in range(n_features) ], outputs=[ContinuousOutput(key="y")], constraints=[ @@ -1059,7 +1063,8 @@ def test_nchoosek_bounds_none_also_valid(): # (it is handled at validation level by is_fulfilled / domain.py). d_with_none = Domain.from_lists( inputs=[ - ContinuousInput(key=f"x{i}", bounds=(1.0, 2.0)) for i in range(n_features) + ContinuousInput(key=f"x{i}", bounds=(1.0, 2.0), allow_zero=True) + for i in range(n_features) ], outputs=[ContinuousOutput(key="y")], constraints=[ @@ -1091,7 +1096,8 @@ def test_nchoosek_bounds_none_also_valid(): # get the C(3,1) = 3 patterns with exactly 2 active features. d_min_gt_zero = Domain.from_lists( inputs=[ - ContinuousInput(key=f"x{i}", bounds=(1.0, 2.0)) for i in range(n_features) + ContinuousInput(key=f"x{i}", bounds=(1.0, 2.0), allow_zero=True) + for i in range(n_features) ], outputs=[ContinuousOutput(key="y")], constraints=[ diff --git a/tests/bofire/strategies/test_random.py b/tests/bofire/strategies/test_random.py index 1a02f5911..ab2f736ff 100644 --- a/tests/bofire/strategies/test_random.py +++ b/tests/bofire/strategies/test_random.py @@ -213,8 +213,8 @@ def test_nchoosek(): If7 = ContinuousInput(bounds=(1, 1), key="If7") c2 = LinearInequalityConstraint.from_greater_equal( - features=["if1", "if2"], - coefficients=[1.0, 1.0], + features=["if1", "if2", "if3"], + coefficients=[1.0, 1.0, 1.0], rhs=0.2, ) @@ -225,8 +225,8 @@ def test_nchoosek(): none_also_valid=False, ) c7 = LinearEqualityConstraint( - features=["if1", "if2"], - coefficients=[1.0, 1.0], + features=["if1", "if2", "if3"], + coefficients=[1.0, 1.0, 1.0], rhs=1.0, ) domain = Domain.from_lists( @@ -239,6 +239,47 @@ def test_nchoosek(): assert len(samples) == 50 +def test_allow_zero_without_nchoosek(): + """Test random sampling with allow_zero features but no NChooseK constraint.""" + if1 = ContinuousInput(bounds=(0.1, 1), key="if1", allow_zero=True) + if2 = ContinuousInput(bounds=(0.1, 1), key="if2", allow_zero=True) + if3 = ContinuousInput(bounds=(0.1, 1), key="if3") + domain = Domain.from_lists(inputs=[if1, if2, if3]) + data_model = data_models.RandomStrategy(domain=domain) + sampler = strategies.RandomStrategy(data_model=data_model) + samples = sampler.ask(50) + assert len(samples) == 50 + # if3 should never be zero (not allow_zero) + assert (samples["if3"] != 0.0).all() + # if1 and if2 should have some zeros (allow_zero) + assert (samples["if1"] == 0.0).any() or (samples["if2"] == 0.0).any() + + +def test_allow_zero_with_nchoosek(): + """Test that allow_zero features already in NChooseK don't get duplicate groups.""" + if1 = ContinuousInput(bounds=(0, 1), key="if1") + if2 = ContinuousInput(bounds=(0, 1), key="if2") + if3 = ContinuousInput(bounds=(0, 1), key="if3") + if4 = ContinuousInput(bounds=(0.1, 1), key="if4", allow_zero=True) + c = NChooseKConstraint( + features=["if1", "if2", "if3"], + min_count=1, + max_count=2, + none_also_valid=False, + ) + domain = Domain.from_lists(inputs=[if1, if2, if3, if4], constraints=[c]) + data_model = data_models.RandomStrategy(domain=domain) + sampler = strategies.RandomStrategy(data_model=data_model) + samples = sampler.ask(50) + assert len(samples) == 50 + # At most 2 features should be non-zero per sample (from NChooseK) + nonzero_counts = (samples[["if1", "if2", "if3"]] != 0.0).sum(axis=1) + assert (nonzero_counts >= 1).all() + assert (nonzero_counts <= 2).all() + # if4 (allow_zero, not in NChooseK) should have some zeros + assert (samples["if4"] == 0.0).any() + + def test_sample_from_polytope(): if1 = ContinuousInput( bounds=(0, 1), @@ -297,3 +338,77 @@ def test_sampler_kwargs_various_methods(method, kwargs, n_samples): sampler_strategy = strategies.RandomStrategy(data_model=sampler_data_model) candidates = sampler_strategy.ask(n_samples) assert len(candidates) == n_samples + + +def test_sample_valid_nchoosek_features_uniform_over_subsets(): + """With one NChooseK on n=5 features and k in [1, 3], there are + C(5,1)+C(5,2)+C(5,3) = 25 valid subsets. With uniform sampling each + should appear with frequency ~1/25. + """ + inputs = [ContinuousInput(key=f"x{i}", bounds=(0, 1)) for i in range(5)] + constraint = NChooseKConstraint( + features=[f"x{i}" for i in range(5)], + min_count=1, + max_count=3, + none_also_valid=False, + ) + domain = Domain.from_lists(inputs=inputs, constraints=[constraint]) + n_samples = 25_000 + samples = strategies.RandomStrategy.sample_valid_nchoosek_features( + domain=domain, seed=0, n=n_samples + ) + counts: dict = {} + for s in samples: + counts[s] = counts.get(s, 0) + 1 + assert len(counts) == 25, f"Expected 25 unique subsets, got {len(counts)}" + expected = n_samples / 25 + for subset, count in counts.items(): + rel = abs(count - expected) / expected + assert ( + rel < 0.20 + ), f"Subset {subset} count {count} too far from expected {expected:.0f}" + + +def test_sample_valid_nchoosek_features_none_also_valid(): + """When none_also_valid=True, the empty subset is in the support.""" + inputs = [ContinuousInput(key=f"x{i}", bounds=(0, 1)) for i in range(3)] + constraint = NChooseKConstraint( + features=["x0", "x1", "x2"], + min_count=2, + max_count=3, + none_also_valid=True, + ) + domain = Domain.from_lists(inputs=inputs, constraints=[constraint]) + samples = strategies.RandomStrategy.sample_valid_nchoosek_features( + domain=domain, seed=1, n=2000 + ) + unique = set(samples) + # Valid subsets: () + C(3,2) + C(3,3) = 1 + 3 + 1 = 5 + assert len(unique) == 5 + assert () in unique + + +def test_sample_valid_nchoosek_features_allow_zero_singletons(): + """Without any NChooseK, allow_zero=True features form singleton groups.""" + inputs = [ + ContinuousInput(key="a", bounds=(0.1, 1), allow_zero=True), + ContinuousInput(key="b", bounds=(0.1, 1), allow_zero=True), + ContinuousInput(key="c", bounds=(0.1, 1)), + ] + domain = Domain.from_lists(inputs=inputs) + samples = strategies.RandomStrategy.sample_valid_nchoosek_features( + domain=domain, seed=2, n=2000 + ) + unique = set(samples) + # Each of {a, b} can be on or off independently -> 4 subsets + assert unique == {(), ("a",), ("b",), ("a", "b")} + + +def test_sample_valid_nchoosek_features_empty_returns_empty_tuple(): + """Domain without NChooseK and without allow_zero features yields ().""" + inputs = [ContinuousInput(key="x", bounds=(0, 1))] + domain = Domain.from_lists(inputs=inputs) + samples = strategies.RandomStrategy.sample_valid_nchoosek_features( + domain=domain, seed=3, n=4 + ) + assert samples == [(), (), (), ()]