Skip to content
4 changes: 4 additions & 0 deletions bofire/data_models/constraints/nchoosek.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def validate_inputs(self, inputs: Inputs):
assert isinstance(
feature_, ContinuousInput
), f"Feature {f} is not a ContinuousInput."
if feature_.bounds[0] < 0:
raise ValueError(
f"Feature {f} must have a lower bound of >=0, but has {feature_.bounds[0]}",
)
Comment thread
jduerholt marked this conversation as resolved.
Outdated

@model_validator(mode="after")
def validate_counts(self):
Expand Down
72 changes: 72 additions & 0 deletions bofire/data_models/domain/domain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import collections.abc
import itertools
import math
import random
import warnings
from collections.abc import Sequence
from typing import Any, Dict, Literal, Optional, Tuple, Union
Expand Down Expand Up @@ -236,6 +238,76 @@ def get_nchoosek_combinations(self, exhaustive: bool = False):

return used_features_list_final, unused_features_list

def sample_valid_nchoosek_features(
Comment thread
jduerholt marked this conversation as resolved.
Outdated
self,
rng: random.Random,
Comment thread
jduerholt marked this conversation as resolved.
Outdated
n: int = 1,
max_iters: int = 1000,
) -> list[Tuple[str, ...]]:
"""Sample sets of active feature keys uniformly from all valid subsets.

Includes (a) one group per ``NChooseKConstraint`` (respecting
``min_count``, ``max_count``, and ``none_also_valid``) and (b) one
singleton group per ``ContinuousInput`` with ``allow_zero=True`` that
is not already part of any ``NChooseKConstraint``.

Within each group the subset size ``k`` is drawn with probability
proportional to ``C(n, k)`` and ``k`` features are then chosen
uniformly, so the per-group distribution is uniform over all valid
subsets. When ``NChooseKConstraint``s share features, the per-group
union may violate one of the constraints; in that case rejection
sampling is used (up to ``max_iters`` attempts per drawn combination).

Args:
rng: Random number generator used for sampling.
n: Number of combinations to draw. Defaults to 1.
max_iters: Maximum number of rejection-sampling attempts per
drawn combination. Defaults to 1000.

Returns:
A list of ``n`` sorted tuples of active feature keys.

Raises:
ValueError: If a valid combination is not found within
``max_iters`` attempts.
"""
groups: list[Tuple[list[str], list[int], list[int]]] = []
nchoosek_keys: set[str] = set()
nchoosek_cons = list(self.constraints.get(NChooseKConstraint))
for con in nchoosek_cons:
assert isinstance(con, NChooseKConstraint)
ks = list(range(con.min_count, con.max_count + 1))
if con.none_also_valid and 0 not in ks:
ks.insert(0, 0)
weights = [math.comb(len(con.features), k) for k in ks]
groups.append((list(con.features), ks, weights))
Comment thread
jduerholt marked this conversation as resolved.
Outdated
nchoosek_keys.update(con.features)
for feat in self.inputs.get(ContinuousInput):
assert isinstance(feat, ContinuousInput)
if feat.allow_zero and feat.key not in nchoosek_keys:
groups.append(([feat.key], [0, 1], [1, 1]))
Comment thread
jduerholt marked this conversation as resolved.
Outdated

results: list[Tuple[str, ...]] = []
for _ in range(n):
for _ in range(max_iters):
active: set[str] = set()
for features, ks, weights in groups:
k = rng.choices(ks, weights=weights, k=1)[0]
active.update(rng.sample(features, k))
if all(
(con.none_also_valid and len(active & set(con.features)) == 0)
or con.min_count <= len(active & set(con.features)) <= con.max_count
for con in nchoosek_cons
):
results.append(tuple(sorted(active)))
break
else:
raise ValueError(
f"Failed to sample a valid NChooseK combination after "
f"{max_iters} attempts.",
)
Comment thread
jduerholt marked this conversation as resolved.
Outdated
return results

def coerce_invalids(self, experiments: pd.DataFrame) -> pd.DataFrame:
"""Coerces all invalid output measurements to np.nan

Expand Down
1 change: 1 addition & 0 deletions bofire/data_models/strategies/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class RandomStrategy(Strategy):
n_thinning: Annotated[int, Field(ge=1)] = 32
num_base_samples: Optional[Annotated[int, Field(gt=0)]] = None
max_iters: Annotated[int, Field(gt=0)] = 1000
max_combinations: Annotated[int, Field(gt=0)] = 64
sampler_kwargs: Optional[dict] = None

def is_constraint_implemented(self, my_type: Type[Constraint]) -> bool:
Expand Down
58 changes: 36 additions & 22 deletions bofire/strategies/random.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
import random
import warnings
from copy import deepcopy
from typing import Dict, Optional, cast
Expand Down Expand Up @@ -59,6 +60,7 @@ def __init__(
self.fallback_sampling_method = data_model.fallback_sampling_method
self.n_burnin = data_model.n_burnin
self.n_thinning = data_model.n_thinning
self.max_combinations = data_model.max_combinations
self.sampler_kwargs = data_model.sampler_kwargs

def has_sufficient_experiments(self) -> bool:
Expand Down Expand Up @@ -124,32 +126,38 @@ def _sample_with_nchooseks(
pd.DataFrame: A DataFrame containing the sampled data.

"""
if len(self.domain.constraints.get(NChooseKConstraint)) > 0:
_, unused = self.domain.get_nchoosek_combinations()

if candidate_count <= len(unused):
sampled_combinations = [
unused[i]
for i in np.random.default_rng(self._get_seed()).choice(
len(unused),
size=candidate_count,
replace=False,
)
]
num_samples_per_it = 1
else:
sampled_combinations = unused
num_samples_per_it = math.ceil(candidate_count / len(unused))
nchoosek_feature_keys: set[str] = set()
for constraint in self.domain.constraints.get(NChooseKConstraint):
assert isinstance(constraint, NChooseKConstraint)
nchoosek_feature_keys.update(constraint.features)
allow_zero_feature_keys = {
feat.key
for feat in self.domain.inputs.get(ContinuousInput)
if isinstance(feat, ContinuousInput) and feat.allow_zero
}
zeroable_keys = nchoosek_feature_keys | allow_zero_feature_keys

if zeroable_keys:
Comment thread
jduerholt marked this conversation as resolved.
rng = random.Random(self._get_seed())
n_combos = min(self.max_combinations, candidate_count)
drawn = self.domain.sample_valid_nchoosek_features(rng, n=n_combos)
combinations: Dict[tuple, int] = {}
for combo in drawn:
Comment thread
jduerholt marked this conversation as resolved.
Outdated
combinations[combo] = combinations.get(combo, 0) + 1

sampling_multiplier = math.ceil(candidate_count / n_combos)

samples = []
for u in sampled_combinations:
for combo, count in combinations.items():
# create new domain without the nchoosekconstraints
domain = deepcopy(self.domain)
domain.constraints = domain.constraints.get(excludes=NChooseKConstraint)
# fix the unused features
for key in u:
# fix the unselected zeroable features
for key in zeroable_keys - set(combo):
feat = domain.inputs.get_by_key(key=key)
assert isinstance(feat, ContinuousInput)
if feat.allow_zero:
Comment thread
jduerholt marked this conversation as resolved.
Outdated
feat.allow_zero = False
feat.bounds = [0.0, 0.0]
# setup then sampler for this situation
samples.append(
Expand All @@ -159,7 +167,7 @@ def _sample_with_nchooseks(
n_burnin=self.n_burnin,
n_thinning=self.n_thinning,
seed=self._get_seed(),
n=num_samples_per_it,
n=count * sampling_multiplier,
sampler_kwargs=self.sampler_kwargs,
),
)
Expand Down Expand Up @@ -277,7 +285,7 @@ def _sample_from_polytope(
samples = pd.DataFrame(
data=np.nan,
index=range(n),
columns=domain.inputs.get_keys(),
columns=domain.inputs.get_keys(ContinuousInput),
)
else:
bounds = torch.tensor([lower, upper]).to(**tkwargs)
Expand Down Expand Up @@ -334,10 +342,13 @@ def _sample_from_polytope(
)

# setup the categoricals and discrete ones as uniform sampled vals
# we have to make sure here that no fixed ones occur here
samples = pd.concat(
[
samples,
domain.inputs.get([CategoricalInput, DiscreteInput]).sample(
domain.inputs.get([CategoricalInput, DiscreteInput])
.get_free()
.sample(
n,
method=fallback_sampling_method,
seed=seed,
Expand All @@ -350,6 +361,9 @@ def _sample_from_polytope(
# setup the fixed continuous ones
for key, value in fixed_features.items():
samples[key] = value
# setup the fixed discrete/categorical ones
for feat in domain.inputs.get([CategoricalInput, DiscreteInput]).get_fixed():
samples[feat.key] = feat.fixed_value()[0]

return samples[domain.inputs.get_keys()]

Expand Down
Comment thread
jduerholt marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,102 @@ def test_nchoosek_combinations_nonexhaustive():
c = unittest.TestCase()
c.assertCountEqual(used, expected_used)
c.assertCountEqual(unused, expected_unused)


def test_sample_valid_nchoosek_features_uniform_over_subsets():
"""With one NChooseK on n=5 features and k in [1, 3], there are
C(5,1)+C(5,2)+C(5,3) = 25 valid subsets. With uniform sampling each
should appear with frequency ~1/25.
"""
import random

inputs = [ContinuousInput(key=f"x{i}", bounds=(0, 1)) for i in range(5)]
constraint = NChooseKConstraint(
features=[f"x{i}" for i in range(5)],
min_count=1,
max_count=3,
none_also_valid=False,
)
domain = Domain(
inputs=Inputs(features=inputs),
constraints=Constraints(constraints=[constraint]),
)
n_samples = 25_000
samples = domain.sample_valid_nchoosek_features(random.Random(0), n=n_samples)
counts: dict = {}
for s in samples:
counts[s] = counts.get(s, 0) + 1
assert len(counts) == 25, f"Expected 25 unique subsets, got {len(counts)}"
expected = n_samples / 25
for subset, count in counts.items():
rel = abs(count - expected) / expected
assert (
rel < 0.20
), f"Subset {subset} count {count} too far from expected {expected:.0f}"


def test_sample_valid_nchoosek_features_none_also_valid():
"""When none_also_valid=True, the empty subset is in the support."""
import random

inputs = [ContinuousInput(key=f"x{i}", bounds=(0, 1)) for i in range(3)]
constraint = NChooseKConstraint(
features=["x0", "x1", "x2"],
min_count=2,
max_count=3,
none_also_valid=True,
)
domain = Domain(
inputs=Inputs(features=inputs),
constraints=Constraints(constraints=[constraint]),
)
samples = domain.sample_valid_nchoosek_features(random.Random(1), n=2000)
unique = set(samples)
# Valid subsets: () + C(3,2) + C(3,3) = 1 + 3 + 1 = 5
assert len(unique) == 5
assert () in unique


def test_sample_valid_nchoosek_features_allow_zero_singletons():
"""Without any NChooseK, allow_zero=True features form singleton groups."""
import random

inputs = [
ContinuousInput(key="a", bounds=(0.1, 1), allow_zero=True),
ContinuousInput(key="b", bounds=(0.1, 1), allow_zero=True),
ContinuousInput(key="c", bounds=(0.1, 1)),
]
domain = Domain(inputs=Inputs(features=inputs))
samples = domain.sample_valid_nchoosek_features(random.Random(2), n=2000)
unique = set(samples)
# Each of {a, b} can be on or off independently -> 4 subsets
assert unique == {(), ("a",), ("b",), ("a", "b")}


def test_sample_valid_nchoosek_features_empty_returns_empty_tuple():
"""Domain without NChooseK and without allow_zero features yields ()."""
import random

inputs = [ContinuousInput(key="x", bounds=(0, 1))]
domain = Domain(inputs=Inputs(features=inputs))
samples = domain.sample_valid_nchoosek_features(random.Random(3), n=4)
assert samples == [(), (), (), ()]


def test_sample_valid_nchoosek_features_default_n_is_one():
Comment thread
jduerholt marked this conversation as resolved.
Outdated
"""Default returns a list of length 1."""
import random

inputs = [ContinuousInput(key=f"x{i}", bounds=(0, 1)) for i in range(3)]
constraint = NChooseKConstraint(
features=["x0", "x1", "x2"],
min_count=1,
max_count=2,
none_also_valid=False,
)
domain = Domain(
inputs=Inputs(features=inputs),
constraints=Constraints(constraints=[constraint]),
)
samples = domain.sample_valid_nchoosek_features(random.Random(0))
assert len(samples) == 1
Loading
Loading