From 704b7125cec640d32173b73937e7fe3d5e51c067 Mon Sep 17 00:00:00 2001 From: uttam12331 Date: Sun, 28 Jun 2026 16:14:57 +0530 Subject: [PATCH] feat(naive_bayes): add CategoricalNB classifier --- river/naive_bayes/__init__.py | 3 +- river/naive_bayes/categorical.py | 148 ++++++++++++++++++++++++++ river/naive_bayes/test_naive_bayes.py | 70 ++++++++++++ 3 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 river/naive_bayes/categorical.py diff --git a/river/naive_bayes/__init__.py b/river/naive_bayes/__init__.py index b1890dadc4..3f7f820736 100644 --- a/river/naive_bayes/__init__.py +++ b/river/naive_bayes/__init__.py @@ -3,8 +3,9 @@ from __future__ import annotations from .bernoulli import BernoulliNB +from .categorical import CategoricalNB from .complement import ComplementNB from .gaussian import GaussianNB from .multinomial import MultinomialNB -__all__ = ["BernoulliNB", "ComplementNB", "GaussianNB", "MultinomialNB"] +__all__ = ["BernoulliNB", "CategoricalNB", "ComplementNB", "GaussianNB", "MultinomialNB"] diff --git a/river/naive_bayes/categorical.py b/river/naive_bayes/categorical.py new file mode 100644 index 0000000000..66e13e5edb --- /dev/null +++ b/river/naive_bayes/categorical.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +import collections +import math +import typing + +from river import utils + +from . import base + +if typing.TYPE_CHECKING: + import pandas as pd + +__all__ = ["CategoricalNB"] + + +class CategoricalNB(base.BaseNB): + """Naive Bayes classifier for categorical features. + + The input vector must contain categorical (discrete) feature values, for instance + strings such as `{"weather": "sunny", "wind": "strong"}`. Each feature is assumed to + follow a categorical distribution, conditioned on the class. This mirrors scikit-learn's + [`CategoricalNB`](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html), + but learns incrementally: new feature values (categories) encountered after the first + observations are handled gracefully. + + Parameters + ---------- + alpha + Additive (Laplace/Lidstone) smoothing parameter (use 0 for no smoothing). + + Attributes + ---------- + class_counts : collections.Counter + Number of times each class has been seen. + feature_counts : collections.defaultdict + Number of times each `(class, category)` pair has been seen, per feature. + category_counts : collections.defaultdict + Number of times each category has been seen, per feature. Used to count the + number of distinct categories of a feature, which determines the smoothing + denominator. + + Examples + -------- + + >>> from river import naive_bayes + + >>> dataset = [ + ... ({"weather": "sunny", "humidity": "high"}, "no"), + ... ({"weather": "sunny", "humidity": "high"}, "no"), + ... ({"weather": "overcast", "humidity": "high"}, "yes"), + ... ({"weather": "rainy", "humidity": "normal"}, "yes"), + ... ({"weather": "rainy", "humidity": "normal"}, "yes"), + ... ({"weather": "overcast", "humidity": "normal"}, "yes"), + ... ] + + >>> model = naive_bayes.CategoricalNB(alpha=1) + + >>> for x, y in dataset: + ... model.learn_one(x, y) + + >>> model.p_class("yes") + 0.666666... + + >>> model.predict_proba_one({"weather": "overcast", "humidity": "normal"}) + {'no': 0.08, 'yes': 0.92} + + >>> model.predict_one({"weather": "overcast", "humidity": "normal"}) + 'yes' + + References + ---------- + [^1]: [scikit-learn CategoricalNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html) + + """ + + def __init__(self, alpha=1.0): + self.alpha = alpha + self.class_counts: collections.Counter = collections.Counter() + self.feature_counts: collections.defaultdict = collections.defaultdict(collections.Counter) + self.category_counts: collections.defaultdict = collections.defaultdict(collections.Counter) + + def learn_one(self, x, y): + """Update the model with a single observation. + + Parameters + ---------- + x + Dictionary of categorical feature values. + y + Target class. + + """ + self.class_counts[y] += 1 + for f, value in x.items(): + self.feature_counts[f][(y, value)] += 1 + self.category_counts[f][value] += 1 + + @property + def classes_(self): + return list(self.class_counts.keys()) + + def p_class(self, c) -> float: + return self.class_counts[c] / sum(self.class_counts.values()) + + def p_feature_given_class(self, f, value, c) -> float: + """Probability of a category given a class for a feature, with smoothing.""" + n_categories = len(self.category_counts.get(f, ())) or 1 + num = self.feature_counts.get(f, {}).get((c, value), 0.0) + self.alpha + den = self.class_counts[c] + self.alpha * n_categories + return num / den + + def joint_log_likelihood(self, x): + """Compute the unnormalized posterior log-likelihood of `x`. + + The log-likelihood is `log P(c) + log P(x|c)`. + + """ + if not self.class_counts: + return {} + return { + c: math.log(self.p_class(c)) + + sum(math.log(self.p_feature_given_class(f, value, c)) for f, value in x.items()) + for c in self.classes_ + } + + def learn_many(self, X: pd.DataFrame, y: pd.Series): + """Learn from a batch of observations. + + Parameters + ---------- + X + A dataframe of categorical feature values. + y + A series of target classes. + + """ + for (_, row), label in zip(X.iterrows(), y): + self.learn_one(row.to_dict(), label) + + def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame: + """Compute the unnormalized posterior log-likelihood of `X` in mini-batches.""" + pd = utils.pandas.import_pandas() + index = X.index + if not self.class_counts: + return pd.DataFrame(index=index) + records = [self.joint_log_likelihood(row.to_dict()) for _, row in X.iterrows()] + return pd.DataFrame(records, index=index, columns=self.classes_) diff --git a/river/naive_bayes/test_naive_bayes.py b/river/naive_bayes/test_naive_bayes.py index 83a5acfca8..28fd8021da 100644 --- a/river/naive_bayes/test_naive_bayes.py +++ b/river/naive_bayes/test_naive_bayes.py @@ -360,3 +360,73 @@ def test_gaussian_learn_many_not_fit(): assert model.predict_proba_many(X).equals(pd.DataFrame(index=["river", "rocks"])) assert model.predict_many(X).equals(pd.DataFrame(index=["river", "rocks"])) + + +def _categorical_dataset(seed, n=80): + import random + + rng = random.Random(seed) + weather = ["sunny", "overcast", "rainy"] + humidity = ["high", "normal"] + rows, ys = [], [] + for _ in range(n): + w, h = rng.choice(weather), rng.choice(humidity) + rows.append({"weather": w, "humidity": h}) + ys.append("yes" if (w == "overcast" or h == "normal") else "no") + return weather, humidity, rows, ys + + +@pytest.mark.parametrize("alpha", [1.0, 2.0, 3.0]) +def test_categorical_vs_sklearn(alpha): + """river's CategoricalNB must match sklearn's CategoricalNB on categorical data.""" + weather, humidity, rows, ys = _categorical_dataset(seed=42) + wmap = {c: i for i, c in enumerate(weather)} + hmap = {c: i for i, c in enumerate(humidity)} + ymap = {"no": 0, "yes": 1} + inv_y = {v: k for k, v in ymap.items()} + X = np.array([[wmap[r["weather"]], hmap[r["humidity"]]] for r in rows]) + Y = np.array([ymap[y] for y in ys]) + + river_model = naive_bayes.CategoricalNB(alpha=alpha) + for r, y in zip(rows, ys): + river_model.learn_one(r, y) + + sk = sk_naive_bayes.CategoricalNB(alpha=alpha).fit(X, Y) + + for r, xrow in zip(rows, X): + river_proba = river_model.predict_proba_one(r) + sk_proba = sk.predict_proba(xrow.reshape(1, -1))[0] + for idx, cls in enumerate(sk.classes_): + assert river_proba[inv_y[cls]] == pytest.approx(sk_proba[idx]) + + +def test_categorical_learn_many_vs_learn_one(): + """CategoricalNB.learn_many must yield the same model as repeated learn_one.""" + _, _, rows, ys = _categorical_dataset(seed=7, n=40) + + one = naive_bayes.CategoricalNB(alpha=1) + for r, y in zip(rows, ys): + one.learn_one(r, y) + + many = naive_bayes.CategoricalNB(alpha=1) + many.learn_many(pd.DataFrame(rows), pd.Series(ys)) + + assert one.class_counts == many.class_counts + assert one.feature_counts == many.feature_counts + + test_x = {"weather": "overcast", "humidity": "normal"} + assert one.predict_proba_one(test_x) == pytest.approx(many.predict_proba_one(test_x)) + + +def test_categorical_handles_unseen_feature_value(): + """An unseen category at predict time must not raise and must stay normalized.""" + model = naive_bayes.CategoricalNB(alpha=1) + for x, y in [ + ({"weather": "sunny"}, "no"), + ({"weather": "rainy"}, "yes"), + ]: + model.learn_one(x, y) + + proba = model.predict_proba_one({"weather": "snowy"}) # category never seen + assert set(proba) == {"no", "yes"} + assert sum(proba.values()) == pytest.approx(1.0)