Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion river/naive_bayes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from __future__ import annotations

from .bernoulli import BernoulliNB
from .categorical import CategoricalNB
from .complement import ComplementNB
from .gaussian import GaussianNB
from .multinomial import MultinomialNB

__all__ = ["BernoulliNB", "ComplementNB", "GaussianNB", "MultinomialNB"]
__all__ = ["BernoulliNB", "CategoricalNB", "ComplementNB", "GaussianNB", "MultinomialNB"]
148 changes: 148 additions & 0 deletions river/naive_bayes/categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from __future__ import annotations

import collections
import math
import typing

from river import utils

from . import base

if typing.TYPE_CHECKING:
import pandas as pd

__all__ = ["CategoricalNB"]


class CategoricalNB(base.BaseNB):
"""Naive Bayes classifier for categorical features.

The input vector must contain categorical (discrete) feature values, for instance
strings such as `{"weather": "sunny", "wind": "strong"}`. Each feature is assumed to
follow a categorical distribution, conditioned on the class. This mirrors scikit-learn's
[`CategoricalNB`](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html),
but learns incrementally: new feature values (categories) encountered after the first
observations are handled gracefully.

Parameters
----------
alpha
Additive (Laplace/Lidstone) smoothing parameter (use 0 for no smoothing).

Attributes
----------
class_counts : collections.Counter
Number of times each class has been seen.
feature_counts : collections.defaultdict
Number of times each `(class, category)` pair has been seen, per feature.
category_counts : collections.defaultdict
Number of times each category has been seen, per feature. Used to count the
number of distinct categories of a feature, which determines the smoothing
denominator.

Examples
--------

>>> from river import naive_bayes

>>> dataset = [
... ({"weather": "sunny", "humidity": "high"}, "no"),
... ({"weather": "sunny", "humidity": "high"}, "no"),
... ({"weather": "overcast", "humidity": "high"}, "yes"),
... ({"weather": "rainy", "humidity": "normal"}, "yes"),
... ({"weather": "rainy", "humidity": "normal"}, "yes"),
... ({"weather": "overcast", "humidity": "normal"}, "yes"),
... ]

>>> model = naive_bayes.CategoricalNB(alpha=1)

>>> for x, y in dataset:
... model.learn_one(x, y)

>>> model.p_class("yes")
0.666666...

>>> model.predict_proba_one({"weather": "overcast", "humidity": "normal"})
{'no': 0.08, 'yes': 0.92}

>>> model.predict_one({"weather": "overcast", "humidity": "normal"})
'yes'

References
----------
[^1]: [scikit-learn CategoricalNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html)

"""

def __init__(self, alpha=1.0):
self.alpha = alpha
self.class_counts: collections.Counter = collections.Counter()
self.feature_counts: collections.defaultdict = collections.defaultdict(collections.Counter)
self.category_counts: collections.defaultdict = collections.defaultdict(collections.Counter)

def learn_one(self, x, y):
"""Update the model with a single observation.

Parameters
----------
x
Dictionary of categorical feature values.
y
Target class.

"""
self.class_counts[y] += 1
for f, value in x.items():
self.feature_counts[f][(y, value)] += 1
self.category_counts[f][value] += 1

@property
def classes_(self):
return list(self.class_counts.keys())

def p_class(self, c) -> float:
return self.class_counts[c] / sum(self.class_counts.values())

def p_feature_given_class(self, f, value, c) -> float:
"""Probability of a category given a class for a feature, with smoothing."""
n_categories = len(self.category_counts.get(f, ())) or 1
num = self.feature_counts.get(f, {}).get((c, value), 0.0) + self.alpha
den = self.class_counts[c] + self.alpha * n_categories
return num / den

def joint_log_likelihood(self, x):
"""Compute the unnormalized posterior log-likelihood of `x`.

The log-likelihood is `log P(c) + log P(x|c)`.

"""
if not self.class_counts:
return {}
return {
c: math.log(self.p_class(c))
+ sum(math.log(self.p_feature_given_class(f, value, c)) for f, value in x.items())
for c in self.classes_
}

def learn_many(self, X: pd.DataFrame, y: pd.Series):

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you support Narwhals? We're in the process of moving all mini-batch methods to Narwhals instead of pandas

"""Learn from a batch of observations.

Parameters
----------
X
A dataframe of categorical feature values.
y
A series of target classes.

"""
for (_, row), label in zip(X.iterrows(), y):
self.learn_one(row.to_dict(), label)
Comment on lines +138 to +139

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do not want mini-batch methods to be for loops over the inputs. Mini-batch methods must use vectorization, else they're not bringing anything to the table.


def joint_log_likelihood_many(self, X: pd.DataFrame) -> pd.DataFrame:
"""Compute the unnormalized posterior log-likelihood of `X` in mini-batches."""
pd = utils.pandas.import_pandas()
index = X.index
if not self.class_counts:
return pd.DataFrame(index=index)
records = [self.joint_log_likelihood(row.to_dict()) for _, row in X.iterrows()]
return pd.DataFrame(records, index=index, columns=self.classes_)
70 changes: 70 additions & 0 deletions river/naive_bayes/test_naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,3 +360,73 @@ def test_gaussian_learn_many_not_fit():

assert model.predict_proba_many(X).equals(pd.DataFrame(index=["river", "rocks"]))
assert model.predict_many(X).equals(pd.DataFrame(index=["river", "rocks"]))


def _categorical_dataset(seed, n=80):
import random

rng = random.Random(seed)
weather = ["sunny", "overcast", "rainy"]
humidity = ["high", "normal"]
rows, ys = [], []
for _ in range(n):
w, h = rng.choice(weather), rng.choice(humidity)
rows.append({"weather": w, "humidity": h})
ys.append("yes" if (w == "overcast" or h == "normal") else "no")
return weather, humidity, rows, ys


@pytest.mark.parametrize("alpha", [1.0, 2.0, 3.0])
def test_categorical_vs_sklearn(alpha):
"""river's CategoricalNB must match sklearn's CategoricalNB on categorical data."""
weather, humidity, rows, ys = _categorical_dataset(seed=42)
wmap = {c: i for i, c in enumerate(weather)}
hmap = {c: i for i, c in enumerate(humidity)}
ymap = {"no": 0, "yes": 1}
inv_y = {v: k for k, v in ymap.items()}
X = np.array([[wmap[r["weather"]], hmap[r["humidity"]]] for r in rows])
Y = np.array([ymap[y] for y in ys])

river_model = naive_bayes.CategoricalNB(alpha=alpha)
for r, y in zip(rows, ys):
river_model.learn_one(r, y)

sk = sk_naive_bayes.CategoricalNB(alpha=alpha).fit(X, Y)

for r, xrow in zip(rows, X):
river_proba = river_model.predict_proba_one(r)
sk_proba = sk.predict_proba(xrow.reshape(1, -1))[0]
for idx, cls in enumerate(sk.classes_):
assert river_proba[inv_y[cls]] == pytest.approx(sk_proba[idx])


def test_categorical_learn_many_vs_learn_one():
"""CategoricalNB.learn_many must yield the same model as repeated learn_one."""
_, _, rows, ys = _categorical_dataset(seed=7, n=40)

one = naive_bayes.CategoricalNB(alpha=1)
for r, y in zip(rows, ys):
one.learn_one(r, y)

many = naive_bayes.CategoricalNB(alpha=1)
many.learn_many(pd.DataFrame(rows), pd.Series(ys))

assert one.class_counts == many.class_counts
assert one.feature_counts == many.feature_counts

test_x = {"weather": "overcast", "humidity": "normal"}
assert one.predict_proba_one(test_x) == pytest.approx(many.predict_proba_one(test_x))


def test_categorical_handles_unseen_feature_value():
"""An unseen category at predict time must not raise and must stay normalized."""
model = naive_bayes.CategoricalNB(alpha=1)
for x, y in [
({"weather": "sunny"}, "no"),
({"weather": "rainy"}, "yes"),
]:
model.learn_one(x, y)

proba = model.predict_proba_one({"weather": "snowy"}) # category never seen
assert set(proba) == {"no", "yes"}
assert sum(proba.values()) == pytest.approx(1.0)