Feature Categorization¶
Feature categorization is the first step in Pilz's algorithm. Each feature is binned into n_cat categories to enable multi-dimensional correlation analysis.
Why Categorization?¶
Before we can build correlation tables for feature combinations, we need discrete bins:
Two Types of Binning¶
The feat_cater() dispatcher routes each feature to the appropriate binning method based on its type:
# src/pilz/service/train.py:204-213
def feat_cater(self, feat: Feature, train_df: TrainDataframes, n: int) -> CategorizedFeature:
match feat.statistical:
case FeatureType.CATEGORIAL:
return self.cat_cater_impl(feat=feat, train_df=train_df, n=n)
case FeatureType.NUMERICAL:
return self.cont_cater_impl(feat=feat, train_df=train_df, n=n)
Numerical Features: Quantile Binning¶
For continuous values, cont_cater_impl() creates equal-size bins by interpolating quantile boundaries from cumulative weights:
# src/pilz/service/train.py:278-310
def cont_cater_impl(self, feat: Feature, train_df: TrainDataframes, n: int) -> CategorizedFeature:
df_sorted = train_df.df_group[[feat.name, "weight"]].sort(feat.name)
cumulative_weights = df_sorted["weight"].cum_sum()
res = []
label = ["0"]
idx = 0
for i in range(1, n):
quantile = i / n
index = (cumulative_weights >= quantile).arg_true()[0]
if index > 0:
weight_below = cumulative_weights[index - 1]
value_below = df_sorted[feat.name][index - 1]
value_at_index = df_sorted[feat.name][index]
fraction = (quantile - weight_below) / (
cumulative_weights[index] - weight_below
)
value = value_below + fraction * (value_at_index - value_below)
else:
value = df_sorted[feat.name][0]
if value in res:
continue
idx += 1
label.append(f"{idx}")
res.append(value)
return train_df.create_categorized_features(
feat=feat,
cuts=ContCats(cuts=res, labels=label, feat_name=feat.name),
)
The quantile boundaries are stored in the ContCats class, which uses pl.Series.cut() to apply them to new data:
# src/pilz/model/dataframes.py:110-144
class ContCats(BaseModel):
cuts: list[float]
labels: list[str]
feat_name: str
def cut(self, df: pl.Series) -> pl.Series:
return df.cut(self.cuts, labels=self.labels)
Categorical Features: Target Rate Grouping¶
For categorical values, cat_cater_impl() assigns each original category to a bin based on target rate similarity:
- Large categories first: Categories with weight >
1/nof total get their own bin - Small categories grouped: Remaining categories are sorted by target rate and grouped by cumulative-weight quantiles
# src/pilz/service/train.py:215-276
def cat_cater_impl(self, feat: Feature, train_df: TrainDataframes, n: int) -> CategorizedFeature:
# Phase 1: large categories get individual bins
df = train_df.df_group.group_by(feat.name).agg(
pl.col("weight").sum(),
pl.col("target_weight").sum(),
)
for i in range(n):
if df["weight"].sum() < 1.0 / (n - i):
break
max_row = df.filter(pl.col("weight") == pl.col("weight").max())
mapping[max_row[feat.name][0]] = str(i)
df = df.filter(pl.col(feat.name) != max_row[feat.name][0])
# Phase 2: small categories grouped by target rate quantiles
df = df.with_columns(
pl.col("target_weight").truediv(pl.col("weight")).alias("target_rate")
).sort("target_rate", descending=False)
df = df.with_columns(pl.col("weight").cum_sum().alias("cum_weight"))
labels = [str(i) for i in range(offset, n)]
m = len(labels)
breaks = [i / m for i in range(1, m)]
df = df.with_columns(
pl.col("cum_weight")
.cut(breaks=breaks, labels=labels, left_closed=True)
.alias("bin")
)
for row in df.iter_rows(named=True):
mapping[row[feat.name]] = row["bin"]
return train_df.create_categorized_features(
feat=feat,
cuts=CatCats(mapping=mapping, feat_name=feat.name),
)
The mapping is stored in the CatCats class, which applies it using replace_strict:
# src/pilz/model/dataframes.py:55-107
class CatCats(BaseModel):
mapping: dict[str | int, str]
feat_name: str
default: str = "-1"
def cut(self, df: pl.Series) -> pl.Series:
return df.replace_strict(self.mapping, default=self.default)
The Code Path¶
The entry point is cater(), which iterates over all features in the datacard:
# src/pilz/service/train.py:192-202
def cater(self, train_df: TrainDataframes):
for feat in self.dc.train_features:
categorized_feature = self.feat_cater(
feat=feat, train_df=train_df, n=self.settings.n_cat
)
if not categorized_feature.is_diff_to_low():
train_df.train_features.append(categorized_feature)
Each categorized feature is scored via calc_diff():
# src/pilz/model/dataframes.py:153-167
def calc_diff(self) -> float:
target_diff = self.diff_df.filter(
pl.col("diff") > self.neutral_faktor
)["diff"].sum()
non_target_diff = abs(
self.diff_df.filter(pl.col("diff") < -self.neutral_faktor)["diff"].sum()
)
return max(target_diff, non_target_diff)
Features that don't differentiate well enough are filtered out by is_diff_to_low():
# src/pilz/model/dataframes.py:326-331
def is_diff_to_low(self, threshold: float = 0.90) -> bool:
max_wert = self.diff_df["max_proportion"].max()
min_prop_of_max = self.diff_df.filter(
pl.col("max_proportion") == max_wert
)["proportion", "proportion_right"].min_horizontal()[0]
return min_prop_of_max > threshold
Building Correlation Tables¶
After binning, Pilz builds correlation tables for feature combinations using the binned features:
# src/pilz/model/dataframes.py:334-362
class CombinedCategorizedFeature(CategorizedFeatureMixin):
def __init__(self, train_features, non_target_size, target_size, neutral_faktor):
group_by = [train.feature.name for train in train_features]
non_target_df = pl.DataFrame([train.non_target_sr for train in train_features])
df_count_non_target = (
non_target_df.group_by(group_by)
.len(name="proportion")
.with_columns((pl.col("proportion") / non_target_size))
)
target_df = pl.DataFrame([train.target_sr for train in train_features])
df_count_target = (
target_df.group_by(group_by)
.len(name="proportion")
.with_columns((pl.col("proportion") / target_size))
)
self.set_diff_df(
df_count_target=df_count_target,
df_count_non_target=df_count_non_target,
join_on=group_by,
)
Configuration¶
The n_cat parameter controls the granularity of binning:
# src/pilz/model/settings.py:19
n_cat: int = Field(
description="Number of categories in one feature",
default=3,
)
Summary¶
| Feature Type | Binning Method | Key Function |
|---|---|---|
| Numerical | Quantile binning with cumulative-weight interpolation | cont_cater_impl() at train.py:278 |
| Categorical | Target-rate sorting + cumulative-weight quantiles | cat_cater_impl() at train.py:215 |
| Scoring | Computes diff between target/non-target proportions | calc_diff() at dataframes.py:153 |
| Filtering | Discards features with insufficient differentiation | is_diff_to_low() at dataframes.py:326 |
Next Steps¶
- Multi-Dimensional Splits — How these binned features are combined
- Three-Way Splits — How splits use the correlation tables
- How Pilz Works — Overview of the full algorithm