Skip to content

Feature Categorization

Feature categorization is the first step in Pilz's algorithm. Each feature is binned into n_cat categories to enable multi-dimensional correlation analysis.

Why Categorization?

Before we can build correlation tables for feature combinations, we need discrete bins:

flowchart TB subgraph Raw_Data R[Continuous values: 1.2, 3.5, 7.8, 12.4, ...] end subgraph Binning B[Bin into n_cat categories] end subgraph Correlation_Ready C[Category 0, 1, 2, ...] end R --> B B --> C style B fill:#e0f0ff

Two Types of Binning

The feat_cater() dispatcher routes each feature to the appropriate binning method based on its type:

# src/pilz/service/train.py:204-213
def feat_cater(self, feat: Feature, train_df: TrainDataframes, n: int) -> CategorizedFeature:
    match feat.statistical:
        case FeatureType.CATEGORIAL:
            return self.cat_cater_impl(feat=feat, train_df=train_df, n=n)
        case FeatureType.NUMERICAL:
            return self.cont_cater_impl(feat=feat, train_df=train_df, n=n)

Numerical Features: Quantile Binning

For continuous values, cont_cater_impl() creates equal-size bins by interpolating quantile boundaries from cumulative weights:

# src/pilz/service/train.py:278-310
def cont_cater_impl(self, feat: Feature, train_df: TrainDataframes, n: int) -> CategorizedFeature:
    df_sorted = train_df.df_group[[feat.name, "weight"]].sort(feat.name)
    cumulative_weights = df_sorted["weight"].cum_sum()

    res = []
    label = ["0"]
    idx = 0
    for i in range(1, n):
        quantile = i / n
        index = (cumulative_weights >= quantile).arg_true()[0]
        if index > 0:
            weight_below = cumulative_weights[index - 1]
            value_below = df_sorted[feat.name][index - 1]
            value_at_index = df_sorted[feat.name][index]
            fraction = (quantile - weight_below) / (
                cumulative_weights[index] - weight_below
            )
            value = value_below + fraction * (value_at_index - value_below)
        else:
            value = df_sorted[feat.name][0]
        if value in res:
            continue
        idx += 1
        label.append(f"{idx}")
        res.append(value)

    return train_df.create_categorized_features(
        feat=feat,
        cuts=ContCats(cuts=res, labels=label, feat_name=feat.name),
    )
flowchart TB subgraph Input I[Values: 1, 2, 3, 5, 8, 13, 21, 34, 55, 89] end subgraph Sorted S[1, 2, 3, 5, 8, 13, 21, 34, 55, 89] end subgraph Cumulative C[1, 3, 6, 11, 19, 32, 53, 87, 142, 231] end subgraph Bins_n_cat_2 M1[Median: 13.5] B1["Bin 0: \u2264 13.5 (5 values)"] B2["Bin 1: > 13.5 (5 values)"] end I --> S S --> C C --> M1 M1 --> B1 M1 --> B2

The quantile boundaries are stored in the ContCats class, which uses pl.Series.cut() to apply them to new data:

# src/pilz/model/dataframes.py:110-144
class ContCats(BaseModel):
    cuts: list[float]
    labels: list[str]
    feat_name: str

    def cut(self, df: pl.Series) -> pl.Series:
        return df.cut(self.cuts, labels=self.labels)

Categorical Features: Target Rate Grouping

For categorical values, cat_cater_impl() assigns each original category to a bin based on target rate similarity:

  1. Large categories first: Categories with weight > 1/n of total get their own bin
  2. Small categories grouped: Remaining categories are sorted by target rate and grouped by cumulative-weight quantiles
# src/pilz/service/train.py:215-276
def cat_cater_impl(self, feat: Feature, train_df: TrainDataframes, n: int) -> CategorizedFeature:
    # Phase 1: large categories get individual bins
    df = train_df.df_group.group_by(feat.name).agg(
        pl.col("weight").sum(),
        pl.col("target_weight").sum(),
    )
    for i in range(n):
        if df["weight"].sum() < 1.0 / (n - i):
            break
        max_row = df.filter(pl.col("weight") == pl.col("weight").max())
        mapping[max_row[feat.name][0]] = str(i)
        df = df.filter(pl.col(feat.name) != max_row[feat.name][0])

    # Phase 2: small categories grouped by target rate quantiles
    df = df.with_columns(
        pl.col("target_weight").truediv(pl.col("weight")).alias("target_rate")
    ).sort("target_rate", descending=False)

    df = df.with_columns(pl.col("weight").cum_sum().alias("cum_weight"))
    labels = [str(i) for i in range(offset, n)]
    m = len(labels)
    breaks = [i / m for i in range(1, m)]

    df = df.with_columns(
        pl.col("cum_weight")
        .cut(breaks=breaks, labels=labels, left_closed=True)
        .alias("bin")
    )
    for row in df.iter_rows(named=True):
        mapping[row[feat.name]] = row["bin"]

    return train_df.create_categorized_features(
        feat=feat,
        cuts=CatCats(mapping=mapping, feat_name=feat.name),
    )
flowchart TB subgraph "Step 1: Calculate Target Rate" T1[Category: admin, 100 samples, 80 target] --> TR1[80%] T2[Category: technician, 50 samples, 25 target] --> TR2[50%] T3[Category: blue-collar, 100 samples, 10 target] --> TR3[10%] end subgraph "Step 2: Sort by Rate" S1[Sort: 10%, 50%, 80%] end subgraph "Step 3: Create n_cat Bins" B1["Bin 2: admin (80%)"] B2["Bin 1: technician (50%)"] B3["Bin 0: blue-collar (10%)"] end TR1 --> S1 TR2 --> S1 TR3 --> S1 S1 --> B1 S1 --> B2 S1 --> B3 style S1 fill:#e0f0ff

The mapping is stored in the CatCats class, which applies it using replace_strict:

# src/pilz/model/dataframes.py:55-107
class CatCats(BaseModel):
    mapping: dict[str | int, str]
    feat_name: str
    default: str = "-1"

    def cut(self, df: pl.Series) -> pl.Series:
        return df.replace_strict(self.mapping, default=self.default)

The Code Path

The entry point is cater(), which iterates over all features in the datacard:

# src/pilz/service/train.py:192-202
def cater(self, train_df: TrainDataframes):
    for feat in self.dc.train_features:
        categorized_feature = self.feat_cater(
            feat=feat, train_df=train_df, n=self.settings.n_cat
        )
        if not categorized_feature.is_diff_to_low():
            train_df.train_features.append(categorized_feature)

Each categorized feature is scored via calc_diff():

# src/pilz/model/dataframes.py:153-167
def calc_diff(self) -> float:
    target_diff = self.diff_df.filter(
        pl.col("diff") > self.neutral_faktor
    )["diff"].sum()
    non_target_diff = abs(
        self.diff_df.filter(pl.col("diff") < -self.neutral_faktor)["diff"].sum()
    )
    return max(target_diff, non_target_diff)

Features that don't differentiate well enough are filtered out by is_diff_to_low():

# src/pilz/model/dataframes.py:326-331
def is_diff_to_low(self, threshold: float = 0.90) -> bool:
    max_wert = self.diff_df["max_proportion"].max()
    min_prop_of_max = self.diff_df.filter(
        pl.col("max_proportion") == max_wert
    )["proportion", "proportion_right"].min_horizontal()[0]
    return min_prop_of_max > threshold

Building Correlation Tables

After binning, Pilz builds correlation tables for feature combinations using the binned features:

# src/pilz/model/dataframes.py:334-362
class CombinedCategorizedFeature(CategorizedFeatureMixin):
    def __init__(self, train_features, non_target_size, target_size, neutral_faktor):
        group_by = [train.feature.name for train in train_features]

        non_target_df = pl.DataFrame([train.non_target_sr for train in train_features])
        df_count_non_target = (
            non_target_df.group_by(group_by)
            .len(name="proportion")
            .with_columns((pl.col("proportion") / non_target_size))
        )

        target_df = pl.DataFrame([train.target_sr for train in train_features])
        df_count_target = (
            target_df.group_by(group_by)
            .len(name="proportion")
            .with_columns((pl.col("proportion") / target_size))
        )

        self.set_diff_df(
            df_count_target=df_count_target,
            df_count_non_target=df_count_non_target,
            join_on=group_by,
        )
flowchart TB subgraph Binned_Features F1[X: Bin 0, Bin 1] F2[Y: Bin 0, Bin 1] end subgraph Correlation_Table T1["X=0, Y=0: T=15, NT=85"] T2["X=0, Y=1: T=45, NT=55"] T3["X=1, Y=0: T=60, NT=40"] T4["X=1, Y=1: T=80, NT=20"] end F1 --> T1 & T2 F2 --> T1 & T3 style Correlation_Table fill:#ccffcc

Configuration

The n_cat parameter controls the granularity of binning:

# src/pilz/model/settings.py:19
n_cat: int = Field(
    description="Number of categories in one feature",
    default=3,
)

Summary

Feature Type Binning Method Key Function
Numerical Quantile binning with cumulative-weight interpolation cont_cater_impl() at train.py:278
Categorical Target-rate sorting + cumulative-weight quantiles cat_cater_impl() at train.py:215
Scoring Computes diff between target/non-target proportions calc_diff() at dataframes.py:153
Filtering Discards features with insufficient differentiation is_diff_to_low() at dataframes.py:326

Next Steps