Skip to content

from cluster_experiments.cupac import *

CupacHandler

CupacHandler class. It handles operations related to the cupac model.

Its main goal is to call the add_covariates method, where it will add the ouptut from the cupac model, and this should be used as covariates in the regression method for the hypothesis test.

Source code in cluster_experiments/cupac.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
class CupacHandler:
    """
    CupacHandler class. It handles operations related to the cupac model.

    Its main goal is to call the add_covariates method, where it will add the ouptut from the cupac model,
    and this should be used as covariates in the regression method for the hypothesis test.
    """

    def __init__(
        self,
        cupac_model: Optional[BaseEstimator] = None,
        target_col: str = "target",
        features_cupac_model: Optional[List[str]] = None,
        cache_fit: bool = True,
    ):
        self.cupac_model: BaseEstimator = cupac_model or EmptyRegressor()
        self.target_col = target_col
        self.cupac_outcome_name = f"estimate_{target_col}"
        self.features_cupac_model: List[str] = features_cupac_model or []
        self.is_cupac = not isinstance(self.cupac_model, EmptyRegressor)
        self.cache_fit = cache_fit

    def _prep_data_cupac(
        self, df: pd.DataFrame, pre_experiment_df: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
        """Prepares data for training and prediction"""
        df = df.copy()
        pre_experiment_df = pre_experiment_df.copy()
        df_predict = df.drop(columns=[self.target_col])
        # Split data into X and y
        pre_experiment_x = pre_experiment_df.drop(columns=[self.target_col])
        pre_experiment_y = pre_experiment_df[self.target_col]

        # Keep only cupac features
        if self.features_cupac_model:
            pre_experiment_x = pre_experiment_x[self.features_cupac_model]
            df_predict = df_predict[self.features_cupac_model]

        return df_predict, pre_experiment_x, pre_experiment_y

    def add_covariates(
        self, df: pd.DataFrame, pre_experiment_df: Optional[pd.DataFrame] = None
    ) -> pd.DataFrame:
        """
        Train model to predict outcome variable (based on pre-experiment data)
        and  add the prediction to the experiment dataframe. Only do this if
        we use cupac
        Args:
            pre_experiment_df: Dataframe with pre-experiment data.
            df: Dataframe with outcome and treatment variables.
        """
        self.check_cupac_inputs(pre_experiment_df)

        # Early return if no need to add covariates
        if not self.need_covariates(pre_experiment_df):
            return df

        df = df.copy()
        pre_experiment_df = pre_experiment_df.copy()
        df_predict, pre_experiment_x, pre_experiment_y = self._prep_data_cupac(
            df=df, pre_experiment_df=pre_experiment_df
        )

        # Fit model if it has not been fitted before
        self._fit_cupac_model(pre_experiment_x, pre_experiment_y)

        # Predict
        estimated_target = self._predict_cupac_model(df_predict)

        # Add cupac outcome name to df
        df[self.cupac_outcome_name] = estimated_target
        return df

    def _fit_cupac_model(
        self, pre_experiment_x: pd.DataFrame, pre_experiment_y: pd.Series
    ):
        """Fits the cupac model.
        Caches the fitted model in the object, so we only fit it once.
        We can disable this by setting cache_fit to False.
        """
        if not self.cache_fit:
            self.cupac_model.fit(pre_experiment_x, pre_experiment_y)
            return

        try:
            check_is_fitted(self.cupac_model)
        except NotFittedError:
            self.cupac_model.fit(pre_experiment_x, pre_experiment_y)

    def _predict_cupac_model(self, df_predict: pd.DataFrame) -> ArrayLike:
        """Predicts the cupac model"""
        if hasattr(self.cupac_model, "predict_proba"):
            return self.cupac_model.predict_proba(df_predict)[:, 1]
        if hasattr(self.cupac_model, "predict"):
            return self.cupac_model.predict(df_predict)
        raise ValueError("cupac_model should have predict or predict_proba method.")

    def need_covariates(self, pre_experiment_df: Optional[pd.DataFrame] = None) -> bool:
        return pre_experiment_df is not None and self.is_cupac

    def check_cupac_inputs(self, pre_experiment_df: Optional[pd.DataFrame] = None):
        if self.is_cupac and pre_experiment_df is None:
            raise ValueError("If cupac is used, pre_experiment_df should be provided.")

        if not self.is_cupac and pre_experiment_df is not None:
            raise ValueError(
                "If cupac is not used, pre_experiment_df should not be provided - "
                "remove pre_experiment_df argument or set cupac_model to not None."
            )

add_covariates(df, pre_experiment_df=None)

Train model to predict outcome variable (based on pre-experiment data) and add the prediction to the experiment dataframe. Only do this if we use cupac Args: pre_experiment_df: Dataframe with pre-experiment data. df: Dataframe with outcome and treatment variables.

Source code in cluster_experiments/cupac.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def add_covariates(
    self, df: pd.DataFrame, pre_experiment_df: Optional[pd.DataFrame] = None
) -> pd.DataFrame:
    """
    Train model to predict outcome variable (based on pre-experiment data)
    and  add the prediction to the experiment dataframe. Only do this if
    we use cupac
    Args:
        pre_experiment_df: Dataframe with pre-experiment data.
        df: Dataframe with outcome and treatment variables.
    """
    self.check_cupac_inputs(pre_experiment_df)

    # Early return if no need to add covariates
    if not self.need_covariates(pre_experiment_df):
        return df

    df = df.copy()
    pre_experiment_df = pre_experiment_df.copy()
    df_predict, pre_experiment_x, pre_experiment_y = self._prep_data_cupac(
        df=df, pre_experiment_df=pre_experiment_df
    )

    # Fit model if it has not been fitted before
    self._fit_cupac_model(pre_experiment_x, pre_experiment_y)

    # Predict
    estimated_target = self._predict_cupac_model(df_predict)

    # Add cupac outcome name to df
    df[self.cupac_outcome_name] = estimated_target
    return df

EmptyRegressor

Bases: BaseEstimator

Empty regressor class. It does not do anything, used to glue the code of other estimators and PowerAnalysis

Each Regressor should have: - fit method: Uses pre experiment data to fit some kind of model to be used as a covariate and reduce variance. - predict method: Uses the fitted model to add the covariate on the experiment data.

It can add aggregates of the target in older data as a covariate, or a model (cupac) to predict the target.

Source code in cluster_experiments/cupac.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
class EmptyRegressor(BaseEstimator):
    """
    Empty regressor class. It does not do anything, used to glue the code of other estimators and PowerAnalysis

    Each Regressor should have:
    - fit method: Uses pre experiment data to fit some kind of model to be used as a covariate and reduce variance.
    - predict method: Uses the fitted model to add the covariate on the experiment data.

    It can add aggregates of the target in older data as a covariate, or a model (cupac) to predict the target.
    """

    @classmethod
    def from_config(cls, config):
        return cls()

TargetAggregation

Bases: BaseEstimator

Adds average of target using pre-experiment data

Parameters:

Name Type Description Default
agg_col str

Column to group by to aggregate target

required
target_col str

Column to aggregate

'target'
smoothing_factor int

Smoothing factor for the smoothed mean

20

Usage:

import pandas as pd
from cluster_experiments.cupac import TargetAggregation

df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"], "target_col": [1, 2, 3, 4, 5, 6]})
new_df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"]})
target_agg = TargetAggregation("agg_col", "target_col")
target_agg.fit(df.drop(columns="target_col"), df["target_col"])
df_with_target_agg = target_agg.predict(new_df)
print(df_with_target_agg)

Source code in cluster_experiments/cupac.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class TargetAggregation(BaseEstimator):
    """
    Adds average of target using pre-experiment data

    Args:
        agg_col: Column to group by to aggregate target
        target_col: Column to aggregate
        smoothing_factor: Smoothing factor for the smoothed mean
    Usage:
    ```python
    import pandas as pd
    from cluster_experiments.cupac import TargetAggregation

    df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"], "target_col": [1, 2, 3, 4, 5, 6]})
    new_df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"]})
    target_agg = TargetAggregation("agg_col", "target_col")
    target_agg.fit(df.drop(columns="target_col"), df["target_col"])
    df_with_target_agg = target_agg.predict(new_df)
    print(df_with_target_agg)
    ```
    """

    def __init__(
        self,
        agg_col: str,
        target_col: str = "target",
        smoothing_factor: int = 20,
    ):
        self.agg_col = agg_col
        self.target_col = target_col
        self.smoothing_factor = smoothing_factor
        self.is_empty = False
        self.mean_target_col = f"{self.target_col}_mean"
        self.smooth_mean_target_col = f"{self.target_col}_smooth_mean"
        self.pre_experiment_agg_df = pd.DataFrame()

    def _get_pre_experiment_mean(self, pre_experiment_df: pd.DataFrame) -> float:
        return pre_experiment_df[self.target_col].mean()

    def fit(self, X: pd.DataFrame, y: pd.Series) -> "TargetAggregation":
        """Fits "target encoder" model to pre-experiment data"""
        pre_experiment_df = X.copy()
        pre_experiment_df[self.target_col] = y

        self.pre_experiment_mean = self._get_pre_experiment_mean(pre_experiment_df)
        self.pre_experiment_agg_df = (
            pre_experiment_df.assign(count=1)
            .groupby(self.agg_col, as_index=False)
            .agg({self.target_col: "sum", "count": "sum"})
            .assign(
                **{
                    self.mean_target_col: lambda x: x[self.target_col] / x["count"],
                    self.smooth_mean_target_col: lambda x: (
                        x[self.target_col]
                        + self.smoothing_factor * self.pre_experiment_mean
                    )
                    / (x["count"] + self.smoothing_factor),
                }
            )
            .drop(columns=["count", self.target_col])
        )
        return self

    def predict(self, X: pd.DataFrame) -> ArrayLike:
        """Adds average target of pre-experiment data to experiment data"""
        return (
            X.merge(self.pre_experiment_agg_df, how="left", on=self.agg_col)[
                self.smooth_mean_target_col
            ]
            .fillna(self.pre_experiment_mean)
            .values
        )

    @classmethod
    def from_config(cls, config):
        """Creates TargetAggregation from PowerConfig"""
        return cls(
            agg_col=config.agg_col,
            target_col=config.target_col,
            smoothing_factor=config.smoothing_factor,
        )

fit(X, y)

Fits "target encoder" model to pre-experiment data

Source code in cluster_experiments/cupac.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def fit(self, X: pd.DataFrame, y: pd.Series) -> "TargetAggregation":
    """Fits "target encoder" model to pre-experiment data"""
    pre_experiment_df = X.copy()
    pre_experiment_df[self.target_col] = y

    self.pre_experiment_mean = self._get_pre_experiment_mean(pre_experiment_df)
    self.pre_experiment_agg_df = (
        pre_experiment_df.assign(count=1)
        .groupby(self.agg_col, as_index=False)
        .agg({self.target_col: "sum", "count": "sum"})
        .assign(
            **{
                self.mean_target_col: lambda x: x[self.target_col] / x["count"],
                self.smooth_mean_target_col: lambda x: (
                    x[self.target_col]
                    + self.smoothing_factor * self.pre_experiment_mean
                )
                / (x["count"] + self.smoothing_factor),
            }
        )
        .drop(columns=["count", self.target_col])
    )
    return self

from_config(config) classmethod

Creates TargetAggregation from PowerConfig

Source code in cluster_experiments/cupac.py
 98
 99
100
101
102
103
104
105
@classmethod
def from_config(cls, config):
    """Creates TargetAggregation from PowerConfig"""
    return cls(
        agg_col=config.agg_col,
        target_col=config.target_col,
        smoothing_factor=config.smoothing_factor,
    )

predict(X)

Adds average target of pre-experiment data to experiment data

Source code in cluster_experiments/cupac.py
88
89
90
91
92
93
94
95
96
def predict(self, X: pd.DataFrame) -> ArrayLike:
    """Adds average target of pre-experiment data to experiment data"""
    return (
        X.merge(self.pre_experiment_agg_df, how="left", on=self.agg_col)[
            self.smooth_mean_target_col
        ]
        .fillna(self.pre_experiment_mean)
        .values
    )