`from cluster_experiments.cupac import *`¶

`CupacHandler` ¶

CupacHandler class. It handles operations related to the cupac model.

Its main goal is to call the add_covariates method, where it will add the ouptut from the cupac model, and this should be used as covariates in the regression method for the hypothesis test.

Source code in cluster_experiments/cupac.py

class CupacHandler:
    """
    CupacHandler class. It handles operations related to the cupac model.

    Its main goal is to call the add_covariates method, where it will add the ouptut from the cupac model,
    and this should be used as covariates in the regression method for the hypothesis test.
    """

    def __init__(
        self,
        cupac_model: Optional[BaseEstimator] = None,
        target_col: str = "target",
        scale_col: Optional[str] = None,
        features_cupac_model: Optional[List[str]] = None,
        cache_fit: bool = True,
    ):
        self.cupac_model: BaseEstimator = cupac_model or EmptyRegressor()
        self.target_col = target_col
        # TODO: implement CUPAC with both target_col and scale_col,
        # right now it only supports target_col for delta method
        self.scale_col = scale_col
        self.cupac_outcome_name = f"estimate_{target_col}"
        self.features_cupac_model: List[str] = features_cupac_model or []
        self.is_cupac = not isinstance(self.cupac_model, EmptyRegressor)
        self.cache_fit = cache_fit

        self.check_cupac_config()

    def get_pre_experiment_y(self, pre_experiment_df: pd.DataFrame) -> pd.Series:
        """Returns the pre-experiment target variable, scaled if scale_col is provided."""
        if self.scale_col is not None:
            return (
                pre_experiment_df[self.target_col] / pre_experiment_df[self.scale_col]
            )
        return pre_experiment_df[self.target_col]

    def _prep_data_cupac(
        self, df: pd.DataFrame, pre_experiment_df: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
        """Prepares data for training and prediction"""
        df = df.copy()
        pre_experiment_df = pre_experiment_df.copy()
        df_predict = df.drop(columns=[self.target_col])
        # Split data into X and y
        pre_experiment_x = pre_experiment_df.drop(columns=[self.target_col])
        pre_experiment_y = self.get_pre_experiment_y(pre_experiment_df)

        # Keep only cupac features
        if self.features_cupac_model:
            pre_experiment_x = pre_experiment_x[self.features_cupac_model]
            df_predict = df_predict[self.features_cupac_model]

        return df_predict, pre_experiment_x, pre_experiment_y

    def add_covariates(
        self, df: pd.DataFrame, pre_experiment_df: Optional[pd.DataFrame] = None
    ) -> pd.DataFrame:
        """
        Train model to predict outcome variable (based on pre-experiment data)
        and  add the prediction to the experiment dataframe. Only do this if
        we use cupac
        Args:
            pre_experiment_df: Dataframe with pre-experiment data.
            df: Dataframe with outcome and treatment variables.
        """
        self.check_cupac_inputs(pre_experiment_df)

        # Early return if no need to add covariates
        if not self.need_covariates(pre_experiment_df):
            return df

        df = df.copy()
        pre_experiment_df = pre_experiment_df.copy()
        df_predict, pre_experiment_x, pre_experiment_y = self._prep_data_cupac(
            df=df, pre_experiment_df=pre_experiment_df
        )

        # Fit model if it has not been fitted before
        self._fit_cupac_model(pre_experiment_x, pre_experiment_y)

        # Predict
        estimated_target = self._predict_cupac_model(df_predict)

        # Add cupac outcome name to df
        df[self.cupac_outcome_name] = estimated_target
        return df

    def _fit_cupac_model(
        self, pre_experiment_x: pd.DataFrame, pre_experiment_y: pd.Series
    ):
        """Fits the cupac model.
        Caches the fitted model in the object, so we only fit it once.
        We can disable this by setting cache_fit to False.
        """
        if not self.cache_fit:
            self.cupac_model.fit(pre_experiment_x, pre_experiment_y)
            return

        try:
            check_is_fitted(self.cupac_model)
        except NotFittedError:
            self.cupac_model.fit(pre_experiment_x, pre_experiment_y)

    def _predict_cupac_model(self, df_predict: pd.DataFrame) -> ArrayLike:
        """Predicts the cupac model"""
        if hasattr(self.cupac_model, "predict_proba"):
            return self.cupac_model.predict_proba(df_predict)[:, 1]
        if hasattr(self.cupac_model, "predict"):
            return self.cupac_model.predict(df_predict)
        raise ValueError("cupac_model should have predict or predict_proba method.")

    def need_covariates(self, pre_experiment_df: Optional[pd.DataFrame] = None) -> bool:
        return pre_experiment_df is not None and self.is_cupac

    def check_cupac_inputs(self, pre_experiment_df: Optional[pd.DataFrame] = None):
        if self.is_cupac and pre_experiment_df is None:
            raise ValueError("If cupac is used, pre_experiment_df should be provided.")

        if not self.is_cupac and pre_experiment_df is not None:
            raise ValueError(
                "If cupac is not used, pre_experiment_df should not be provided - remove pre_experiment_df argument or set cupac_model to not None."
            )

    def check_cupac_config(self):
        if self.is_cupac and self.target_col in self.features_cupac_model:
            raise ValueError(
                "If cupac is used, target_col should not be in features_cupac_model."
            )
        if self.is_cupac and self.scale_col in self.features_cupac_model:
            raise ValueError(
                "If cupac is used, scale_col should not be in features_cupac_model."
            )

`_fit_cupac_model(pre_experiment_x, pre_experiment_y)` ¶

Fits the cupac model. Caches the fitted model in the object, so we only fit it once. We can disable this by setting cache_fit to False.

Source code in cluster_experiments/cupac.py

def _fit_cupac_model(
    self, pre_experiment_x: pd.DataFrame, pre_experiment_y: pd.Series
):
    """Fits the cupac model.
    Caches the fitted model in the object, so we only fit it once.
    We can disable this by setting cache_fit to False.
    """
    if not self.cache_fit:
        self.cupac_model.fit(pre_experiment_x, pre_experiment_y)
        return

    try:
        check_is_fitted(self.cupac_model)
    except NotFittedError:
        self.cupac_model.fit(pre_experiment_x, pre_experiment_y)

`_predict_cupac_model(df_predict)` ¶

Predicts the cupac model

Source code in cluster_experiments/cupac.py

def _predict_cupac_model(self, df_predict: pd.DataFrame) -> ArrayLike:
    """Predicts the cupac model"""
    if hasattr(self.cupac_model, "predict_proba"):
        return self.cupac_model.predict_proba(df_predict)[:, 1]
    if hasattr(self.cupac_model, "predict"):
        return self.cupac_model.predict(df_predict)
    raise ValueError("cupac_model should have predict or predict_proba method.")

`_prep_data_cupac(df, pre_experiment_df)` ¶

Prepares data for training and prediction

Source code in cluster_experiments/cupac.py

def _prep_data_cupac(
    self, df: pd.DataFrame, pre_experiment_df: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]:
    """Prepares data for training and prediction"""
    df = df.copy()
    pre_experiment_df = pre_experiment_df.copy()
    df_predict = df.drop(columns=[self.target_col])
    # Split data into X and y
    pre_experiment_x = pre_experiment_df.drop(columns=[self.target_col])
    pre_experiment_y = self.get_pre_experiment_y(pre_experiment_df)

    # Keep only cupac features
    if self.features_cupac_model:
        pre_experiment_x = pre_experiment_x[self.features_cupac_model]
        df_predict = df_predict[self.features_cupac_model]

    return df_predict, pre_experiment_x, pre_experiment_y

`add_covariates(df, pre_experiment_df=None)` ¶

Train model to predict outcome variable (based on pre-experiment data) and add the prediction to the experiment dataframe. Only do this if we use cupac Args: pre_experiment_df: Dataframe with pre-experiment data. df: Dataframe with outcome and treatment variables.

Source code in cluster_experiments/cupac.py

def add_covariates(
    self, df: pd.DataFrame, pre_experiment_df: Optional[pd.DataFrame] = None
) -> pd.DataFrame:
    """
    Train model to predict outcome variable (based on pre-experiment data)
    and  add the prediction to the experiment dataframe. Only do this if
    we use cupac
    Args:
        pre_experiment_df: Dataframe with pre-experiment data.
        df: Dataframe with outcome and treatment variables.
    """
    self.check_cupac_inputs(pre_experiment_df)

    # Early return if no need to add covariates
    if not self.need_covariates(pre_experiment_df):
        return df

    df = df.copy()
    pre_experiment_df = pre_experiment_df.copy()
    df_predict, pre_experiment_x, pre_experiment_y = self._prep_data_cupac(
        df=df, pre_experiment_df=pre_experiment_df
    )

    # Fit model if it has not been fitted before
    self._fit_cupac_model(pre_experiment_x, pre_experiment_y)

    # Predict
    estimated_target = self._predict_cupac_model(df_predict)

    # Add cupac outcome name to df
    df[self.cupac_outcome_name] = estimated_target
    return df

`get_pre_experiment_y(pre_experiment_df)` ¶

Returns the pre-experiment target variable, scaled if scale_col is provided.

Source code in cluster_experiments/cupac.py

def get_pre_experiment_y(self, pre_experiment_df: pd.DataFrame) -> pd.Series:
    """Returns the pre-experiment target variable, scaled if scale_col is provided."""
    if self.scale_col is not None:
        return (
            pre_experiment_df[self.target_col] / pre_experiment_df[self.scale_col]
        )
    return pre_experiment_df[self.target_col]

`EmptyRegressor` ¶

Bases: BaseEstimator

Empty regressor class. It does not do anything, used to glue the code of other estimators and PowerAnalysis

Each Regressor should have: - fit method: Uses pre experiment data to fit some kind of model to be used as a covariate and reduce variance. - predict method: Uses the fitted model to add the covariate on the experiment data.

It can add aggregates of the target in older data as a covariate, or a model (cupac) to predict the target.

Source code in cluster_experiments/cupac.py

class EmptyRegressor(BaseEstimator):
    """
    Empty regressor class. It does not do anything, used to glue the code of other estimators and PowerAnalysis

    Each Regressor should have:
    - fit method: Uses pre experiment data to fit some kind of model to be used as a covariate and reduce variance.
    - predict method: Uses the fitted model to add the covariate on the experiment data.

    It can add aggregates of the target in older data as a covariate, or a model (cupac) to predict the target.
    """

    @classmethod
    def from_config(cls, config):
        return cls()

`TargetAggregation` ¶

Bases: BaseEstimator

Adds average of target using pre-experiment data

Parameters:

Name	Type	Description	Default
`agg_col`	`str`	Column to group by to aggregate target	required
`target_col`	`str`	Column to aggregate	`'target'`
`smoothing_factor`	`int`	Smoothing factor for the smoothed mean	`20`

Usage:

import pandas as pd
from cluster_experiments.cupac import TargetAggregation

df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"], "target_col": [1, 2, 3, 4, 5, 6]})
new_df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"]})
target_agg = TargetAggregation("agg_col", "target_col")
target_agg.fit(df.drop(columns="target_col"), df["target_col"])
df_with_target_agg = target_agg.predict(new_df)
print(df_with_target_agg)

Source code in cluster_experiments/cupac.py

class TargetAggregation(BaseEstimator):
    """
    Adds average of target using pre-experiment data

    Args:
        agg_col: Column to group by to aggregate target
        target_col: Column to aggregate
        smoothing_factor: Smoothing factor for the smoothed mean
    Usage:
    ```python
    import pandas as pd
    from cluster_experiments.cupac import TargetAggregation

    df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"], "target_col": [1, 2, 3, 4, 5, 6]})
    new_df = pd.DataFrame({"agg_col": ["a", "a", "b", "b", "c", "c"]})
    target_agg = TargetAggregation("agg_col", "target_col")
    target_agg.fit(df.drop(columns="target_col"), df["target_col"])
    df_with_target_agg = target_agg.predict(new_df)
    print(df_with_target_agg)
    ```
    """

    def __init__(
        self,
        agg_col: str,
        target_col: str = "target",
        smoothing_factor: int = 20,
    ):
        self.agg_col = agg_col
        self.target_col = target_col
        self.smoothing_factor = smoothing_factor
        self.is_empty = False
        self.mean_target_col = f"{self.target_col}_mean"
        self.smooth_mean_target_col = f"{self.target_col}_smooth_mean"
        self.pre_experiment_agg_df = pd.DataFrame()

    def _get_pre_experiment_mean(self, pre_experiment_df: pd.DataFrame) -> float:
        return pre_experiment_df[self.target_col].mean()

    def fit(self, X: pd.DataFrame, y: pd.Series) -> "TargetAggregation":
        """Fits "target encoder" model to pre-experiment data"""
        pre_experiment_df = X.copy()
        pre_experiment_df[self.target_col] = y

        self.pre_experiment_mean = self._get_pre_experiment_mean(pre_experiment_df)
        self.pre_experiment_agg_df = (
            pre_experiment_df.assign(count=1)
            .groupby(self.agg_col, as_index=False)
            .agg({self.target_col: "sum", "count": "sum"})
            .assign(
                **{
                    self.mean_target_col: lambda x: x[self.target_col] / x["count"],
                    self.smooth_mean_target_col: lambda x: (
                        x[self.target_col]
                        + self.smoothing_factor * self.pre_experiment_mean
                    )
                    / (x["count"] + self.smoothing_factor),
                }
            )
            .drop(columns=["count", self.target_col])
        )
        return self

    def predict(self, X: pd.DataFrame) -> ArrayLike:
        """Adds average target of pre-experiment data to experiment data"""
        return (
            X.merge(self.pre_experiment_agg_df, how="left", on=self.agg_col)[
                self.smooth_mean_target_col
            ]
            .fillna(self.pre_experiment_mean)
            .values
        )

    @classmethod
    def from_config(cls, config):
        """Creates TargetAggregation from PowerConfig"""
        return cls(
            agg_col=config.agg_col,
            target_col=config.target_col,
            smoothing_factor=config.smoothing_factor,
        )

`fit(X, y)` ¶

Fits "target encoder" model to pre-experiment data

Source code in cluster_experiments/cupac.py

def fit(self, X: pd.DataFrame, y: pd.Series) -> "TargetAggregation":
    """Fits "target encoder" model to pre-experiment data"""
    pre_experiment_df = X.copy()
    pre_experiment_df[self.target_col] = y

    self.pre_experiment_mean = self._get_pre_experiment_mean(pre_experiment_df)
    self.pre_experiment_agg_df = (
        pre_experiment_df.assign(count=1)
        .groupby(self.agg_col, as_index=False)
        .agg({self.target_col: "sum", "count": "sum"})
        .assign(
            **{
                self.mean_target_col: lambda x: x[self.target_col] / x["count"],
                self.smooth_mean_target_col: lambda x: (
                    x[self.target_col]
                    + self.smoothing_factor * self.pre_experiment_mean
                )
                / (x["count"] + self.smoothing_factor),
            }
        )
        .drop(columns=["count", self.target_col])
    )
    return self

`from_config(config)` `classmethod` ¶

Creates TargetAggregation from PowerConfig

Source code in cluster_experiments/cupac.py

@classmethod
def from_config(cls, config):
    """Creates TargetAggregation from PowerConfig"""
    return cls(
        agg_col=config.agg_col,
        target_col=config.target_col,
        smoothing_factor=config.smoothing_factor,
    )

`predict(X)` ¶

Adds average target of pre-experiment data to experiment data

Source code in cluster_experiments/cupac.py

def predict(self, X: pd.DataFrame) -> ArrayLike:
    """Adds average target of pre-experiment data to experiment data"""
    return (
        X.merge(self.pre_experiment_agg_df, how="left", on=self.agg_col)[
            self.smooth_mean_target_col
        ]
        .fillna(self.pre_experiment_mean)
        .values
    )

from cluster_experiments.cupac import *¶

CupacHandler ¶

_fit_cupac_model(pre_experiment_x, pre_experiment_y) ¶

_predict_cupac_model(df_predict) ¶

_prep_data_cupac(df, pre_experiment_df) ¶

add_covariates(df, pre_experiment_df=None) ¶

get_pre_experiment_y(pre_experiment_df) ¶

EmptyRegressor ¶

TargetAggregation ¶

fit(X, y) ¶

from_config(config) classmethod ¶

predict(X) ¶

`from cluster_experiments.cupac import *`¶

`CupacHandler` ¶

`_fit_cupac_model(pre_experiment_x, pre_experiment_y)` ¶

`_predict_cupac_model(df_predict)` ¶

`_prep_data_cupac(df, pre_experiment_df)` ¶

`add_covariates(df, pre_experiment_df=None)` ¶

`get_pre_experiment_y(pre_experiment_df)` ¶

`EmptyRegressor` ¶

`TargetAggregation` ¶

`fit(X, y)` ¶

`from_config(config)` `classmethod` ¶

`predict(X)` ¶