Cupac example

Cupac tutorial¶

In this example we demonstrate usage of CUPAC as a way to increase power.

We compare vainilla (non-cupac) gee estimation with gee estimation with cupac.

In [1]:

Copied!





from datetime import date

import numpy as np
import pandas as pd
from cluster_experiments import GeeExperimentAnalysis
from cluster_experiments import ConstantPerturbator
from cluster_experiments import PowerAnalysis
from cluster_experiments import ClusteredSplitter
from sklearn.ensemble import HistGradientBoostingRegressor


def generate_random_data(clusters, dates, N):
    """Generates target as a non-linear function of the covariates, the cluster mean and some residual"""

    # Every cluster has a mean
    df_clusters = pd.DataFrame(
        {
            "cluster": clusters,
            "cluster_mean": np.random.normal(0, 0.1, size=len(clusters)),
        }
    )
    # The target is the sum of: covariates, cluster mean and random residual
    df = (
        pd.DataFrame(
            {
                "cluster": np.random.choice(clusters, size=N),
                "residual": np.random.normal(0, 1, size=N),
                "date": np.random.choice(dates, size=N),
                "x1": np.random.normal(0, 1, size=N),
                "x2": np.random.normal(0, 1, size=N),
                "x3": np.random.normal(0, 1, size=N),
                "x4": np.random.normal(0, 1, size=N),
            }
        )
        .merge(df_clusters, on="cluster")
        .assign(
            target=lambda x: x["x1"] * x["x2"]
            + x["x3"] ** 2
            + x["x4"]
            + x["cluster_mean"]
            + x["residual"]
        )
    )

    return df
from datetime import date

import numpy as np
import pandas as pd
from cluster_experiments import GeeExperimentAnalysis
from cluster_experiments import ConstantPerturbator
from cluster_experiments import PowerAnalysis
from cluster_experiments import ClusteredSplitter
from sklearn.ensemble import HistGradientBoostingRegressor


def generate_random_data(clusters, dates, N):
    """Generates target as a non-linear function of the covariates, the cluster mean and some residual"""

    # Every cluster has a mean
    df_clusters = pd.DataFrame(
        {
            "cluster": clusters,
            "cluster_mean": np.random.normal(0, 0.1, size=len(clusters)),
        }
    )
    # The target is the sum of: covariates, cluster mean and random residual
    df = (
        pd.DataFrame(
            {
                "cluster": np.random.choice(clusters, size=N),
                "residual": np.random.normal(0, 1, size=N),
                "date": np.random.choice(dates, size=N),
                "x1": np.random.normal(0, 1, size=N),
                "x2": np.random.normal(0, 1, size=N),
                "x3": np.random.normal(0, 1, size=N),
                "x4": np.random.normal(0, 1, size=N),
            }
        )
        .merge(df_clusters, on="cluster")
        .assign(
            target=lambda x: x["x1"] * x["x2"]
            + x["x3"] ** 2
            + x["x4"]
            + x["cluster_mean"]
            + x["residual"]
        )
    )

    return df

Data generation¶

We have data that contains:

Cluster
Date
Some covariates
Outcome (target)

We want to run a switchback-clustered experiment for around 15 days.

We take data from day 15 to day 31 as the analysis data.

We take data from day 1 to day 14 as the pre-analysis data, which is going to be used to fit the cupac model.

In [2]:

Copied!





clusters = [f"Cluster {i}" for i in range(100)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 32)]
experiment_dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(15, 32)]
N = 10_000
df = generate_random_data(clusters, dates, N).drop(columns=["residual", "cluster_mean"])
df_analysis = df.query(f"date.isin({experiment_dates})")
df_pre = df.query(f"~date.isin({experiment_dates})")
df
clusters = [f"Cluster {i}" for i in range(100)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 32)]
experiment_dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(15, 32)]
N = 10_000
df = generate_random_data(clusters, dates, N).drop(columns=["residual", "cluster_mean"])
df_analysis = df.query(f"date.isin({experiment_dates})")
df_pre = df.query(f"~date.isin({experiment_dates})")
df

Out[2]:

	cluster	date	x1	x2	x3	x4	target
0	Cluster 64	2022-01-26	0.407857	-0.821972	0.380603	-0.435979	0.259322
1	Cluster 64	2022-01-01	-0.338506	1.259993	-1.172503	-0.638529	0.109807
2	Cluster 64	2022-01-25	0.144844	-0.647623	1.126136	0.339519	1.672764
3	Cluster 64	2022-01-04	-0.081028	2.257395	1.786203	0.608843	4.281907
4	Cluster 64	2022-01-13	-0.381064	-0.636419	0.854746	1.072939	2.303696
...	...	...	...	...	...	...	...
9995	Cluster 74	2022-01-05	0.781370	-0.216738	-0.786449	-0.806634	-0.552988
9996	Cluster 74	2022-01-16	-0.858006	0.801571	-1.044879	0.105053	0.638271
9997	Cluster 74	2022-01-22	0.943984	-0.975810	-0.839801	0.839736	1.674833
9998	Cluster 74	2022-01-24	1.269351	-0.037544	-1.506677	1.209883	2.423682
9999	Cluster 74	2022-01-30	-1.141438	0.977891	0.143942	-0.585707	-0.835189

10000 rows × 7 columns

Vainilla method¶

Just run a regular gee estimation, no covariate adjustment.

In [3]:

Copied!





# Splitter and perturbator
sw = ClusteredSplitter(
    cluster_cols=["cluster", "date"],
)

perturbator = ConstantPerturbator(
    average_effect=0.1,
)

# Vainilla GEE
analysis = GeeExperimentAnalysis(
    cluster_cols=["cluster", "date"],
)
pw_vainilla = PowerAnalysis(
    perturbator=perturbator,
    splitter=sw,
    analysis=analysis,
    n_simulations=50,
)

power = pw_vainilla.power_analysis(df_analysis)
print(f"Not using cupac: {power = }")
# Splitter and perturbator
sw = ClusteredSplitter(
    cluster_cols=["cluster", "date"],
)

perturbator = ConstantPerturbator(
    average_effect=0.1,
)

# Vainilla GEE
analysis = GeeExperimentAnalysis(
    cluster_cols=["cluster", "date"],
)
pw_vainilla = PowerAnalysis(
    perturbator=perturbator,
    splitter=sw,
    analysis=analysis,
    n_simulations=50,
)

power = pw_vainilla.power_analysis(df_analysis)
print(f"Not using cupac: {power = }")

Not using cupac: power = 0.5

We can see that power is not great

Cupac method¶

Use GBM model with covariates x1, x2, x3, x4 fitted on pre-analysis data; use the predictions of this model on analysis data to reduce variance.

In [4]:

Copied!





# Cupac GEE
analysis = GeeExperimentAnalysis(
    cluster_cols=["cluster", "date"], covariates=["estimate_target"]
)

gbm = HistGradientBoostingRegressor()
pw_cupac = PowerAnalysis(
    perturbator=perturbator,
    splitter=sw,
    analysis=analysis,
    n_simulations=50,
    cupac_model=gbm,
    features_cupac_model=["x1", "x2", "x3", "x4"],
)

power = pw_cupac.power_analysis(df_analysis, df_pre)
print(f"Using cupac: {power = }")
# Cupac GEE
analysis = GeeExperimentAnalysis(
    cluster_cols=["cluster", "date"], covariates=["estimate_target"]
)

gbm = HistGradientBoostingRegressor()
pw_cupac = PowerAnalysis(
    perturbator=perturbator,
    splitter=sw,
    analysis=analysis,
    n_simulations=50,
    cupac_model=gbm,
    features_cupac_model=["x1", "x2", "x3", "x4"],
)

power = pw_cupac.power_analysis(df_analysis, df_pre)
print(f"Using cupac: {power = }")

Using cupac: power = 0.92

Power has increased!