Cupac example
Cupac tutorial¶
In this example we demonstrate usage of CUPAC as a way to increase power.
We compare vainilla (non-cupac) gee estimation with gee estimation with cupac.
In [1]:
Copied!
from datetime import date
import numpy as np
import pandas as pd
from cluster_experiments import GeeExperimentAnalysis
from cluster_experiments import ConstantPerturbator
from cluster_experiments import PowerAnalysis
from cluster_experiments import ClusteredSplitter
from sklearn.ensemble import HistGradientBoostingRegressor
def generate_random_data(clusters, dates, N):
"""Generates target as a non-linear function of the covariates, the cluster mean and some residual"""
# Every cluster has a mean
df_clusters = pd.DataFrame(
{
"cluster": clusters,
"cluster_mean": np.random.normal(0, 0.1, size=len(clusters)),
}
)
# The target is the sum of: covariates, cluster mean and random residual
df = (
pd.DataFrame(
{
"cluster": np.random.choice(clusters, size=N),
"residual": np.random.normal(0, 1, size=N),
"date": np.random.choice(dates, size=N),
"x1": np.random.normal(0, 1, size=N),
"x2": np.random.normal(0, 1, size=N),
"x3": np.random.normal(0, 1, size=N),
"x4": np.random.normal(0, 1, size=N),
}
)
.merge(df_clusters, on="cluster")
.assign(
target=lambda x: x["x1"] * x["x2"]
+ x["x3"] ** 2
+ x["x4"]
+ x["cluster_mean"]
+ x["residual"]
)
)
return df
from datetime import date
import numpy as np
import pandas as pd
from cluster_experiments import GeeExperimentAnalysis
from cluster_experiments import ConstantPerturbator
from cluster_experiments import PowerAnalysis
from cluster_experiments import ClusteredSplitter
from sklearn.ensemble import HistGradientBoostingRegressor
def generate_random_data(clusters, dates, N):
"""Generates target as a non-linear function of the covariates, the cluster mean and some residual"""
# Every cluster has a mean
df_clusters = pd.DataFrame(
{
"cluster": clusters,
"cluster_mean": np.random.normal(0, 0.1, size=len(clusters)),
}
)
# The target is the sum of: covariates, cluster mean and random residual
df = (
pd.DataFrame(
{
"cluster": np.random.choice(clusters, size=N),
"residual": np.random.normal(0, 1, size=N),
"date": np.random.choice(dates, size=N),
"x1": np.random.normal(0, 1, size=N),
"x2": np.random.normal(0, 1, size=N),
"x3": np.random.normal(0, 1, size=N),
"x4": np.random.normal(0, 1, size=N),
}
)
.merge(df_clusters, on="cluster")
.assign(
target=lambda x: x["x1"] * x["x2"]
+ x["x3"] ** 2
+ x["x4"]
+ x["cluster_mean"]
+ x["residual"]
)
)
return df
Data generation¶
We have data that contains:
- Cluster
- Date
- Some covariates
- Outcome (target)
We want to run a switchback-clustered experiment for around 15 days.
We take data from day 15 to day 31 as the analysis data.
We take data from day 1 to day 14 as the pre-analysis data, which is going to be used to fit the cupac model.
In [2]:
Copied!
clusters = [f"Cluster {i}" for i in range(100)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 32)]
experiment_dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(15, 32)]
N = 10_000
df = generate_random_data(clusters, dates, N).drop(columns=["residual", "cluster_mean"])
df_analysis = df.query(f"date.isin({experiment_dates})")
df_pre = df.query(f"~date.isin({experiment_dates})")
df
clusters = [f"Cluster {i}" for i in range(100)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 32)]
experiment_dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(15, 32)]
N = 10_000
df = generate_random_data(clusters, dates, N).drop(columns=["residual", "cluster_mean"])
df_analysis = df.query(f"date.isin({experiment_dates})")
df_pre = df.query(f"~date.isin({experiment_dates})")
df
Out[2]:
cluster | date | x1 | x2 | x3 | x4 | target | |
---|---|---|---|---|---|---|---|
0 | Cluster 64 | 2022-01-26 | 0.407857 | -0.821972 | 0.380603 | -0.435979 | 0.259322 |
1 | Cluster 64 | 2022-01-01 | -0.338506 | 1.259993 | -1.172503 | -0.638529 | 0.109807 |
2 | Cluster 64 | 2022-01-25 | 0.144844 | -0.647623 | 1.126136 | 0.339519 | 1.672764 |
3 | Cluster 64 | 2022-01-04 | -0.081028 | 2.257395 | 1.786203 | 0.608843 | 4.281907 |
4 | Cluster 64 | 2022-01-13 | -0.381064 | -0.636419 | 0.854746 | 1.072939 | 2.303696 |
... | ... | ... | ... | ... | ... | ... | ... |
9995 | Cluster 74 | 2022-01-05 | 0.781370 | -0.216738 | -0.786449 | -0.806634 | -0.552988 |
9996 | Cluster 74 | 2022-01-16 | -0.858006 | 0.801571 | -1.044879 | 0.105053 | 0.638271 |
9997 | Cluster 74 | 2022-01-22 | 0.943984 | -0.975810 | -0.839801 | 0.839736 | 1.674833 |
9998 | Cluster 74 | 2022-01-24 | 1.269351 | -0.037544 | -1.506677 | 1.209883 | 2.423682 |
9999 | Cluster 74 | 2022-01-30 | -1.141438 | 0.977891 | 0.143942 | -0.585707 | -0.835189 |
10000 rows × 7 columns
Vainilla method¶
Just run a regular gee estimation, no covariate adjustment.
In [3]:
Copied!
# Splitter and perturbator
sw = ClusteredSplitter(
cluster_cols=["cluster", "date"],
)
perturbator = ConstantPerturbator(
average_effect=0.1,
)
# Vainilla GEE
analysis = GeeExperimentAnalysis(
cluster_cols=["cluster", "date"],
)
pw_vainilla = PowerAnalysis(
perturbator=perturbator,
splitter=sw,
analysis=analysis,
n_simulations=50,
)
power = pw_vainilla.power_analysis(df_analysis)
print(f"Not using cupac: {power = }")
# Splitter and perturbator
sw = ClusteredSplitter(
cluster_cols=["cluster", "date"],
)
perturbator = ConstantPerturbator(
average_effect=0.1,
)
# Vainilla GEE
analysis = GeeExperimentAnalysis(
cluster_cols=["cluster", "date"],
)
pw_vainilla = PowerAnalysis(
perturbator=perturbator,
splitter=sw,
analysis=analysis,
n_simulations=50,
)
power = pw_vainilla.power_analysis(df_analysis)
print(f"Not using cupac: {power = }")
Not using cupac: power = 0.5
We can see that power is not great
Cupac method¶
Use GBM model with covariates x1, x2, x3, x4 fitted on pre-analysis data; use the predictions of this model on analysis data to reduce variance.
In [4]:
Copied!
# Cupac GEE
analysis = GeeExperimentAnalysis(
cluster_cols=["cluster", "date"], covariates=["estimate_target"]
)
gbm = HistGradientBoostingRegressor()
pw_cupac = PowerAnalysis(
perturbator=perturbator,
splitter=sw,
analysis=analysis,
n_simulations=50,
cupac_model=gbm,
features_cupac_model=["x1", "x2", "x3", "x4"],
)
power = pw_cupac.power_analysis(df_analysis, df_pre)
print(f"Using cupac: {power = }")
# Cupac GEE
analysis = GeeExperimentAnalysis(
cluster_cols=["cluster", "date"], covariates=["estimate_target"]
)
gbm = HistGradientBoostingRegressor()
pw_cupac = PowerAnalysis(
perturbator=perturbator,
splitter=sw,
analysis=analysis,
n_simulations=50,
cupac_model=gbm,
features_cupac_model=["x1", "x2", "x3", "x4"],
)
power = pw_cupac.power_analysis(df_analysis, df_pre)
print(f"Using cupac: {power = }")
Using cupac: power = 0.92
Power has increased!