Time-lines
Uses power and mde lines functionalities to showcase how to plot MDE and power as function of the sample size, given, respectively, the power and the MDE.
In [47]:
Copied!
from datetime import date
import numpy as np
from cluster_experiments import NormalPowerAnalysis
import pandas as pd
import plotnine as p9
# Create fake data
N = 10_000
clusters = [f"Cluster {i}" for i in range(10)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 15)]
df = pd.DataFrame(
{
"cluster": np.random.choice(clusters, size=N),
"date": np.random.choice(dates, size=N),
}
).assign(
# Target is a linear combination of cluster and day of week, plus some noise
cluster_id=lambda df: df["cluster"].astype("category").cat.codes,
day_of_week=lambda df: pd.to_datetime(df["date"]).dt.dayofweek,
target=lambda df: df["cluster_id"] + df["day_of_week"] + np.random.normal(size=N),
date=lambda df: pd.to_datetime(df["date"]),
)
from datetime import date
import numpy as np
from cluster_experiments import NormalPowerAnalysis
import pandas as pd
import plotnine as p9
# Create fake data
N = 10_000
clusters = [f"Cluster {i}" for i in range(10)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 15)]
df = pd.DataFrame(
{
"cluster": np.random.choice(clusters, size=N),
"date": np.random.choice(dates, size=N),
}
).assign(
# Target is a linear combination of cluster and day of week, plus some noise
cluster_id=lambda df: df["cluster"].astype("category").cat.codes,
day_of_week=lambda df: pd.to_datetime(df["date"]).dt.dayofweek,
target=lambda df: df["cluster_id"] + df["day_of_week"] + np.random.normal(size=N),
date=lambda df: pd.to_datetime(df["date"]),
)
In [48]:
Copied!
# Set-up power analysis for switchback experiment
pw_normal = NormalPowerAnalysis.from_dict(
{
"splitter": "clustered",
"analysis": "clustered_ols",
"cluster_cols": ["cluster", "date"],
"n_simulations": 5,
"hypothesis": "two-sided",
"seed": 20240922,
"time_col": "date",
}
)
# Set-up power analysis for switchback experiment
pw_normal = NormalPowerAnalysis.from_dict(
{
"splitter": "clustered",
"analysis": "clustered_ols",
"cluster_cols": ["cluster", "date"],
"n_simulations": 5,
"hypothesis": "two-sided",
"seed": 20240922,
"time_col": "date",
}
)
In [49]:
Copied!
%%time
# compute power line for different lengths different average effects
power_line = pw_normal.power_time_line(
df, experiment_length=range(1, 14), average_effects=range(5)
)
%%time
# compute power line for different lengths different average effects
power_line = pw_normal.power_time_line(
df, experiment_length=range(1, 14), average_effects=range(5)
)
CPU times: total: 1.3 s Wall time: 1.32 s
In [50]:
Copied!
# plot line
p9.ggplot(
pd.DataFrame(power_line),
p9.aes(x="experiment_length", y="power", color="effect", group="effect"),
) + p9.geom_line() + p9.geom_point() + p9.theme_minimal() + p9.labs(
x="Experiment Length", y="Power"
) + p9.ggtitle("Power lines")
# plot line
p9.ggplot(
pd.DataFrame(power_line),
p9.aes(x="experiment_length", y="power", color="effect", group="effect"),
) + p9.geom_line() + p9.geom_point() + p9.theme_minimal() + p9.labs(
x="Experiment Length", y="Power"
) + p9.ggtitle("Power lines")
In [51]:
Copied!
%%time
# compute mde line for different lengths and different powers
mde_line = pw_normal.mde_time_line(
df, experiment_length=range(1, 14), powers=[0.7, 0.8, 0.9]
)
%%time
# compute mde line for different lengths and different powers
mde_line = pw_normal.mde_time_line(
df, experiment_length=range(1, 14), powers=[0.7, 0.8, 0.9]
)
CPU times: total: 1.33 s Wall time: 1.33 s
In [52]:
Copied!
# plot line
p9.ggplot(
pd.DataFrame(mde_line),
p9.aes(x="experiment_length", y="mde", color="power", group="power"),
) + p9.geom_line() + p9.geom_point() + p9.theme_minimal() + p9.labs(
x="Experiment Length", y="Minimum Detectable Effect"
) + p9.ggtitle("MDE lines")
# plot line
p9.ggplot(
pd.DataFrame(mde_line),
p9.aes(x="experiment_length", y="mde", color="power", group="power"),
) + p9.geom_line() + p9.geom_point() + p9.theme_minimal() + p9.labs(
x="Experiment Length", y="Minimum Detectable Effect"
) + p9.ggtitle("MDE lines")
Now, suppose we want to analyze how the target metric evolves over time (for instance, how the average user profit changes on a daily basis). To do this, we can compute the Minimum Detectable Effect over a rolling time window.
In [53]:
Copied!
%%time
# compute mde line for different lengths and different powers on a rolling basis
rolling_mde_line = pw_normal.mde_rolling_time_line(
df=df,
powers=[0.7, 0.8, 0.9],
experiment_length=range(1, 15),
agg_func='mean',
)
%%time
# compute mde line for different lengths and different powers on a rolling basis
rolling_mde_line = pw_normal.mde_rolling_time_line(
df=df,
powers=[0.7, 0.8, 0.9],
experiment_length=range(1, 15),
agg_func='mean',
)
CPU times: total: 719 ms Wall time: 696 ms
In [54]:
Copied!
# plot line
p9.ggplot(
pd.DataFrame(rolling_mde_line),
p9.aes(x="experiment_length", y="relative_mde", color="power", group="power"),
) + p9.geom_line() + p9.geom_point() + p9.theme_minimal() + p9.labs(
x="Experiment Length", y="Relative Minimum Detectable Effect"
) + p9.ggtitle("Rolling relative MDE lines")
# plot line
p9.ggplot(
pd.DataFrame(rolling_mde_line),
p9.aes(x="experiment_length", y="relative_mde", color="power", group="power"),
) + p9.geom_line() + p9.geom_point() + p9.theme_minimal() + p9.labs(
x="Experiment Length", y="Relative Minimum Detectable Effect"
) + p9.ggtitle("Rolling relative MDE lines")