Stratified switchback

This notebook shows how to use the switchback module. In particular, it shows how to create a PowerAnalysis object with a switchback splitter, using a time column and 30 min splits.

It uses the splitter of the PowerAnalysis object to simulate the treatment assignment, and shows how the stratification of the clusters works.

In the end, it also shows how to run the power analysis.

In [1]:

Copied!

from cluster_experiments import PowerAnalysis
import pandas as pd
import numpy as np

np.random.seed(42)
from cluster_experiments import PowerAnalysis
import pandas as pd
import numpy as np

np.random.seed(42)

In [2]:

Copied!





# Define bihourly switchback splitter
config = {
    "time_col": "time",
    "switch_frequency": "30min",
    "perturbator": "constant",
    "analysis": "ols_clustered",
    "splitter": "switchback_stratified",
    "cluster_cols": ["time", "city"],
    "strata_cols": ["day_of_week", "hour_of_day", "city"],
    "target_col": "y",
}

power = PowerAnalysis.from_dict(config)
# Define bihourly switchback splitter
config = {
    "time_col": "time",
    "switch_frequency": "30min",
    "perturbator": "constant",
    "analysis": "ols_clustered",
    "splitter": "switchback_stratified",
    "cluster_cols": ["time", "city"],
    "strata_cols": ["day_of_week", "hour_of_day", "city"],
    "target_col": "y",
}

power = PowerAnalysis.from_dict(config)

In [3]:

Copied!





# Define data with random dates
df_raw = pd.DataFrame(
    {   
        # Generate 10k random timestamps from 2021-01-01 to 2021-01-10
        "time": pd.date_range("2021-01-01", "2021-01-08", freq="1min")[
            np.random.randint(7 * 24 * 60, size=7 * 24 * 60)
        ],
        "y": np.random.randn(7 * 24 * 60),
    }
).assign(
    day_of_week=lambda df: df.time.dt.dayofweek,
    hour_of_day=lambda df: df.time.dt.hour
)
df = pd.concat([df_raw.assign(city=city) for city in ("TGN", "NYC", "LON", "REU")])
# Define data with random dates
df_raw = pd.DataFrame(
    {   
        # Generate 10k random timestamps from 2021-01-01 to 2021-01-10
        "time": pd.date_range("2021-01-01", "2021-01-08", freq="1min")[
            np.random.randint(7 * 24 * 60, size=7 * 24 * 60)
        ],
        "y": np.random.randn(7 * 24 * 60),
    }
).assign(
    day_of_week=lambda df: df.time.dt.dayofweek,
    hour_of_day=lambda df: df.time.dt.hour
)
df = pd.concat([df_raw.assign(city=city) for city in ("TGN", "NYC", "LON", "REU")])

In [4]:

Copied!

df.head(10)
df.head(10)

Out[4]:

	time	y	day_of_week	hour_of_day	city
0	2021-01-06 01:10:00	-0.216104	2	1	TGN
1	2021-01-01 14:20:00	-1.016524	4	14	TGN
2	2021-01-04 17:50:00	-2.326362	0	17	TGN
3	2021-01-04 14:31:00	-0.358456	0	14	TGN
4	2021-01-04 23:34:00	-0.490571	0	23	TGN
5	2021-01-05 08:25:00	-0.149901	1	8	TGN
6	2021-01-01 07:46:00	-0.628898	4	7	TGN
7	2021-01-04 01:46:00	1.829330	0	1	TGN
8	2021-01-04 20:58:00	0.517337	0	20	TGN
9	2021-01-06 18:42:00	-0.499613	2	18	TGN

In [5]:

Copied!

treatments = power.splitter.assign_treatment_df(df)
treatments = power.splitter.assign_treatment_df(df)

In [6]:

Copied!





# For every city, we have a balanced AB split
(
    treatments
    .loc[:, ["city", "treatment", "time"]]
    .drop_duplicates()
    .groupby(["city", "treatment"])
    .size()
    .head(10)
)
# For every city, we have a balanced AB split
(
    treatments
    .loc[:, ["city", "treatment", "time"]]
    .drop_duplicates()
    .groupby(["city", "treatment"])
    .size()
    .head(10)
)

Out[6]:

city  treatment
LON   A            168
      B            168
NYC   A            168
      B            168
REU   A            168
      B            168
TGN   A            168
      B            168
dtype: int64

In [7]:

Copied!





# For every hour of day, we have a balanced AB split
(
    treatments
    .loc[:, ["city", "treatment", "time", "hour_of_day"]]
    .drop_duplicates()
    .groupby(["hour_of_day", "treatment"])
    .size()
    .head(10)
)
# For every hour of day, we have a balanced AB split
(
    treatments
    .loc[:, ["city", "treatment", "time", "hour_of_day"]]
    .drop_duplicates()
    .groupby(["hour_of_day", "treatment"])
    .size()
    .head(10)
)

Out[7]:

hour_of_day  treatment
0            A            28
             B            28
1            A            28
             B            28
2            A            28
             B            28
3            A            28
             B            28
4            A            28
             B            28
dtype: int64

In [8]:

Copied!





# For every day of week, we have a balanced AB split
(
    treatments
    .loc[:, ["city", "treatment", "time", "day_of_week"]]
    .drop_duplicates()
    .groupby(["day_of_week", "treatment"])
    .size()
    .head(10)
)
# For every day of week, we have a balanced AB split
(
    treatments
    .loc[:, ["city", "treatment", "time", "day_of_week"]]
    .drop_duplicates()
    .groupby(["day_of_week", "treatment"])
    .size()
    .head(10)
)

Out[8]:

day_of_week  treatment
0            A            96
             B            96
1            A            96
             B            96
2            A            96
             B            96
3            A            96
             B            96
4            A            96
             B            96
dtype: int64

In [9]:

Copied!

# In the first 30 minutes of the day, LON, NYC, REU, and TGN have a constant treatment
treatments.query("time < '2021-01-01 00:30:00'").groupby(["city", "treatment"]).size()
# In the first 30 minutes of the day, LON, NYC, REU, and TGN have a constant treatment
treatments.query("time < '2021-01-01 00:30:00'").groupby(["city", "treatment"]).size()

Out[9]:

city  treatment
LON   B            36
NYC   B            36
REU   A            36
TGN   B            36
dtype: int64

In [10]:

Copied!

# We can run power analysis
power.power_analysis(df, average_effect=0.01)
# We can run power analysis
power.power_analysis(df, average_effect=0.01)

Out[10]:

0.17