AA test clustered

This notebook shows that, when using a clustered splitter, if the clusters explain a part of the variance, using a non-clustered analysis will lead to higher false positive rate than expected.

In particular, we use a clustered splitter and:

An OLS-clustered robust estimator, we see that it passes the AA test
A simple OLS (without clustered standard errors), it fails the AA test as it returns a super high false positive rate

In [1]:

Copied!





from datetime import date

import numpy as np
from cluster_experiments import PowerAnalysis, ConstantPerturbator, BalancedClusteredSplitter, ExperimentAnalysis, ClusteredOLSAnalysis
import pandas as pd
import statsmodels.api as sm



# Create fake data
N = 10_000
clusters = [f"Cluster {i}" for i in range(10)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 15)]
df = pd.DataFrame(
    {
        "cluster": np.random.choice(clusters, size=N),
        "date": np.random.choice(dates, size=N),
    }
).assign(
    # Target is a linear combination of cluster and day of week, plus some noise
    cluster_id=lambda df: df["cluster"].astype("category").cat.codes,
    day_of_week=lambda df: pd.to_datetime(df["date"]).dt.dayofweek,
    target=lambda df: df["cluster_id"] + df["day_of_week"] + np.random.normal(size=N),
)
from datetime import date

import numpy as np
from cluster_experiments import PowerAnalysis, ConstantPerturbator, BalancedClusteredSplitter, ExperimentAnalysis, ClusteredOLSAnalysis
import pandas as pd
import statsmodels.api as sm



# Create fake data
N = 10_000
clusters = [f"Cluster {i}" for i in range(10)]
dates = [f"{date(2022, 1, i):%Y-%m-%d}" for i in range(1, 15)]
df = pd.DataFrame(
    {
        "cluster": np.random.choice(clusters, size=N),
        "date": np.random.choice(dates, size=N),
    }
).assign(
    # Target is a linear combination of cluster and day of week, plus some noise
    cluster_id=lambda df: df["cluster"].astype("category").cat.codes,
    day_of_week=lambda df: pd.to_datetime(df["date"]).dt.dayofweek,
    target=lambda df: df["cluster_id"] + df["day_of_week"] + np.random.normal(size=N),
)

In [2]:

Copied!

df.head()
df.head()

Out[2]:

	cluster	date	cluster_id	day_of_week	target
0	Cluster 3	2022-01-08	3	5	7.534487
1	Cluster 2	2022-01-06	2	3	5.039041
2	Cluster 1	2022-01-14	1	4	5.341845
3	Cluster 7	2022-01-12	7	2	9.468617
4	Cluster 0	2022-01-10	0	0	-0.644678

Some clusters have a higher average outcome than others

In [3]:

Copied!

df.groupby("cluster").agg({"target": ["mean", "std"]})
df.groupby("cluster").agg({"target": ["mean", "std"]})

Out[3]:

	target
	mean	std
cluster
Cluster 0	3.027335	2.223308
Cluster 1	3.907833	2.211297
Cluster 2	4.895215	2.270596
Cluster 3	6.045043	2.269786
Cluster 4	6.902209	2.224554
Cluster 5	8.028794	2.313159
Cluster 6	9.046213	2.253462
Cluster 7	10.055748	2.226720
Cluster 8	11.048716	2.273583
Cluster 9	11.939075	2.216478

In [4]:

Copied!





# Simple ols to run the analysis
class NonClusteredOLS(ExperimentAnalysis):
    def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
        """Returns the p-value of the analysis
        Arguments:
            df: dataframe containing the data to analyze
            verbose (Optional): bool, prints the regression summary if True
        """
        results_ols = sm.OLS.from_formula("target ~ treatment", data=df).fit()
        return results_ols.pvalues[self.treatment_col]
# Simple ols to run the analysis
class NonClusteredOLS(ExperimentAnalysis):
    def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
        """Returns the p-value of the analysis
        Arguments:
            df: dataframe containing the data to analyze
            verbose (Optional): bool, prints the regression summary if True
        """
        results_ols = sm.OLS.from_formula("target ~ treatment", data=df).fit()
        return results_ols.pvalues[self.treatment_col]

In [5]:

Copied!





cluster_cols = ["cluster", "date"]

splitter = BalancedClusteredSplitter(
    cluster_cols=cluster_cols,
)

perturbator = ConstantPerturbator()

alpha = 0.05
n_simulations = 100

# Right power analysis, we use clustered splitter and ols clustered analysis
pw_right = PowerAnalysis(
    splitter=splitter,
    perturbator=perturbator,
    alpha=alpha,
    n_simulations=n_simulations,
    analysis=ClusteredOLSAnalysis(
        cluster_cols=cluster_cols,
    ),
)

# Wrong power analysis, we use clustered splitter and regular ols
pw_wrong = PowerAnalysis(
    splitter=splitter,
    perturbator=perturbator,
    alpha=alpha,
    n_simulations=n_simulations,
    analysis=NonClusteredOLS(
        # We pass cluster_cols here, but we don't use them!!!
        cluster_cols=cluster_cols,
    ),
)
cluster_cols = ["cluster", "date"]

splitter = BalancedClusteredSplitter(
    cluster_cols=cluster_cols,
)

perturbator = ConstantPerturbator()

alpha = 0.05
n_simulations = 100

# Right power analysis, we use clustered splitter and ols clustered analysis
pw_right = PowerAnalysis(
    splitter=splitter,
    perturbator=perturbator,
    alpha=alpha,
    n_simulations=n_simulations,
    analysis=ClusteredOLSAnalysis(
        cluster_cols=cluster_cols,
    ),
)

# Wrong power analysis, we use clustered splitter and regular ols
pw_wrong = PowerAnalysis(
    splitter=splitter,
    perturbator=perturbator,
    alpha=alpha,
    n_simulations=n_simulations,
    analysis=NonClusteredOLS(
        # We pass cluster_cols here, but we don't use them!!!
        cluster_cols=cluster_cols,
    ),
)

Right way of doing it: in the AA test we get a power similar to the type I error of the test

In [6]:

Copied!

pw_right.power_analysis(df, average_effect=0.0)
pw_right.power_analysis(df, average_effect=0.0)

Out[6]:

0.06

Wrong way of doing it: the AA test fails, we have too much power

In [7]:

Copied!

pw_wrong.power_analysis(df, average_effect=0.0)
pw_wrong.power_analysis(df, average_effect=0.0)

Out[7]:

0.79