Skip to content

from cluster_experiments.inference.hypothesis_test import *

HypothesisTest

A class used to represent a Hypothesis Test with a metric, analysis, optional analysis configuration, and optional dimensions.

Attributes

metric : Metric An instance of the Metric class analysis_type : str string mapping to an ExperimentAnalysis class. Must be either in the built-in analysis_mapping or in the custom_analysis_type_mapper if provided. analysis_config : Optional[dict] An optional dictionary representing the configuration for the analysis dimensions : Optional[List[Dimension]] An optional list of Dimension instances cupac_config : Optional[dict] An optional dictionary representing the configuration for the cupac model custom_analysis_type_mapper : Optional[Dict[str, ExperimentAnalysis]] An optional dictionary mapping the names of custom analysis types to the corresponding ExperimentAnalysis classes

Source code in cluster_experiments/inference/hypothesis_test.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
class HypothesisTest:
    """
    A class used to represent a Hypothesis Test with a metric, analysis, optional analysis configuration, and optional dimensions.

    Attributes
    ----------
    metric : Metric
        An instance of the Metric class
    analysis_type : str
        string mapping to an ExperimentAnalysis class. Must be either in the built-in analysis_mapping or in the custom_analysis_type_mapper if provided.
    analysis_config : Optional[dict]
        An optional dictionary representing the configuration for the analysis
    dimensions : Optional[List[Dimension]]
        An optional list of Dimension instances
    cupac_config : Optional[dict]
        An optional dictionary representing the configuration for the cupac model
    custom_analysis_type_mapper : Optional[Dict[str, ExperimentAnalysis]]
        An optional dictionary mapping the names of custom analysis types to the corresponding ExperimentAnalysis classes
    """

    def __init__(
        self,
        metric: Metric,
        analysis_type: str,
        analysis_config: Optional[dict] = None,
        dimensions: Optional[List[Dimension]] = None,
        cupac_config: Optional[dict] = None,
        custom_analysis_type_mapper: Optional[Dict[str, ExperimentAnalysis]] = None,
    ):
        """
        Parameters
        ----------
        metric : Metric
            An instance of the Metric class
        analysis_type : str
            string mapping to an ExperimentAnalysis class. Must be either in the built-in analysis_mapping or in the custom_analysis_type_mapper if provided.
        analysis_config : Optional[dict]
            An optional dictionary representing the configuration for the analysis
        dimensions : Optional[List[Dimension]]
            An optional list of Dimension instances
        cupac_config : Optional[dict]
            An optional dictionary representing the configuration for the cupac model
        custom_analysis_type_mapper : Optional[Dict[str, ExperimentAnalysis]]
            An optional dictionary mapping the names of custom analysis types to the corresponding ExperimentAnalysis classes
        """
        self._validate_inputs(
            metric,
            analysis_type,
            analysis_config,
            dimensions,
            cupac_config,
            custom_analysis_type_mapper,
        )
        self.metric = metric
        self.analysis_type = analysis_type
        self.analysis_config = analysis_config or {}
        self.dimensions = [DefaultDimension()] + (dimensions or [])
        self.cupac_config = cupac_config or {}
        self.custom_analysis_type_mapper = custom_analysis_type_mapper or {}

        self.analysis_type_mapper = self.custom_analysis_type_mapper or analysis_mapping
        self.analysis_class = self.analysis_type_mapper[self.analysis_type]
        self.is_cupac = bool(cupac_config)
        self.cupac_handler = (
            CupacHandler(**self.cupac_config) if self.is_cupac else None
        )
        self.cupac_covariate_col = (
            self.cupac_handler.cupac_outcome_name if self.is_cupac else None
        )

        self.new_analysis_config = None
        self.experiment_analysis = None

    @staticmethod
    def _validate_inputs(
        metric: Metric,
        analysis_type: str,
        analysis_config: Optional[dict],
        dimensions: Optional[List[Dimension]],
        cupac_config: Optional[dict] = None,
        custom_analysis_type_mapper: Optional[Dict[str, ExperimentAnalysis]] = None,
    ):
        """
        Validates the inputs for the HypothesisTest class.

        Parameters
        ----------
        metric : Metric
            An instance of the Metric class
        analysis_type : str
            string mapper to an ExperimentAnalysis
        analysis_config : Optional[dict]
            An optional dictionary representing the configuration for the analysis
        dimensions : Optional[List[Dimension]]
            An optional list of Dimension instances
        cupac_config : Optional[dict]
            An optional dictionary representing the configuration for the cupac model
        custom_analysis_type_mapper : Optional[dict[str, ExperimentAnalysis]]
            An optional dictionary mapping the names of custom analysis types to the corresponding ExperimentAnalysis classes
        """
        # Check if metric is a valid Metric instance
        if not isinstance(metric, Metric):
            raise TypeError("Metric must be an instance of Metric")

        # Check if analysis_type is a string
        if not isinstance(analysis_type, str):
            raise TypeError("Analysis must be a string")

        # Check if analysis_config is a dictionary when provided
        if analysis_config is not None and not isinstance(analysis_config, dict):
            raise TypeError("analysis_config must be a dictionary if provided")

        # Check if cupac_config is a dictionary when provided
        if cupac_config is not None and not isinstance(cupac_config, dict):
            raise TypeError("cupac_config must be a dictionary if provided")

        # Check if dimensions is a list of Dimension instances when provided
        if dimensions is not None and (
            not isinstance(dimensions, list)
            or not all(isinstance(dim, Dimension) for dim in dimensions)
        ):
            raise TypeError(
                "Dimensions must be a list of Dimension instances if provided"
            )

        # Validate custom_analysis_type_mapper if provided
        if custom_analysis_type_mapper:
            # Ensure it's a dictionary
            if not isinstance(custom_analysis_type_mapper, dict):
                raise TypeError(
                    "custom_analysis_type_mapper must be a dictionary if provided"
                )

            # Ensure all keys are strings and values are ExperimentAnalysis classes
            for key, value in custom_analysis_type_mapper.items():
                if not isinstance(key, str):
                    raise TypeError(
                        f"Key '{key}' in custom_analysis_type_mapper must be a string"
                    )
                if not issubclass(value, ExperimentAnalysis):
                    raise TypeError(
                        f"Value '{value}' for key '{key}' in custom_analysis_type_mapper must be a subclass of ExperimentAnalysis"
                    )

            # Ensure the analysis_type is in the custom mapper if a custom mapper is provided
            if analysis_type not in custom_analysis_type_mapper:
                raise ValueError(
                    f"Analysis type '{analysis_type}' not found in the provided custom_analysis_type_mapper"
                )

        # If no custom_analysis_type_mapper, check if analysis_type exists in the default mapping
        elif analysis_type not in analysis_mapping:
            raise ValueError(
                f"Analysis type '{analysis_type}' not found in analysis_mapping"
            )

    def get_inference_results(self, df: pd.DataFrame, alpha: float) -> InferenceResults:
        """
        Performs inference analysis on the provided DataFrame using the analysis class.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe containing the data for analysis.
        alpha : float
            The significance level to be used in the inference analysis.

        Returns
        -------
        InferenceResults
            The results containing the statistics of the inference procedure.
        """

        self.experiment_analysis = self.analysis_class(**self.new_analysis_config)
        inference_results = self.experiment_analysis.get_inference_results(
            df=df, alpha=alpha
        )

        return inference_results

    def _prepare_analysis_config(
        self, target_col: str, treatment_col: str, treatment: str
    ) -> None:
        """
        Extends the analysis_config provided by the user, by adding or overriding the following keys:
        - target_col
        - treatment_col
        - treatment

        Also handles cupac covariate.

        Returns
        -------
        dict
            The prepared analysis configuration, ready to be ingested by the experiment analysis class
        """
        new_analysis_config = copy.deepcopy(self.analysis_config)

        new_analysis_config["target_col"] = target_col
        new_analysis_config["treatment_col"] = treatment_col
        new_analysis_config["treatment"] = treatment

        covariates = new_analysis_config.get("covariates", [])

        if self.cupac_covariate_col and self.cupac_covariate_col not in covariates:
            raise ValueError(
                f"You provided a cupac configuration but did not provide the cupac covariate called {self.cupac_covariate_col} in the analysis_config"
            )

        self.new_analysis_config = new_analysis_config

    @staticmethod
    def prepare_data(
        data: pd.DataFrame,
        variant_col: str,
        treatment_variant: Variant,
        control_variant: Variant,
        dimension_name: str,
        dimension_value: str,
    ) -> pd.DataFrame:
        """
        Prepares the data for the experiment analysis pipeline
        """
        prepared_df = data.copy()

        prepared_df = prepared_df.assign(__total_dimension="total")

        prepared_df = prepared_df.query(
            f"{variant_col}.isin(['{treatment_variant.name}','{control_variant.name}'])"
        ).query(f"{dimension_name} == '{dimension_value}'")

        return prepared_df

    def add_covariates(
        self, exp_data: pd.DataFrame, pre_exp_data: pd.DataFrame
    ) -> pd.DataFrame:
        """
        If the test is a cupac test, adds the covariates to the experimental data.
        """
        if self.is_cupac:
            exp_data = self.cupac_handler.add_covariates(
                df=exp_data, pre_experiment_df=pre_exp_data
            )

        return exp_data

    def get_test_results(
        self,
        control_variant: Variant,
        treatment_variant: Variant,
        variant_col: str,
        exp_data: pd.DataFrame,
        dimension: Dimension,
        dimension_value: str,
        alpha: float,
    ) -> AnalysisPlanResults:
        """
        Performs the hypothesis test on the provided data, for the given dimension value.

        Parameters
        ----------
        control_variant : Variant
            The control variant
        treatment_variant : Variant
            The treatment variant
        variant_col : str
            The column name representing the variant
        exp_data : pd.DataFrame
            The dataframe containing the data for analysis.
        dimension : Dimension
            The dimension instance
        dimension_value : str
            The value of the dimension
        alpha : float
            The significance level to be used in the inference analysis.

        Returns
        -------
        AnalysisPlanResults
            The results of the hypothesis test
        """
        self._prepare_analysis_config(
            target_col=self.metric.target_column,
            treatment_col=variant_col,
            treatment=treatment_variant.name,
        )

        prepared_df = self.prepare_data(
            data=exp_data,
            variant_col=variant_col,
            treatment_variant=treatment_variant,
            control_variant=control_variant,
            dimension_name=dimension.name,
            dimension_value=dimension_value,
        )

        inference_results = self.get_inference_results(df=prepared_df, alpha=alpha)

        control_variant_mean = self.metric.get_mean(
            prepared_df.query(f"{variant_col}=='{control_variant.name}'")
        )
        treatment_variant_mean = self.metric.get_mean(
            prepared_df.query(f"{variant_col}=='{treatment_variant.name}'")
        )

        test_results = AnalysisPlanResults(
            metric_alias=[self.metric.alias],
            control_variant_name=[control_variant.name],
            treatment_variant_name=[treatment_variant.name],
            control_variant_mean=[control_variant_mean],
            treatment_variant_mean=[treatment_variant_mean],
            analysis_type=[self.analysis_type],
            ate=[inference_results.ate],
            ate_ci_lower=[inference_results.conf_int.lower],
            ate_ci_upper=[inference_results.conf_int.upper],
            p_value=[inference_results.p_value],
            std_error=[inference_results.std_error],
            dimension_name=[dimension.name],
            dimension_value=[dimension_value],
            alpha=[alpha],
        )

        return test_results

__init__(metric, analysis_type, analysis_config=None, dimensions=None, cupac_config=None, custom_analysis_type_mapper=None)

Parameters

metric : Metric An instance of the Metric class analysis_type : str string mapping to an ExperimentAnalysis class. Must be either in the built-in analysis_mapping or in the custom_analysis_type_mapper if provided. analysis_config : Optional[dict] An optional dictionary representing the configuration for the analysis dimensions : Optional[List[Dimension]] An optional list of Dimension instances cupac_config : Optional[dict] An optional dictionary representing the configuration for the cupac model custom_analysis_type_mapper : Optional[Dict[str, ExperimentAnalysis]] An optional dictionary mapping the names of custom analysis types to the corresponding ExperimentAnalysis classes

Source code in cluster_experiments/inference/hypothesis_test.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def __init__(
    self,
    metric: Metric,
    analysis_type: str,
    analysis_config: Optional[dict] = None,
    dimensions: Optional[List[Dimension]] = None,
    cupac_config: Optional[dict] = None,
    custom_analysis_type_mapper: Optional[Dict[str, ExperimentAnalysis]] = None,
):
    """
    Parameters
    ----------
    metric : Metric
        An instance of the Metric class
    analysis_type : str
        string mapping to an ExperimentAnalysis class. Must be either in the built-in analysis_mapping or in the custom_analysis_type_mapper if provided.
    analysis_config : Optional[dict]
        An optional dictionary representing the configuration for the analysis
    dimensions : Optional[List[Dimension]]
        An optional list of Dimension instances
    cupac_config : Optional[dict]
        An optional dictionary representing the configuration for the cupac model
    custom_analysis_type_mapper : Optional[Dict[str, ExperimentAnalysis]]
        An optional dictionary mapping the names of custom analysis types to the corresponding ExperimentAnalysis classes
    """
    self._validate_inputs(
        metric,
        analysis_type,
        analysis_config,
        dimensions,
        cupac_config,
        custom_analysis_type_mapper,
    )
    self.metric = metric
    self.analysis_type = analysis_type
    self.analysis_config = analysis_config or {}
    self.dimensions = [DefaultDimension()] + (dimensions or [])
    self.cupac_config = cupac_config or {}
    self.custom_analysis_type_mapper = custom_analysis_type_mapper or {}

    self.analysis_type_mapper = self.custom_analysis_type_mapper or analysis_mapping
    self.analysis_class = self.analysis_type_mapper[self.analysis_type]
    self.is_cupac = bool(cupac_config)
    self.cupac_handler = (
        CupacHandler(**self.cupac_config) if self.is_cupac else None
    )
    self.cupac_covariate_col = (
        self.cupac_handler.cupac_outcome_name if self.is_cupac else None
    )

    self.new_analysis_config = None
    self.experiment_analysis = None

add_covariates(exp_data, pre_exp_data)

If the test is a cupac test, adds the covariates to the experimental data.

Source code in cluster_experiments/inference/hypothesis_test.py
248
249
250
251
252
253
254
255
256
257
258
259
def add_covariates(
    self, exp_data: pd.DataFrame, pre_exp_data: pd.DataFrame
) -> pd.DataFrame:
    """
    If the test is a cupac test, adds the covariates to the experimental data.
    """
    if self.is_cupac:
        exp_data = self.cupac_handler.add_covariates(
            df=exp_data, pre_experiment_df=pre_exp_data
        )

    return exp_data

get_inference_results(df, alpha)

Performs inference analysis on the provided DataFrame using the analysis class.

Parameters

df : pd.DataFrame The dataframe containing the data for analysis. alpha : float The significance level to be used in the inference analysis.

Returns

InferenceResults The results containing the statistics of the inference procedure.

Source code in cluster_experiments/inference/hypothesis_test.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def get_inference_results(self, df: pd.DataFrame, alpha: float) -> InferenceResults:
    """
    Performs inference analysis on the provided DataFrame using the analysis class.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing the data for analysis.
    alpha : float
        The significance level to be used in the inference analysis.

    Returns
    -------
    InferenceResults
        The results containing the statistics of the inference procedure.
    """

    self.experiment_analysis = self.analysis_class(**self.new_analysis_config)
    inference_results = self.experiment_analysis.get_inference_results(
        df=df, alpha=alpha
    )

    return inference_results

get_test_results(control_variant, treatment_variant, variant_col, exp_data, dimension, dimension_value, alpha)

Performs the hypothesis test on the provided data, for the given dimension value.

Parameters

control_variant : Variant The control variant treatment_variant : Variant The treatment variant variant_col : str The column name representing the variant exp_data : pd.DataFrame The dataframe containing the data for analysis. dimension : Dimension The dimension instance dimension_value : str The value of the dimension alpha : float The significance level to be used in the inference analysis.

Returns

AnalysisPlanResults The results of the hypothesis test

Source code in cluster_experiments/inference/hypothesis_test.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
def get_test_results(
    self,
    control_variant: Variant,
    treatment_variant: Variant,
    variant_col: str,
    exp_data: pd.DataFrame,
    dimension: Dimension,
    dimension_value: str,
    alpha: float,
) -> AnalysisPlanResults:
    """
    Performs the hypothesis test on the provided data, for the given dimension value.

    Parameters
    ----------
    control_variant : Variant
        The control variant
    treatment_variant : Variant
        The treatment variant
    variant_col : str
        The column name representing the variant
    exp_data : pd.DataFrame
        The dataframe containing the data for analysis.
    dimension : Dimension
        The dimension instance
    dimension_value : str
        The value of the dimension
    alpha : float
        The significance level to be used in the inference analysis.

    Returns
    -------
    AnalysisPlanResults
        The results of the hypothesis test
    """
    self._prepare_analysis_config(
        target_col=self.metric.target_column,
        treatment_col=variant_col,
        treatment=treatment_variant.name,
    )

    prepared_df = self.prepare_data(
        data=exp_data,
        variant_col=variant_col,
        treatment_variant=treatment_variant,
        control_variant=control_variant,
        dimension_name=dimension.name,
        dimension_value=dimension_value,
    )

    inference_results = self.get_inference_results(df=prepared_df, alpha=alpha)

    control_variant_mean = self.metric.get_mean(
        prepared_df.query(f"{variant_col}=='{control_variant.name}'")
    )
    treatment_variant_mean = self.metric.get_mean(
        prepared_df.query(f"{variant_col}=='{treatment_variant.name}'")
    )

    test_results = AnalysisPlanResults(
        metric_alias=[self.metric.alias],
        control_variant_name=[control_variant.name],
        treatment_variant_name=[treatment_variant.name],
        control_variant_mean=[control_variant_mean],
        treatment_variant_mean=[treatment_variant_mean],
        analysis_type=[self.analysis_type],
        ate=[inference_results.ate],
        ate_ci_lower=[inference_results.conf_int.lower],
        ate_ci_upper=[inference_results.conf_int.upper],
        p_value=[inference_results.p_value],
        std_error=[inference_results.std_error],
        dimension_name=[dimension.name],
        dimension_value=[dimension_value],
        alpha=[alpha],
    )

    return test_results

prepare_data(data, variant_col, treatment_variant, control_variant, dimension_name, dimension_value) staticmethod

Prepares the data for the experiment analysis pipeline

Source code in cluster_experiments/inference/hypothesis_test.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
@staticmethod
def prepare_data(
    data: pd.DataFrame,
    variant_col: str,
    treatment_variant: Variant,
    control_variant: Variant,
    dimension_name: str,
    dimension_value: str,
) -> pd.DataFrame:
    """
    Prepares the data for the experiment analysis pipeline
    """
    prepared_df = data.copy()

    prepared_df = prepared_df.assign(__total_dimension="total")

    prepared_df = prepared_df.query(
        f"{variant_col}.isin(['{treatment_variant.name}','{control_variant.name}'])"
    ).query(f"{dimension_name} == '{dimension_value}'")

    return prepared_df