Washover
This notebook shows how to use the washover module. In particular, it shows how to apply a ConstantWashover object with a 30min time delta.
In [1]:
Copied!
## Import Libraries
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from cluster_experiments import ConstantWashover
## Import Libraries
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from cluster_experiments import ConstantWashover
In [2]:
Copied!
## Generate a dummy dataset of 4 time periods of 2 hours with 10 orders each
np.random.seed(42)
num_rows = 5
def random_timestamp(start_time, end_time):
time_delta = end_time - start_time
random_seconds = np.random.randint(0, time_delta.total_seconds())
return start_time + timedelta(seconds=random_seconds)
def generate_data(start_time, end_time, treatment):
data = {
'order_id': np.random.randint(10**9, 10**10, size=num_rows),
'city_code': 'VAL',
'activation_time_local': [random_timestamp(start_time, end_time) for _ in range(num_rows)],
'bin_start_time_local': start_time,
'treatment': treatment
}
return pd.DataFrame(data)
start_times = [datetime(2024, 1, 22, 9, 0), datetime(2024, 1, 22, 11, 0),
datetime(2024, 1, 22, 13, 0), datetime(2024, 1, 22, 15, 0)]
treatments = ['control', 'variation', 'variation', 'control']
dataframes = [generate_data(start, start + timedelta(hours=2), treatment) for start, treatment in zip(start_times, treatments)]
df = pd.concat(dataframes).sort_values(by='activation_time_local').reset_index(drop=True)
df.head(20)
## Generate a dummy dataset of 4 time periods of 2 hours with 10 orders each
np.random.seed(42)
num_rows = 5
def random_timestamp(start_time, end_time):
time_delta = end_time - start_time
random_seconds = np.random.randint(0, time_delta.total_seconds())
return start_time + timedelta(seconds=random_seconds)
def generate_data(start_time, end_time, treatment):
data = {
'order_id': np.random.randint(10**9, 10**10, size=num_rows),
'city_code': 'VAL',
'activation_time_local': [random_timestamp(start_time, end_time) for _ in range(num_rows)],
'bin_start_time_local': start_time,
'treatment': treatment
}
return pd.DataFrame(data)
start_times = [datetime(2024, 1, 22, 9, 0), datetime(2024, 1, 22, 11, 0),
datetime(2024, 1, 22, 13, 0), datetime(2024, 1, 22, 15, 0)]
treatments = ['control', 'variation', 'variation', 'control']
dataframes = [generate_data(start, start + timedelta(hours=2), treatment) for start, treatment in zip(start_times, treatments)]
df = pd.concat(dataframes).sort_values(by='activation_time_local').reset_index(drop=True)
df.head(20)
Out[2]:
order_id | city_code | activation_time_local | bin_start_time_local | treatment | |
---|---|---|---|---|---|
0 | 8395928407 | VAL | 2024-01-22 09:19:44 | 2024-01-22 09:00:00 | control |
1 | 5298312065 | VAL | 2024-01-22 10:15:55 | 2024-01-22 09:00:00 | control |
2 | 3563451924 | VAL | 2024-01-22 10:24:11 | 2024-01-22 09:00:00 | control |
3 | 1787846414 | VAL | 2024-01-22 10:28:31 | 2024-01-22 09:00:00 | control |
4 | 5537253172 | VAL | 2024-01-22 10:47:00 | 2024-01-22 09:00:00 | control |
5 | 2855189739 | VAL | 2024-01-22 11:21:07 | 2024-01-22 11:00:00 | variation |
6 | 8667272366 | VAL | 2024-01-22 11:25:28 | 2024-01-22 11:00:00 | variation |
7 | 7548779029 | VAL | 2024-01-22 11:31:39 | 2024-01-22 11:00:00 | variation |
8 | 6152559666 | VAL | 2024-01-22 11:53:22 | 2024-01-22 11:00:00 | variation |
9 | 2250819632 | VAL | 2024-01-22 12:56:15 | 2024-01-22 11:00:00 | variation |
10 | 8767007473 | VAL | 2024-01-22 13:22:43 | 2024-01-22 13:00:00 | variation |
11 | 3609385266 | VAL | 2024-01-22 13:34:01 | 2024-01-22 13:00:00 | variation |
12 | 9370399619 | VAL | 2024-01-22 13:43:32 | 2024-01-22 13:00:00 | variation |
13 | 1279394470 | VAL | 2024-01-22 13:47:04 | 2024-01-22 13:00:00 | variation |
14 | 5147358011 | VAL | 2024-01-22 14:57:21 | 2024-01-22 13:00:00 | variation |
15 | 7643070057 | VAL | 2024-01-22 15:02:41 | 2024-01-22 15:00:00 | control |
16 | 5164334270 | VAL | 2024-01-22 16:11:37 | 2024-01-22 15:00:00 | control |
17 | 7528642437 | VAL | 2024-01-22 16:34:35 | 2024-01-22 15:00:00 | control |
18 | 2111451555 | VAL | 2024-01-22 16:49:05 | 2024-01-22 15:00:00 | control |
19 | 8140478823 | VAL | 2024-01-22 16:54:33 | 2024-01-22 15:00:00 | control |
In [3]:
Copied!
## Define washover with 30 min duration
washover = ConstantWashover(washover_time_delta=timedelta(minutes=30))
## Apply washover to the dataframe, the orders with activation time within the first 30 minutes after every change in the treatment column, clustering by city and 2h time bin, will be dropped
df_analysis_washover = washover.washover(
df=df,
truncated_time_col='bin_start_time_local',
treatment_col='treatment',
cluster_cols=['city_code','bin_start_time_local'],
original_time_col='activation_time_local',
)
## Define washover with 30 min duration
washover = ConstantWashover(washover_time_delta=timedelta(minutes=30))
## Apply washover to the dataframe, the orders with activation time within the first 30 minutes after every change in the treatment column, clustering by city and 2h time bin, will be dropped
df_analysis_washover = washover.washover(
df=df,
truncated_time_col='bin_start_time_local',
treatment_col='treatment',
cluster_cols=['city_code','bin_start_time_local'],
original_time_col='activation_time_local',
)
In [4]:
Copied!
## Show the rows that have been dropped
anti_joined_df = df.merge(df_analysis_washover['order_id'], how='left', indicator=True, on='order_id')
anti_joined_df = anti_joined_df[anti_joined_df['_merge'] == 'left_only'].drop(columns=['_merge'])
anti_joined_df.head(10)
## Show the rows that have been dropped
anti_joined_df = df.merge(df_analysis_washover['order_id'], how='left', indicator=True, on='order_id')
anti_joined_df = anti_joined_df[anti_joined_df['_merge'] == 'left_only'].drop(columns=['_merge'])
anti_joined_df.head(10)
Out[4]:
order_id | city_code | activation_time_local | bin_start_time_local | treatment | |
---|---|---|---|---|---|
0 | 8395928407 | VAL | 2024-01-22 09:19:44 | 2024-01-22 09:00:00 | control |
5 | 2855189739 | VAL | 2024-01-22 11:21:07 | 2024-01-22 11:00:00 | variation |
6 | 8667272366 | VAL | 2024-01-22 11:25:28 | 2024-01-22 11:00:00 | variation |
15 | 7643070057 | VAL | 2024-01-22 15:02:41 | 2024-01-22 15:00:00 | control |
In [5]:
Copied!
## Check DF shapes
print('df:', df.shape)
print('df_analysis_washover:', df_analysis_washover.shape)
## Check DF shapes
print('df:', df.shape)
print('df_analysis_washover:', df_analysis_washover.shape)
df: (20, 5) df_analysis_washover: (16, 5)