from kungfu_pandas.kungfu import *

agg_by_col(df, by=None, col=None, agg='sum', asc=False)

Show source code in kungfu_pandas/kungfu.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def agg_by_col(
    df: pd.DataFrame,
    by: str = None,
    col: str = None,
    agg: str = 'sum',
    asc: bool = False
) -> pd.DataFrame:
    """
    Groups by column 'by', aggregates column 'col' with 'agg'
    and orders by their values ascending or descedning

    Arguments:
        df: dataframe to count.
        by: column name of the dataframe to group by.
        col: column name of the dataframe to summarise.
        agg: aggregation function to summarise col.
        asc: sort by aggregation result, ascending or descending.

    Usage:

    ```python
    import pandas as pd
    from kungfu_pandas import count

    df = pd.DataFrame({
            'x': [1, 2, 3, 0, 0, 1],
            'group': ['a', 'a', 'a', 'b', 'b', 'b']
    })

    (
        df
        .pipe(agg_by_col, by='group', col='x', agg='mean')
    )
    ```
    """

    # Edge case -> 0 rows
    if df.shape[0] == 0:
        if by:
            return pd.DataFrame({by: [None], col: [None]})
        else:
            return pd.DataFrame({col: [None]})

    if by:
        return (
            df
            .groupby(by, as_index=False)
            [col]
            .agg(agg)
            .sort_values(by=col, ascending=asc)
        )
    else:
        return pd.DataFrame(
            {col: [df[col].agg(agg)]}
        ).sort_values(by=col, ascending=asc)

Groups by column 'by', aggregates column 'col' with 'agg' and orders by their values ascending or descedning

Parameters

Name Type Description Default
df DataFrame dataframe to count. required
by str column name of the dataframe to group by. None
col str column name of the dataframe to summarise. None
agg str aggregation function to summarise col. 'sum'
asc bool sort by aggregation result, ascending or descending. False

Usage:

import pandas as pd
from kungfu_pandas import count

df = pd.DataFrame({
        'x': [1, 2, 3, 0, 0, 1],
        'group': ['a', 'a', 'a', 'b', 'b', 'b']
})

(
    df
    .pipe(agg_by_col, by='group', col='x', agg='mean')
)

case_when(df, cases)

Show source code in kungfu_pandas/kungfu.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def case_when(
    df: pd.DataFrame,
    cases: Union[Dict[Callable, Any], List[Tuple[Callable, Any]]],
) -> pd.Series:
    """
    This is the pandas equivalent of SQL case when. If no cases match, NaN is returned.

    Arguments:
        df: dataframe to apply case when to.
        cases: dictionary of functions and their output values. It can also be a list of tuples where the first element should be the function and the second the value. It is important to note that this dictionary is ordered as in a sql case when

    Usage:

    ```python
    import pandas as pd
    from kungfu_pandas import case_when

    df = pd.DataFrame({
            'x': [1, 2, 3, 0, 0, 1],
            'group': ['a', 'a', 'a', 'b', 'b', 'b']
    })

    (
        df
        .pipe(case_when, [
            (lambda d: d['x'] == 0, 0),
            (lambda d: (d['x'] == 1) & (d['group'] == 'a'), 1),
            (lambda d: (d['x'] == 1) & (d['group'] == 'b'), 2),
            (lambda d: d['x'] >= 3, 3),
        ])
    )

    (
        df
        .assign(
            new_x=lambda old_df:
            case_when(old_df, {
                lambda d: d['x'] == 0: 0,
                lambda d: (d['x'] == 1) & (d['group'] == 'a'): 1,
                lambda d: (d['x'] == 1) & (d['group'] == 'b'): 2,
                lambda d: d['x'] >= 3: 3,
            })
        )
    )
    ```
    """
    if not cases:
        raise ValueError(
            'Empty condition: value dictionary is passed to case_when')

    cases = cases.copy()
    # Transform to list of tuples
    if isinstance(cases, dict):
        cases = list(cases.items())
    # Output type
    type_out = type(cases[0][1])

    # Initialize out_s as a series full of None
    out_s = pd.Series(np.nan, index=df.index, dtype=type_out)

    # We need to reverse the order of the dictionary to have the same logic as case when in sql
    # The idea is that the first condition rules out the rest of the conditions,
    # the second rules out the latter ones, etc.
    for lmbd, value in reversed(cases):
        out_s[lmbd(df)] = value
    return out_s

This is the pandas equivalent of SQL case when. If no cases match, NaN is returned.

Parameters

Name Type Description Default
df DataFrame dataframe to apply case when to. required
cases Union[Dict[Callable, Any], List[Tuple[Callable, Any]]] dictionary of functions and their output values. It can also be a list of tuples where the first element should be the function and the second the value. It is important to note that this dictionary is ordered as in a sql case when required

Usage:

import pandas as pd
from kungfu_pandas import case_when

df = pd.DataFrame({
        'x': [1, 2, 3, 0, 0, 1],
        'group': ['a', 'a', 'a', 'b', 'b', 'b']
})

(
    df
    .pipe(case_when, [
        (lambda d: d['x'] == 0, 0),
        (lambda d: (d['x'] == 1) & (d['group'] == 'a'), 1),
        (lambda d: (d['x'] == 1) & (d['group'] == 'b'), 2),
        (lambda d: d['x'] >= 3, 3),
    ])
)

(
    df
    .assign(
        new_x=lambda old_df:
        case_when(old_df, {
            lambda d: d['x'] == 0: 0,
            lambda d: (d['x'] == 1) & (d['group'] == 'a'): 1,
            lambda d: (d['x'] == 1) & (d['group'] == 'b'): 2,
            lambda d: d['x'] >= 3: 3,
        })
    )
)

count(df, by=None)

Show source code in kungfu_pandas/kungfu.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def count(df: pd.DataFrame, by: str = None) -> pd.DataFrame:
    """
    Counts by column, if no column is given just gives total count

    Arguments:
        df: dataframe to count.
        by: column name of the dataframe to group and count by.

    Usage:

    ```python
    import pandas as pd
    from kungfu_pandas import count

    df = pd.DataFrame({
            'x': [1, 2, 3, 0, 0, 1],
            'group': ['a', 'a', 'a', 'b', 'b', 'b']
    })

    (
        df
        .pipe(count, by='group')
    )
    ```
    """
    if not by:
        # Not grouping, just return the shape
        return pd.DataFrame(dict(n=[df.shape[0]]))

    if df.shape[0] == 0:
        # Edge case -> 0 rows but using groups
        return pd.DataFrame({by: [None], 'n': [0]})
    # Regular case
    return (
        df
        .groupby(by)
        .size()
        .to_frame()
        .reset_index()
        .rename(columns={0: 'n'})
    )

Counts by column, if no column is given just gives total count

Parameters

Name Type Description Default
df DataFrame dataframe to count. required
by str column name of the dataframe to group and count by. None

Usage:

import pandas as pd
from kungfu_pandas import count

df = pd.DataFrame({
        'x': [1, 2, 3, 0, 0, 1],
        'group': ['a', 'a', 'a', 'b', 'b', 'b']
})

(
    df
    .pipe(count, by='group')
)

mask(df, key, function)

Show source code in kungfu_pandas/kungfu.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def mask(df: pd.DataFrame, key: str, function: Callable) -> pd.DataFrame:
    """
    Returns a filtered dataframe, by applying function to key

    Arguments:
        df: dataframe to be masked.
        key: column name of the dataframe to apply function to.
        function: function applied to the key for filtering.

    Usage:

    ```python
    import pandas as pd
    from kungfu_pandas import mask

    df = pd.DataFrame({
            'x': [1, 2, 3, 0, 0, 1],
            'group': ['a', 'a', 'a', 'b', 'b', 'b']
    })

    def is_zero(x):
        return x == 0

    (
        df
        .pipe(mask, 'x', is_zero)
    )

    ```
    """
    return df[function(df[key])]

Returns a filtered dataframe, by applying function to key

Parameters

Name Type Description Default
df DataFrame dataframe to be masked. required
key str column name of the dataframe to apply function to. required
function Callable function applied to the key for filtering. required

Usage:

import pandas as pd
from kungfu_pandas import mask

df = pd.DataFrame({
        'x': [1, 2, 3, 0, 0, 1],
        'group': ['a', 'a', 'a', 'b', 'b', 'b']
})

def is_zero(x):
    return x == 0

(
    df
    .pipe(mask, 'x', is_zero)
)