Source code for dataframe.features

import fast_causal_inference.lib.tools as ais_tools
from fast_causal_inference.dataframe import readClickHouse
from fast_causal_inference.dataframe.functions import (
    DfFnColWrapper,
    register_fn,
    define_args,
    FnArg,
    DfFunction,
    OlapEngineType,
)


[docs]class OneHotEncoder:
    """

    This class implements the OneHotEncoder method for causal inference.

    Parameters
    ----------
    cols : list, default=None
        The columns to be one-hot encoded.

    Methods
    -------
    fit(dataframe):
        Apply the OneHotEncoder method to the input dataframe.

    Example
    -------
    .. code-block:: python

        import fast_causal_inference
        import fast_causal_inference.dataframe.features as Features
        df = fast_causal_inference.readClickHouse('test_data_small')
        one_hot = Features.OneHotEncoder()
        df_new = one_hot.fit(df, cols=['x_cat1'])
        df_new.printSchema()
    """

    def __init__(self):
        pass

    def fit(self, df, cols):
        new_df = df.materializedView(is_temp=True)
        new_table_name = ais_tools.onehot(new_df.getTableName(), cols)
        return readClickHouse(new_table_name[0])


from fast_causal_inference.dataframe.functions import (
    DfFnColWrapper,
    register_fn,
    define_args,
    FnArg,
    DfFunction,
    OlapEngineType,
)


@register_fn(engine=OlapEngineType.CLICKHOUSE, name="cutbins")
@register_fn(engine=OlapEngineType.STARROCKS, name="cutbins")
@define_args(
    FnArg(name="column"), FnArg(name="bins"), FnArg(name="if_string", default="True")
)
class CutbinsDfFunction(DfFunction):
    pass


def cut_bins(column, bins, if_string=True):
    bins_str = ""
    if isinstance(bins, str):
        bins_str = bins
    elif isinstance(bins, list):
        bins_str = "[" + ",".join([str(x) for x in bins]) + "]"
    else:
        raise ValueError(f"bins({bins}) must be a str or a list")
    return DfFnColWrapper(CutbinsDfFunction(), {}, [column, bins_str, if_string])


[docs]class Bucketizer:
    """
    This class is used for bucketizing continuous variables into discrete bins.

    """

    def __init__(self):
        pass

[docs]    def fit(self, df, inputCols, splitsArray, outputCols=[], if_string=True):
        """
        This function applies the bucketizing transformation to the specified columns of the input dataframe.

        Parameters

        :param df: The input dataframe to be transformed.
        :type df: DataFrame
        :param inputCols: A list of column names in the dataframe to be bucketized.
        :type inputCols: list
        :param splitsArray: A list of lists, where each inner list contains the split points for bucketizing the corresponding column in inputCols.
        :type splitsArray: list
        :param outputCols: A list of output column names after bucketizing. If not provided, '_buckets' will be appended to the original column names.
        :type outputCols: list, optional
        :param if_string: A flag indicating whether the bin values should be treated as strings. Default is True.
        :type if_string: bool, optional
        :return: The transformed dataframe with bucketized columns.
        :rtype: DataFrame

        Example
        -------
        .. code-block:: python

            >>> import fast_causal_inference
            >>> import fast_causal_inference.dataframe.features as Features
            >>> df = fast_causal_inference.readClickHouse('test_data_small')
            >>> bucketizer = Features.Bucketizer()
            >>> df_new = bucketizer.fit(df,['x1','x2'],[[1,3],[0,2]],if_string=True)
            >>> df_new.select('x1','x2','x1_buckets','x2_buckets').head(5).show()
                                x1            x2 x1_buckets x2_buckets
            0  -0.131301907  -3.152383354          1          0
            1  -0.966931088  -0.427920835          1          0
            2   1.257744217  -2.050358546      [1,3)          0
            3  -0.777228042  -2.621604715          1          0
            4  -0.669571385   0.606404768          1      [0,2)

            >>> df_new = bucketizer.fit(df,['x1','x2'],[[1,3],[0,2]],if_string=False)
            >>> df_new.select('x1','x2','x1_buckets','x2_buckets').head(5).show()
                        x1            x2 x1_buckets x2_buckets
            0  -0.131301907  -3.152383354          1          1
            1  -0.966931088  -0.427920835          1          1
            2   1.257744217  -2.050358546          2          1
            3  -0.777228042  -2.621604715          1          1
            4  -0.669571385   0.606404768          1          2
        """
        if len(outputCols) == 0:
            outputCols = [i + "_buckets" for i in inputCols]
        for i in range(len(inputCols)):
            df = df.withColumn(
                outputCols[i], cut_bins(inputCols[i], splitsArray[i], if_string)
            )
        return df