diff --git a/docs/api_doc/encoding/StringListBinarizer.rst b/docs/api_doc/encoding/StringListBinarizer.rst new file mode 100644 index 000000000..5b7112cfe --- /dev/null +++ b/docs/api_doc/encoding/StringListBinarizer.rst @@ -0,0 +1,5 @@ +StringListBinarizer +=================== + +.. autoclass:: feature_engine.encoding.StringListBinarizer + :members: diff --git a/docs/api_doc/encoding/index.rst b/docs/api_doc/encoding/index.rst index 5256753c4..ab448c4d1 100644 --- a/docs/api_doc/encoding/index.rst +++ b/docs/api_doc/encoding/index.rst @@ -37,6 +37,7 @@ input. DecisionTreeEncoder RareLabelEncoder StringSimilarityEncoder + StringListBinarizer Other categorical encoding libraries ------------------------------------ diff --git a/docs/user_guide/encoding/StringListBinarizer.rst b/docs/user_guide/encoding/StringListBinarizer.rst new file mode 100644 index 000000000..54dd538a9 --- /dev/null +++ b/docs/user_guide/encoding/StringListBinarizer.rst @@ -0,0 +1,79 @@ +.. _string_list_binarizer: + +.. currentmodule:: feature_engine.encoding + +StringListBinarizer +=================== + +:class:`StringListBinarizer()` replaces categorical variables containing lists of strings +or comma-delimited strings with a set of binary variables (dummy variables) representing +each one of the unique tags or categories present across all observations. + +This transformer is particularly useful for handling multi-label categorical columns +where each row might have multiple values, such as ``"action, comedy"`` or +``"romance, thriller, action"``. The transformer splits these strings by a specified separator, +collects all unique tags, and then applies one-hot encoding on them. It can also natively +handle columns structured as Python lists, like ``["action", "comedy"]``. + +Python example +-------------- + +Let's look at an example. We generate a toy dataset with multi-label genre information +stored as comma-delimited strings: + +.. code:: python + + import pandas as pd + from feature_engine.encoding import StringListBinarizer + + X = pd.DataFrame(dict( + user_id = [1, 2, 3], + genres = ["action, comedy", "comedy", "action, thriller"] + )) + + print(X) + +.. code:: python + + user_id genres + 0 1 action, comedy + 1 2 comedy + 2 3 action, thriller + +Now, we set up the :class:`StringListBinarizer()`. Since our strings are separated by a +comma and a space, we specify ``separator=", "``. + +.. code:: python + + slb = StringListBinarizer( + variables=["genres"], + separator=", " + ) + + slb.fit(X) + +During `fit`, the enoder splits the strings, identifies the unique categories across +the entire dataset, and saves them in its `encoder_dict_` attribute. + +.. code:: python + + print(slb.encoder_dict_) + # {'genres': ['action', 'comedy', 'thriller']} + +We can now use `transform` to get the dummy variables. The original column is dropped by default. + +.. code:: python + + X_encoded = slb.transform(X) + print(X_encoded) + +.. code:: python + + user_id genres_action genres_comedy genres_thriller + 0 1 1 1 0 + 1 2 0 1 0 + 2 3 1 0 1 + +As we see, each row now has a 1 in the columns corresponding to the genres it originally contained, +and 0 otherwise. Unseen categories encountered during transform will simply be ignored (i.e. all +dummy columns will be 0 for those extra components). diff --git a/docs/user_guide/encoding/index.rst b/docs/user_guide/encoding/index.rst index 2b82c0a11..eccdc7500 100644 --- a/docs/user_guide/encoding/index.rst +++ b/docs/user_guide/encoding/index.rst @@ -431,6 +431,7 @@ Encoders :maxdepth: 1 OneHotEncoder + StringListBinarizer OrdinalEncoder CountFrequencyEncoder MeanEncoder diff --git a/feature_engine/encoding/__init__.py b/feature_engine/encoding/__init__.py index 3c689b7ad..cfdec4a68 100644 --- a/feature_engine/encoding/__init__.py +++ b/feature_engine/encoding/__init__.py @@ -9,6 +9,7 @@ from .ordinal import OrdinalEncoder from .rare_label import RareLabelEncoder from .similarity_encoder import StringSimilarityEncoder +from .string_list_binarizer import StringListBinarizer from .woe import WoEEncoder __all__ = [ @@ -18,6 +19,7 @@ "OneHotEncoder", "OrdinalEncoder", "RareLabelEncoder", + "StringListBinarizer", "StringSimilarityEncoder", "WoEEncoder", ] diff --git a/feature_engine/encoding/string_list_binarizer.py b/feature_engine/encoding/string_list_binarizer.py new file mode 100644 index 000000000..79a3b7527 --- /dev/null +++ b/feature_engine/encoding/string_list_binarizer.py @@ -0,0 +1,276 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine.dataframe_checks import ( + _check_optional_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.tags import _return_tags +from feature_engine.variable_handling import ( + check_all_variables, + find_all_variables, +) + + +class StringListBinarizer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + StringListBinarizer() takes categorical variables that contain a list of strings + or a delimited string, and creates binary variables representing each of the + unique categories across all observations. + + This is especially useful for columns containing multiple tags per row, such as + `["action", "comedy"]` or `"action, comedy"`. + + The transformer takes a list of variables to encode, or automatically selects + all object/categorical columns if none are provided. + + The encodings are created by splitting the strings on a specified `separator` + (or parsing the lists directly), identifying the unique tags in the dataset, + and then adding a new boolean column `varname_tag` for each unique tag. + + Original columns are dropped after transformation by default. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables : list, default=None + The list of categorical variables to encode. If None, the encoder will find and + select all categorical variables. + + separator : str, default="," + The separator used to split the strings in the variable. + If the variable contains Python lists instead of strings, + this parameter is ignored. + + ignore_format : bool, default=False + Whether to format check the variables in `fit`. If `True`, the encoder will + ignore the variable types and proceed with encoding, provided the variables are + entered by the user. If `variables` is None, the target variables are all those + in the dataset regardless of type. If `False`, the encoder will select and + encode only categorical variables (type 'object' or 'categorical'). + + Attributes + ---------- + variables_: + The list of variables to be transformed. + + encoder_dict_: + A dictionary mapping the variables to the sorted list of their unique tags. + + feature_names_in_: + List with the names of features seen during `fit`. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + Learn the unique tags per variable. + + fit_transform: + Fit to data, then transform it. + + transform: + Replace the original variable with the binary encoded variables. + + Examples + -------- + >>> import pandas as pd + >>> from feature_engine.encoding import StringListBinarizer + >>> X = pd.DataFrame(dict(tags=["action, comedy", "comedy", "action, thriller"])) + >>> slb = StringListBinarizer(variables=["tags"], separator=", ") + >>> slb.fit(X) + >>> slb.transform(X) + tags_action tags_comedy tags_thriller + 0 1 1 0 + 1 0 1 0 + 2 1 0 1 + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + separator: str = ",", + ignore_format: bool = False, + ) -> None: + + if not isinstance(separator, str): + raise ValueError( + f"separator takes only strings. Got {type(separator).__name__} instead." + ) + + if not isinstance(ignore_format, bool): + raise ValueError( + "ignore_format takes only booleans True and False. " + f"Got {ignore_format} instead." + ) + + self.variables = _check_variables_input_value(variables) + self.separator = separator + self.ignore_format = ignore_format + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Learn the unique tags present in each categorical variable. + + Parameters + ---------- + + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas series, default=None + Target. It is not needed in this encoded. You can pass y or + None. + """ + X = check_X(X) + + # select variables to encode + if self.ignore_format is True: + if self.variables is None: + self.variables_ = find_all_variables(X) + else: + self.variables_ = check_all_variables(X, self.variables) + else: + if self.variables is None: + # Select typical categorical/string-like variables + self.variables_ = X.select_dtypes( + include=["object", "category", "string"] + ).columns.to_list() + if len(self.variables_) == 0: + raise ValueError( + "No categorical variables found in the dataframe. Please check " + "the variables format or set `ignore_format=True`." + ) + else: + self.variables_ = _check_variables_input_value(self.variables) + + # Check that specified variables exist and are object/categorical + non_cat = [ + var + for var in self.variables_ + if var in X.columns + and not ( + pd.api.types.is_object_dtype(X[var]) + or isinstance(X[var].dtype, pd.CategoricalDtype) + or pd.api.types.is_string_dtype(X[var]) + ) + ] + if non_cat: + raise TypeError( + "Some of the variables are not categorical. Please cast them " + "as object or categorical before calling fit, or set " + "`ignore_format=True`. Variables: " + f"{non_cat}" + ) + + _check_optional_contains_na(X, self.variables_) + + self.encoder_dict_ = {} + + for var in self.variables_: + unique_tags = set() + for row in X[var]: + if isinstance(row, str): + tags = [t.strip() for t in row.split(self.separator)] + elif isinstance(row, list): + tags = [str(t).strip() for t in row] + else: + tags = [str(row).strip()] + unique_tags.update(tags) + + # Remove empty strings from tags (often caused by trailing separators) + unique_tags.discard("") + + self.encoder_dict_[var] = sorted(list(unique_tags)) + + self.feature_names_in_ = X.columns.tolist() + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Replace the categorical variables by the binary encoded variables. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: pandas dataframe. + The transformed dataframe. The shape of the dataframe will differ from + the original, as it replaces the original list/string columns with multiple + dummy columns. + """ + check_is_fitted(self) + + X = check_X(X) + _check_X_matches_training_df(X, self.n_features_in_) + _check_optional_contains_na(X, self.variables_) + + X_transformed = X[self.feature_names_in_].copy() + + for feature in self.variables_: + categories = self.encoder_dict_[feature] + + # Use faster numpy processing for dummies + dummy_data = { + f"{feature}_{category}": np.zeros(len(X), dtype=int) + for category in categories + } + + for i, row in enumerate(X[feature]): + if isinstance(row, str): + tags = [t.strip() for t in row.split(self.separator)] + elif isinstance(row, list): + tags = [str(t).strip() for t in row] + else: + tags = [str(row).strip()] + + for t in tags: + if t in categories: + dummy_data[f"{feature}_{t}"][i] = 1 + + dummy_df = pd.DataFrame(dummy_data, index=X.index) + X_transformed = pd.concat([X_transformed, dummy_df], axis=1) + + # drop original variables + X_transformed.drop(labels=self.variables_, axis=1, inplace=True) + + return X_transformed + + def get_feature_names_out(self, input_features=None) -> List[str]: + """Get output feature names for transformation.""" + check_is_fitted(self) + + feature_names = list(self.feature_names_in_) + feature_names = [f for f in feature_names if f not in self.variables_] + + for feature in self.variables_: + for category in self.encoder_dict_[feature]: + feature_names.append(f"{feature}_{category}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["variables"] = "categorical" + tags_dict["_xfail_checks"]["check_estimators_nan_inf"] = "transformer allows NA" + return tags_dict diff --git a/tests/test_encoding/test_string_list_binarizer.py b/tests/test_encoding/test_string_list_binarizer.py new file mode 100644 index 000000000..14a0114c9 --- /dev/null +++ b/tests/test_encoding/test_string_list_binarizer.py @@ -0,0 +1,186 @@ +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError + +from feature_engine.encoding import StringListBinarizer + + +def test_string_list_binarizer_delimited_strings(): + df = pd.DataFrame( + { + "tags": ["action, comedy", "comedy", "action, thriller"], + "other": [1, 2, 3], + } + ) + + expected_df = pd.DataFrame( + { + "other": [1, 2, 3], + "tags_action": [1, 0, 1], + "tags_comedy": [1, 1, 0], + "tags_thriller": [0, 0, 1], + } + ) + + encoder = StringListBinarizer(variables=["tags"], separator=",") + X = encoder.fit_transform(df) + + assert encoder.variables_ == ["tags"] + assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]} + pd.testing.assert_frame_equal(X, expected_df) + + +def test_string_list_binarizer_python_lists(): + df = pd.DataFrame( + { + "tags": [["action", "comedy"], ["comedy"], ["action", "thriller"]], + "other": [1, 2, 3], + } + ) + + expected_df = pd.DataFrame( + { + "other": [1, 2, 3], + "tags_action": [1, 0, 1], + "tags_comedy": [1, 1, 0], + "tags_thriller": [0, 0, 1], + } + ) + + encoder = StringListBinarizer(variables=["tags"]) + X = encoder.fit_transform(df) + + assert encoder.variables_ == ["tags"] + assert encoder.encoder_dict_ == {"tags": ["action", "comedy", "thriller"]} + pd.testing.assert_frame_equal(X, expected_df) + + +def test_find_categorical_variables(): + df = pd.DataFrame({"tags": ["A,B", "C"], "num": [1, 2]}) + + encoder = StringListBinarizer(variables=None, separator=",") + encoder.fit(df) + + assert encoder.variables_ == ["tags"] + + +def test_ignore_format(): + df = pd.DataFrame( + { + "tags": ["A,B", "C"], + "num": ["1", "2"], # Treated as object but maybe we want to encode it + } + ) + + encoder = StringListBinarizer(variables=["num"], ignore_format=True) + encoder.fit(df) + + assert encoder.variables_ == ["num"] + assert encoder.encoder_dict_ == {"num": ["1", "2"]} + + +def test_error_if_not_categorical(): + df = pd.DataFrame({"num": [1, 2]}) + encoder = StringListBinarizer(variables=["num"]) + with pytest.raises(TypeError): + encoder.fit(df) + + +def test_missing_values_error(): + df = pd.DataFrame({"tags": ["A,B", float("nan")]}) + encoder = StringListBinarizer(variables=["tags"]) + with pytest.raises(ValueError): + encoder.fit(df) + + +def test_not_fitted_error(): + df = pd.DataFrame({"tags": ["A,B"]}) + encoder = StringListBinarizer() + with pytest.raises(NotFittedError): + encoder.transform(df) + + +def test_unseen_categories(): + df_train = pd.DataFrame({"tags": ["A,B", "C"]}) + df_test = pd.DataFrame({"tags": ["A,D", "B,C,E"]}) + + encoder = StringListBinarizer(variables=["tags"], separator=",") + encoder.fit(df_train) + X = encoder.transform(df_test) + + # Expect D and E to be ignored (columns for A, B, C only) + expected_df = pd.DataFrame( + {"tags_A": [1, 0], "tags_B": [0, 1], "tags_C": [0, 1]} + ) + + pd.testing.assert_frame_equal(X, expected_df) + + +def test_init_separator_not_str(): + with pytest.raises(ValueError, match="separator takes only strings"): + StringListBinarizer(variables=["tags"], separator=123) + + +def test_init_ignore_format_not_bool(): + with pytest.raises(ValueError, match="ignore_format takes only booleans"): + StringListBinarizer(variables=["tags"], ignore_format="yes") + + +def test_ignore_format_true_variables_none(): + """Fit with ignore_format=True and variables=None uses find_all_variables.""" + df = pd.DataFrame( + {"tags": ["a,b", "c"], "num": [1, 2], "other": ["x", "y"]} + ) + encoder = StringListBinarizer(separator=",", ignore_format=True) + encoder.fit(df) + assert set(encoder.variables_) == {"tags", "num", "other"} + X = encoder.transform(df) + assert list(X.columns) == encoder.get_feature_names_out() + + +def test_no_categorical_variables_raises(): + """Raise when variables=None and no object/category/string columns.""" + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + encoder = StringListBinarizer(variables=None) + with pytest.raises(ValueError, match="No categorical variables found"): + encoder.fit(df) + + +def test_fit_row_not_str_or_list(): + """Fit with a row that is neither str nor list (e.g. number) uses else branch.""" + df = pd.DataFrame({"tags": ["A,B", 42]}) + encoder = StringListBinarizer(variables=["tags"], separator=",") + encoder.fit(df) + assert "A" in encoder.encoder_dict_["tags"] + assert "B" in encoder.encoder_dict_["tags"] + assert "42" in encoder.encoder_dict_["tags"] + + +def test_transform_row_not_str_or_list(): + """Transform with non-str non-list row uses else branch.""" + df_train = pd.DataFrame({"tags": ["A", "B"]}) + encoder = StringListBinarizer(variables=["tags"]) + encoder.fit(df_train) + df_test = pd.DataFrame({"tags": [123]}) + X = encoder.transform(df_test) + assert "tags_A" in X.columns + assert "tags_B" in X.columns + + +def test_get_feature_names_out(): + """get_feature_names_out returns binarized feature names in order.""" + df = pd.DataFrame( + {"x": [1, 2], "tags": ["a,b", "c"], "y": [3, 4]} + ) + encoder = StringListBinarizer(variables=["tags"], separator=",") + encoder.fit(df) + names = encoder.get_feature_names_out() + assert names == ["x", "y", "tags_a", "tags_b", "tags_c"] + + +def test_more_tags(): + """_more_tags returns expected sklearn config.""" + encoder = StringListBinarizer(variables=["tags"]) + tags = encoder._more_tags() + assert tags["variables"] == "categorical" + assert "check_estimators_nan_inf" in tags["_xfail_checks"]