diff --git a/.gitignore b/.gitignore index 3ba72acd9..399a7473b 100644 --- a/.gitignore +++ b/.gitignore @@ -111,4 +111,4 @@ venv.bak/ *.csv *.DS_Store *.db -*.pptx \ No newline at end of file +*.pptx diff --git a/feature_engine/imputation/categorical.py b/feature_engine/imputation/categorical.py index 8c4000a0c..e7200287c 100644 --- a/feature_engine/imputation/categorical.py +++ b/feature_engine/imputation/categorical.py @@ -1,6 +1,7 @@ # Authors: Soledad Galli # License: BSD 3 clause +import warnings from typing import List, Optional, Union import pandas as pd @@ -12,7 +13,7 @@ _feature_names_in_docstring, _imputer_dict_docstring, _n_features_in_docstring, - _variables_attribute_docstring, + _variables_attribute_docstring ) from feature_engine._docstrings.methods import ( _fit_transform_docstring, @@ -88,6 +89,18 @@ class CategoricalImputer(BaseImputer): type object or categorical. If True, the imputer will select all variables or accept all variables entered by the user, including those cast as numeric. + multimodal : str, default='raise' + Indicates what to do when imputation_method='frequent' + and a variable has more than 1 mode. + + If 'raise', raises a ValueError and stops the fit. + + If 'warn', raises a UserWarning and continues the imputation using the + first most frequent category found. + + If 'ignore', continues without warnings, imputing using the first + most frequent category found. + Attributes ---------- {imputer_dict_} @@ -135,6 +148,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, return_object: bool = False, ignore_format: bool = False, + multimodal: str = "raise", ) -> None: if imputation_method not in ["missing", "frequent"]: raise ValueError( @@ -144,11 +158,18 @@ def __init__( if not isinstance(ignore_format, bool): raise ValueError("ignore_format takes only booleans True and False") + if multimodal not in ["raise", "warn", "ignore"]: + raise ValueError( + "multimodal takes only values 'raise', 'warn', or 'ignore'. " + f"Got {multimodal} instead." + ) + self.imputation_method = imputation_method self.fill_value = fill_value self.variables = _check_variables_input_value(variables) self.return_object = return_object self.ignore_format = ignore_format + self.multimodal = multimodal def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ @@ -163,10 +184,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): y is not needed in this imputation. You can pass None or y. """ - # check input dataframe X = check_X(X) - # select variables to encode if self.ignore_format is True: if self.variables is None: self.variables_ = find_all_variables(X) @@ -182,54 +201,64 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): self.imputer_dict_ = {var: self.fill_value for var in self.variables_} elif self.imputation_method == "frequent": - # if imputing only 1 variable: if len(self.variables_) == 1: var = self.variables_[0] mode_vals = X[var].mode() - # Some variables may contain more than 1 mode: if len(mode_vals) > 1: - raise ValueError( - f"The variable {var} contains multiple frequent categories." - ) + if self.multimodal == "raise": + raise ValueError( + f"The variable {var} contains multiple " + f"frequent categories. Set multimodal='warn' or " + f"multimodal='ignore' to allow imputation using " + f"the first most frequent category found." + ) + elif self.multimodal == "warn": + warnings.warn( + f"Variable {var} has multiple frequent " + f"categories. The first category found, " + f"{mode_vals[0]}, will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = {var: mode_vals[0]} - # imputing multiple variables: else: - # Returns a dataframe with 1 row if there is one mode per - # variable, or more rows if there are more modes: mode_vals = X[self.variables_].mode() - # Careful: some variables contain multiple modes if len(mode_vals) > 1: varnames = mode_vals.dropna(axis=1).columns.to_list() if len(varnames) > 1: varnames_str = ", ".join(varnames) else: varnames_str = varnames[0] - raise ValueError( - f"The variable(s) {varnames_str} contain(s) multiple frequent " - f"categories." - ) + + if self.multimodal == "raise": + raise ValueError( + f"The variable(s) {varnames_str} contain(s) " + f"multiple frequent categories. Set " + f"multimodal='warn' or multimodal='ignore' to allow " + f"imputation using the first most frequent " + f"category found." + ) + elif self.multimodal == "warn": + warnings.warn( + f"Variable(s) {varnames_str} have multiple " + f"frequent categories. The first category " + f"found will be used for imputation.", + UserWarning, + ) self.imputer_dict_ = mode_vals.iloc[0].to_dict() self._get_feature_names_in(X) - return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: - # Frequent category imputation if self.imputation_method == "frequent": X = super().transform(X) - - # Imputation with string else: X = self._transform(X) - - # if variable is of type category, we need to add the new - # category, before filling in the nan add_cats = {} for variable in self.variables_: if X[variable].dtype.name == "category": @@ -243,13 +272,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X.assign(**add_cats).fillna(self.imputer_dict_) - # add additional step to return variables cast as object if self.return_object: X[self.variables_] = X[self.variables_].astype("O") - return X - # Get docstring from BaseClass transform.__doc__ = BaseImputer.transform.__doc__ def _more_tags(self): diff --git a/tests/test_imputation/test_categorical_imputer.py b/tests/test_imputation/test_categorical_imputer.py index 182e8826b..b1d36da8c 100644 --- a/tests/test_imputation/test_categorical_imputer.py +++ b/tests/test_imputation/test_categorical_imputer.py @@ -1,25 +1,39 @@ +import warnings + +import numpy as np import pandas as pd import pytest from feature_engine.imputation import CategoricalImputer +@pytest.fixture +def multimodal_df(): + return pd.DataFrame( + { + "city": [ + "London", "London", "Paris", "Paris", "Berlin", "Berlin", "Madrid" + ], + "country": ["UK", "UK", "FR", "FR", "DE", "DE", "ES"], + "one_mode": [ + "London", "London", "London", "Paris", "Paris", "Berlin", "Berlin" + ], + } + ) + + def test_impute_with_string_missing_and_automatically_find_variables(df_na): - # set up transformer imputer = CategoricalImputer(imputation_method="missing", variables=None) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Missing") X_reference["City"] = X_reference["City"].fillna("Missing") X_reference["Studies"] = X_reference["Studies"].fillna("Missing") - # test init params assert imputer.imputation_method == "missing" assert imputer.variables is None - # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { @@ -28,33 +42,26 @@ def test_impute_with_string_missing_and_automatically_find_variables(df_na): "Studies": "Missing", } - # test transform output - # selected columns should have no NA - # non selected columns should still have NA assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) def test_user_defined_string_and_automatically_find_variables(df_na): - # set up imputer imputer = CategoricalImputer( imputation_method="missing", fill_value="Unknown", variables=None ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Unknown") X_reference["City"] = X_reference["City"].fillna("Unknown") X_reference["Studies"] = X_reference["Studies"].fillna("Unknown") - # test init params assert imputer.imputation_method == "missing" assert imputer.fill_value == "Unknown" assert imputer.variables is None - # tes fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { @@ -63,22 +70,18 @@ def test_user_defined_string_and_automatically_find_variables(df_na): "Studies": "Unknown", } - # test transform output: assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) def test_mode_imputation_and_single_variable(df_na): - # set up imputer imputer = CategoricalImputer(imputation_method="frequent", variables="City") X_transformed = imputer.fit_transform(df_na) - # set up expected result X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") - # test init, fit and transform params, attr and output assert imputer.imputation_method == "frequent" assert imputer.variables == "City" assert imputer.variables_ == ["City"] @@ -90,24 +93,20 @@ def test_mode_imputation_and_single_variable(df_na): def test_mode_imputation_with_multiple_variables(df_na): - # set up imputer imputer = CategoricalImputer( imputation_method="frequent", variables=["Studies", "City"] ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") - # test fit attr and transform output assert imputer.imputer_dict_ == {"Studies": "Bachelor", "City": "London"} pd.testing.assert_frame_equal(X_transformed, X_reference) def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_numerical(df_na): - # test case: imputing of numerical variables cast as object + return numeric df_na = df_na.copy() df_na["Marks"] = df_na["Marks"].astype("O") imputer = CategoricalImputer( @@ -131,8 +130,6 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_numerical(d def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_na): - # test case 6: imputing of numerical variables cast as object + return as object - # after imputation df_na = df_na.copy() df_na["Marks"] = df_na["Marks"].astype("O") imputer = CategoricalImputer( @@ -145,37 +142,31 @@ def test_imputation_of_numerical_vars_cast_as_object_and_returned_as_object(df_n def test_error_when_imputation_method_not_frequent_or_missing(): - with pytest.raises(ValueError): + msg = "imputation_method takes only values 'missing' or 'frequent'" + with pytest.raises(ValueError, match=msg): CategoricalImputer(imputation_method="arbitrary") def test_error_when_variable_contains_multiple_modes(df_na): - msg = "The variable Name contains multiple frequent categories." imputer = CategoricalImputer(imputation_method="frequent", variables="Name") - with pytest.raises(ValueError) as record: + msg = "The variable Name contains multiple frequent categories" + with pytest.raises(ValueError, match=msg): imputer.fit(df_na) - # check that error message matches - assert str(record.value) == msg - msg = "The variable(s) Name contain(s) multiple frequent categories." imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError) as record: + msg = r"The variable\(s\) Name contain\(s\) multiple frequent categories" + with pytest.raises(ValueError, match=msg): imputer.fit(df_na) - # check that error message matches - assert str(record.value) == msg df_ = df_na.copy() df_["Name_dup"] = df_["Name"] - msg = "The variable(s) Name, Name_dup contain(s) multiple frequent categories." imputer = CategoricalImputer(imputation_method="frequent") - with pytest.raises(ValueError) as record: + msg = r"The variable\(s\) Name, Name_dup contain\(s\) multiple frequent categories" + with pytest.raises(ValueError, match=msg): imputer.fit(df_) - # check that error message matches - assert str(record.value) == msg def test_impute_numerical_variables(df_na): - # set up transformer imputer = CategoricalImputer( imputation_method="missing", fill_value=0, @@ -184,24 +175,19 @@ def test_impute_numerical_variables(df_na): ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference = X_reference.fillna(0) - # test init params assert imputer.imputation_method == "missing" assert imputer.variables == ["Name", "City", "Studies", "Age", "Marks"] - # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies", "Age", "Marks"] assert imputer.n_features_in_ == 6 - # test transform params pd.testing.assert_frame_equal(X_transformed, X_reference) def test_impute_numerical_variables_with_mode(df_na): - # set up transformer imputer = CategoricalImputer( imputation_method="frequent", variables=["City", "Studies", "Marks"], @@ -209,16 +195,13 @@ def test_impute_numerical_variables_with_mode(df_na): ) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["City"] = X_reference["City"].fillna("London") X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") X_reference["Marks"] = X_reference["Marks"].fillna(0.8) - # test init params assert imputer.variables == ["City", "Studies", "Marks"] - # test fit attributes assert imputer.variables_ == ["City", "Studies", "Marks"] assert imputer.n_features_in_ == 6 assert imputer.imputer_dict_ == { @@ -227,7 +210,6 @@ def test_impute_numerical_variables_with_mode(df_na): "Marks": 0.8, } - # test transform output pd.testing.assert_frame_equal(X_transformed, X_reference) @@ -239,7 +221,6 @@ def test_variables_cast_as_category_missing(df_na): imputer = CategoricalImputer(imputation_method="missing", variables=None) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Name"] = X_reference["Name"].fillna("Missing") X_reference["Studies"] = X_reference["Studies"].fillna("Missing") @@ -248,7 +229,6 @@ def test_variables_cast_as_category_missing(df_na): X_reference["City"].cat.add_categories("Missing").fillna("Missing") ) - # test fit attributes assert imputer.variables_ == ["Name", "City", "Studies"] assert imputer.imputer_dict_ == { "Name": "Missing", @@ -256,9 +236,6 @@ def test_variables_cast_as_category_missing(df_na): "Studies": "Missing", } - # test transform output - # selected columns should have no NA - # non selected columns should still have NA assert X_transformed[["Name", "City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) @@ -268,27 +245,21 @@ def test_variables_cast_as_category_frequent(df_na): df_na = df_na.copy() df_na["City"] = df_na["City"].astype("category") - # this variable does not have a mode, so drop df_na.drop(labels=["Name"], axis=1, inplace=True) imputer = CategoricalImputer(imputation_method="frequent", variables=None) X_transformed = imputer.fit_transform(df_na) - # set up expected output X_reference = df_na.copy() X_reference["Studies"] = X_reference["Studies"].fillna("Bachelor") X_reference["City"] = X_reference["City"].fillna("London") - # test fit attributes assert imputer.variables_ == ["City", "Studies"] assert imputer.imputer_dict_ == { "City": "London", "Studies": "Bachelor", } - # test transform output - # selected columns should have no NA - # non selected columns should still have NA assert X_transformed[["City", "Studies"]].isnull().sum().sum() == 0 assert X_transformed[["Age", "Marks"]].isnull().sum().sum() > 0 pd.testing.assert_frame_equal(X_transformed, X_reference) @@ -300,8 +271,89 @@ def test_variables_cast_as_category_frequent(df_na): ) def test_error_when_ignore_format_is_not_boolean(ignore_format): msg = "ignore_format takes only booleans True and False" - with pytest.raises(ValueError) as record: + with pytest.raises(ValueError, match=msg): CategoricalImputer(imputation_method="missing", ignore_format=ignore_format) - # check that error message matches - assert str(record.value) == msg + +def test_multimodal_raises_errors(multimodal_df): + imputer = CategoricalImputer(imputation_method="frequent") + with pytest.raises(ValueError, match="multiple frequent categories"): + imputer.fit(multimodal_df) + + +@pytest.mark.parametrize("multimodal", ["warn", "ignore"]) +def test_multimodal_imputation_result(multimodal_df, multimodal): + imputer = CategoricalImputer(imputation_method="frequent", multimodal=multimodal) + if multimodal == "warn": + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + else: + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + imputer.fit(multimodal_df) + matching_warnings = [ + msg for msg in w if "multiple frequent categories" in str(msg.message) + ] + assert len(matching_warnings) == 0 + + +@pytest.mark.parametrize("multimodal", ["bad_value", 1, True]) +def test_multimodal_invalid_value_raises(multimodal): + with pytest.raises(ValueError, match="multimodal takes only values"): + CategoricalImputer(imputation_method="frequent", multimodal=multimodal) + + +def test_multimodal_param_ignored_when_imputation_method_is_missing(): + df = pd.DataFrame({"city": ["London", np.nan, "Paris"]}) + imputer = CategoricalImputer(imputation_method="missing", multimodal="warn") + # Should fit without warnings since there's no mode computation + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + imputer.fit(df) + matching_warnings = [ + msg for msg in w if "multiple frequent categories" in str(msg.message) + ] + assert len(matching_warnings) == 0 + + +def test_warning_when_single_variable_is_multimodal(multimodal_df): + imputer = CategoricalImputer( + imputation_method="frequent", variables="city", multimodal="warn" + ) + with pytest.warns(UserWarning, match="multiple frequent categories"): + imputer.fit(multimodal_df) + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_warning_when_single_variable_in_list_is_multimodal(multimodal_df): + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city"], multimodal="warn" + ) + with pytest.warns(UserWarning) as record: + imputer.fit(multimodal_df) + + assert len(record) == 1 + assert "Variable city has multiple frequent categories" in str(record[0].message) + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_ignore_when_single_variable_is_multimodal(multimodal_df): + imputer = CategoricalImputer( + imputation_method="frequent", variables="city", multimodal="ignore" + ) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + imputer.fit(multimodal_df) + matching_warnings = [ + msg for msg in w if "multiple frequent categories" in str(msg.message) + ] + assert len(matching_warnings) == 0 + assert imputer.imputer_dict_["city"] == multimodal_df["city"].mode()[0] + + +def test_multimodal_raise_when_only_one_variable_is_multimodal(multimodal_df): + imputer = CategoricalImputer( + imputation_method="frequent", variables=["city", "one_mode"], multimodal="raise" + ) + with pytest.raises(ValueError, match="city"): + imputer.fit(multimodal_df)