From e452f4266db7b0cf45f3d8faa3871aca533db2b1 Mon Sep 17 00:00:00 2001 From: snehagahlot3 Date: Sun, 22 Mar 2026 17:20:06 +0530 Subject: [PATCH 1/2] docs: replace houseprice dataset with sklearn's fetch_california_housing (#692) --- .../GeometricWidthDiscretiser.rst | 18 +++++------ docs/user_guide/wrappers/Wrapper.rst | 30 +++++++++++-------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst index ce9626595..fc13e8466 100644 --- a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst +++ b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst @@ -54,15 +54,15 @@ Let's load the house prices dataset and separate it into train and test sets: from sklearn.model_selection import train_test_split from feature_engine.discretisation import GeometricWidthDiscretiser + + from sklearn.datasets import fetch_california_housing + data = fetch_california_housing(as_frame=True).frame - # Load dataset - data = pd.read_csv('houseprice.csv') - - # Separate into train and test sets - X_train, X_test, y_train, y_test = train_test_split( - data.drop(['Id', 'SalePrice'], axis=1), - data['SalePrice'], test_size=0.3, random_state=0) - +# Separate into train and test sets +X_train, X_test, y_train, y_test = train_test_split( + data.drop(['MedHouseVal'], axis=1), + data['MedHouseVal'], test_size=0.3, random_state=0) + Now, we want to discretise the 2 variables indicated below into 10 intervals of increasing width: @@ -70,7 +70,7 @@ width: .. code:: python # set up the discretisation transformer - disc = GeometricWidthDiscretiser(bins=10, variables=['LotArea', 'GrLivArea']) + disc = GeometricWidthDiscretiser(bins=10, variables=['MedInc', 'AveRooms']) # fit the transformer disc.fit(X_train) diff --git a/docs/user_guide/wrappers/Wrapper.rst b/docs/user_guide/wrappers/Wrapper.rst index 48e9c91f6..0635ed018 100644 --- a/docs/user_guide/wrappers/Wrapper.rst +++ b/docs/user_guide/wrappers/Wrapper.rst @@ -33,16 +33,20 @@ impute only the selected variables. from feature_engine.wrappers import SklearnTransformerWrapper # Load dataset - data = pd.read_csv('houseprice.csv') + from sklearn.datasets import fetch_california_housing + data = fetch_california_housing(as_frame=True).frame + # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split( - data.drop(['Id', 'SalePrice'], axis=1), - data['SalePrice'], test_size=0.3, random_state=0) + data.drop(['MedHouseVal'], axis=1), + data['MedHouseVal'], test_size=0.3, random_state=0) + # set up the wrapper with the SimpleImputer imputer = SklearnTransformerWrapper(transformer = SimpleImputer(strategy='mean'), - variables = ['LotFrontage', 'MasVnrArea']) + variables = ['MedInc', 'AveRooms']) + # fit the wrapper + SimpleImputer imputer.fit(X_train) @@ -64,16 +68,17 @@ to standardize only the selected variables. from feature_engine.wrappers import SklearnTransformerWrapper # Load dataset - data = pd.read_csv('houseprice.csv') + from sklearn.datasets import fetch_california_housing + data = fetch_california_housing(as_frame=True).frame # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split( - data.drop(['Id', 'SalePrice'], axis=1), - data['SalePrice'], test_size=0.3, random_state=0) + data.drop(['MedHouseVal'], axis=1), + data['MedHouseVal'], test_size=0.3, random_state=0) # set up the wrapper with the StandardScaler scaler = SklearnTransformerWrapper(transformer = StandardScaler(), - variables = ['LotFrontage', 'MasVnrArea']) + variables = ['MedInc', 'AveRooms']) # fit the wrapper + StandardScaler scaler.fit(X_train) @@ -95,12 +100,13 @@ to select only a subset of the variables. from feature_engine.wrappers import SklearnTransformerWrapper # Load dataset - data = pd.read_csv('houseprice.csv') - + from sklearn.datasets import fetch_california_housing + data = fetch_california_housing(as_frame=True).frame + # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split( - data.drop(['Id', 'SalePrice'], axis=1), - data['SalePrice'], test_size=0.3, random_state=0) + data.drop(['MedHouseVal'], axis=1), + data['MedHouseVal'], test_size=0.3, random_state=0) cols = [var for var in X_train.columns if X_train[var].dtypes !='O'] From f1ed2f475a620b4640ffe5d0e8f07c8299f6af6c Mon Sep 17 00:00:00 2001 From: snehagahlot3 Date: Sun, 22 Mar 2026 17:41:43 +0530 Subject: [PATCH 2/2] fix: correct indentation in GeometricWidthDiscretiser.rst --- .../discretisation/GeometricWidthDiscretiser.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst index fc13e8466..70153a3b7 100644 --- a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst +++ b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst @@ -54,12 +54,11 @@ Let's load the house prices dataset and separate it into train and test sets: from sklearn.model_selection import train_test_split from feature_engine.discretisation import GeometricWidthDiscretiser - + from sklearn.datasets import fetch_california_housing data = fetch_california_housing(as_frame=True).frame - -# Separate into train and test sets -X_train, X_test, y_train, y_test = train_test_split( + # Separate into train and test sets + X_train, X_test, y_train, y_test = train_test_split( data.drop(['MedHouseVal'], axis=1), data['MedHouseVal'], test_size=0.3, random_state=0)