From e452f4266db7b0cf45f3d8faa3871aca533db2b1 Mon Sep 17 00:00:00 2001
From: snehagahlot3 <snehagahlot3@gmail.com>
Date: Sun, 22 Mar 2026 17:20:06 +0530
Subject: [PATCH 1/2] docs: replace houseprice dataset with sklearn's
 fetch_california_housing (#692)

---
 .../GeometricWidthDiscretiser.rst             | 18 +++++------
 docs/user_guide/wrappers/Wrapper.rst          | 30 +++++++++++--------
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst
index ce9626595..fc13e8466 100644
--- a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst
+++ b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst
@@ -54,15 +54,15 @@ Let's load the house prices dataset and separate it into train and test sets:
 	from sklearn.model_selection import train_test_split
 
 	from feature_engine.discretisation import GeometricWidthDiscretiser
+    
+    from sklearn.datasets import fetch_california_housing
+    data = fetch_california_housing(as_frame=True).frame
 
-	# Load dataset
-	data = pd.read_csv('houseprice.csv')
-
-	# Separate into train and test sets
-	X_train, X_test, y_train, y_test =  train_test_split(
-		    data.drop(['Id', 'SalePrice'], axis=1),
-		    data['SalePrice'], test_size=0.3, random_state=0)
-
+# Separate into train and test sets
+X_train, X_test, y_train, y_test = train_test_split(
+        data.drop(['MedHouseVal'], axis=1),
+        data['MedHouseVal'], test_size=0.3, random_state=0)
+	
 
 Now, we want to discretise the 2 variables indicated below into 10 intervals of increasing
 width:
@@ -70,7 +70,7 @@ width:
 .. code:: python
 
 	# set up the discretisation transformer
-	disc = GeometricWidthDiscretiser(bins=10, variables=['LotArea', 'GrLivArea'])
+    disc = GeometricWidthDiscretiser(bins=10, variables=['MedInc', 'AveRooms'])
 
 	# fit the transformer
 	disc.fit(X_train)
diff --git a/docs/user_guide/wrappers/Wrapper.rst b/docs/user_guide/wrappers/Wrapper.rst
index 48e9c91f6..0635ed018 100644
--- a/docs/user_guide/wrappers/Wrapper.rst
+++ b/docs/user_guide/wrappers/Wrapper.rst
@@ -33,16 +33,20 @@ impute only the selected variables.
     from feature_engine.wrappers import SklearnTransformerWrapper
 	
     # Load dataset
-    data = pd.read_csv('houseprice.csv')
+    from sklearn.datasets import fetch_california_housing
+    data = fetch_california_housing(as_frame=True).frame
+    
     
     # Separate into train and test sets
     X_train, X_test, y_train, y_test = train_test_split(
-    	data.drop(['Id', 'SalePrice'], axis=1),
-    	data['SalePrice'], test_size=0.3, random_state=0)
+        data.drop(['MedHouseVal'], axis=1),
+        data['MedHouseVal'], test_size=0.3, random_state=0)
+
     	
     # set up the wrapper with the SimpleImputer
     imputer = SklearnTransformerWrapper(transformer = SimpleImputer(strategy='mean'),
-                                        variables = ['LotFrontage', 'MasVnrArea'])
+                                        variables = ['MedInc', 'AveRooms'])
+
     
     # fit the wrapper + SimpleImputer                              
     imputer.fit(X_train)
@@ -64,16 +68,17 @@ to standardize only the selected variables.
     from feature_engine.wrappers import SklearnTransformerWrapper
 
     # Load dataset
-    data = pd.read_csv('houseprice.csv')
+    from sklearn.datasets import fetch_california_housing
+    data = fetch_california_housing(as_frame=True).frame
 
     # Separate into train and test sets
     X_train, X_test, y_train, y_test = train_test_split(
-    	data.drop(['Id', 'SalePrice'], axis=1),
-    	data['SalePrice'], test_size=0.3, random_state=0)
+        data.drop(['MedHouseVal'], axis=1),
+        data['MedHouseVal'], test_size=0.3, random_state=0)
 
     # set up the wrapper with the StandardScaler
     scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
-                                        variables = ['LotFrontage', 'MasVnrArea'])
+                                        variables = ['MedInc', 'AveRooms'])
 
     # fit the wrapper + StandardScaler
     scaler.fit(X_train)
@@ -95,12 +100,13 @@ to select only a subset of the variables.
     from feature_engine.wrappers import SklearnTransformerWrapper
 
     # Load dataset
-    data = pd.read_csv('houseprice.csv')
-
+    from sklearn.datasets import fetch_california_housing
+    data = fetch_california_housing(as_frame=True).frame
+    
     # Separate into train and test sets
     X_train, X_test, y_train, y_test = train_test_split(
-    	data.drop(['Id', 'SalePrice'], axis=1),
-    	data['SalePrice'], test_size=0.3, random_state=0)
+        data.drop(['MedHouseVal'], axis=1),
+        data['MedHouseVal'], test_size=0.3, random_state=0)
 
     cols = [var for var in X_train.columns if X_train[var].dtypes !='O']
 

From f1ed2f475a620b4640ffe5d0e8f07c8299f6af6c Mon Sep 17 00:00:00 2001
From: snehagahlot3 <snehagahlot3@gmail.com>
Date: Sun, 22 Mar 2026 17:41:43 +0530
Subject: [PATCH 2/2] fix: correct indentation in GeometricWidthDiscretiser.rst

---
 .../discretisation/GeometricWidthDiscretiser.rst           | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst
index fc13e8466..70153a3b7 100644
--- a/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst
+++ b/docs/user_guide/discretisation/GeometricWidthDiscretiser.rst
@@ -54,12 +54,11 @@ Let's load the house prices dataset and separate it into train and test sets:
 	from sklearn.model_selection import train_test_split
 
 	from feature_engine.discretisation import GeometricWidthDiscretiser
-    
+
     from sklearn.datasets import fetch_california_housing
     data = fetch_california_housing(as_frame=True).frame
-
-# Separate into train and test sets
-X_train, X_test, y_train, y_test = train_test_split(
+    # Separate into train and test sets
+    X_train, X_test, y_train, y_test = train_test_split(
         data.drop(['MedHouseVal'], axis=1),
         data['MedHouseVal'], test_size=0.3, random_state=0)