openml · rachit23tech · Mar 18, 2026 · Mar 18, 2026
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -15,24 +15,33 @@ class OpenMLDataFeature:  # noqa: PLW1641
     Parameters
     ----------
     index : int
-        The index of this feature
+        The index of this feature. Must be non-negative.
     name : str
-        Name of the feature
+        Name of the feature. Must be a non-empty string.
     data_type : str
         can be nominal, numeric, string, date (corresponds to arff)
     nominal_values : list(str)
-        list of the possible values, in case of nominal attribute
+        list of the possible values, in case of nominal attribute.
+        Must be a non-empty list for nominal data types.
     number_missing_values : int
         Number of rows that have a missing value for this feature.
-    ontologies : list(str)
+        Must be non-negative.
+    ontologies : list(str), optional
         list of ontologies attached to this feature. An ontology describes the
         concept that are described in a feature. An ontology is defined by an
         URL where the information is provided.
+
+    Raises
+    ------
+    TypeError
+        If types are incorrect for any parameter.
+    ValueError
+        If values are invalid (e.g., negative counts, empty name, invalid data_type).
     """
 
     LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
 
-    def __init__(  # noqa: PLR0913
+    def __init__(  # noqa: PLR0913, C901, PLR0912
         self,
         index: int,
         name: str,
@@ -41,38 +50,95 @@ def __init__(  # noqa: PLR0913
         number_missing_values: int,
         ontologies: list[str] | None = None,
     ):
+        # Validate index
         if not isinstance(index, int):
-            raise TypeError(f"Index must be `int` but is {type(index)}")
+            raise TypeError(
+                f"Parameter 'index' must be int, but got {type(index).__name__}. Value: {index!r}"
+            )
+        if index < 0:
+            raise ValueError(
+                f"Parameter 'index' must be non-negative, but got {index}. "
+                "Feature indices cannot be negative."
+            )
 
+        # Validate name
+        if not isinstance(name, str):
+            raise TypeError(
+                f"Parameter 'name' must be str, but got {type(name).__name__}. Value: {name!r}"
+            )
+        if not name.strip():
+            raise ValueError(f"Parameter 'name' cannot be empty or whitespace-only. Got: {name!r}")
+
+        # Validate data_type
+        if not isinstance(data_type, str):
+            raise TypeError(
+                f"Parameter 'data_type' must be str, but got "
+                f"{type(data_type).__name__}. Value: {data_type!r}"
+            )
         if data_type not in self.LEGAL_DATA_TYPES:
             raise ValueError(
-                f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}",
+                f"Parameter 'data_type' must be one of {list(self.LEGAL_DATA_TYPES)}, "
+                f"but got {data_type!r}."
             )
 
+        # Validate nominal_values
         if data_type == "nominal":
             if nominal_values is None:
                 raise TypeError(
-                    "Dataset features require attribute `nominal_values` for nominal feature type.",
+                    "Parameter 'nominal_values' is required for nominal data types, "
+                    "but got None. Please provide a list of nominal values."
                 )
-
             if not isinstance(nominal_values, list):
                 raise TypeError(
-                    "Argument `nominal_values` is of wrong datatype, should be list, "
-                    f"but is {type(nominal_values)}",
+                    f"Parameter 'nominal_values' must be list, but got "
+                    f"{type(nominal_values).__name__}. Value: {nominal_values!r}"
+                )
+            if not nominal_values:
+                raise ValueError(
+                    "Parameter 'nominal_values' cannot be empty for nominal data "
+                    "types. Please provide at least one nominal value."
+                )
+            # Validate that all elements are strings
+            non_string_values = [v for v in nominal_values if not isinstance(v, str)]
+            if non_string_values:
+                raise TypeError(
+                    f"All elements in 'nominal_values' must be str, but found "
+                    f"non-string values: {non_string_values}."
                 )
         elif nominal_values is not None:
-            raise TypeError("Argument `nominal_values` must be None for non-nominal feature.")
+            raise TypeError(
+                f"Parameter 'nominal_values' must be None for non-nominal data "
+                f"types (got {data_type!r}), but got {type(nominal_values).__name__}. "
+                f"Value: {nominal_values!r}"
+            )
 
+        # Validate number_missing_values
         if not isinstance(number_missing_values, int):
-            msg = f"number_missing_values must be int but is {type(number_missing_values)}"
-            raise TypeError(msg)
+            raise TypeError(
+                f"Parameter 'number_missing_values' must be int, but got "
+                f"{type(number_missing_values).__name__}. "
+                f"Value: {number_missing_values!r}"
+            )
+        if number_missing_values < 0:
+            raise ValueError(
+                f"Parameter 'number_missing_values' must be non-negative, but got "
+                f"{number_missing_values}. Cannot have negative missing values."
+            )
+
+        # Validate ontologies
+        if ontologies is not None and not isinstance(ontologies, list):
+            raise TypeError(
+                f"Parameter 'ontologies' must be list or None, but got "
+                f"{type(ontologies).__name__}. Value: {ontologies!r}"
+            )
 
+        # All validations passed, assign attributes
         self.index = index
-        self.name = str(name)
-        self.data_type = str(data_type)
+        self.name = name
+        self.data_type = data_type
         self.nominal_values = nominal_values
         self.number_missing_values = number_missing_values
-        self.ontologies = ontologies
+        self.ontologies = ontologies if ontologies is not None else []
 
     def __repr__(self) -> str:
         return f"[{self.index} - {self.name} ({self.data_type})]"

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -1022,14 +1022,20 @@ def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature
 
     features: dict[int, OpenMLDataFeature] = {}
     for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
-        nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
+        nr_missing = xmlfeature.get("oml:number_of_missing_values", "0")
+
+        # FIX: Convert ontology string to list for consistency
+        ontologies = xmlfeature.get("oml:ontology")
+        if isinstance(ontologies, str):
+            ontologies = [ontologies]
+
         feature = OpenMLDataFeature(
             int(xmlfeature["oml:index"]),
             xmlfeature["oml:name"],
             xmlfeature["oml:data_type"],
             xmlfeature.get("oml:nominal_value"),
             int(nr_missing),
-            xmlfeature.get("oml:ontology"),
+            ontologies,  # Now it's always list or None
         )
         if idx != feature.index:
             raise ValueError("Data features not provided in right order")