Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 83 additions & 17 deletions openml/datasets/data_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,33 @@ class OpenMLDataFeature: # noqa: PLW1641
Parameters
----------
index : int
The index of this feature
The index of this feature. Must be non-negative.
name : str
Name of the feature
Name of the feature. Must be a non-empty string.
data_type : str
can be nominal, numeric, string, date (corresponds to arff)
nominal_values : list(str)
list of the possible values, in case of nominal attribute
list of the possible values, in case of nominal attribute.
Must be a non-empty list for nominal data types.
number_missing_values : int
Number of rows that have a missing value for this feature.
ontologies : list(str)
Must be non-negative.
ontologies : list(str), optional
list of ontologies attached to this feature. An ontology describes the
concept that are described in a feature. An ontology is defined by an
URL where the information is provided.

Raises
------
TypeError
If types are incorrect for any parameter.
ValueError
If values are invalid (e.g., negative counts, empty name, invalid data_type).
"""

LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]

def __init__( # noqa: PLR0913
def __init__( # noqa: PLR0913, C901, PLR0912
self,
index: int,
name: str,
Expand All @@ -41,38 +50,95 @@ def __init__( # noqa: PLR0913
number_missing_values: int,
ontologies: list[str] | None = None,
):
# Validate index
if not isinstance(index, int):
raise TypeError(f"Index must be `int` but is {type(index)}")
raise TypeError(
f"Parameter 'index' must be int, but got {type(index).__name__}. Value: {index!r}"
)
if index < 0:
raise ValueError(
f"Parameter 'index' must be non-negative, but got {index}. "
"Feature indices cannot be negative."
)

# Validate name
if not isinstance(name, str):
raise TypeError(
f"Parameter 'name' must be str, but got {type(name).__name__}. Value: {name!r}"
)
if not name.strip():
raise ValueError(f"Parameter 'name' cannot be empty or whitespace-only. Got: {name!r}")

# Validate data_type
if not isinstance(data_type, str):
raise TypeError(
f"Parameter 'data_type' must be str, but got "
f"{type(data_type).__name__}. Value: {data_type!r}"
)
if data_type not in self.LEGAL_DATA_TYPES:
raise ValueError(
f"data type should be in {self.LEGAL_DATA_TYPES!s}, found: {data_type}",
f"Parameter 'data_type' must be one of {list(self.LEGAL_DATA_TYPES)}, "
f"but got {data_type!r}."
)

# Validate nominal_values
if data_type == "nominal":
if nominal_values is None:
raise TypeError(
"Dataset features require attribute `nominal_values` for nominal feature type.",
"Parameter 'nominal_values' is required for nominal data types, "
"but got None. Please provide a list of nominal values."
)

if not isinstance(nominal_values, list):
raise TypeError(
"Argument `nominal_values` is of wrong datatype, should be list, "
f"but is {type(nominal_values)}",
f"Parameter 'nominal_values' must be list, but got "
f"{type(nominal_values).__name__}. Value: {nominal_values!r}"
)
if not nominal_values:
raise ValueError(
"Parameter 'nominal_values' cannot be empty for nominal data "
"types. Please provide at least one nominal value."
)
# Validate that all elements are strings
non_string_values = [v for v in nominal_values if not isinstance(v, str)]
if non_string_values:
raise TypeError(
f"All elements in 'nominal_values' must be str, but found "
f"non-string values: {non_string_values}."
)
elif nominal_values is not None:
raise TypeError("Argument `nominal_values` must be None for non-nominal feature.")
raise TypeError(
f"Parameter 'nominal_values' must be None for non-nominal data "
f"types (got {data_type!r}), but got {type(nominal_values).__name__}. "
f"Value: {nominal_values!r}"
)

# Validate number_missing_values
if not isinstance(number_missing_values, int):
msg = f"number_missing_values must be int but is {type(number_missing_values)}"
raise TypeError(msg)
raise TypeError(
f"Parameter 'number_missing_values' must be int, but got "
f"{type(number_missing_values).__name__}. "
f"Value: {number_missing_values!r}"
)
if number_missing_values < 0:
raise ValueError(
f"Parameter 'number_missing_values' must be non-negative, but got "
f"{number_missing_values}. Cannot have negative missing values."
)

# Validate ontologies
if ontologies is not None and not isinstance(ontologies, list):
raise TypeError(
f"Parameter 'ontologies' must be list or None, but got "
f"{type(ontologies).__name__}. Value: {ontologies!r}"
)

# All validations passed, assign attributes
self.index = index
self.name = str(name)
self.data_type = str(data_type)
self.name = name
self.data_type = data_type
self.nominal_values = nominal_values
self.number_missing_values = number_missing_values
self.ontologies = ontologies
self.ontologies = ontologies if ontologies is not None else []

def __repr__(self) -> str:
return f"[{self.index} - {self.name} ({self.data_type})]"
Expand Down
10 changes: 8 additions & 2 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,14 +1022,20 @@ def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature

features: dict[int, OpenMLDataFeature] = {}
for idx, xmlfeature in enumerate(features_xml["oml:feature"]):
nr_missing = xmlfeature.get("oml:number_of_missing_values", 0)
nr_missing = xmlfeature.get("oml:number_of_missing_values", "0")

# FIX: Convert ontology string to list for consistency
ontologies = xmlfeature.get("oml:ontology")
if isinstance(ontologies, str):
ontologies = [ontologies]

feature = OpenMLDataFeature(
int(xmlfeature["oml:index"]),
xmlfeature["oml:name"],
xmlfeature["oml:data_type"],
xmlfeature.get("oml:nominal_value"),
int(nr_missing),
xmlfeature.get("oml:ontology"),
ontologies, # Now it's always list or None
)
if idx != feature.index:
raise ValueError("Data features not provided in right order")
Expand Down