From bb4b6c481b5dad0ba705accfead6f28350c5e551 Mon Sep 17 00:00:00 2001 From: Emphor <022dakshy@gmail.com> Date: Tue, 17 Mar 2026 17:12:16 +0530 Subject: [PATCH 1/2] Fix incorrect error message for invalid cache_format in get_dataset --- openml/datasets/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 432938520..3f482612c 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -456,7 +456,7 @@ def get_dataset( # noqa: C901, PLR0912 if cache_format not in ["feather", "pickle"]: raise ValueError( - "cache_format must be one of 'feather' or 'pickle. " + "cache_format must be one of 'feather' or 'pickle'. " f"Invalid format specified: {cache_format}", ) From a3bf3c50aefb5e7e0f6186a0f76a103840f95a2a Mon Sep 17 00:00:00 2001 From: Emphor <022dakshy@gmail.com> Date: Wed, 18 Mar 2026 00:45:35 +0530 Subject: [PATCH 2/2] ENH: Add TTL-based cache invalidation for dataset description cache --- openml/datasets/functions.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 3f482612c..9a63a944a 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -2,8 +2,10 @@ # ruff: noqa: PLR0913 from __future__ import annotations +import contextlib import logging import os +import time import warnings from collections import OrderedDict from functools import partial @@ -1071,6 +1073,9 @@ def _topic_delete_dataset(data_id: int, topic: str) -> int: return int(data_id) +DESCRIPTION_CACHE_TTL = 86400 # 24 hours + + def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]: """Get the dataset description as xml dictionary. @@ -1090,11 +1095,15 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, XML Dataset description parsed to a dict. """ - # TODO implement a cache for this that invalidates itself after some time - # This can be saved on disk, but cannot be cached properly, because - # it contains the information on whether a dataset is active. description_file = did_cache_dir / "description.xml" + # Invalidate stale cache + if description_file.is_file(): + file_age = time.time() - description_file.stat().st_mtime + if file_age >= DESCRIPTION_CACHE_TTL: + with contextlib.suppress(OSError): + description_file.unlink() + try: with description_file.open(encoding="utf8") as fh: dataset_xml = fh.read()