diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 0b3fee9d..cd16b5d5 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -60,6 +60,9 @@ [{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], [{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}], [{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}], + [{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], + [{"_type": "typing._GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}], + [{"_type": "typing._SpecialForm"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.SpecialFormHandler", "_config": {}}], [{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}], [{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}] ] diff --git a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py index e9e8d05b..931d7cc5 100644 --- a/src/orcapod/hashing/semantic_hashing/builtin_handlers.py +++ b/src/orcapod/hashing/semantic_hashing/builtin_handlers.py @@ -172,6 +172,45 @@ def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: return f"type:{module}.{qualname}" +class SpecialFormHandler: + """ + Handler for ``typing._SpecialForm`` objects such as ``typing.Union`` and + ``typing.ClassVar``. + + These appear as the ``__origin__`` of typing generics — for example, + ``Optional[int]`` is ``Union[int, None]``, whose ``__origin__`` is + ``typing.Union``. Returns a stable string of the form + ``"special_form:typing."`` so they can be safely embedded as the + origin component inside a ``GenericAliasHandler`` result. + """ + + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + name = getattr(obj, "_name", None) or repr(obj) + return f"special_form:typing.{name}" + + +class GenericAliasHandler: + """ + Handler for generic alias type annotations such as ``dict[int, list[int]]`` + (``types.GenericAlias``) and ``typing`` generics (``typing._GenericAlias``). + + Produces a stable dict containing the origin type and a list of hashed + argument types so that structurally identical generic annotations always + yield the same hash, and structurally different ones yield different hashes. + """ + + def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any: + origin = getattr(obj, "__origin__", None) + args = getattr(obj, "__args__", None) or () + if origin is None: + return f"generic_alias:{obj!r}" + return { + "__type__": "generic_alias", + "origin": hasher.hash_object(origin).to_string(), + "args": [hasher.hash_object(arg).to_string() for arg in args], + } + + class ArrowTableHandler: """ Handler for ``pa.Table`` and ``pa.RecordBatch`` objects. @@ -321,6 +360,21 @@ def register_builtin_handlers( # type objects (classes used as values, e.g. passed in a dict) registry.register(type, TypeObjectHandler()) + # generic alias type annotations: dict[int, str], list[str], etc. + generic_alias_handler = GenericAliasHandler() + registry.register(_types.GenericAlias, generic_alias_handler) + # typing._GenericAlias covers Optional[X], Union[X, Y], Dict[K, V], etc. + # typing._SpecialForm covers typing.Union, typing.ClassVar, etc. which + # appear as __origin__ on those generics (e.g. Optional[int].__origin__ + # is typing.Union, a _SpecialForm). + try: + import typing as _typing + + registry.register(_typing._GenericAlias, generic_alias_handler) # type: ignore[attr-defined] + registry.register(_typing._SpecialForm, SpecialFormHandler()) # type: ignore[attr-defined] + except AttributeError: + pass + # Schema objects -- must come after type handler so Schema is matched # specifically rather than falling through to the Mapping expansion path registry.register(Schema, SchemaHandler()) diff --git a/src/orcapod/semantic_types/universal_converter.py b/src/orcapod/semantic_types/universal_converter.py index c79900b0..be76e808 100644 --- a/src/orcapod/semantic_types/universal_converter.py +++ b/src/orcapod/semantic_types/universal_converter.py @@ -632,7 +632,7 @@ def _create_python_to_arrow_converter( element_converter = self.get_python_to_arrow_converter(args[0]) return ( lambda value: [element_converter(item) for item in value] - if value + if value is not None else [] ) @@ -644,7 +644,7 @@ def _create_python_to_arrow_converter( {"key": key_converter(k), "value": value_converter(v)} for k, v in value.items() ] - if value + if value is not None else [] ) diff --git a/tests/test_hashing/test_semantic_hasher.py b/tests/test_hashing/test_semantic_hasher.py index e37bc366..137767ec 100644 --- a/tests/test_hashing/test_semantic_hasher.py +++ b/tests/test_hashing/test_semantic_hasher.py @@ -521,7 +521,80 @@ def test_custom_class_hashed(self, hasher): # --------------------------------------------------------------------------- -# 12. ContentIdentifiableProtocol: independent hashing and composability +# 12. GenericAliasHandler +# --------------------------------------------------------------------------- + + +class TestGenericAliasHandler: + def test_builtin_generic_alias_hashed(self, hasher): + """types.GenericAlias (e.g. dict[str, int]) produces a ContentHash.""" + assert isinstance(hasher.hash_object(dict[str, int]), ContentHash) + + def test_typing_generic_alias_hashed(self, hasher): + """typing generics (e.g. Dict[str, int]) produce a ContentHash.""" + import typing + + assert isinstance(hasher.hash_object(typing.Dict[str, int]), ContentHash) + + def test_optional_hashed(self, hasher): + """Optional[int] (a typing._GenericAlias) produces a ContentHash.""" + import typing + + assert isinstance(hasher.hash_object(typing.Optional[int]), ContentHash) + + def test_same_annotation_same_hash(self, hasher): + """Identical annotations produce the same hash.""" + assert hasher.hash_object(list[int]) == hasher.hash_object(list[int]) + + def test_different_annotations_differ(self, hasher): + """dict[str, int] and dict[int, str] are structurally different.""" + assert hasher.hash_object(dict[str, int]) != hasher.hash_object(dict[int, str]) + + def test_different_containers_differ(self, hasher): + """list[int] and tuple[int] differ even with the same argument.""" + assert hasher.hash_object(list[int]) != hasher.hash_object(tuple[int]) + + def test_nested_generic_hashed(self, hasher): + """Nested generics like dict[str, list[int]] produce a ContentHash.""" + assert isinstance(hasher.hash_object(dict[str, list[int]]), ContentHash) + + def test_nested_generic_discrimination(self, hasher): + """dict[str, list[int]] and dict[str, list[str]] produce different hashes.""" + assert hasher.hash_object(dict[str, list[int]]) != hasher.hash_object( + dict[str, list[str]] + ) + + def test_builtin_and_typing_equivalent(self, hasher): + """dict[str, int] and Dict[str, int] should hash to the same value.""" + import typing + + assert hasher.hash_object(dict[str, int]) == hasher.hash_object( + typing.Dict[str, int] + ) + + def test_optional_equals_union_with_none(self, hasher): + """Optional[int] and Union[int, None] are the same Python object and must hash identically.""" + import typing + + assert hasher.hash_object(typing.Optional[int]) == hasher.hash_object( + typing.Union[int, None] + ) + + def test_union_arg_order_matters(self, hasher): + """Union[int, str] and Union[str, int] hash differently. + + Python's type system considers them equivalent, but orcapod hashes + args in declaration order — users should be consistent. + """ + import typing + + assert hasher.hash_object(typing.Union[int, str]) != hasher.hash_object( + typing.Union[str, int] + ) + + +# --------------------------------------------------------------------------- +# 13. ContentIdentifiableProtocol: independent hashing and composability # --------------------------------------------------------------------------- @@ -949,6 +1022,8 @@ def test_get_default_type_handler_registry_is_singleton(self): def test_default_registry_has_builtin_handlers(self): import types as _types + import typing as _typing + reg = get_default_type_handler_registry() assert reg.has_handler(bytes) assert reg.has_handler(bytearray) @@ -956,6 +1031,9 @@ def test_default_registry_has_builtin_handlers(self): assert reg.has_handler(Path) assert reg.has_handler(_types.FunctionType) assert reg.has_handler(type) + assert reg.has_handler(_types.GenericAlias) + assert reg.has_handler(_typing._GenericAlias) # type: ignore[attr-defined] + assert reg.has_handler(_typing._SpecialForm) # type: ignore[attr-defined] def test_default_registry_has_no_content_hash_handler(self): """ContentHash is handled as a terminal -- no registry entry needed."""