Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/orcapod/contexts/data/v0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
[{"_type": "types.BuiltinFunctionType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}],
[{"_type": "types.MethodType"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.FunctionHandler", "_config": {"function_info_extractor": {"_ref": "function_info_extractor"}}}],
[{"_type": "builtins.type"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.TypeObjectHandler", "_config": {}}],
[{"_type": "types.GenericAlias"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.GenericAliasHandler", "_config": {}}],
[{"_type": "pyarrow.Table"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}],
[{"_type": "pyarrow.RecordBatch"}, {"_class": "orcapod.hashing.semantic_hashing.builtin_handlers.ArrowTableHandler", "_config": {"arrow_hasher": {"_ref": "arrow_hasher"}}}]
Comment on lines 60 to 65
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type_handler_registry in this context only registers types.GenericAlias, but the PR description (and register_builtin_handlers) also registers typing._GenericAlias. Because the default semantic_hasher here uses this JSON-constructed registry, typing generics like typing.Dict[int, str] / typing.Optional[int] can still raise in strict mode. Consider adding a typing._GenericAlias entry (or switching this context to a registry implementation that conditionally registers it).

Copilot uses AI. Check for mistakes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@brian-arnold this is a good point. Shall we register _GenericAlias as well?

]
Expand Down
35 changes: 35 additions & 0 deletions src/orcapod/hashing/semantic_hashing/builtin_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,28 @@ def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any:
return f"type:{module}.{qualname}"


class GenericAliasHandler:
"""
Handler for generic alias type annotations such as ``dict[int, list[int]]``
(``types.GenericAlias``) and ``typing`` generics (``typing._GenericAlias``).

Produces a stable dict containing the origin type and a list of hashed
argument types so that structurally identical generic annotations always
yield the same hash, and structurally different ones yield different hashes.
"""

def handle(self, obj: Any, hasher: "SemanticHasherProtocol") -> Any:
origin = getattr(obj, "__origin__", None)
args = getattr(obj, "__args__", None) or ()
if origin is None:
return f"generic_alias:{obj!r}"
return {
"__type__": "generic_alias",
"origin": hasher.hash_object(origin).to_string(),
"args": [hasher.hash_object(arg).to_string() for arg in args],
}
Comment on lines +175 to +194
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are extensive semantic hasher tests, but none appear to cover parameterized generics (e.g. dict[int, str], list[int], typing.Dict[int, str]). Adding a regression test that hashes these types in strict mode (and asserts stable equality/inequality across different args) would prevent this handler from silently breaking in future refactors.

Copilot uses AI. Check for mistakes.


class ArrowTableHandler:
"""
Handler for ``pa.Table`` and ``pa.RecordBatch`` objects.
Expand Down Expand Up @@ -321,6 +343,19 @@ def register_builtin_handlers(
# type objects (classes used as values, e.g. passed in a dict)
registry.register(type, TypeObjectHandler())

# generic alias type annotations: dict[int, str], list[str], etc.
import types as _types
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

register_builtin_handlers imports types as _types twice (once for functions, again for GenericAlias). This is redundant and makes the section a bit harder to read; you can reuse the existing _types import from the functions block instead of re-importing it.

Suggested change
import types as _types

Copilot uses AI. Check for mistakes.

generic_alias_handler = GenericAliasHandler()
registry.register(_types.GenericAlias, generic_alias_handler)
# typing._GenericAlias covers Optional[X], Union[X, Y], Dict[K, V], etc.
try:
import typing as _typing

registry.register(_typing._GenericAlias, generic_alias_handler) # type: ignore[attr-defined]
except AttributeError:
pass

# Schema objects -- must come after type handler so Schema is matched
# specifically rather than falling through to the Mapping expansion path
registry.register(Schema, SchemaHandler())
Expand Down
4 changes: 2 additions & 2 deletions src/orcapod/semantic_types/universal_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ def _create_python_to_arrow_converter(
element_converter = self.get_python_to_arrow_converter(args[0])
return (
lambda value: [element_converter(item) for item in value]
if value
if value is not None
else []
)

Expand All @@ -644,7 +644,7 @@ def _create_python_to_arrow_converter(
{"key": key_converter(k), "value": value_converter(v)}
for k, v in value.items()
]
if value
if value is not None
else []
)

Expand Down