From 7333d089805c64c81f995dab229d4e155287ff22 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 5 Mar 2026 11:17:16 +0000
Subject: [PATCH 1/6] refactor(hashing): consolidate file hashing via
 PathStructConverter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inject file_hasher into PathStructConverter so both the Arrow hasher
path (via SemanticHashingVisitor → hash_struct_dict) and the semantic
hasher path (via PathContentHandler) delegate to the same
FileContentHasherProtocol instance.

Changes:
- PathStructConverter now requires file_hasher in __init__
- Implement hash_struct_dict() delegating to injected file_hasher
- Reorder v0.1.json so file_hasher is instantiated before
  semantic_registry (ref resolution is sequential)
- Add file_hasher ref to PathStructConverter config in v0.1.json
- Update versioned_hashers.py factory to pass BasicFileHasher
- Update and uncomment PathStructConverter tests
- Add integration tests for cross-path hash consistency

https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r
---
 src/orcapod/contexts/data/v0.1.json           |  16 +-
 src/orcapod/hashing/versioned_hashers.py      |   4 +-
 .../semantic_struct_converters.py             |  34 ++-
 .../test_file_hashing_consistency.py          | 212 ++++++++++++++++++
 .../test_path_struct_converter.py             | 141 ++++++------
 5 files changed, 333 insertions(+), 74 deletions(-)
 create mode 100644 tests/test_hashing/test_file_hashing_consistency.py

diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json
index 41a1aa03..0b3fee9d 100644
--- a/src/orcapod/contexts/data/v0.1.json
+++ b/src/orcapod/contexts/data/v0.1.json
@@ -2,13 +2,21 @@
     "context_key": "std:v0.1:default",
     "version": "v0.1",
     "description": "Initial stable release with basic Path semantic type support",
+    "file_hasher": {
+        "_class": "orcapod.hashing.file_hashers.BasicFileHasher",
+        "_config": {
+            "algorithm": "sha256"
+        }
+    },
     "semantic_registry": {
         "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry",
         "_config": {
             "converters": {
                 "path": {
                     "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter",
-                    "_config": {}
+                    "_config": {
+                        "file_hasher": {"_ref": "file_hasher"}
+                    }
                 }
             }
         }
@@ -33,12 +41,6 @@
             }
         }
     },
-    "file_hasher": {
-        "_class": "orcapod.hashing.file_hashers.BasicFileHasher",
-        "_config": {
-            "algorithm": "sha256"
-        }
-    },
     "function_info_extractor": {
         "_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor",
         "_config": {
diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py
index fa76bb11..8adce44d 100644
--- a/src/orcapod/hashing/versioned_hashers.py
+++ b/src/orcapod/hashing/versioned_hashers.py
@@ -124,6 +124,7 @@ def get_versioned_semantic_arrow_hasher(
         A fully configured SemanticArrowHasher instance.
     """
     from orcapod.hashing.arrow_hashers import SemanticArrowHasher
+    from orcapod.hashing.file_hashers import BasicFileHasher
     from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry
     from orcapod.semantic_types.semantic_struct_converters import PathStructConverter
 
@@ -132,7 +133,8 @@ def get_versioned_semantic_arrow_hasher(
     # that arise from the protocol definition of SemanticStructConverterProtocol having
     # a slightly different hash_struct_dict signature than the concrete class.
     registry: Any = SemanticTypeRegistry()
-    path_converter: Any = PathStructConverter()
+    file_hasher = BasicFileHasher(algorithm="sha256")
+    path_converter: Any = PathStructConverter(file_hasher=file_hasher)
     registry.register_converter("path", path_converter)
 
     logger.debug(
diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py
index a7effd1f..63d1e236 100644
--- a/src/orcapod/semantic_types/semantic_struct_converters.py
+++ b/src/orcapod/semantic_types/semantic_struct_converters.py
@@ -13,6 +13,8 @@
 
 if TYPE_CHECKING:
     import pyarrow as pa
+
+    from orcapod.protocols.hashing_protocols import FileContentHasherProtocol
 else:
     pa = LazyModule("pyarrow")
 
@@ -76,9 +78,10 @@ def _compute_content_hash(self, content: bytes) -> ContentHash:
 class PathStructConverter(SemanticStructConverterBase):
     """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}"""
 
-    def __init__(self):
+    def __init__(self, file_hasher: "FileContentHasherProtocol"):
         super().__init__("path")
         self._python_type = Path
+        self._file_hasher = file_hasher
 
         # Define the Arrow struct type for paths
         self._arrow_struct_type = pa.struct(
@@ -134,3 +137,32 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool:
         return set(struct_dict.keys()) == {"path"} and isinstance(
             struct_dict["path"], str
         )
+
+    def hash_struct_dict(
+        self, struct_dict: dict[str, Any], add_prefix: bool = False
+    ) -> str:
+        """Compute hash of a path semantic type by hashing the file content.
+
+        Args:
+            struct_dict: Dict with a "path" key containing a file path string.
+            add_prefix: If True, prefix with "path:sha256:...".
+
+        Returns:
+            Hash string of the file content.
+
+        Raises:
+            FileNotFoundError: If the path does not exist.
+            IsADirectoryError: If the path is a directory.
+        """
+        path_str = struct_dict.get("path")
+        if path_str is None:
+            raise ValueError("Missing 'path' field in struct dict")
+
+        path = Path(path_str)
+        if not path.exists():
+            raise FileNotFoundError(f"Path does not exist: {path}")
+        if path.is_dir():
+            raise IsADirectoryError(f"Path is a directory: {path}")
+
+        content_hash = self._file_hasher.hash_file(path)
+        return self._format_hash_string(content_hash.digest, add_prefix=add_prefix)
diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py
new file mode 100644
index 00000000..339298b8
--- /dev/null
+++ b/tests/test_hashing/test_file_hashing_consistency.py
@@ -0,0 +1,212 @@
+"""
+Integration tests verifying that file hashing is consistent across both paths:
+
+1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a
+   path struct column → calls PathStructConverter.hash_struct_dict → file_hasher.
+2. **Semantic hasher path**: BaseSemanticHasher hashes a Python Path object →
+   calls PathContentHandler.handle → file_hasher.
+
+Both paths must delegate to the same FileContentHasherProtocol so that identical
+file content always produces identical hashes, regardless of entry point.
+"""
+
+from pathlib import Path
+
+import pyarrow as pa
+import pytest
+
+from orcapod.hashing.arrow_hashers import SemanticArrowHasher
+from orcapod.hashing.file_hashers import BasicFileHasher
+from orcapod.hashing.semantic_hashing.builtin_handlers import (
+    PathContentHandler,
+    register_builtin_handlers,
+)
+from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher
+from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry
+from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry
+from orcapod.semantic_types.semantic_struct_converters import PathStructConverter
+
+
+# ---------------------------------------------------------------------------
+# Shared fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def file_hasher():
+    """Single file hasher instance shared by both paths."""
+    return BasicFileHasher(algorithm="sha256")
+
+
+@pytest.fixture
+def path_converter(file_hasher):
+    return PathStructConverter(file_hasher=file_hasher)
+
+
+@pytest.fixture
+def arrow_hasher(path_converter):
+    """SemanticArrowHasher wired with the shared file_hasher via PathStructConverter."""
+    registry = SemanticTypeRegistry()
+    registry.register_converter("path", path_converter)
+    return SemanticArrowHasher(semantic_registry=registry)
+
+
+@pytest.fixture
+def semantic_hasher(file_hasher):
+    """BaseSemanticHasher wired with the shared file_hasher via PathContentHandler."""
+    registry = TypeHandlerRegistry()
+    register_builtin_handlers(registry, file_hasher=file_hasher)
+    return BaseSemanticHasher(
+        hasher_id="test_v1", type_handler_registry=registry, strict=True
+    )
+
+
+# ---------------------------------------------------------------------------
+# Arrow struct hasher: path column tests
+# ---------------------------------------------------------------------------
+
+
+class TestArrowStructPathHashing:
+    """Tests for file hashing through the Arrow hasher path."""
+
+    def test_same_content_different_paths_same_hash(
+        self, arrow_hasher, tmp_path
+    ):
+        """Two distinct files with identical content produce the same table hash."""
+        file1 = tmp_path / "a.txt"
+        file2 = tmp_path / "b.txt"
+        file1.write_text("identical content")
+        file2.write_text("identical content")
+
+        table1 = pa.table(
+            {"file": [{"path": str(file1)}]},
+            schema=pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]),
+        )
+        table2 = pa.table(
+            {"file": [{"path": str(file2)}]},
+            schema=pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]),
+        )
+
+        hash1 = arrow_hasher.hash_table(table1)
+        hash2 = arrow_hasher.hash_table(table2)
+        assert hash1.digest == hash2.digest
+
+    def test_modified_content_different_hash(self, arrow_hasher, tmp_path):
+        """Same path with modified content between hashes yields different hash."""
+        file = tmp_path / "mutable.txt"
+        file.write_text("version 1")
+
+        schema = pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))])
+        table_v1 = pa.table({"file": [{"path": str(file)}]}, schema=schema)
+        hash1 = arrow_hasher.hash_table(table_v1)
+
+        file.write_text("version 2")
+        table_v2 = pa.table({"file": [{"path": str(file)}]}, schema=schema)
+        hash2 = arrow_hasher.hash_table(table_v2)
+
+        assert hash1.digest != hash2.digest
+
+    def test_different_content_different_hash(self, arrow_hasher, tmp_path):
+        """Two files with different content produce different table hashes."""
+        file1 = tmp_path / "x.txt"
+        file2 = tmp_path / "y.txt"
+        file1.write_text("content A")
+        file2.write_text("content B")
+
+        schema = pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))])
+        table1 = pa.table({"file": [{"path": str(file1)}]}, schema=schema)
+        table2 = pa.table({"file": [{"path": str(file2)}]}, schema=schema)
+
+        hash1 = arrow_hasher.hash_table(table1)
+        hash2 = arrow_hasher.hash_table(table2)
+        assert hash1.digest != hash2.digest
+
+
+# ---------------------------------------------------------------------------
+# Semantic hasher: Path object tests
+# ---------------------------------------------------------------------------
+
+
+class TestSemanticPathHashing:
+    """Tests for file hashing through the semantic hasher path."""
+
+    def test_same_content_different_paths_same_hash(
+        self, semantic_hasher, tmp_path
+    ):
+        """Two distinct Path objects pointing to files with identical content."""
+        file1 = tmp_path / "a.txt"
+        file2 = tmp_path / "b.txt"
+        file1.write_text("identical content")
+        file2.write_text("identical content")
+
+        hash1 = semantic_hasher.hash_object(Path(file1))
+        hash2 = semantic_hasher.hash_object(Path(file2))
+        assert hash1.digest == hash2.digest
+
+    def test_modified_content_different_hash(self, semantic_hasher, tmp_path):
+        """Same Path with modified content between hashes."""
+        file = tmp_path / "mutable.txt"
+        file.write_text("version 1")
+        hash1 = semantic_hasher.hash_object(Path(file))
+
+        file.write_text("version 2")
+        hash2 = semantic_hasher.hash_object(Path(file))
+        assert hash1.digest != hash2.digest
+
+    def test_different_content_different_hash(self, semantic_hasher, tmp_path):
+        """Two Paths pointing to different content produce different hashes."""
+        file1 = tmp_path / "x.txt"
+        file2 = tmp_path / "y.txt"
+        file1.write_text("content A")
+        file2.write_text("content B")
+
+        hash1 = semantic_hasher.hash_object(Path(file1))
+        hash2 = semantic_hasher.hash_object(Path(file2))
+        assert hash1.digest != hash2.digest
+
+
+# ---------------------------------------------------------------------------
+# Cross-path consistency
+# ---------------------------------------------------------------------------
+
+
+class TestCrossPathConsistency:
+    """Verify that the arrow hasher and semantic hasher use the same file_hasher
+    and produce equivalent file content hashes for the same underlying file."""
+
+    def test_arrow_and_semantic_hash_same_file_content(
+        self, path_converter, semantic_hasher, file_hasher, tmp_path
+    ):
+        """The file content hash extracted by PathStructConverter.hash_struct_dict
+        must match the ContentHash produced by PathContentHandler.handle (which
+        the semantic hasher uses internally for Path objects).
+
+        We compare at the file_hasher level: both paths ultimately call
+        file_hasher.hash_file(path), so the raw digest must be identical.
+        """
+        file = tmp_path / "shared.txt"
+        file.write_text("shared content for both paths")
+
+        # Arrow path: PathStructConverter.hash_struct_dict (no prefix)
+        arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)})
+
+        # Semantic path: file_hasher.hash_file directly (same as PathContentHandler)
+        semantic_content_hash = file_hasher.hash_file(file)
+
+        assert arrow_hash_hex == semantic_content_hash.digest.hex()
+
+    def test_arrow_and_semantic_same_content_two_files(
+        self, path_converter, file_hasher, tmp_path
+    ):
+        """Two files with identical content: arrow struct hash_struct_dict and
+        direct file_hasher.hash_file produce the same digest."""
+        file1 = tmp_path / "file_arrow.txt"
+        file2 = tmp_path / "file_semantic.txt"
+        content = "same content for cross-path test"
+        file1.write_text(content)
+        file2.write_text(content)
+
+        arrow_hex = path_converter.hash_struct_dict({"path": str(file1)})
+        semantic_hex = file_hasher.hash_file(file2).digest.hex()
+
+        assert arrow_hex == semantic_hex
diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py
index d6e12644..73ae46e6 100644
--- a/tests/test_semantic_types/test_path_struct_converter.py
+++ b/tests/test_semantic_types/test_path_struct_converter.py
@@ -1,14 +1,23 @@
 from pathlib import Path
 from typing import cast
-from unittest.mock import patch
 
 import pytest
 
+from orcapod.hashing.file_hashers import BasicFileHasher
 from orcapod.semantic_types.semantic_struct_converters import PathStructConverter
 
 
-def test_path_to_struct_and_back():
-    converter = PathStructConverter()
+@pytest.fixture
+def file_hasher():
+    return BasicFileHasher(algorithm="sha256")
+
+
+@pytest.fixture
+def converter(file_hasher):
+    return PathStructConverter(file_hasher=file_hasher)
+
+
+def test_path_to_struct_and_back(converter):
     path_obj = Path("/tmp/test.txt")
     struct_dict = converter.python_to_struct_dict(path_obj)
     assert struct_dict["path"] == str(path_obj)
@@ -16,26 +25,22 @@ def test_path_to_struct_and_back():
     assert restored == path_obj
 
 
-def test_path_to_struct_invalid_type():
-    converter = PathStructConverter()
+def test_path_to_struct_invalid_type(converter):
     with pytest.raises(TypeError):
         converter.python_to_struct_dict("not_a_path")  # type: ignore
 
 
-def test_struct_to_python_missing_field():
-    converter = PathStructConverter()
+def test_struct_to_python_missing_field(converter):
     with pytest.raises(ValueError):
         converter.struct_dict_to_python({})
 
 
-def test_can_handle_python_type():
-    converter = PathStructConverter()
+def test_can_handle_python_type(converter):
     assert converter.can_handle_python_type(Path)
     assert not converter.can_handle_python_type(str)
 
 
-def test_can_handle_struct_type():
-    converter = PathStructConverter()
+def test_can_handle_struct_type(converter):
     struct_type = converter.arrow_struct_type
     assert converter.can_handle_struct_type(struct_type)
 
@@ -60,62 +65,68 @@ def names(self):
     assert not converter.can_handle_struct_type(fake_struct)
 
 
-def test_is_semantic_struct():
-    converter = PathStructConverter()
+def test_is_semantic_struct(converter):
     assert converter.is_semantic_struct({"path": "/tmp/test.txt"})
     assert not converter.is_semantic_struct({"not_path": "value"})
     assert not converter.is_semantic_struct({"path": 123})
 
 
-# def test_hash_struct_dict_file_not_found(tmp_path):
-#     converter = PathStructConverter()
-#     struct_dict = {"path": str(tmp_path / "does_not_exist.txt")}
-#     with pytest.raises(FileNotFoundError):
-#         converter.hash_struct_dict(struct_dict)
-
-
-# def test_hash_struct_dict_permission_error(tmp_path):
-#     converter = PathStructConverter()
-#     file_path = tmp_path / "file.txt"
-#     file_path.write_text("data")
-#     with patch("pathlib.Path.read_bytes", side_effect=PermissionError):
-#         struct_dict = {"path": str(file_path)}
-#         with pytest.raises(PermissionError):
-#             converter.hash_struct_dict(struct_dict)
-
-
-# def test_hash_struct_dict_is_directory(tmp_path):
-#     converter = PathStructConverter()
-#     struct_dict = {"path": str(tmp_path)}
-#     with pytest.raises(ValueError):
-#         converter.hash_struct_dict(struct_dict)
-
-
-# def test_hash_struct_dict_content_based(tmp_path):
-#     converter = PathStructConverter()
-#     file1 = tmp_path / "file1.txt"
-#     file2 = tmp_path / "file2.txt"
-#     content = "identical content"
-#     file1.write_text(content)
-#     file2.write_text(content)
-#     struct_dict1 = {"path": str(file1)}
-#     struct_dict2 = {"path": str(file2)}
-#     hash1 = converter.hash_struct_dict(struct_dict1)
-#     hash2 = converter.hash_struct_dict(struct_dict2)
-#     assert hash1 == hash2
-
-
-# def test_hash_path_objects_content_based(tmp_path):
-#     converter = PathStructConverter()
-#     file1 = tmp_path / "fileA.txt"
-#     file2 = tmp_path / "fileB.txt"
-#     content = "same file content"
-#     file1.write_text(content)
-#     file2.write_text(content)
-#     path_obj1 = Path(file1)
-#     path_obj2 = Path(file2)
-#     struct_dict1 = converter.python_to_struct_dict(path_obj1)
-#     struct_dict2 = converter.python_to_struct_dict(path_obj2)
-#     hash1 = converter.hash_struct_dict(struct_dict1)
-#     hash2 = converter.hash_struct_dict(struct_dict2)
-#     assert hash1 == hash2
+def test_hash_struct_dict_file_not_found(converter, tmp_path):
+    struct_dict = {"path": str(tmp_path / "does_not_exist.txt")}
+    with pytest.raises(FileNotFoundError):
+        converter.hash_struct_dict(struct_dict)
+
+
+def test_hash_struct_dict_is_directory(converter, tmp_path):
+    struct_dict = {"path": str(tmp_path)}
+    with pytest.raises(IsADirectoryError):
+        converter.hash_struct_dict(struct_dict)
+
+
+def test_hash_struct_dict_content_based(converter, tmp_path):
+    """Two distinct files with identical content produce the same hash."""
+    file1 = tmp_path / "file1.txt"
+    file2 = tmp_path / "file2.txt"
+    content = "identical content"
+    file1.write_text(content)
+    file2.write_text(content)
+    hash1 = converter.hash_struct_dict({"path": str(file1)})
+    hash2 = converter.hash_struct_dict({"path": str(file2)})
+    assert hash1 == hash2
+
+
+def test_hash_path_objects_content_based(converter, tmp_path):
+    """Round-trip through python_to_struct_dict then hash_struct_dict."""
+    file1 = tmp_path / "fileA.txt"
+    file2 = tmp_path / "fileB.txt"
+    content = "same file content"
+    file1.write_text(content)
+    file2.write_text(content)
+    struct_dict1 = converter.python_to_struct_dict(Path(file1))
+    struct_dict2 = converter.python_to_struct_dict(Path(file2))
+    hash1 = converter.hash_struct_dict(struct_dict1)
+    hash2 = converter.hash_struct_dict(struct_dict2)
+    assert hash1 == hash2
+
+
+def test_hash_struct_dict_with_prefix(converter, tmp_path):
+    """Prefixed hash starts with 'path:sha256:'."""
+    file = tmp_path / "file.txt"
+    file.write_text("hello")
+    hash_str = converter.hash_struct_dict({"path": str(file)}, add_prefix=True)
+    assert hash_str.startswith("path:sha256:")
+
+
+def test_hash_struct_dict_different_content(converter, tmp_path):
+    """Same path with modified content yields a different hash."""
+    file = tmp_path / "mutable.txt"
+    file.write_text("version 1")
+    hash1 = converter.hash_struct_dict({"path": str(file)})
+    file.write_text("version 2")
+    hash2 = converter.hash_struct_dict({"path": str(file)})
+    assert hash1 != hash2
+
+
+def test_hash_struct_dict_missing_path_field(converter):
+    with pytest.raises(ValueError, match="Missing 'path' field"):
+        converter.hash_struct_dict({})

From 298520022a582b432338dc47565125584b5b0df7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 5 Mar 2026 11:20:37 +0000
Subject: [PATCH 2/6] fix(hashing): handle BasicFileHasher returning raw bytes
 in hash_struct_dict

BasicFileHasher.hash_file() returns raw bytes despite the protocol
declaring ContentHash. Handle both cases defensively in
PathStructConverter.hash_struct_dict() and in the integration tests.

https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r
---
 src/orcapod/semantic_types/semantic_struct_converters.py | 5 ++++-
 tests/test_hashing/test_file_hashing_consistency.py      | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py
index 63d1e236..b634214a 100644
--- a/src/orcapod/semantic_types/semantic_struct_converters.py
+++ b/src/orcapod/semantic_types/semantic_struct_converters.py
@@ -165,4 +165,7 @@ def hash_struct_dict(
             raise IsADirectoryError(f"Path is a directory: {path}")
 
         content_hash = self._file_hasher.hash_file(path)
-        return self._format_hash_string(content_hash.digest, add_prefix=add_prefix)
+        # BasicFileHasher.hash_file returns raw bytes despite the protocol
+        # declaring ContentHash. Handle both cases defensively.
+        digest = content_hash.digest if hasattr(content_hash, "digest") else content_hash
+        return self._format_hash_string(digest, add_prefix=add_prefix)
diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py
index 339298b8..99311fef 100644
--- a/tests/test_hashing/test_file_hashing_consistency.py
+++ b/tests/test_hashing/test_file_hashing_consistency.py
@@ -191,9 +191,11 @@ def test_arrow_and_semantic_hash_same_file_content(
         arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)})
 
         # Semantic path: file_hasher.hash_file directly (same as PathContentHandler)
+        # BasicFileHasher.hash_file returns raw bytes
         semantic_content_hash = file_hasher.hash_file(file)
+        semantic_hex = semantic_content_hash.hex() if isinstance(semantic_content_hash, bytes) else semantic_content_hash.digest.hex()
 
-        assert arrow_hash_hex == semantic_content_hash.digest.hex()
+        assert arrow_hash_hex == semantic_hex
 
     def test_arrow_and_semantic_same_content_two_files(
         self, path_converter, file_hasher, tmp_path
@@ -207,6 +209,7 @@ def test_arrow_and_semantic_same_content_two_files(
         file2.write_text(content)
 
         arrow_hex = path_converter.hash_struct_dict({"path": str(file1)})
-        semantic_hex = file_hasher.hash_file(file2).digest.hex()
+        raw = file_hasher.hash_file(file2)
+        semantic_hex = raw.hex() if isinstance(raw, bytes) else raw.digest.hex()
 
         assert arrow_hex == semantic_hex

From 0d1d135c65e34f2f5687e6fc3467d13c80626b98 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 5 Mar 2026 11:29:28 +0000
Subject: [PATCH 3/6] fix(hashing): make hash_file return ContentHash instead
 of raw bytes

hash_utils.hash_file() declared -> bytes but the protocol and all
callers expected ContentHash. Now it properly returns
ContentHash(method=algorithm, digest=...).

- Update hash_utils.hash_file to return ContentHash
- Fix CachedFileHasher cache hit/miss to use .digest
- Update generate_file_hashes.py to use .to_hex()
- Remove defensive hasattr workarounds added in previous commit

https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r
---
 src/orcapod/hashing/file_hashers.py           |  8 ++---
 src/orcapod/hashing/hash_utils.py             | 31 +++++++++++--------
 .../semantic_struct_converters.py             |  5 +--
 tests/test_hashing/generate_file_hashes.py    | 12 +++----
 .../test_file_hashing_consistency.py          |  7 ++---
 5 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py
index 7dddfcc3..3dff6224 100644
--- a/src/orcapod/hashing/file_hashers.py
+++ b/src/orcapod/hashing/file_hashers.py
@@ -38,8 +38,8 @@ def hash_file(self, file_path: PathLike) -> ContentHash:
         cache_key = f"file:{file_path}"
         cached_value = self.string_cacher.get_cached(cache_key)
         if cached_value is not None:
-            return bytes.fromhex(cached_value)
+            return ContentHash(method="cached", digest=bytes.fromhex(cached_value))
 
-        value = self.file_hasher.hash_file(file_path)
-        self.string_cacher.set_cached(cache_key, value.hex())
-        return value
+        result = self.file_hasher.hash_file(file_path)
+        self.string_cacher.set_cached(cache_key, result.digest.hex())
+        return result
diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py
index 2fc43d8b..a42f44d1 100644
--- a/src/orcapod/hashing/hash_utils.py
+++ b/src/orcapod/hashing/hash_utils.py
@@ -7,6 +7,8 @@
 
 import xxhash
 
+from orcapod.types import ContentHash
+
 logger = logging.getLogger(__name__)
 
 
@@ -41,18 +43,18 @@ def combine_hashes(
     return combined_hash
 
 
-def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
-    """
-    Calculate the hash of a file using the specified algorithm.
+def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash:
+    """Calculate the hash of a file using the specified algorithm.
 
-    Parameters:
-        file_path (str): Path to the file to hash
-        algorithm (str): Hash algorithm to use - options include:
-                         'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'
-        buffer_size (int): Size of chunks to read from the file at a time
+    Args:
+        file_path: Path to the file to hash.
+        algorithm: Hash algorithm to use — options include:
+            'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'.
+        buffer_size: Size of chunks to read from the file at a time.
 
     Returns:
-        bytes: Raw digest bytes of the hash
+        A ContentHash with method set to the algorithm name and digest
+        containing the raw hash bytes.
     """
     if not Path(file_path).is_file():
         raise FileNotFoundError(f"The file {file_path} does not exist")
@@ -61,7 +63,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
     if algorithm == "hash_path":
         hasher = hashlib.sha256()
         hasher.update(file_path.encode("utf-8"))
-        return hasher.digest()
+        return ContentHash(method=algorithm, digest=hasher.digest())
 
     if algorithm == "xxh64":
         hasher = xxhash.xxh64()
@@ -71,7 +73,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
                 if not data:
                     break
                 hasher.update(data)
-        return hasher.digest()
+        return ContentHash(method=algorithm, digest=hasher.digest())
 
     if algorithm == "crc32":
         crc = 0
@@ -81,7 +83,10 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
                 if not data:
                     break
                 crc = zlib.crc32(data, crc)
-        return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big")
+        return ContentHash(
+            method=algorithm,
+            digest=(crc & 0xFFFFFFFF).to_bytes(4, byteorder="big"),
+        )
 
     try:
         hasher = hashlib.new(algorithm)
@@ -98,7 +103,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
                 break
             hasher.update(data)
 
-    return hasher.digest()
+    return ContentHash(method=algorithm, digest=hasher.digest())
 
 
 def _is_in_string(line: str, pos: int) -> bool:
diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py
index b634214a..63d1e236 100644
--- a/src/orcapod/semantic_types/semantic_struct_converters.py
+++ b/src/orcapod/semantic_types/semantic_struct_converters.py
@@ -165,7 +165,4 @@ def hash_struct_dict(
             raise IsADirectoryError(f"Path is a directory: {path}")
 
         content_hash = self._file_hasher.hash_file(path)
-        # BasicFileHasher.hash_file returns raw bytes despite the protocol
-        # declaring ContentHash. Handle both cases defensively.
-        digest = content_hash.digest if hasattr(content_hash, "digest") else content_hash
-        return self._format_hash_string(digest, add_prefix=add_prefix)
+        return self._format_hash_string(content_hash.digest, add_prefix=add_prefix)
diff --git a/tests/test_hashing/generate_file_hashes.py b/tests/test_hashing/generate_file_hashes.py
index 0beb66c5..270c2894 100644
--- a/tests/test_hashing/generate_file_hashes.py
+++ b/tests/test_hashing/generate_file_hashes.py
@@ -63,12 +63,12 @@ def create_sample_files():
         files_info.append(
             {
                 "file": str(rel_filepath),
-                "hash": file_hash,
+                "hash": file_hash.to_hex(),
                 "size_kb": size,
                 "type": "text",
             }
         )
-        print(f"Created text file: {filename} ({size} KB), Hash: {file_hash}")
+        print(f"Created text file: {filename} ({size} KB), Hash: {file_hash.to_hex()}")
 
     # Generate binary files of various sizes
     binary_sizes = [1, 5, 10, 50, 100]  # sizes in KB
@@ -88,12 +88,12 @@ def create_sample_files():
         files_info.append(
             {
                 "file": str(rel_filepath),
-                "hash": file_hash,
+                "hash": file_hash.to_hex(),
                 "size_kb": size,
                 "type": "binary",
             }
         )
-        print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash}")
+        print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash.to_hex()}")
 
     # Create a structured file (JSON)
     json_filename = "sample_structured.json"
@@ -116,8 +116,8 @@ def create_sample_files():
     # Compute the hash
     json_hash = hash_file(json_filepath)
 
-    files_info.append({"file": str(rel_filepath), "hash": json_hash, "type": "json"})
-    print(f"Created JSON file: {json_filename}, Hash: {json_hash}")
+    files_info.append({"file": str(rel_filepath), "hash": json_hash.to_hex(), "type": "json"})
+    print(f"Created JSON file: {json_filename}, Hash: {json_hash.to_hex()}")
 
     return files_info
 
diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py
index 99311fef..339298b8 100644
--- a/tests/test_hashing/test_file_hashing_consistency.py
+++ b/tests/test_hashing/test_file_hashing_consistency.py
@@ -191,11 +191,9 @@ def test_arrow_and_semantic_hash_same_file_content(
         arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)})
 
         # Semantic path: file_hasher.hash_file directly (same as PathContentHandler)
-        # BasicFileHasher.hash_file returns raw bytes
         semantic_content_hash = file_hasher.hash_file(file)
-        semantic_hex = semantic_content_hash.hex() if isinstance(semantic_content_hash, bytes) else semantic_content_hash.digest.hex()
 
-        assert arrow_hash_hex == semantic_hex
+        assert arrow_hash_hex == semantic_content_hash.digest.hex()
 
     def test_arrow_and_semantic_same_content_two_files(
         self, path_converter, file_hasher, tmp_path
@@ -209,7 +207,6 @@ def test_arrow_and_semantic_same_content_two_files(
         file2.write_text(content)
 
         arrow_hex = path_converter.hash_struct_dict({"path": str(file1)})
-        raw = file_hasher.hash_file(file2)
-        semantic_hex = raw.hex() if isinstance(raw, bytes) else raw.digest.hex()
+        semantic_hex = file_hasher.hash_file(file2).digest.hex()
 
         assert arrow_hex == semantic_hex

From 01c90f460047b65f46b4c1ae09fd40714deafccf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 5 Mar 2026 11:30:53 +0000
Subject: [PATCH 4/6] fix(hashing): store full ContentHash string in
 CachedFileHasher cache

Use ContentHash.to_string() / from_string() so the cached value
preserves both the method name and digest bytes, instead of losing the
method on cache hit.

https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r
---
 src/orcapod/hashing/file_hashers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py
index 3dff6224..870dc47c 100644
--- a/src/orcapod/hashing/file_hashers.py
+++ b/src/orcapod/hashing/file_hashers.py
@@ -38,8 +38,8 @@ def hash_file(self, file_path: PathLike) -> ContentHash:
         cache_key = f"file:{file_path}"
         cached_value = self.string_cacher.get_cached(cache_key)
         if cached_value is not None:
-            return ContentHash(method="cached", digest=bytes.fromhex(cached_value))
+            return ContentHash.from_string(cached_value)
 
         result = self.file_hasher.hash_file(file_path)
-        self.string_cacher.set_cached(cache_key, result.digest.hex())
+        self.string_cacher.set_cached(cache_key, result.to_string())
         return result

From 8863a6f597eee095a2d783e1b87fb0c86389fe3b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 5 Mar 2026 11:36:44 +0000
Subject: [PATCH 5/6] test(hashing): add file hasher return type and
 CachedFileHasher tests

Add tests that would have caught the pre-existing bugs:
- hash_file() returning raw bytes instead of ContentHash
- CachedFileHasher losing the method name on cache hit
- hash_path algorithm failing on Path objects (str() missing)

Test coverage includes:
- hash_utils.hash_file returns ContentHash for all algorithms
- BasicFileHasher return type, determinism, content sensitivity
- CachedFileHasher: cache miss delegates, cache hit preserves method
  and digest, cache stores full to_string() format, clear_cache
  forces rehash, stale cache behavior documented

Also fixes hash_path algorithm to str()-ify Path before encoding.

https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r
---
 src/orcapod/hashing/hash_utils.py       |   2 +-
 tests/test_hashing/test_file_hashers.py | 270 ++++++++++++++++++++++++
 2 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_hashing/test_file_hashers.py

diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py
index a42f44d1..0addcb77 100644
--- a/src/orcapod/hashing/hash_utils.py
+++ b/src/orcapod/hashing/hash_utils.py
@@ -62,7 +62,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash:
     # Hash the path string itself rather than file content
     if algorithm == "hash_path":
         hasher = hashlib.sha256()
-        hasher.update(file_path.encode("utf-8"))
+        hasher.update(str(file_path).encode("utf-8"))
         return ContentHash(method=algorithm, digest=hasher.digest())
 
     if algorithm == "xxh64":
diff --git a/tests/test_hashing/test_file_hashers.py b/tests/test_hashing/test_file_hashers.py
new file mode 100644
index 00000000..90081f26
--- /dev/null
+++ b/tests/test_hashing/test_file_hashers.py
@@ -0,0 +1,270 @@
+"""
+Tests for file hashing return types and CachedFileHasher behavior.
+
+These tests would have caught the pre-existing bug where hash_utils.hash_file()
+returned raw bytes instead of ContentHash, and verify the CachedFileHasher
+correctly round-trips ContentHash through the string cache.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from orcapod.hashing.file_hashers import BasicFileHasher, CachedFileHasher
+from orcapod.hashing.hash_utils import hash_file
+from orcapod.hashing.string_cachers import InMemoryCacher
+from orcapod.types import ContentHash
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sample_file(tmp_path):
+    """Create a small sample file for hashing."""
+    f = tmp_path / "sample.txt"
+    f.write_text("hello world")
+    return f
+
+
+@pytest.fixture
+def file_hasher():
+    return BasicFileHasher(algorithm="sha256")
+
+
+@pytest.fixture
+def cached_file_hasher(file_hasher):
+    cacher = InMemoryCacher()
+    return CachedFileHasher(file_hasher=file_hasher, string_cacher=cacher)
+
+
+# ---------------------------------------------------------------------------
+# hash_utils.hash_file returns ContentHash
+# ---------------------------------------------------------------------------
+
+
+class TestHashFileReturnType:
+    """Tests that would have caught hash_file returning raw bytes."""
+
+    def test_hash_file_returns_content_hash(self, sample_file):
+        result = hash_file(sample_file)
+        assert isinstance(result, ContentHash), (
+            f"hash_file should return ContentHash, got {type(result).__name__}"
+        )
+
+    def test_hash_file_has_method(self, sample_file):
+        result = hash_file(sample_file)
+        assert result.method == "sha256"
+
+    def test_hash_file_has_digest_bytes(self, sample_file):
+        result = hash_file(sample_file)
+        assert isinstance(result.digest, bytes)
+        assert len(result.digest) == 32  # SHA-256 produces 32 bytes
+
+    def test_hash_file_xxh64_returns_content_hash(self, sample_file):
+        result = hash_file(sample_file, algorithm="xxh64")
+        assert isinstance(result, ContentHash)
+        assert result.method == "xxh64"
+
+    def test_hash_file_crc32_returns_content_hash(self, sample_file):
+        result = hash_file(sample_file, algorithm="crc32")
+        assert isinstance(result, ContentHash)
+        assert result.method == "crc32"
+        assert len(result.digest) == 4  # CRC32 produces 4 bytes
+
+    def test_hash_file_hash_path_returns_content_hash(self, sample_file):
+        result = hash_file(sample_file, algorithm="hash_path")
+        assert isinstance(result, ContentHash)
+        assert result.method == "hash_path"
+
+    def test_hash_file_to_string_round_trips(self, sample_file):
+        """ContentHash.to_string() / from_string() preserves method and digest."""
+        original = hash_file(sample_file)
+        serialized = original.to_string()
+        restored = ContentHash.from_string(serialized)
+        assert restored.method == original.method
+        assert restored.digest == original.digest
+
+
+# ---------------------------------------------------------------------------
+# BasicFileHasher returns ContentHash
+# ---------------------------------------------------------------------------
+
+
+class TestBasicFileHasherReturnType:
+    """Tests that would have caught BasicFileHasher returning raw bytes."""
+
+    def test_returns_content_hash(self, file_hasher, sample_file):
+        result = file_hasher.hash_file(sample_file)
+        assert isinstance(result, ContentHash)
+
+    def test_method_matches_algorithm(self, sample_file):
+        for algo in ("sha256", "md5"):
+            hasher = BasicFileHasher(algorithm=algo)
+            result = hasher.hash_file(sample_file)
+            assert result.method == algo
+
+    def test_digest_is_bytes(self, file_hasher, sample_file):
+        result = file_hasher.hash_file(sample_file)
+        assert isinstance(result.digest, bytes)
+
+    def test_deterministic(self, file_hasher, sample_file):
+        h1 = file_hasher.hash_file(sample_file)
+        h2 = file_hasher.hash_file(sample_file)
+        assert h1 == h2
+
+    def test_different_content_different_hash(self, file_hasher, tmp_path):
+        f1 = tmp_path / "a.txt"
+        f2 = tmp_path / "b.txt"
+        f1.write_text("aaa")
+        f2.write_text("bbb")
+        assert file_hasher.hash_file(f1) != file_hasher.hash_file(f2)
+
+    def test_same_content_different_paths_same_hash(self, file_hasher, tmp_path):
+        f1 = tmp_path / "a.txt"
+        f2 = tmp_path / "b.txt"
+        f1.write_text("same")
+        f2.write_text("same")
+        assert file_hasher.hash_file(f1) == file_hasher.hash_file(f2)
+
+
+# ---------------------------------------------------------------------------
+# CachedFileHasher with InMemoryCacher
+# ---------------------------------------------------------------------------
+
+
+class TestCachedFileHasher:
+    """Tests for CachedFileHasher caching behavior and ContentHash preservation."""
+
+    def test_returns_content_hash(self, cached_file_hasher, sample_file):
+        result = cached_file_hasher.hash_file(sample_file)
+        assert isinstance(result, ContentHash)
+
+    def test_cache_miss_delegates_to_inner_hasher(self, sample_file):
+        """On cache miss, result must match the inner BasicFileHasher."""
+        inner = BasicFileHasher(algorithm="sha256")
+        cacher = InMemoryCacher()
+        cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
+
+        expected = inner.hash_file(sample_file)
+        actual = cached.hash_file(sample_file)
+
+        assert actual.method == expected.method
+        assert actual.digest == expected.digest
+
+    def test_cache_hit_returns_correct_content_hash(self, sample_file):
+        """On cache hit, the returned ContentHash must have correct method and digest."""
+        inner = BasicFileHasher(algorithm="sha256")
+        cacher = InMemoryCacher()
+        cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
+
+        # First call populates cache
+        first = cached.hash_file(sample_file)
+
+        # Second call should hit cache
+        second = cached.hash_file(sample_file)
+
+        assert second.method == first.method
+        assert second.digest == first.digest
+
+    def test_cache_stores_to_string_format(self, sample_file):
+        """The cache must store the full 'method:hex_digest' string."""
+        inner = BasicFileHasher(algorithm="sha256")
+        cacher = InMemoryCacher()
+        cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
+
+        result = cached.hash_file(sample_file)
+
+        # Inspect the raw cached value
+        cache_key = f"file:{sample_file}"
+        cached_value = cacher.get_cached(cache_key)
+
+        assert cached_value is not None
+        assert cached_value == result.to_string()
+        # Should be in "method:hex_digest" format
+        assert ":" in cached_value
+        method, hex_digest = cached_value.split(":", 1)
+        assert method == "sha256"
+        assert hex_digest == result.digest.hex()
+
+    def test_cache_hit_preserves_method_not_cached(self, sample_file):
+        """Cache hit must return the original method, not 'cached' or similar."""
+        inner = BasicFileHasher(algorithm="sha256")
+        cacher = InMemoryCacher()
+        cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
+
+        # Populate cache
+        cached.hash_file(sample_file)
+
+        # Cache hit
+        result = cached.hash_file(sample_file)
+        assert result.method == "sha256", (
+            f"Cache hit should preserve original method, got '{result.method}'"
+        )
+
+    def test_cache_round_trip_with_from_string(self, sample_file):
+        """Manually verify the to_string / from_string round-trip used by the cache."""
+        inner = BasicFileHasher(algorithm="sha256")
+        original = inner.hash_file(sample_file)
+
+        serialized = original.to_string()
+        restored = ContentHash.from_string(serialized)
+
+        assert restored.method == original.method
+        assert restored.digest == original.digest
+
+    def test_different_algorithms_cached_independently(self, tmp_path):
+        """Two CachedFileHashers with different algorithms produce different cached results."""
+        f = tmp_path / "file.txt"
+        f.write_text("test content")
+
+        cacher = InMemoryCacher()
+
+        sha_inner = BasicFileHasher(algorithm="sha256")
+        sha_cached = CachedFileHasher(file_hasher=sha_inner, string_cacher=cacher)
+        sha_result = sha_cached.hash_file(f)
+
+        md5_inner = BasicFileHasher(algorithm="md5")
+        md5_cached = CachedFileHasher(file_hasher=md5_inner, string_cacher=cacher)
+        # Same cache key "file:<path>" — the second call hits the cached sha256 value.
+        # This is a known limitation: the cache key doesn't include the algorithm.
+        # We test that the cached value at least round-trips correctly.
+        md5_result = md5_cached.hash_file(f)
+
+        # Since same cache key, md5_cached gets the sha256 result from cache
+        # This documents current behavior — cache key should ideally include algorithm
+        assert md5_result.method == "sha256"  # gets cached sha256 result
+
+    def test_modified_content_after_cache(self, cached_file_hasher, tmp_path):
+        """Cache is NOT invalidated when file content changes (documents behavior).
+
+        CachedFileHasher caches by path string, so a modified file still
+        returns the stale cached hash until the cache is cleared.
+        """
+        f = tmp_path / "mutable.txt"
+        f.write_text("version 1")
+        first = cached_file_hasher.hash_file(f)
+
+        f.write_text("version 2")
+        second = cached_file_hasher.hash_file(f)
+
+        # Same because cached — documents expected caching behavior
+        assert first.digest == second.digest
+
+    def test_clear_cache_forces_rehash(self, tmp_path):
+        """After clearing the cache, a modified file produces a new hash."""
+        inner = BasicFileHasher(algorithm="sha256")
+        cacher = InMemoryCacher()
+        cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
+
+        f = tmp_path / "mutable.txt"
+        f.write_text("version 1")
+        first = cached.hash_file(f)
+
+        f.write_text("version 2")
+        cacher.clear_cache()
+        second = cached.hash_file(f)
+
+        assert first.digest != second.digest

From 5e7e689eb03d4f0b6b5ca32679647b7e8fe854b3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 5 Mar 2026 11:53:46 +0000
Subject: [PATCH 6/6] feat(hashing): add mtime+size cache busting to
 CachedFileHasher

Include st_mtime_ns and st_size in the cache key so that modified files
automatically produce a cache miss instead of returning stale hashes.

Update tests to verify the new auto-invalidation behavior and fix
flaky test by using different-length content to guarantee size changes.

https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r
---
 src/orcapod/hashing/file_hashers.py     |  5 ++-
 tests/test_hashing/test_file_hashers.py | 46 ++++++++++++++-----------
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py
index 870dc47c..7e82c063 100644
--- a/src/orcapod/hashing/file_hashers.py
+++ b/src/orcapod/hashing/file_hashers.py
@@ -1,3 +1,5 @@
+import os
+
 from orcapod.hashing.hash_utils import hash_file
 from orcapod.protocols.hashing_protocols import (
     FileContentHasherProtocol,
@@ -35,7 +37,8 @@ def __init__(
         self.string_cacher = string_cacher
 
     def hash_file(self, file_path: PathLike) -> ContentHash:
-        cache_key = f"file:{file_path}"
+        stat = os.stat(file_path)
+        cache_key = f"file:{file_path}:{stat.st_mtime_ns}:{stat.st_size}"
         cached_value = self.string_cacher.get_cached(cache_key)
         if cached_value is not None:
             return ContentHash.from_string(cached_value)
diff --git a/tests/test_hashing/test_file_hashers.py b/tests/test_hashing/test_file_hashers.py
index 90081f26..606c92e6 100644
--- a/tests/test_hashing/test_file_hashers.py
+++ b/tests/test_hashing/test_file_hashers.py
@@ -171,14 +171,17 @@ def test_cache_hit_returns_correct_content_hash(self, sample_file):
 
     def test_cache_stores_to_string_format(self, sample_file):
         """The cache must store the full 'method:hex_digest' string."""
+        import os
+
         inner = BasicFileHasher(algorithm="sha256")
         cacher = InMemoryCacher()
         cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
 
         result = cached.hash_file(sample_file)
 
-        # Inspect the raw cached value
-        cache_key = f"file:{sample_file}"
+        # Inspect the raw cached value — key includes mtime+size
+        stat = os.stat(sample_file)
+        cache_key = f"file:{sample_file}:{stat.st_mtime_ns}:{stat.st_size}"
         cached_value = cacher.get_cached(cache_key)
 
         assert cached_value is not None
@@ -215,8 +218,12 @@ def test_cache_round_trip_with_from_string(self, sample_file):
         assert restored.method == original.method
         assert restored.digest == original.digest
 
-    def test_different_algorithms_cached_independently(self, tmp_path):
-        """Two CachedFileHashers with different algorithms produce different cached results."""
+    def test_different_algorithms_share_cache_key(self, tmp_path):
+        """Two CachedFileHashers with different algorithms but same path+mtime+size
+        share the same cache key. The second hasher gets the first's cached result.
+
+        This documents a known limitation: the cache key doesn't include the algorithm.
+        """
         f = tmp_path / "file.txt"
         f.write_text("test content")
 
@@ -228,30 +235,29 @@ def test_different_algorithms_cached_independently(self, tmp_path):
 
         md5_inner = BasicFileHasher(algorithm="md5")
         md5_cached = CachedFileHasher(file_hasher=md5_inner, string_cacher=cacher)
-        # Same cache key "file:<path>" — the second call hits the cached sha256 value.
-        # This is a known limitation: the cache key doesn't include the algorithm.
-        # We test that the cached value at least round-trips correctly.
         md5_result = md5_cached.hash_file(f)
 
-        # Since same cache key, md5_cached gets the sha256 result from cache
-        # This documents current behavior — cache key should ideally include algorithm
-        assert md5_result.method == "sha256"  # gets cached sha256 result
+        # Same cache key (same file, same mtime+size), so md5 gets sha256 result
+        assert md5_result.method == "sha256"
+        assert md5_result.digest == sha_result.digest
 
-    def test_modified_content_after_cache(self, cached_file_hasher, tmp_path):
-        """Cache is NOT invalidated when file content changes (documents behavior).
+    def test_modified_content_invalidates_cache(self, cached_file_hasher, tmp_path):
+        """Cache is automatically invalidated when file content changes.
 
-        CachedFileHasher caches by path string, so a modified file still
-        returns the stale cached hash until the cache is cleared.
+        CachedFileHasher includes mtime_ns and file size in the cache key,
+        so writing new content produces a cache miss and a fresh hash.
         """
         f = tmp_path / "mutable.txt"
-        f.write_text("version 1")
+        f.write_text("short")
         first = cached_file_hasher.hash_file(f)
 
-        f.write_text("version 2")
+        # Use different-length content so file size changes even if mtime_ns
+        # doesn't advance (can happen on fast filesystems).
+        f.write_text("much longer content here")
         second = cached_file_hasher.hash_file(f)
 
-        # Same because cached — documents expected caching behavior
-        assert first.digest == second.digest
+        # Different because size changed → cache miss → rehashed
+        assert first.digest != second.digest
 
     def test_clear_cache_forces_rehash(self, tmp_path):
         """After clearing the cache, a modified file produces a new hash."""
@@ -260,10 +266,10 @@ def test_clear_cache_forces_rehash(self, tmp_path):
         cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher)
 
         f = tmp_path / "mutable.txt"
-        f.write_text("version 1")
+        f.write_text("short")
         first = cached.hash_file(f)
 
-        f.write_text("version 2")
+        f.write_text("much longer content here")
         cacher.clear_cache()
         second = cached.hash_file(f)