From 7333d089805c64c81f995dab229d4e155287ff22 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 11:17:16 +0000 Subject: [PATCH 1/6] refactor(hashing): consolidate file hashing via PathStructConverter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inject file_hasher into PathStructConverter so both the Arrow hasher path (via SemanticHashingVisitor → hash_struct_dict) and the semantic hasher path (via PathContentHandler) delegate to the same FileContentHasherProtocol instance. Changes: - PathStructConverter now requires file_hasher in __init__ - Implement hash_struct_dict() delegating to injected file_hasher - Reorder v0.1.json so file_hasher is instantiated before semantic_registry (ref resolution is sequential) - Add file_hasher ref to PathStructConverter config in v0.1.json - Update versioned_hashers.py factory to pass BasicFileHasher - Update and uncomment PathStructConverter tests - Add integration tests for cross-path hash consistency https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r --- src/orcapod/contexts/data/v0.1.json | 16 +- src/orcapod/hashing/versioned_hashers.py | 4 +- .../semantic_struct_converters.py | 34 ++- .../test_file_hashing_consistency.py | 212 ++++++++++++++++++ .../test_path_struct_converter.py | 141 ++++++------ 5 files changed, 333 insertions(+), 74 deletions(-) create mode 100644 tests/test_hashing/test_file_hashing_consistency.py diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 41a1aa03..0b3fee9d 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -2,13 +2,21 @@ "context_key": "std:v0.1:default", "version": "v0.1", "description": "Initial stable release with basic Path semantic type support", + "file_hasher": { + "_class": "orcapod.hashing.file_hashers.BasicFileHasher", + "_config": { + "algorithm": "sha256" + } + }, "semantic_registry": { "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { "converters": { "path": { "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", - "_config": {} + "_config": { + "file_hasher": {"_ref": "file_hasher"} + } } } } @@ -33,12 +41,6 @@ } } }, - "file_hasher": { - "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "_config": { - "algorithm": "sha256" - } - }, "function_info_extractor": { "_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor", "_config": { diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index fa76bb11..8adce44d 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -124,6 +124,7 @@ def get_versioned_semantic_arrow_hasher( A fully configured SemanticArrowHasher instance. """ from orcapod.hashing.arrow_hashers import SemanticArrowHasher + from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry from orcapod.semantic_types.semantic_struct_converters import PathStructConverter @@ -132,7 +133,8 @@ def get_versioned_semantic_arrow_hasher( # that arise from the protocol definition of SemanticStructConverterProtocol having # a slightly different hash_struct_dict signature than the concrete class. registry: Any = SemanticTypeRegistry() - path_converter: Any = PathStructConverter() + file_hasher = BasicFileHasher(algorithm="sha256") + path_converter: Any = PathStructConverter(file_hasher=file_hasher) registry.register_converter("path", path_converter) logger.debug( diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index a7effd1f..63d1e236 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -13,6 +13,8 @@ if TYPE_CHECKING: import pyarrow as pa + + from orcapod.protocols.hashing_protocols import FileContentHasherProtocol else: pa = LazyModule("pyarrow") @@ -76,9 +78,10 @@ def _compute_content_hash(self, content: bytes) -> ContentHash: class PathStructConverter(SemanticStructConverterBase): """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}""" - def __init__(self): + def __init__(self, file_hasher: "FileContentHasherProtocol"): super().__init__("path") self._python_type = Path + self._file_hasher = file_hasher # Define the Arrow struct type for paths self._arrow_struct_type = pa.struct( @@ -134,3 +137,32 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: return set(struct_dict.keys()) == {"path"} and isinstance( struct_dict["path"], str ) + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + """Compute hash of a path semantic type by hashing the file content. + + Args: + struct_dict: Dict with a "path" key containing a file path string. + add_prefix: If True, prefix with "path:sha256:...". + + Returns: + Hash string of the file content. + + Raises: + FileNotFoundError: If the path does not exist. + IsADirectoryError: If the path is a directory. + """ + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct dict") + + path = Path(path_str) + if not path.exists(): + raise FileNotFoundError(f"Path does not exist: {path}") + if path.is_dir(): + raise IsADirectoryError(f"Path is a directory: {path}") + + content_hash = self._file_hasher.hash_file(path) + return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py new file mode 100644 index 00000000..339298b8 --- /dev/null +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -0,0 +1,212 @@ +""" +Integration tests verifying that file hashing is consistent across both paths: + +1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a + path struct column → calls PathStructConverter.hash_struct_dict → file_hasher. +2. **Semantic hasher path**: BaseSemanticHasher hashes a Python Path object → + calls PathContentHandler.handle → file_hasher. + +Both paths must delegate to the same FileContentHasherProtocol so that identical +file content always produces identical hashes, regardless of entry point. +""" + +from pathlib import Path + +import pyarrow as pa +import pytest + +from orcapod.hashing.arrow_hashers import SemanticArrowHasher +from orcapod.hashing.file_hashers import BasicFileHasher +from orcapod.hashing.semantic_hashing.builtin_handlers import ( + PathContentHandler, + register_builtin_handlers, +) +from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher +from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry +from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry +from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def file_hasher(): + """Single file hasher instance shared by both paths.""" + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def path_converter(file_hasher): + return PathStructConverter(file_hasher=file_hasher) + + +@pytest.fixture +def arrow_hasher(path_converter): + """SemanticArrowHasher wired with the shared file_hasher via PathStructConverter.""" + registry = SemanticTypeRegistry() + registry.register_converter("path", path_converter) + return SemanticArrowHasher(semantic_registry=registry) + + +@pytest.fixture +def semantic_hasher(file_hasher): + """BaseSemanticHasher wired with the shared file_hasher via PathContentHandler.""" + registry = TypeHandlerRegistry() + register_builtin_handlers(registry, file_hasher=file_hasher) + return BaseSemanticHasher( + hasher_id="test_v1", type_handler_registry=registry, strict=True + ) + + +# --------------------------------------------------------------------------- +# Arrow struct hasher: path column tests +# --------------------------------------------------------------------------- + + +class TestArrowStructPathHashing: + """Tests for file hashing through the Arrow hasher path.""" + + def test_same_content_different_paths_same_hash( + self, arrow_hasher, tmp_path + ): + """Two distinct files with identical content produce the same table hash.""" + file1 = tmp_path / "a.txt" + file2 = tmp_path / "b.txt" + file1.write_text("identical content") + file2.write_text("identical content") + + table1 = pa.table( + {"file": [{"path": str(file1)}]}, + schema=pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]), + ) + table2 = pa.table( + {"file": [{"path": str(file2)}]}, + schema=pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]), + ) + + hash1 = arrow_hasher.hash_table(table1) + hash2 = arrow_hasher.hash_table(table2) + assert hash1.digest == hash2.digest + + def test_modified_content_different_hash(self, arrow_hasher, tmp_path): + """Same path with modified content between hashes yields different hash.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + + schema = pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]) + table_v1 = pa.table({"file": [{"path": str(file)}]}, schema=schema) + hash1 = arrow_hasher.hash_table(table_v1) + + file.write_text("version 2") + table_v2 = pa.table({"file": [{"path": str(file)}]}, schema=schema) + hash2 = arrow_hasher.hash_table(table_v2) + + assert hash1.digest != hash2.digest + + def test_different_content_different_hash(self, arrow_hasher, tmp_path): + """Two files with different content produce different table hashes.""" + file1 = tmp_path / "x.txt" + file2 = tmp_path / "y.txt" + file1.write_text("content A") + file2.write_text("content B") + + schema = pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]) + table1 = pa.table({"file": [{"path": str(file1)}]}, schema=schema) + table2 = pa.table({"file": [{"path": str(file2)}]}, schema=schema) + + hash1 = arrow_hasher.hash_table(table1) + hash2 = arrow_hasher.hash_table(table2) + assert hash1.digest != hash2.digest + + +# --------------------------------------------------------------------------- +# Semantic hasher: Path object tests +# --------------------------------------------------------------------------- + + +class TestSemanticPathHashing: + """Tests for file hashing through the semantic hasher path.""" + + def test_same_content_different_paths_same_hash( + self, semantic_hasher, tmp_path + ): + """Two distinct Path objects pointing to files with identical content.""" + file1 = tmp_path / "a.txt" + file2 = tmp_path / "b.txt" + file1.write_text("identical content") + file2.write_text("identical content") + + hash1 = semantic_hasher.hash_object(Path(file1)) + hash2 = semantic_hasher.hash_object(Path(file2)) + assert hash1.digest == hash2.digest + + def test_modified_content_different_hash(self, semantic_hasher, tmp_path): + """Same Path with modified content between hashes.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + hash1 = semantic_hasher.hash_object(Path(file)) + + file.write_text("version 2") + hash2 = semantic_hasher.hash_object(Path(file)) + assert hash1.digest != hash2.digest + + def test_different_content_different_hash(self, semantic_hasher, tmp_path): + """Two Paths pointing to different content produce different hashes.""" + file1 = tmp_path / "x.txt" + file2 = tmp_path / "y.txt" + file1.write_text("content A") + file2.write_text("content B") + + hash1 = semantic_hasher.hash_object(Path(file1)) + hash2 = semantic_hasher.hash_object(Path(file2)) + assert hash1.digest != hash2.digest + + +# --------------------------------------------------------------------------- +# Cross-path consistency +# --------------------------------------------------------------------------- + + +class TestCrossPathConsistency: + """Verify that the arrow hasher and semantic hasher use the same file_hasher + and produce equivalent file content hashes for the same underlying file.""" + + def test_arrow_and_semantic_hash_same_file_content( + self, path_converter, semantic_hasher, file_hasher, tmp_path + ): + """The file content hash extracted by PathStructConverter.hash_struct_dict + must match the ContentHash produced by PathContentHandler.handle (which + the semantic hasher uses internally for Path objects). + + We compare at the file_hasher level: both paths ultimately call + file_hasher.hash_file(path), so the raw digest must be identical. + """ + file = tmp_path / "shared.txt" + file.write_text("shared content for both paths") + + # Arrow path: PathStructConverter.hash_struct_dict (no prefix) + arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) + + # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) + semantic_content_hash = file_hasher.hash_file(file) + + assert arrow_hash_hex == semantic_content_hash.digest.hex() + + def test_arrow_and_semantic_same_content_two_files( + self, path_converter, file_hasher, tmp_path + ): + """Two files with identical content: arrow struct hash_struct_dict and + direct file_hasher.hash_file produce the same digest.""" + file1 = tmp_path / "file_arrow.txt" + file2 = tmp_path / "file_semantic.txt" + content = "same content for cross-path test" + file1.write_text(content) + file2.write_text(content) + + arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}) + semantic_hex = file_hasher.hash_file(file2).digest.hex() + + assert arrow_hex == semantic_hex diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py index d6e12644..73ae46e6 100644 --- a/tests/test_semantic_types/test_path_struct_converter.py +++ b/tests/test_semantic_types/test_path_struct_converter.py @@ -1,14 +1,23 @@ from pathlib import Path from typing import cast -from unittest.mock import patch import pytest +from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_struct_converters import PathStructConverter -def test_path_to_struct_and_back(): - converter = PathStructConverter() +@pytest.fixture +def file_hasher(): + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def converter(file_hasher): + return PathStructConverter(file_hasher=file_hasher) + + +def test_path_to_struct_and_back(converter): path_obj = Path("/tmp/test.txt") struct_dict = converter.python_to_struct_dict(path_obj) assert struct_dict["path"] == str(path_obj) @@ -16,26 +25,22 @@ def test_path_to_struct_and_back(): assert restored == path_obj -def test_path_to_struct_invalid_type(): - converter = PathStructConverter() +def test_path_to_struct_invalid_type(converter): with pytest.raises(TypeError): converter.python_to_struct_dict("not_a_path") # type: ignore -def test_struct_to_python_missing_field(): - converter = PathStructConverter() +def test_struct_to_python_missing_field(converter): with pytest.raises(ValueError): converter.struct_dict_to_python({}) -def test_can_handle_python_type(): - converter = PathStructConverter() +def test_can_handle_python_type(converter): assert converter.can_handle_python_type(Path) assert not converter.can_handle_python_type(str) -def test_can_handle_struct_type(): - converter = PathStructConverter() +def test_can_handle_struct_type(converter): struct_type = converter.arrow_struct_type assert converter.can_handle_struct_type(struct_type) @@ -60,62 +65,68 @@ def names(self): assert not converter.can_handle_struct_type(fake_struct) -def test_is_semantic_struct(): - converter = PathStructConverter() +def test_is_semantic_struct(converter): assert converter.is_semantic_struct({"path": "/tmp/test.txt"}) assert not converter.is_semantic_struct({"not_path": "value"}) assert not converter.is_semantic_struct({"path": 123}) -# def test_hash_struct_dict_file_not_found(tmp_path): -# converter = PathStructConverter() -# struct_dict = {"path": str(tmp_path / "does_not_exist.txt")} -# with pytest.raises(FileNotFoundError): -# converter.hash_struct_dict(struct_dict) - - -# def test_hash_struct_dict_permission_error(tmp_path): -# converter = PathStructConverter() -# file_path = tmp_path / "file.txt" -# file_path.write_text("data") -# with patch("pathlib.Path.read_bytes", side_effect=PermissionError): -# struct_dict = {"path": str(file_path)} -# with pytest.raises(PermissionError): -# converter.hash_struct_dict(struct_dict) - - -# def test_hash_struct_dict_is_directory(tmp_path): -# converter = PathStructConverter() -# struct_dict = {"path": str(tmp_path)} -# with pytest.raises(ValueError): -# converter.hash_struct_dict(struct_dict) - - -# def test_hash_struct_dict_content_based(tmp_path): -# converter = PathStructConverter() -# file1 = tmp_path / "file1.txt" -# file2 = tmp_path / "file2.txt" -# content = "identical content" -# file1.write_text(content) -# file2.write_text(content) -# struct_dict1 = {"path": str(file1)} -# struct_dict2 = {"path": str(file2)} -# hash1 = converter.hash_struct_dict(struct_dict1) -# hash2 = converter.hash_struct_dict(struct_dict2) -# assert hash1 == hash2 - - -# def test_hash_path_objects_content_based(tmp_path): -# converter = PathStructConverter() -# file1 = tmp_path / "fileA.txt" -# file2 = tmp_path / "fileB.txt" -# content = "same file content" -# file1.write_text(content) -# file2.write_text(content) -# path_obj1 = Path(file1) -# path_obj2 = Path(file2) -# struct_dict1 = converter.python_to_struct_dict(path_obj1) -# struct_dict2 = converter.python_to_struct_dict(path_obj2) -# hash1 = converter.hash_struct_dict(struct_dict1) -# hash2 = converter.hash_struct_dict(struct_dict2) -# assert hash1 == hash2 +def test_hash_struct_dict_file_not_found(converter, tmp_path): + struct_dict = {"path": str(tmp_path / "does_not_exist.txt")} + with pytest.raises(FileNotFoundError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_is_directory(converter, tmp_path): + struct_dict = {"path": str(tmp_path)} + with pytest.raises(IsADirectoryError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_content_based(converter, tmp_path): + """Two distinct files with identical content produce the same hash.""" + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + content = "identical content" + file1.write_text(content) + file2.write_text(content) + hash1 = converter.hash_struct_dict({"path": str(file1)}) + hash2 = converter.hash_struct_dict({"path": str(file2)}) + assert hash1 == hash2 + + +def test_hash_path_objects_content_based(converter, tmp_path): + """Round-trip through python_to_struct_dict then hash_struct_dict.""" + file1 = tmp_path / "fileA.txt" + file2 = tmp_path / "fileB.txt" + content = "same file content" + file1.write_text(content) + file2.write_text(content) + struct_dict1 = converter.python_to_struct_dict(Path(file1)) + struct_dict2 = converter.python_to_struct_dict(Path(file2)) + hash1 = converter.hash_struct_dict(struct_dict1) + hash2 = converter.hash_struct_dict(struct_dict2) + assert hash1 == hash2 + + +def test_hash_struct_dict_with_prefix(converter, tmp_path): + """Prefixed hash starts with 'path:sha256:'.""" + file = tmp_path / "file.txt" + file.write_text("hello") + hash_str = converter.hash_struct_dict({"path": str(file)}, add_prefix=True) + assert hash_str.startswith("path:sha256:") + + +def test_hash_struct_dict_different_content(converter, tmp_path): + """Same path with modified content yields a different hash.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + hash1 = converter.hash_struct_dict({"path": str(file)}) + file.write_text("version 2") + hash2 = converter.hash_struct_dict({"path": str(file)}) + assert hash1 != hash2 + + +def test_hash_struct_dict_missing_path_field(converter): + with pytest.raises(ValueError, match="Missing 'path' field"): + converter.hash_struct_dict({}) From 298520022a582b432338dc47565125584b5b0df7 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 11:20:37 +0000 Subject: [PATCH 2/6] fix(hashing): handle BasicFileHasher returning raw bytes in hash_struct_dict BasicFileHasher.hash_file() returns raw bytes despite the protocol declaring ContentHash. Handle both cases defensively in PathStructConverter.hash_struct_dict() and in the integration tests. https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r --- src/orcapod/semantic_types/semantic_struct_converters.py | 5 ++++- tests/test_hashing/test_file_hashing_consistency.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index 63d1e236..b634214a 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -165,4 +165,7 @@ def hash_struct_dict( raise IsADirectoryError(f"Path is a directory: {path}") content_hash = self._file_hasher.hash_file(path) - return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) + # BasicFileHasher.hash_file returns raw bytes despite the protocol + # declaring ContentHash. Handle both cases defensively. + digest = content_hash.digest if hasattr(content_hash, "digest") else content_hash + return self._format_hash_string(digest, add_prefix=add_prefix) diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py index 339298b8..99311fef 100644 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -191,9 +191,11 @@ def test_arrow_and_semantic_hash_same_file_content( arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) + # BasicFileHasher.hash_file returns raw bytes semantic_content_hash = file_hasher.hash_file(file) + semantic_hex = semantic_content_hash.hex() if isinstance(semantic_content_hash, bytes) else semantic_content_hash.digest.hex() - assert arrow_hash_hex == semantic_content_hash.digest.hex() + assert arrow_hash_hex == semantic_hex def test_arrow_and_semantic_same_content_two_files( self, path_converter, file_hasher, tmp_path @@ -207,6 +209,7 @@ def test_arrow_and_semantic_same_content_two_files( file2.write_text(content) arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}) - semantic_hex = file_hasher.hash_file(file2).digest.hex() + raw = file_hasher.hash_file(file2) + semantic_hex = raw.hex() if isinstance(raw, bytes) else raw.digest.hex() assert arrow_hex == semantic_hex From 0d1d135c65e34f2f5687e6fc3467d13c80626b98 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 11:29:28 +0000 Subject: [PATCH 3/6] fix(hashing): make hash_file return ContentHash instead of raw bytes hash_utils.hash_file() declared -> bytes but the protocol and all callers expected ContentHash. Now it properly returns ContentHash(method=algorithm, digest=...). - Update hash_utils.hash_file to return ContentHash - Fix CachedFileHasher cache hit/miss to use .digest - Update generate_file_hashes.py to use .to_hex() - Remove defensive hasattr workarounds added in previous commit https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r --- src/orcapod/hashing/file_hashers.py | 8 ++--- src/orcapod/hashing/hash_utils.py | 31 +++++++++++-------- .../semantic_struct_converters.py | 5 +-- tests/test_hashing/generate_file_hashes.py | 12 +++---- .../test_file_hashing_consistency.py | 7 ++--- 5 files changed, 31 insertions(+), 32 deletions(-) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 7dddfcc3..3dff6224 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -38,8 +38,8 @@ def hash_file(self, file_path: PathLike) -> ContentHash: cache_key = f"file:{file_path}" cached_value = self.string_cacher.get_cached(cache_key) if cached_value is not None: - return bytes.fromhex(cached_value) + return ContentHash(method="cached", digest=bytes.fromhex(cached_value)) - value = self.file_hasher.hash_file(file_path) - self.string_cacher.set_cached(cache_key, value.hex()) - return value + result = self.file_hasher.hash_file(file_path) + self.string_cacher.set_cached(cache_key, result.digest.hex()) + return result diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 2fc43d8b..a42f44d1 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -7,6 +7,8 @@ import xxhash +from orcapod.types import ContentHash + logger = logging.getLogger(__name__) @@ -41,18 +43,18 @@ def combine_hashes( return combined_hash -def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: - """ - Calculate the hash of a file using the specified algorithm. +def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: + """Calculate the hash of a file using the specified algorithm. - Parameters: - file_path (str): Path to the file to hash - algorithm (str): Hash algorithm to use - options include: - 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path' - buffer_size (int): Size of chunks to read from the file at a time + Args: + file_path: Path to the file to hash. + algorithm: Hash algorithm to use — options include: + 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'. + buffer_size: Size of chunks to read from the file at a time. Returns: - bytes: Raw digest bytes of the hash + A ContentHash with method set to the algorithm name and digest + containing the raw hash bytes. """ if not Path(file_path).is_file(): raise FileNotFoundError(f"The file {file_path} does not exist") @@ -61,7 +63,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: if algorithm == "hash_path": hasher = hashlib.sha256() hasher.update(file_path.encode("utf-8")) - return hasher.digest() + return ContentHash(method=algorithm, digest=hasher.digest()) if algorithm == "xxh64": hasher = xxhash.xxh64() @@ -71,7 +73,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: if not data: break hasher.update(data) - return hasher.digest() + return ContentHash(method=algorithm, digest=hasher.digest()) if algorithm == "crc32": crc = 0 @@ -81,7 +83,10 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: if not data: break crc = zlib.crc32(data, crc) - return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big") + return ContentHash( + method=algorithm, + digest=(crc & 0xFFFFFFFF).to_bytes(4, byteorder="big"), + ) try: hasher = hashlib.new(algorithm) @@ -98,7 +103,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: break hasher.update(data) - return hasher.digest() + return ContentHash(method=algorithm, digest=hasher.digest()) def _is_in_string(line: str, pos: int) -> bool: diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index b634214a..63d1e236 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -165,7 +165,4 @@ def hash_struct_dict( raise IsADirectoryError(f"Path is a directory: {path}") content_hash = self._file_hasher.hash_file(path) - # BasicFileHasher.hash_file returns raw bytes despite the protocol - # declaring ContentHash. Handle both cases defensively. - digest = content_hash.digest if hasattr(content_hash, "digest") else content_hash - return self._format_hash_string(digest, add_prefix=add_prefix) + return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) diff --git a/tests/test_hashing/generate_file_hashes.py b/tests/test_hashing/generate_file_hashes.py index 0beb66c5..270c2894 100644 --- a/tests/test_hashing/generate_file_hashes.py +++ b/tests/test_hashing/generate_file_hashes.py @@ -63,12 +63,12 @@ def create_sample_files(): files_info.append( { "file": str(rel_filepath), - "hash": file_hash, + "hash": file_hash.to_hex(), "size_kb": size, "type": "text", } ) - print(f"Created text file: {filename} ({size} KB), Hash: {file_hash}") + print(f"Created text file: {filename} ({size} KB), Hash: {file_hash.to_hex()}") # Generate binary files of various sizes binary_sizes = [1, 5, 10, 50, 100] # sizes in KB @@ -88,12 +88,12 @@ def create_sample_files(): files_info.append( { "file": str(rel_filepath), - "hash": file_hash, + "hash": file_hash.to_hex(), "size_kb": size, "type": "binary", } ) - print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash}") + print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash.to_hex()}") # Create a structured file (JSON) json_filename = "sample_structured.json" @@ -116,8 +116,8 @@ def create_sample_files(): # Compute the hash json_hash = hash_file(json_filepath) - files_info.append({"file": str(rel_filepath), "hash": json_hash, "type": "json"}) - print(f"Created JSON file: {json_filename}, Hash: {json_hash}") + files_info.append({"file": str(rel_filepath), "hash": json_hash.to_hex(), "type": "json"}) + print(f"Created JSON file: {json_filename}, Hash: {json_hash.to_hex()}") return files_info diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py index 99311fef..339298b8 100644 --- a/tests/test_hashing/test_file_hashing_consistency.py +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -191,11 +191,9 @@ def test_arrow_and_semantic_hash_same_file_content( arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) - # BasicFileHasher.hash_file returns raw bytes semantic_content_hash = file_hasher.hash_file(file) - semantic_hex = semantic_content_hash.hex() if isinstance(semantic_content_hash, bytes) else semantic_content_hash.digest.hex() - assert arrow_hash_hex == semantic_hex + assert arrow_hash_hex == semantic_content_hash.digest.hex() def test_arrow_and_semantic_same_content_two_files( self, path_converter, file_hasher, tmp_path @@ -209,7 +207,6 @@ def test_arrow_and_semantic_same_content_two_files( file2.write_text(content) arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}) - raw = file_hasher.hash_file(file2) - semantic_hex = raw.hex() if isinstance(raw, bytes) else raw.digest.hex() + semantic_hex = file_hasher.hash_file(file2).digest.hex() assert arrow_hex == semantic_hex From 01c90f460047b65f46b4c1ae09fd40714deafccf Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 11:30:53 +0000 Subject: [PATCH 4/6] fix(hashing): store full ContentHash string in CachedFileHasher cache Use ContentHash.to_string() / from_string() so the cached value preserves both the method name and digest bytes, instead of losing the method on cache hit. https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r --- src/orcapod/hashing/file_hashers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 3dff6224..870dc47c 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -38,8 +38,8 @@ def hash_file(self, file_path: PathLike) -> ContentHash: cache_key = f"file:{file_path}" cached_value = self.string_cacher.get_cached(cache_key) if cached_value is not None: - return ContentHash(method="cached", digest=bytes.fromhex(cached_value)) + return ContentHash.from_string(cached_value) result = self.file_hasher.hash_file(file_path) - self.string_cacher.set_cached(cache_key, result.digest.hex()) + self.string_cacher.set_cached(cache_key, result.to_string()) return result From 8863a6f597eee095a2d783e1b87fb0c86389fe3b Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 11:36:44 +0000 Subject: [PATCH 5/6] test(hashing): add file hasher return type and CachedFileHasher tests Add tests that would have caught the pre-existing bugs: - hash_file() returning raw bytes instead of ContentHash - CachedFileHasher losing the method name on cache hit - hash_path algorithm failing on Path objects (str() missing) Test coverage includes: - hash_utils.hash_file returns ContentHash for all algorithms - BasicFileHasher return type, determinism, content sensitivity - CachedFileHasher: cache miss delegates, cache hit preserves method and digest, cache stores full to_string() format, clear_cache forces rehash, stale cache behavior documented Also fixes hash_path algorithm to str()-ify Path before encoding. https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r --- src/orcapod/hashing/hash_utils.py | 2 +- tests/test_hashing/test_file_hashers.py | 270 ++++++++++++++++++++++++ 2 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 tests/test_hashing/test_file_hashers.py diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index a42f44d1..0addcb77 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -62,7 +62,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: # Hash the path string itself rather than file content if algorithm == "hash_path": hasher = hashlib.sha256() - hasher.update(file_path.encode("utf-8")) + hasher.update(str(file_path).encode("utf-8")) return ContentHash(method=algorithm, digest=hasher.digest()) if algorithm == "xxh64": diff --git a/tests/test_hashing/test_file_hashers.py b/tests/test_hashing/test_file_hashers.py new file mode 100644 index 00000000..90081f26 --- /dev/null +++ b/tests/test_hashing/test_file_hashers.py @@ -0,0 +1,270 @@ +""" +Tests for file hashing return types and CachedFileHasher behavior. + +These tests would have caught the pre-existing bug where hash_utils.hash_file() +returned raw bytes instead of ContentHash, and verify the CachedFileHasher +correctly round-trips ContentHash through the string cache. +""" + +from pathlib import Path + +import pytest + +from orcapod.hashing.file_hashers import BasicFileHasher, CachedFileHasher +from orcapod.hashing.hash_utils import hash_file +from orcapod.hashing.string_cachers import InMemoryCacher +from orcapod.types import ContentHash + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def sample_file(tmp_path): + """Create a small sample file for hashing.""" + f = tmp_path / "sample.txt" + f.write_text("hello world") + return f + + +@pytest.fixture +def file_hasher(): + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def cached_file_hasher(file_hasher): + cacher = InMemoryCacher() + return CachedFileHasher(file_hasher=file_hasher, string_cacher=cacher) + + +# --------------------------------------------------------------------------- +# hash_utils.hash_file returns ContentHash +# --------------------------------------------------------------------------- + + +class TestHashFileReturnType: + """Tests that would have caught hash_file returning raw bytes.""" + + def test_hash_file_returns_content_hash(self, sample_file): + result = hash_file(sample_file) + assert isinstance(result, ContentHash), ( + f"hash_file should return ContentHash, got {type(result).__name__}" + ) + + def test_hash_file_has_method(self, sample_file): + result = hash_file(sample_file) + assert result.method == "sha256" + + def test_hash_file_has_digest_bytes(self, sample_file): + result = hash_file(sample_file) + assert isinstance(result.digest, bytes) + assert len(result.digest) == 32 # SHA-256 produces 32 bytes + + def test_hash_file_xxh64_returns_content_hash(self, sample_file): + result = hash_file(sample_file, algorithm="xxh64") + assert isinstance(result, ContentHash) + assert result.method == "xxh64" + + def test_hash_file_crc32_returns_content_hash(self, sample_file): + result = hash_file(sample_file, algorithm="crc32") + assert isinstance(result, ContentHash) + assert result.method == "crc32" + assert len(result.digest) == 4 # CRC32 produces 4 bytes + + def test_hash_file_hash_path_returns_content_hash(self, sample_file): + result = hash_file(sample_file, algorithm="hash_path") + assert isinstance(result, ContentHash) + assert result.method == "hash_path" + + def test_hash_file_to_string_round_trips(self, sample_file): + """ContentHash.to_string() / from_string() preserves method and digest.""" + original = hash_file(sample_file) + serialized = original.to_string() + restored = ContentHash.from_string(serialized) + assert restored.method == original.method + assert restored.digest == original.digest + + +# --------------------------------------------------------------------------- +# BasicFileHasher returns ContentHash +# --------------------------------------------------------------------------- + + +class TestBasicFileHasherReturnType: + """Tests that would have caught BasicFileHasher returning raw bytes.""" + + def test_returns_content_hash(self, file_hasher, sample_file): + result = file_hasher.hash_file(sample_file) + assert isinstance(result, ContentHash) + + def test_method_matches_algorithm(self, sample_file): + for algo in ("sha256", "md5"): + hasher = BasicFileHasher(algorithm=algo) + result = hasher.hash_file(sample_file) + assert result.method == algo + + def test_digest_is_bytes(self, file_hasher, sample_file): + result = file_hasher.hash_file(sample_file) + assert isinstance(result.digest, bytes) + + def test_deterministic(self, file_hasher, sample_file): + h1 = file_hasher.hash_file(sample_file) + h2 = file_hasher.hash_file(sample_file) + assert h1 == h2 + + def test_different_content_different_hash(self, file_hasher, tmp_path): + f1 = tmp_path / "a.txt" + f2 = tmp_path / "b.txt" + f1.write_text("aaa") + f2.write_text("bbb") + assert file_hasher.hash_file(f1) != file_hasher.hash_file(f2) + + def test_same_content_different_paths_same_hash(self, file_hasher, tmp_path): + f1 = tmp_path / "a.txt" + f2 = tmp_path / "b.txt" + f1.write_text("same") + f2.write_text("same") + assert file_hasher.hash_file(f1) == file_hasher.hash_file(f2) + + +# --------------------------------------------------------------------------- +# CachedFileHasher with InMemoryCacher +# --------------------------------------------------------------------------- + + +class TestCachedFileHasher: + """Tests for CachedFileHasher caching behavior and ContentHash preservation.""" + + def test_returns_content_hash(self, cached_file_hasher, sample_file): + result = cached_file_hasher.hash_file(sample_file) + assert isinstance(result, ContentHash) + + def test_cache_miss_delegates_to_inner_hasher(self, sample_file): + """On cache miss, result must match the inner BasicFileHasher.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + expected = inner.hash_file(sample_file) + actual = cached.hash_file(sample_file) + + assert actual.method == expected.method + assert actual.digest == expected.digest + + def test_cache_hit_returns_correct_content_hash(self, sample_file): + """On cache hit, the returned ContentHash must have correct method and digest.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + # First call populates cache + first = cached.hash_file(sample_file) + + # Second call should hit cache + second = cached.hash_file(sample_file) + + assert second.method == first.method + assert second.digest == first.digest + + def test_cache_stores_to_string_format(self, sample_file): + """The cache must store the full 'method:hex_digest' string.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + result = cached.hash_file(sample_file) + + # Inspect the raw cached value + cache_key = f"file:{sample_file}" + cached_value = cacher.get_cached(cache_key) + + assert cached_value is not None + assert cached_value == result.to_string() + # Should be in "method:hex_digest" format + assert ":" in cached_value + method, hex_digest = cached_value.split(":", 1) + assert method == "sha256" + assert hex_digest == result.digest.hex() + + def test_cache_hit_preserves_method_not_cached(self, sample_file): + """Cache hit must return the original method, not 'cached' or similar.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + # Populate cache + cached.hash_file(sample_file) + + # Cache hit + result = cached.hash_file(sample_file) + assert result.method == "sha256", ( + f"Cache hit should preserve original method, got '{result.method}'" + ) + + def test_cache_round_trip_with_from_string(self, sample_file): + """Manually verify the to_string / from_string round-trip used by the cache.""" + inner = BasicFileHasher(algorithm="sha256") + original = inner.hash_file(sample_file) + + serialized = original.to_string() + restored = ContentHash.from_string(serialized) + + assert restored.method == original.method + assert restored.digest == original.digest + + def test_different_algorithms_cached_independently(self, tmp_path): + """Two CachedFileHashers with different algorithms produce different cached results.""" + f = tmp_path / "file.txt" + f.write_text("test content") + + cacher = InMemoryCacher() + + sha_inner = BasicFileHasher(algorithm="sha256") + sha_cached = CachedFileHasher(file_hasher=sha_inner, string_cacher=cacher) + sha_result = sha_cached.hash_file(f) + + md5_inner = BasicFileHasher(algorithm="md5") + md5_cached = CachedFileHasher(file_hasher=md5_inner, string_cacher=cacher) + # Same cache key "file:" — the second call hits the cached sha256 value. + # This is a known limitation: the cache key doesn't include the algorithm. + # We test that the cached value at least round-trips correctly. + md5_result = md5_cached.hash_file(f) + + # Since same cache key, md5_cached gets the sha256 result from cache + # This documents current behavior — cache key should ideally include algorithm + assert md5_result.method == "sha256" # gets cached sha256 result + + def test_modified_content_after_cache(self, cached_file_hasher, tmp_path): + """Cache is NOT invalidated when file content changes (documents behavior). + + CachedFileHasher caches by path string, so a modified file still + returns the stale cached hash until the cache is cleared. + """ + f = tmp_path / "mutable.txt" + f.write_text("version 1") + first = cached_file_hasher.hash_file(f) + + f.write_text("version 2") + second = cached_file_hasher.hash_file(f) + + # Same because cached — documents expected caching behavior + assert first.digest == second.digest + + def test_clear_cache_forces_rehash(self, tmp_path): + """After clearing the cache, a modified file produces a new hash.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + f = tmp_path / "mutable.txt" + f.write_text("version 1") + first = cached.hash_file(f) + + f.write_text("version 2") + cacher.clear_cache() + second = cached.hash_file(f) + + assert first.digest != second.digest From 5e7e689eb03d4f0b6b5ca32679647b7e8fe854b3 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 5 Mar 2026 11:53:46 +0000 Subject: [PATCH 6/6] feat(hashing): add mtime+size cache busting to CachedFileHasher Include st_mtime_ns and st_size in the cache key so that modified files automatically produce a cache miss instead of returning stale hashes. Update tests to verify the new auto-invalidation behavior and fix flaky test by using different-length content to guarantee size changes. https://claude.ai/code/session_01PBeavr3pTFu5sPhWUxNx2r --- src/orcapod/hashing/file_hashers.py | 5 ++- tests/test_hashing/test_file_hashers.py | 46 ++++++++++++++----------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 870dc47c..7e82c063 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,3 +1,5 @@ +import os + from orcapod.hashing.hash_utils import hash_file from orcapod.protocols.hashing_protocols import ( FileContentHasherProtocol, @@ -35,7 +37,8 @@ def __init__( self.string_cacher = string_cacher def hash_file(self, file_path: PathLike) -> ContentHash: - cache_key = f"file:{file_path}" + stat = os.stat(file_path) + cache_key = f"file:{file_path}:{stat.st_mtime_ns}:{stat.st_size}" cached_value = self.string_cacher.get_cached(cache_key) if cached_value is not None: return ContentHash.from_string(cached_value) diff --git a/tests/test_hashing/test_file_hashers.py b/tests/test_hashing/test_file_hashers.py index 90081f26..606c92e6 100644 --- a/tests/test_hashing/test_file_hashers.py +++ b/tests/test_hashing/test_file_hashers.py @@ -171,14 +171,17 @@ def test_cache_hit_returns_correct_content_hash(self, sample_file): def test_cache_stores_to_string_format(self, sample_file): """The cache must store the full 'method:hex_digest' string.""" + import os + inner = BasicFileHasher(algorithm="sha256") cacher = InMemoryCacher() cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) result = cached.hash_file(sample_file) - # Inspect the raw cached value - cache_key = f"file:{sample_file}" + # Inspect the raw cached value — key includes mtime+size + stat = os.stat(sample_file) + cache_key = f"file:{sample_file}:{stat.st_mtime_ns}:{stat.st_size}" cached_value = cacher.get_cached(cache_key) assert cached_value is not None @@ -215,8 +218,12 @@ def test_cache_round_trip_with_from_string(self, sample_file): assert restored.method == original.method assert restored.digest == original.digest - def test_different_algorithms_cached_independently(self, tmp_path): - """Two CachedFileHashers with different algorithms produce different cached results.""" + def test_different_algorithms_share_cache_key(self, tmp_path): + """Two CachedFileHashers with different algorithms but same path+mtime+size + share the same cache key. The second hasher gets the first's cached result. + + This documents a known limitation: the cache key doesn't include the algorithm. + """ f = tmp_path / "file.txt" f.write_text("test content") @@ -228,30 +235,29 @@ def test_different_algorithms_cached_independently(self, tmp_path): md5_inner = BasicFileHasher(algorithm="md5") md5_cached = CachedFileHasher(file_hasher=md5_inner, string_cacher=cacher) - # Same cache key "file:" — the second call hits the cached sha256 value. - # This is a known limitation: the cache key doesn't include the algorithm. - # We test that the cached value at least round-trips correctly. md5_result = md5_cached.hash_file(f) - # Since same cache key, md5_cached gets the sha256 result from cache - # This documents current behavior — cache key should ideally include algorithm - assert md5_result.method == "sha256" # gets cached sha256 result + # Same cache key (same file, same mtime+size), so md5 gets sha256 result + assert md5_result.method == "sha256" + assert md5_result.digest == sha_result.digest - def test_modified_content_after_cache(self, cached_file_hasher, tmp_path): - """Cache is NOT invalidated when file content changes (documents behavior). + def test_modified_content_invalidates_cache(self, cached_file_hasher, tmp_path): + """Cache is automatically invalidated when file content changes. - CachedFileHasher caches by path string, so a modified file still - returns the stale cached hash until the cache is cleared. + CachedFileHasher includes mtime_ns and file size in the cache key, + so writing new content produces a cache miss and a fresh hash. """ f = tmp_path / "mutable.txt" - f.write_text("version 1") + f.write_text("short") first = cached_file_hasher.hash_file(f) - f.write_text("version 2") + # Use different-length content so file size changes even if mtime_ns + # doesn't advance (can happen on fast filesystems). + f.write_text("much longer content here") second = cached_file_hasher.hash_file(f) - # Same because cached — documents expected caching behavior - assert first.digest == second.digest + # Different because size changed → cache miss → rehashed + assert first.digest != second.digest def test_clear_cache_forces_rehash(self, tmp_path): """After clearing the cache, a modified file produces a new hash.""" @@ -260,10 +266,10 @@ def test_clear_cache_forces_rehash(self, tmp_path): cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) f = tmp_path / "mutable.txt" - f.write_text("version 1") + f.write_text("short") first = cached.hash_file(f) - f.write_text("version 2") + f.write_text("much longer content here") cacher.clear_cache() second = cached.hash_file(f)