diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json index 41a1aa03..0b3fee9d 100644 --- a/src/orcapod/contexts/data/v0.1.json +++ b/src/orcapod/contexts/data/v0.1.json @@ -2,13 +2,21 @@ "context_key": "std:v0.1:default", "version": "v0.1", "description": "Initial stable release with basic Path semantic type support", + "file_hasher": { + "_class": "orcapod.hashing.file_hashers.BasicFileHasher", + "_config": { + "algorithm": "sha256" + } + }, "semantic_registry": { "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry", "_config": { "converters": { "path": { "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter", - "_config": {} + "_config": { + "file_hasher": {"_ref": "file_hasher"} + } } } } @@ -33,12 +41,6 @@ } } }, - "file_hasher": { - "_class": "orcapod.hashing.file_hashers.BasicFileHasher", - "_config": { - "algorithm": "sha256" - } - }, "function_info_extractor": { "_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor", "_config": { diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py index 7dddfcc3..7e82c063 100644 --- a/src/orcapod/hashing/file_hashers.py +++ b/src/orcapod/hashing/file_hashers.py @@ -1,3 +1,5 @@ +import os + from orcapod.hashing.hash_utils import hash_file from orcapod.protocols.hashing_protocols import ( FileContentHasherProtocol, @@ -35,11 +37,12 @@ def __init__( self.string_cacher = string_cacher def hash_file(self, file_path: PathLike) -> ContentHash: - cache_key = f"file:{file_path}" + stat = os.stat(file_path) + cache_key = f"file:{file_path}:{stat.st_mtime_ns}:{stat.st_size}" cached_value = self.string_cacher.get_cached(cache_key) if cached_value is not None: - return bytes.fromhex(cached_value) + return ContentHash.from_string(cached_value) - value = self.file_hasher.hash_file(file_path) - self.string_cacher.set_cached(cache_key, value.hex()) - return value + result = self.file_hasher.hash_file(file_path) + self.string_cacher.set_cached(cache_key, result.to_string()) + return result diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py index 2fc43d8b..0addcb77 100644 --- a/src/orcapod/hashing/hash_utils.py +++ b/src/orcapod/hashing/hash_utils.py @@ -7,6 +7,8 @@ import xxhash +from orcapod.types import ContentHash + logger = logging.getLogger(__name__) @@ -41,18 +43,18 @@ def combine_hashes( return combined_hash -def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: - """ - Calculate the hash of a file using the specified algorithm. +def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: + """Calculate the hash of a file using the specified algorithm. - Parameters: - file_path (str): Path to the file to hash - algorithm (str): Hash algorithm to use - options include: - 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path' - buffer_size (int): Size of chunks to read from the file at a time + Args: + file_path: Path to the file to hash. + algorithm: Hash algorithm to use — options include: + 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'. + buffer_size: Size of chunks to read from the file at a time. Returns: - bytes: Raw digest bytes of the hash + A ContentHash with method set to the algorithm name and digest + containing the raw hash bytes. """ if not Path(file_path).is_file(): raise FileNotFoundError(f"The file {file_path} does not exist") @@ -60,8 +62,8 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: # Hash the path string itself rather than file content if algorithm == "hash_path": hasher = hashlib.sha256() - hasher.update(file_path.encode("utf-8")) - return hasher.digest() + hasher.update(str(file_path).encode("utf-8")) + return ContentHash(method=algorithm, digest=hasher.digest()) if algorithm == "xxh64": hasher = xxhash.xxh64() @@ -71,7 +73,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: if not data: break hasher.update(data) - return hasher.digest() + return ContentHash(method=algorithm, digest=hasher.digest()) if algorithm == "crc32": crc = 0 @@ -81,7 +83,10 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: if not data: break crc = zlib.crc32(data, crc) - return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big") + return ContentHash( + method=algorithm, + digest=(crc & 0xFFFFFFFF).to_bytes(4, byteorder="big"), + ) try: hasher = hashlib.new(algorithm) @@ -98,7 +103,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: break hasher.update(data) - return hasher.digest() + return ContentHash(method=algorithm, digest=hasher.digest()) def _is_in_string(line: str, pos: int) -> bool: diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py index fa76bb11..8adce44d 100644 --- a/src/orcapod/hashing/versioned_hashers.py +++ b/src/orcapod/hashing/versioned_hashers.py @@ -124,6 +124,7 @@ def get_versioned_semantic_arrow_hasher( A fully configured SemanticArrowHasher instance. """ from orcapod.hashing.arrow_hashers import SemanticArrowHasher + from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry from orcapod.semantic_types.semantic_struct_converters import PathStructConverter @@ -132,7 +133,8 @@ def get_versioned_semantic_arrow_hasher( # that arise from the protocol definition of SemanticStructConverterProtocol having # a slightly different hash_struct_dict signature than the concrete class. registry: Any = SemanticTypeRegistry() - path_converter: Any = PathStructConverter() + file_hasher = BasicFileHasher(algorithm="sha256") + path_converter: Any = PathStructConverter(file_hasher=file_hasher) registry.register_converter("path", path_converter) logger.debug( diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py index a7effd1f..63d1e236 100644 --- a/src/orcapod/semantic_types/semantic_struct_converters.py +++ b/src/orcapod/semantic_types/semantic_struct_converters.py @@ -13,6 +13,8 @@ if TYPE_CHECKING: import pyarrow as pa + + from orcapod.protocols.hashing_protocols import FileContentHasherProtocol else: pa = LazyModule("pyarrow") @@ -76,9 +78,10 @@ def _compute_content_hash(self, content: bytes) -> ContentHash: class PathStructConverter(SemanticStructConverterBase): """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}""" - def __init__(self): + def __init__(self, file_hasher: "FileContentHasherProtocol"): super().__init__("path") self._python_type = Path + self._file_hasher = file_hasher # Define the Arrow struct type for paths self._arrow_struct_type = pa.struct( @@ -134,3 +137,32 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: return set(struct_dict.keys()) == {"path"} and isinstance( struct_dict["path"], str ) + + def hash_struct_dict( + self, struct_dict: dict[str, Any], add_prefix: bool = False + ) -> str: + """Compute hash of a path semantic type by hashing the file content. + + Args: + struct_dict: Dict with a "path" key containing a file path string. + add_prefix: If True, prefix with "path:sha256:...". + + Returns: + Hash string of the file content. + + Raises: + FileNotFoundError: If the path does not exist. + IsADirectoryError: If the path is a directory. + """ + path_str = struct_dict.get("path") + if path_str is None: + raise ValueError("Missing 'path' field in struct dict") + + path = Path(path_str) + if not path.exists(): + raise FileNotFoundError(f"Path does not exist: {path}") + if path.is_dir(): + raise IsADirectoryError(f"Path is a directory: {path}") + + content_hash = self._file_hasher.hash_file(path) + return self._format_hash_string(content_hash.digest, add_prefix=add_prefix) diff --git a/tests/test_hashing/generate_file_hashes.py b/tests/test_hashing/generate_file_hashes.py index 0beb66c5..270c2894 100644 --- a/tests/test_hashing/generate_file_hashes.py +++ b/tests/test_hashing/generate_file_hashes.py @@ -63,12 +63,12 @@ def create_sample_files(): files_info.append( { "file": str(rel_filepath), - "hash": file_hash, + "hash": file_hash.to_hex(), "size_kb": size, "type": "text", } ) - print(f"Created text file: {filename} ({size} KB), Hash: {file_hash}") + print(f"Created text file: {filename} ({size} KB), Hash: {file_hash.to_hex()}") # Generate binary files of various sizes binary_sizes = [1, 5, 10, 50, 100] # sizes in KB @@ -88,12 +88,12 @@ def create_sample_files(): files_info.append( { "file": str(rel_filepath), - "hash": file_hash, + "hash": file_hash.to_hex(), "size_kb": size, "type": "binary", } ) - print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash}") + print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash.to_hex()}") # Create a structured file (JSON) json_filename = "sample_structured.json" @@ -116,8 +116,8 @@ def create_sample_files(): # Compute the hash json_hash = hash_file(json_filepath) - files_info.append({"file": str(rel_filepath), "hash": json_hash, "type": "json"}) - print(f"Created JSON file: {json_filename}, Hash: {json_hash}") + files_info.append({"file": str(rel_filepath), "hash": json_hash.to_hex(), "type": "json"}) + print(f"Created JSON file: {json_filename}, Hash: {json_hash.to_hex()}") return files_info diff --git a/tests/test_hashing/test_file_hashers.py b/tests/test_hashing/test_file_hashers.py new file mode 100644 index 00000000..606c92e6 --- /dev/null +++ b/tests/test_hashing/test_file_hashers.py @@ -0,0 +1,276 @@ +""" +Tests for file hashing return types and CachedFileHasher behavior. + +These tests would have caught the pre-existing bug where hash_utils.hash_file() +returned raw bytes instead of ContentHash, and verify the CachedFileHasher +correctly round-trips ContentHash through the string cache. +""" + +from pathlib import Path + +import pytest + +from orcapod.hashing.file_hashers import BasicFileHasher, CachedFileHasher +from orcapod.hashing.hash_utils import hash_file +from orcapod.hashing.string_cachers import InMemoryCacher +from orcapod.types import ContentHash + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def sample_file(tmp_path): + """Create a small sample file for hashing.""" + f = tmp_path / "sample.txt" + f.write_text("hello world") + return f + + +@pytest.fixture +def file_hasher(): + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def cached_file_hasher(file_hasher): + cacher = InMemoryCacher() + return CachedFileHasher(file_hasher=file_hasher, string_cacher=cacher) + + +# --------------------------------------------------------------------------- +# hash_utils.hash_file returns ContentHash +# --------------------------------------------------------------------------- + + +class TestHashFileReturnType: + """Tests that would have caught hash_file returning raw bytes.""" + + def test_hash_file_returns_content_hash(self, sample_file): + result = hash_file(sample_file) + assert isinstance(result, ContentHash), ( + f"hash_file should return ContentHash, got {type(result).__name__}" + ) + + def test_hash_file_has_method(self, sample_file): + result = hash_file(sample_file) + assert result.method == "sha256" + + def test_hash_file_has_digest_bytes(self, sample_file): + result = hash_file(sample_file) + assert isinstance(result.digest, bytes) + assert len(result.digest) == 32 # SHA-256 produces 32 bytes + + def test_hash_file_xxh64_returns_content_hash(self, sample_file): + result = hash_file(sample_file, algorithm="xxh64") + assert isinstance(result, ContentHash) + assert result.method == "xxh64" + + def test_hash_file_crc32_returns_content_hash(self, sample_file): + result = hash_file(sample_file, algorithm="crc32") + assert isinstance(result, ContentHash) + assert result.method == "crc32" + assert len(result.digest) == 4 # CRC32 produces 4 bytes + + def test_hash_file_hash_path_returns_content_hash(self, sample_file): + result = hash_file(sample_file, algorithm="hash_path") + assert isinstance(result, ContentHash) + assert result.method == "hash_path" + + def test_hash_file_to_string_round_trips(self, sample_file): + """ContentHash.to_string() / from_string() preserves method and digest.""" + original = hash_file(sample_file) + serialized = original.to_string() + restored = ContentHash.from_string(serialized) + assert restored.method == original.method + assert restored.digest == original.digest + + +# --------------------------------------------------------------------------- +# BasicFileHasher returns ContentHash +# --------------------------------------------------------------------------- + + +class TestBasicFileHasherReturnType: + """Tests that would have caught BasicFileHasher returning raw bytes.""" + + def test_returns_content_hash(self, file_hasher, sample_file): + result = file_hasher.hash_file(sample_file) + assert isinstance(result, ContentHash) + + def test_method_matches_algorithm(self, sample_file): + for algo in ("sha256", "md5"): + hasher = BasicFileHasher(algorithm=algo) + result = hasher.hash_file(sample_file) + assert result.method == algo + + def test_digest_is_bytes(self, file_hasher, sample_file): + result = file_hasher.hash_file(sample_file) + assert isinstance(result.digest, bytes) + + def test_deterministic(self, file_hasher, sample_file): + h1 = file_hasher.hash_file(sample_file) + h2 = file_hasher.hash_file(sample_file) + assert h1 == h2 + + def test_different_content_different_hash(self, file_hasher, tmp_path): + f1 = tmp_path / "a.txt" + f2 = tmp_path / "b.txt" + f1.write_text("aaa") + f2.write_text("bbb") + assert file_hasher.hash_file(f1) != file_hasher.hash_file(f2) + + def test_same_content_different_paths_same_hash(self, file_hasher, tmp_path): + f1 = tmp_path / "a.txt" + f2 = tmp_path / "b.txt" + f1.write_text("same") + f2.write_text("same") + assert file_hasher.hash_file(f1) == file_hasher.hash_file(f2) + + +# --------------------------------------------------------------------------- +# CachedFileHasher with InMemoryCacher +# --------------------------------------------------------------------------- + + +class TestCachedFileHasher: + """Tests for CachedFileHasher caching behavior and ContentHash preservation.""" + + def test_returns_content_hash(self, cached_file_hasher, sample_file): + result = cached_file_hasher.hash_file(sample_file) + assert isinstance(result, ContentHash) + + def test_cache_miss_delegates_to_inner_hasher(self, sample_file): + """On cache miss, result must match the inner BasicFileHasher.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + expected = inner.hash_file(sample_file) + actual = cached.hash_file(sample_file) + + assert actual.method == expected.method + assert actual.digest == expected.digest + + def test_cache_hit_returns_correct_content_hash(self, sample_file): + """On cache hit, the returned ContentHash must have correct method and digest.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + # First call populates cache + first = cached.hash_file(sample_file) + + # Second call should hit cache + second = cached.hash_file(sample_file) + + assert second.method == first.method + assert second.digest == first.digest + + def test_cache_stores_to_string_format(self, sample_file): + """The cache must store the full 'method:hex_digest' string.""" + import os + + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + result = cached.hash_file(sample_file) + + # Inspect the raw cached value — key includes mtime+size + stat = os.stat(sample_file) + cache_key = f"file:{sample_file}:{stat.st_mtime_ns}:{stat.st_size}" + cached_value = cacher.get_cached(cache_key) + + assert cached_value is not None + assert cached_value == result.to_string() + # Should be in "method:hex_digest" format + assert ":" in cached_value + method, hex_digest = cached_value.split(":", 1) + assert method == "sha256" + assert hex_digest == result.digest.hex() + + def test_cache_hit_preserves_method_not_cached(self, sample_file): + """Cache hit must return the original method, not 'cached' or similar.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + # Populate cache + cached.hash_file(sample_file) + + # Cache hit + result = cached.hash_file(sample_file) + assert result.method == "sha256", ( + f"Cache hit should preserve original method, got '{result.method}'" + ) + + def test_cache_round_trip_with_from_string(self, sample_file): + """Manually verify the to_string / from_string round-trip used by the cache.""" + inner = BasicFileHasher(algorithm="sha256") + original = inner.hash_file(sample_file) + + serialized = original.to_string() + restored = ContentHash.from_string(serialized) + + assert restored.method == original.method + assert restored.digest == original.digest + + def test_different_algorithms_share_cache_key(self, tmp_path): + """Two CachedFileHashers with different algorithms but same path+mtime+size + share the same cache key. The second hasher gets the first's cached result. + + This documents a known limitation: the cache key doesn't include the algorithm. + """ + f = tmp_path / "file.txt" + f.write_text("test content") + + cacher = InMemoryCacher() + + sha_inner = BasicFileHasher(algorithm="sha256") + sha_cached = CachedFileHasher(file_hasher=sha_inner, string_cacher=cacher) + sha_result = sha_cached.hash_file(f) + + md5_inner = BasicFileHasher(algorithm="md5") + md5_cached = CachedFileHasher(file_hasher=md5_inner, string_cacher=cacher) + md5_result = md5_cached.hash_file(f) + + # Same cache key (same file, same mtime+size), so md5 gets sha256 result + assert md5_result.method == "sha256" + assert md5_result.digest == sha_result.digest + + def test_modified_content_invalidates_cache(self, cached_file_hasher, tmp_path): + """Cache is automatically invalidated when file content changes. + + CachedFileHasher includes mtime_ns and file size in the cache key, + so writing new content produces a cache miss and a fresh hash. + """ + f = tmp_path / "mutable.txt" + f.write_text("short") + first = cached_file_hasher.hash_file(f) + + # Use different-length content so file size changes even if mtime_ns + # doesn't advance (can happen on fast filesystems). + f.write_text("much longer content here") + second = cached_file_hasher.hash_file(f) + + # Different because size changed → cache miss → rehashed + assert first.digest != second.digest + + def test_clear_cache_forces_rehash(self, tmp_path): + """After clearing the cache, a modified file produces a new hash.""" + inner = BasicFileHasher(algorithm="sha256") + cacher = InMemoryCacher() + cached = CachedFileHasher(file_hasher=inner, string_cacher=cacher) + + f = tmp_path / "mutable.txt" + f.write_text("short") + first = cached.hash_file(f) + + f.write_text("much longer content here") + cacher.clear_cache() + second = cached.hash_file(f) + + assert first.digest != second.digest diff --git a/tests/test_hashing/test_file_hashing_consistency.py b/tests/test_hashing/test_file_hashing_consistency.py new file mode 100644 index 00000000..339298b8 --- /dev/null +++ b/tests/test_hashing/test_file_hashing_consistency.py @@ -0,0 +1,212 @@ +""" +Integration tests verifying that file hashing is consistent across both paths: + +1. **Arrow hasher path**: SemanticArrowHasher processes an Arrow table containing a + path struct column → calls PathStructConverter.hash_struct_dict → file_hasher. +2. **Semantic hasher path**: BaseSemanticHasher hashes a Python Path object → + calls PathContentHandler.handle → file_hasher. + +Both paths must delegate to the same FileContentHasherProtocol so that identical +file content always produces identical hashes, regardless of entry point. +""" + +from pathlib import Path + +import pyarrow as pa +import pytest + +from orcapod.hashing.arrow_hashers import SemanticArrowHasher +from orcapod.hashing.file_hashers import BasicFileHasher +from orcapod.hashing.semantic_hashing.builtin_handlers import ( + PathContentHandler, + register_builtin_handlers, +) +from orcapod.hashing.semantic_hashing.semantic_hasher import BaseSemanticHasher +from orcapod.hashing.semantic_hashing.type_handler_registry import TypeHandlerRegistry +from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry +from orcapod.semantic_types.semantic_struct_converters import PathStructConverter + + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def file_hasher(): + """Single file hasher instance shared by both paths.""" + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def path_converter(file_hasher): + return PathStructConverter(file_hasher=file_hasher) + + +@pytest.fixture +def arrow_hasher(path_converter): + """SemanticArrowHasher wired with the shared file_hasher via PathStructConverter.""" + registry = SemanticTypeRegistry() + registry.register_converter("path", path_converter) + return SemanticArrowHasher(semantic_registry=registry) + + +@pytest.fixture +def semantic_hasher(file_hasher): + """BaseSemanticHasher wired with the shared file_hasher via PathContentHandler.""" + registry = TypeHandlerRegistry() + register_builtin_handlers(registry, file_hasher=file_hasher) + return BaseSemanticHasher( + hasher_id="test_v1", type_handler_registry=registry, strict=True + ) + + +# --------------------------------------------------------------------------- +# Arrow struct hasher: path column tests +# --------------------------------------------------------------------------- + + +class TestArrowStructPathHashing: + """Tests for file hashing through the Arrow hasher path.""" + + def test_same_content_different_paths_same_hash( + self, arrow_hasher, tmp_path + ): + """Two distinct files with identical content produce the same table hash.""" + file1 = tmp_path / "a.txt" + file2 = tmp_path / "b.txt" + file1.write_text("identical content") + file2.write_text("identical content") + + table1 = pa.table( + {"file": [{"path": str(file1)}]}, + schema=pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]), + ) + table2 = pa.table( + {"file": [{"path": str(file2)}]}, + schema=pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]), + ) + + hash1 = arrow_hasher.hash_table(table1) + hash2 = arrow_hasher.hash_table(table2) + assert hash1.digest == hash2.digest + + def test_modified_content_different_hash(self, arrow_hasher, tmp_path): + """Same path with modified content between hashes yields different hash.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + + schema = pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]) + table_v1 = pa.table({"file": [{"path": str(file)}]}, schema=schema) + hash1 = arrow_hasher.hash_table(table_v1) + + file.write_text("version 2") + table_v2 = pa.table({"file": [{"path": str(file)}]}, schema=schema) + hash2 = arrow_hasher.hash_table(table_v2) + + assert hash1.digest != hash2.digest + + def test_different_content_different_hash(self, arrow_hasher, tmp_path): + """Two files with different content produce different table hashes.""" + file1 = tmp_path / "x.txt" + file2 = tmp_path / "y.txt" + file1.write_text("content A") + file2.write_text("content B") + + schema = pa.schema([pa.field("file", pa.struct([pa.field("path", pa.large_string())]))]) + table1 = pa.table({"file": [{"path": str(file1)}]}, schema=schema) + table2 = pa.table({"file": [{"path": str(file2)}]}, schema=schema) + + hash1 = arrow_hasher.hash_table(table1) + hash2 = arrow_hasher.hash_table(table2) + assert hash1.digest != hash2.digest + + +# --------------------------------------------------------------------------- +# Semantic hasher: Path object tests +# --------------------------------------------------------------------------- + + +class TestSemanticPathHashing: + """Tests for file hashing through the semantic hasher path.""" + + def test_same_content_different_paths_same_hash( + self, semantic_hasher, tmp_path + ): + """Two distinct Path objects pointing to files with identical content.""" + file1 = tmp_path / "a.txt" + file2 = tmp_path / "b.txt" + file1.write_text("identical content") + file2.write_text("identical content") + + hash1 = semantic_hasher.hash_object(Path(file1)) + hash2 = semantic_hasher.hash_object(Path(file2)) + assert hash1.digest == hash2.digest + + def test_modified_content_different_hash(self, semantic_hasher, tmp_path): + """Same Path with modified content between hashes.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + hash1 = semantic_hasher.hash_object(Path(file)) + + file.write_text("version 2") + hash2 = semantic_hasher.hash_object(Path(file)) + assert hash1.digest != hash2.digest + + def test_different_content_different_hash(self, semantic_hasher, tmp_path): + """Two Paths pointing to different content produce different hashes.""" + file1 = tmp_path / "x.txt" + file2 = tmp_path / "y.txt" + file1.write_text("content A") + file2.write_text("content B") + + hash1 = semantic_hasher.hash_object(Path(file1)) + hash2 = semantic_hasher.hash_object(Path(file2)) + assert hash1.digest != hash2.digest + + +# --------------------------------------------------------------------------- +# Cross-path consistency +# --------------------------------------------------------------------------- + + +class TestCrossPathConsistency: + """Verify that the arrow hasher and semantic hasher use the same file_hasher + and produce equivalent file content hashes for the same underlying file.""" + + def test_arrow_and_semantic_hash_same_file_content( + self, path_converter, semantic_hasher, file_hasher, tmp_path + ): + """The file content hash extracted by PathStructConverter.hash_struct_dict + must match the ContentHash produced by PathContentHandler.handle (which + the semantic hasher uses internally for Path objects). + + We compare at the file_hasher level: both paths ultimately call + file_hasher.hash_file(path), so the raw digest must be identical. + """ + file = tmp_path / "shared.txt" + file.write_text("shared content for both paths") + + # Arrow path: PathStructConverter.hash_struct_dict (no prefix) + arrow_hash_hex = path_converter.hash_struct_dict({"path": str(file)}) + + # Semantic path: file_hasher.hash_file directly (same as PathContentHandler) + semantic_content_hash = file_hasher.hash_file(file) + + assert arrow_hash_hex == semantic_content_hash.digest.hex() + + def test_arrow_and_semantic_same_content_two_files( + self, path_converter, file_hasher, tmp_path + ): + """Two files with identical content: arrow struct hash_struct_dict and + direct file_hasher.hash_file produce the same digest.""" + file1 = tmp_path / "file_arrow.txt" + file2 = tmp_path / "file_semantic.txt" + content = "same content for cross-path test" + file1.write_text(content) + file2.write_text(content) + + arrow_hex = path_converter.hash_struct_dict({"path": str(file1)}) + semantic_hex = file_hasher.hash_file(file2).digest.hex() + + assert arrow_hex == semantic_hex diff --git a/tests/test_semantic_types/test_path_struct_converter.py b/tests/test_semantic_types/test_path_struct_converter.py index d6e12644..73ae46e6 100644 --- a/tests/test_semantic_types/test_path_struct_converter.py +++ b/tests/test_semantic_types/test_path_struct_converter.py @@ -1,14 +1,23 @@ from pathlib import Path from typing import cast -from unittest.mock import patch import pytest +from orcapod.hashing.file_hashers import BasicFileHasher from orcapod.semantic_types.semantic_struct_converters import PathStructConverter -def test_path_to_struct_and_back(): - converter = PathStructConverter() +@pytest.fixture +def file_hasher(): + return BasicFileHasher(algorithm="sha256") + + +@pytest.fixture +def converter(file_hasher): + return PathStructConverter(file_hasher=file_hasher) + + +def test_path_to_struct_and_back(converter): path_obj = Path("/tmp/test.txt") struct_dict = converter.python_to_struct_dict(path_obj) assert struct_dict["path"] == str(path_obj) @@ -16,26 +25,22 @@ def test_path_to_struct_and_back(): assert restored == path_obj -def test_path_to_struct_invalid_type(): - converter = PathStructConverter() +def test_path_to_struct_invalid_type(converter): with pytest.raises(TypeError): converter.python_to_struct_dict("not_a_path") # type: ignore -def test_struct_to_python_missing_field(): - converter = PathStructConverter() +def test_struct_to_python_missing_field(converter): with pytest.raises(ValueError): converter.struct_dict_to_python({}) -def test_can_handle_python_type(): - converter = PathStructConverter() +def test_can_handle_python_type(converter): assert converter.can_handle_python_type(Path) assert not converter.can_handle_python_type(str) -def test_can_handle_struct_type(): - converter = PathStructConverter() +def test_can_handle_struct_type(converter): struct_type = converter.arrow_struct_type assert converter.can_handle_struct_type(struct_type) @@ -60,62 +65,68 @@ def names(self): assert not converter.can_handle_struct_type(fake_struct) -def test_is_semantic_struct(): - converter = PathStructConverter() +def test_is_semantic_struct(converter): assert converter.is_semantic_struct({"path": "/tmp/test.txt"}) assert not converter.is_semantic_struct({"not_path": "value"}) assert not converter.is_semantic_struct({"path": 123}) -# def test_hash_struct_dict_file_not_found(tmp_path): -# converter = PathStructConverter() -# struct_dict = {"path": str(tmp_path / "does_not_exist.txt")} -# with pytest.raises(FileNotFoundError): -# converter.hash_struct_dict(struct_dict) - - -# def test_hash_struct_dict_permission_error(tmp_path): -# converter = PathStructConverter() -# file_path = tmp_path / "file.txt" -# file_path.write_text("data") -# with patch("pathlib.Path.read_bytes", side_effect=PermissionError): -# struct_dict = {"path": str(file_path)} -# with pytest.raises(PermissionError): -# converter.hash_struct_dict(struct_dict) - - -# def test_hash_struct_dict_is_directory(tmp_path): -# converter = PathStructConverter() -# struct_dict = {"path": str(tmp_path)} -# with pytest.raises(ValueError): -# converter.hash_struct_dict(struct_dict) - - -# def test_hash_struct_dict_content_based(tmp_path): -# converter = PathStructConverter() -# file1 = tmp_path / "file1.txt" -# file2 = tmp_path / "file2.txt" -# content = "identical content" -# file1.write_text(content) -# file2.write_text(content) -# struct_dict1 = {"path": str(file1)} -# struct_dict2 = {"path": str(file2)} -# hash1 = converter.hash_struct_dict(struct_dict1) -# hash2 = converter.hash_struct_dict(struct_dict2) -# assert hash1 == hash2 - - -# def test_hash_path_objects_content_based(tmp_path): -# converter = PathStructConverter() -# file1 = tmp_path / "fileA.txt" -# file2 = tmp_path / "fileB.txt" -# content = "same file content" -# file1.write_text(content) -# file2.write_text(content) -# path_obj1 = Path(file1) -# path_obj2 = Path(file2) -# struct_dict1 = converter.python_to_struct_dict(path_obj1) -# struct_dict2 = converter.python_to_struct_dict(path_obj2) -# hash1 = converter.hash_struct_dict(struct_dict1) -# hash2 = converter.hash_struct_dict(struct_dict2) -# assert hash1 == hash2 +def test_hash_struct_dict_file_not_found(converter, tmp_path): + struct_dict = {"path": str(tmp_path / "does_not_exist.txt")} + with pytest.raises(FileNotFoundError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_is_directory(converter, tmp_path): + struct_dict = {"path": str(tmp_path)} + with pytest.raises(IsADirectoryError): + converter.hash_struct_dict(struct_dict) + + +def test_hash_struct_dict_content_based(converter, tmp_path): + """Two distinct files with identical content produce the same hash.""" + file1 = tmp_path / "file1.txt" + file2 = tmp_path / "file2.txt" + content = "identical content" + file1.write_text(content) + file2.write_text(content) + hash1 = converter.hash_struct_dict({"path": str(file1)}) + hash2 = converter.hash_struct_dict({"path": str(file2)}) + assert hash1 == hash2 + + +def test_hash_path_objects_content_based(converter, tmp_path): + """Round-trip through python_to_struct_dict then hash_struct_dict.""" + file1 = tmp_path / "fileA.txt" + file2 = tmp_path / "fileB.txt" + content = "same file content" + file1.write_text(content) + file2.write_text(content) + struct_dict1 = converter.python_to_struct_dict(Path(file1)) + struct_dict2 = converter.python_to_struct_dict(Path(file2)) + hash1 = converter.hash_struct_dict(struct_dict1) + hash2 = converter.hash_struct_dict(struct_dict2) + assert hash1 == hash2 + + +def test_hash_struct_dict_with_prefix(converter, tmp_path): + """Prefixed hash starts with 'path:sha256:'.""" + file = tmp_path / "file.txt" + file.write_text("hello") + hash_str = converter.hash_struct_dict({"path": str(file)}, add_prefix=True) + assert hash_str.startswith("path:sha256:") + + +def test_hash_struct_dict_different_content(converter, tmp_path): + """Same path with modified content yields a different hash.""" + file = tmp_path / "mutable.txt" + file.write_text("version 1") + hash1 = converter.hash_struct_dict({"path": str(file)}) + file.write_text("version 2") + hash2 = converter.hash_struct_dict({"path": str(file)}) + assert hash1 != hash2 + + +def test_hash_struct_dict_missing_path_field(converter): + with pytest.raises(ValueError, match="Missing 'path' field"): + converter.hash_struct_dict({})