walkerlab · eywalker · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/src/orcapod/contexts/data/v0.1.json b/src/orcapod/contexts/data/v0.1.json
@@ -2,13 +2,21 @@
     "context_key": "std:v0.1:default",
     "version": "v0.1",
     "description": "Initial stable release with basic Path semantic type support",
+    "file_hasher": {
+        "_class": "orcapod.hashing.file_hashers.BasicFileHasher",
+        "_config": {
+            "algorithm": "sha256"
+        }
+    },
     "semantic_registry": {
         "_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry",
         "_config": {
             "converters": {
                 "path": {
                     "_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter",
-                    "_config": {}
+                    "_config": {
+                        "file_hasher": {"_ref": "file_hasher"}
+                    }
                 }
             }
         }
@@ -33,12 +41,6 @@
             }
         }
     },
-    "file_hasher": {
-        "_class": "orcapod.hashing.file_hashers.BasicFileHasher",
-        "_config": {
-            "algorithm": "sha256"
-        }
-    },
     "function_info_extractor": {
         "_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor",
         "_config": {

diff --git a/src/orcapod/hashing/file_hashers.py b/src/orcapod/hashing/file_hashers.py
@@ -1,3 +1,5 @@
+import os
+
 from orcapod.hashing.hash_utils import hash_file
 from orcapod.protocols.hashing_protocols import (
     FileContentHasherProtocol,
@@ -35,11 +37,12 @@ def __init__(
         self.string_cacher = string_cacher
 
     def hash_file(self, file_path: PathLike) -> ContentHash:
-        cache_key = f"file:{file_path}"
+        stat = os.stat(file_path)
+        cache_key = f"file:{file_path}:{stat.st_mtime_ns}:{stat.st_size}"
         cached_value = self.string_cacher.get_cached(cache_key)
         if cached_value is not None:
-            return bytes.fromhex(cached_value)
+            return ContentHash.from_string(cached_value)
 
-        value = self.file_hasher.hash_file(file_path)
-        self.string_cacher.set_cached(cache_key, value.hex())
-        return value
+        result = self.file_hasher.hash_file(file_path)
+        self.string_cacher.set_cached(cache_key, result.to_string())
+        return result
diff --git a/src/orcapod/hashing/hash_utils.py b/src/orcapod/hashing/hash_utils.py
@@ -7,6 +7,8 @@
 
 import xxhash
 
+from orcapod.types import ContentHash
+
 logger = logging.getLogger(__name__)
 
 
@@ -41,27 +43,27 @@ def combine_hashes(
     return combined_hash
 
 
-def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
-    """
-    Calculate the hash of a file using the specified algorithm.
+def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash:
+    """Calculate the hash of a file using the specified algorithm.
 
-    Parameters:
-        file_path (str): Path to the file to hash
-        algorithm (str): Hash algorithm to use - options include:
-                         'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'
-        buffer_size (int): Size of chunks to read from the file at a time
+    Args:
+        file_path: Path to the file to hash.
+        algorithm: Hash algorithm to use — options include:
+            'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'.
+        buffer_size: Size of chunks to read from the file at a time.
 
     Returns:
-        bytes: Raw digest bytes of the hash
+        A ContentHash with method set to the algorithm name and digest
+        containing the raw hash bytes.
     """
     if not Path(file_path).is_file():
         raise FileNotFoundError(f"The file {file_path} does not exist")
 
     # Hash the path string itself rather than file content
     if algorithm == "hash_path":
         hasher = hashlib.sha256()
-        hasher.update(file_path.encode("utf-8"))
-        return hasher.digest()
+        hasher.update(str(file_path).encode("utf-8"))
+        return ContentHash(method=algorithm, digest=hasher.digest())
 
     if algorithm == "xxh64":
         hasher = xxhash.xxh64()
@@ -71,7 +73,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
                 if not data:
                     break
                 hasher.update(data)
-        return hasher.digest()
+        return ContentHash(method=algorithm, digest=hasher.digest())
 
     if algorithm == "crc32":
         crc = 0
@@ -81,7 +83,10 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
                 if not data:
                     break
                 crc = zlib.crc32(data, crc)
-        return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big")
+        return ContentHash(
+            method=algorithm,
+            digest=(crc & 0xFFFFFFFF).to_bytes(4, byteorder="big"),
+        )
 
     try:
         hasher = hashlib.new(algorithm)
@@ -98,7 +103,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
                 break
             hasher.update(data)
 
-    return hasher.digest()
+    return ContentHash(method=algorithm, digest=hasher.digest())
 
 
 def _is_in_string(line: str, pos: int) -> bool:

diff --git a/src/orcapod/hashing/versioned_hashers.py b/src/orcapod/hashing/versioned_hashers.py
@@ -124,6 +124,7 @@ def get_versioned_semantic_arrow_hasher(
         A fully configured SemanticArrowHasher instance.
     """
     from orcapod.hashing.arrow_hashers import SemanticArrowHasher
+    from orcapod.hashing.file_hashers import BasicFileHasher
     from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry
     from orcapod.semantic_types.semantic_struct_converters import PathStructConverter
 
@@ -132,7 +133,8 @@ def get_versioned_semantic_arrow_hasher(
     # that arise from the protocol definition of SemanticStructConverterProtocol having
     # a slightly different hash_struct_dict signature than the concrete class.
     registry: Any = SemanticTypeRegistry()
-    path_converter: Any = PathStructConverter()
+    file_hasher = BasicFileHasher(algorithm="sha256")
+    path_converter: Any = PathStructConverter(file_hasher=file_hasher)
     registry.register_converter("path", path_converter)
 
     logger.debug(

diff --git a/src/orcapod/semantic_types/semantic_struct_converters.py b/src/orcapod/semantic_types/semantic_struct_converters.py
@@ -13,6 +13,8 @@
 
 if TYPE_CHECKING:
     import pyarrow as pa
+
+    from orcapod.protocols.hashing_protocols import FileContentHasherProtocol
 else:
     pa = LazyModule("pyarrow")
 
@@ -76,9 +78,10 @@ def _compute_content_hash(self, content: bytes) -> ContentHash:
 class PathStructConverter(SemanticStructConverterBase):
     """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}"""
 
-    def __init__(self):
+    def __init__(self, file_hasher: "FileContentHasherProtocol"):
         super().__init__("path")
         self._python_type = Path
+        self._file_hasher = file_hasher
 
         # Define the Arrow struct type for paths
         self._arrow_struct_type = pa.struct(
@@ -134,3 +137,32 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool:
         return set(struct_dict.keys()) == {"path"} and isinstance(
             struct_dict["path"], str
         )
+
+    def hash_struct_dict(
+        self, struct_dict: dict[str, Any], add_prefix: bool = False
+    ) -> str:
+        """Compute hash of a path semantic type by hashing the file content.
+
+        Args:
+            struct_dict: Dict with a "path" key containing a file path string.
+            add_prefix: If True, prefix with "path:sha256:...".
+
+        Returns:
+            Hash string of the file content.
+
+        Raises:
+            FileNotFoundError: If the path does not exist.
+            IsADirectoryError: If the path is a directory.
+        """
+        path_str = struct_dict.get("path")
+        if path_str is None:
+            raise ValueError("Missing 'path' field in struct dict")
+
+        path = Path(path_str)
+        if not path.exists():
+            raise FileNotFoundError(f"Path does not exist: {path}")
+        if path.is_dir():
+            raise IsADirectoryError(f"Path is a directory: {path}")
+
+        content_hash = self._file_hasher.hash_file(path)
-        content_hash = self._file_hasher.hash_file(path)
+        content_hash = self._file_hasher.hash_file(path)
+        if add_prefix and content_hash.method != "sha256":
+            raise ValueError(
+                f"add_prefix=True requires a SHA-256 file hasher, but got '{content_hash.method}'"
+            )
-        content_hash = self._file_hasher.hash_file(path)
+        content_hash = self._file_hasher.hash_file(path)
+        if add_prefix and content_hash.method != "sha256":
+            raise ValueError(
+                f"add_prefix=True requires a SHA-256 file hasher, but got '{content_hash.method}'"
+            )
+        return self._format_hash_string(content_hash.digest, add_prefix=add_prefix)
diff --git a/tests/test_hashing/generate_file_hashes.py b/tests/test_hashing/generate_file_hashes.py
@@ -63,12 +63,12 @@ def create_sample_files():
         files_info.append(
             {
                 "file": str(rel_filepath),
-                "hash": file_hash,
+                "hash": file_hash.to_hex(),
                 "size_kb": size,
                 "type": "text",
             }
         )
-        print(f"Created text file: {filename} ({size} KB), Hash: {file_hash}")
+        print(f"Created text file: {filename} ({size} KB), Hash: {file_hash.to_hex()}")
 
     # Generate binary files of various sizes
     binary_sizes = [1, 5, 10, 50, 100]  # sizes in KB
@@ -88,12 +88,12 @@ def create_sample_files():
         files_info.append(
             {
                 "file": str(rel_filepath),
-                "hash": file_hash,
+                "hash": file_hash.to_hex(),
                 "size_kb": size,
                 "type": "binary",
             }
         )
-        print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash}")
+        print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash.to_hex()}")
 
     # Create a structured file (JSON)
     json_filename = "sample_structured.json"
@@ -116,8 +116,8 @@ def create_sample_files():
     # Compute the hash
     json_hash = hash_file(json_filepath)
 
-    files_info.append({"file": str(rel_filepath), "hash": json_hash, "type": "json"})
-    print(f"Created JSON file: {json_filename}, Hash: {json_hash}")
+    files_info.append({"file": str(rel_filepath), "hash": json_hash.to_hex(), "type": "json"})
+    print(f"Created JSON file: {json_filename}, Hash: {json_hash.to_hex()}")
 
     return files_info