Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions src/orcapod/contexts/data/v0.1.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,21 @@
"context_key": "std:v0.1:default",
"version": "v0.1",
"description": "Initial stable release with basic Path semantic type support",
"file_hasher": {
"_class": "orcapod.hashing.file_hashers.BasicFileHasher",
"_config": {
"algorithm": "sha256"
}
},
"semantic_registry": {
"_class": "orcapod.semantic_types.semantic_registry.SemanticTypeRegistry",
"_config": {
"converters": {
"path": {
"_class": "orcapod.semantic_types.semantic_struct_converters.PathStructConverter",
"_config": {}
"_config": {
"file_hasher": {"_ref": "file_hasher"}
}
}
}
}
Expand All @@ -33,12 +41,6 @@
}
}
},
"file_hasher": {
"_class": "orcapod.hashing.file_hashers.BasicFileHasher",
"_config": {
"algorithm": "sha256"
}
},
"function_info_extractor": {
"_class": "orcapod.hashing.semantic_hashing.function_info_extractors.FunctionSignatureExtractor",
"_config": {
Expand Down
13 changes: 8 additions & 5 deletions src/orcapod/hashing/file_hashers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

from orcapod.hashing.hash_utils import hash_file
from orcapod.protocols.hashing_protocols import (
FileContentHasherProtocol,
Expand Down Expand Up @@ -35,11 +37,12 @@ def __init__(
self.string_cacher = string_cacher

def hash_file(self, file_path: PathLike) -> ContentHash:
cache_key = f"file:{file_path}"
stat = os.stat(file_path)
cache_key = f"file:{file_path}:{stat.st_mtime_ns}:{stat.st_size}"
cached_value = self.string_cacher.get_cached(cache_key)
if cached_value is not None:
return bytes.fromhex(cached_value)
return ContentHash.from_string(cached_value)

value = self.file_hasher.hash_file(file_path)
self.string_cacher.set_cached(cache_key, value.hex())
return value
result = self.file_hasher.hash_file(file_path)
self.string_cacher.set_cached(cache_key, result.to_string())
return result
Comment on lines +40 to +48
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CachedFileHasher cache key does not include the hashing algorithm / hasher identity, so two CachedFileHasher instances with different algorithms but the same backing StringCacher can return an incorrect cached result (e.g., md5 hasher receiving a sha256 ContentHash). Include something like the underlying hasher's algorithm/method (or a stable hasher_id) in the cache_key to prevent cross-algorithm collisions.

Copilot uses AI. Check for mistakes.
33 changes: 19 additions & 14 deletions src/orcapod/hashing/hash_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import xxhash

from orcapod.types import ContentHash

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -41,27 +43,27 @@ def combine_hashes(
return combined_hash


def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
"""
Calculate the hash of a file using the specified algorithm.
def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash:
"""Calculate the hash of a file using the specified algorithm.

Parameters:
file_path (str): Path to the file to hash
algorithm (str): Hash algorithm to use - options include:
'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'
buffer_size (int): Size of chunks to read from the file at a time
Args:
file_path: Path to the file to hash.
algorithm: Hash algorithm to use options include:
'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'.
buffer_size: Size of chunks to read from the file at a time.

Returns:
bytes: Raw digest bytes of the hash
A ContentHash with method set to the algorithm name and digest
containing the raw hash bytes.
"""
if not Path(file_path).is_file():
raise FileNotFoundError(f"The file {file_path} does not exist")

# Hash the path string itself rather than file content
if algorithm == "hash_path":
hasher = hashlib.sha256()
hasher.update(file_path.encode("utf-8"))
return hasher.digest()
hasher.update(str(file_path).encode("utf-8"))
return ContentHash(method=algorithm, digest=hasher.digest())
Comment on lines 59 to +66
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hash_file() checks Path(file_path).is_file() before handling algorithm=='hash_path'. That makes the 'hash_path' mode unusable for non-existent paths (even though it’s supposed to hash the path string itself) and it also raises a misleading FileNotFoundError for directories. Consider moving the 'hash_path' branch before the is_file() check, and splitting validation so directories raise IsADirectoryError (or a clearer error) while missing paths raise FileNotFoundError.

Copilot uses AI. Check for mistakes.

if algorithm == "xxh64":
hasher = xxhash.xxh64()
Expand All @@ -71,7 +73,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
if not data:
break
hasher.update(data)
return hasher.digest()
return ContentHash(method=algorithm, digest=hasher.digest())

if algorithm == "crc32":
crc = 0
Expand All @@ -81,7 +83,10 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
if not data:
break
crc = zlib.crc32(data, crc)
return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big")
return ContentHash(
method=algorithm,
digest=(crc & 0xFFFFFFFF).to_bytes(4, byteorder="big"),
)

try:
hasher = hashlib.new(algorithm)
Expand All @@ -98,7 +103,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes:
break
hasher.update(data)

return hasher.digest()
return ContentHash(method=algorithm, digest=hasher.digest())


def _is_in_string(line: str, pos: int) -> bool:
Expand Down
4 changes: 3 additions & 1 deletion src/orcapod/hashing/versioned_hashers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def get_versioned_semantic_arrow_hasher(
A fully configured SemanticArrowHasher instance.
"""
from orcapod.hashing.arrow_hashers import SemanticArrowHasher
from orcapod.hashing.file_hashers import BasicFileHasher
from orcapod.semantic_types.semantic_registry import SemanticTypeRegistry
from orcapod.semantic_types.semantic_struct_converters import PathStructConverter

Expand All @@ -132,7 +133,8 @@ def get_versioned_semantic_arrow_hasher(
# that arise from the protocol definition of SemanticStructConverterProtocol having
# a slightly different hash_struct_dict signature than the concrete class.
registry: Any = SemanticTypeRegistry()
path_converter: Any = PathStructConverter()
file_hasher = BasicFileHasher(algorithm="sha256")
path_converter: Any = PathStructConverter(file_hasher=file_hasher)
registry.register_converter("path", path_converter)

logger.debug(
Expand Down
34 changes: 33 additions & 1 deletion src/orcapod/semantic_types/semantic_struct_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

if TYPE_CHECKING:
import pyarrow as pa

from orcapod.protocols.hashing_protocols import FileContentHasherProtocol
else:
pa = LazyModule("pyarrow")

Expand Down Expand Up @@ -76,9 +78,10 @@ def _compute_content_hash(self, content: bytes) -> ContentHash:
class PathStructConverter(SemanticStructConverterBase):
"""Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}"""

def __init__(self):
def __init__(self, file_hasher: "FileContentHasherProtocol"):
super().__init__("path")
self._python_type = Path
self._file_hasher = file_hasher

# Define the Arrow struct type for paths
self._arrow_struct_type = pa.struct(
Expand Down Expand Up @@ -134,3 +137,32 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool:
return set(struct_dict.keys()) == {"path"} and isinstance(
struct_dict["path"], str
)

def hash_struct_dict(
self, struct_dict: dict[str, Any], add_prefix: bool = False
) -> str:
"""Compute hash of a path semantic type by hashing the file content.

Args:
struct_dict: Dict with a "path" key containing a file path string.
add_prefix: If True, prefix with "path:sha256:...".

Returns:
Hash string of the file content.

Raises:
FileNotFoundError: If the path does not exist.
IsADirectoryError: If the path is a directory.
"""
path_str = struct_dict.get("path")
if path_str is None:
raise ValueError("Missing 'path' field in struct dict")

path = Path(path_str)
if not path.exists():
raise FileNotFoundError(f"Path does not exist: {path}")
if path.is_dir():
raise IsADirectoryError(f"Path is a directory: {path}")

content_hash = self._file_hasher.hash_file(path)
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PathStructConverter.hash_struct_dict() delegates to an injected file_hasher, but the prefix formatting is hard-coded to 'sha256' via _format_hash_string() and the docstring says 'path:sha256:...'. If a non-sha256 file_hasher is injected, add_prefix will lie about the algorithm. Either enforce/validate that file_hasher is sha256 here, or build the prefix from the returned ContentHash.method (and adjust _format_hash_string accordingly).

Suggested change
content_hash = self._file_hasher.hash_file(path)
content_hash = self._file_hasher.hash_file(path)
if add_prefix and content_hash.method != "sha256":
raise ValueError(
f"add_prefix=True requires a SHA-256 file hasher, but got '{content_hash.method}'"
)

Copilot uses AI. Check for mistakes.
return self._format_hash_string(content_hash.digest, add_prefix=add_prefix)
12 changes: 6 additions & 6 deletions tests/test_hashing/generate_file_hashes.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ def create_sample_files():
files_info.append(
{
"file": str(rel_filepath),
"hash": file_hash,
"hash": file_hash.to_hex(),
"size_kb": size,
"type": "text",
}
)
print(f"Created text file: {filename} ({size} KB), Hash: {file_hash}")
print(f"Created text file: {filename} ({size} KB), Hash: {file_hash.to_hex()}")

# Generate binary files of various sizes
binary_sizes = [1, 5, 10, 50, 100] # sizes in KB
Expand All @@ -88,12 +88,12 @@ def create_sample_files():
files_info.append(
{
"file": str(rel_filepath),
"hash": file_hash,
"hash": file_hash.to_hex(),
"size_kb": size,
"type": "binary",
}
)
print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash}")
print(f"Created binary file: {filename} ({size} KB), Hash: {file_hash.to_hex()}")

# Create a structured file (JSON)
json_filename = "sample_structured.json"
Expand All @@ -116,8 +116,8 @@ def create_sample_files():
# Compute the hash
json_hash = hash_file(json_filepath)

files_info.append({"file": str(rel_filepath), "hash": json_hash, "type": "json"})
print(f"Created JSON file: {json_filename}, Hash: {json_hash}")
files_info.append({"file": str(rel_filepath), "hash": json_hash.to_hex(), "type": "json"})
print(f"Created JSON file: {json_filename}, Hash: {json_hash.to_hex()}")

return files_info

Expand Down
Loading