-
Notifications
You must be signed in to change notification settings - Fork 5
Fix hash_file return type and wire file_hasher through converters #74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7333d08
2985200
0d1d135
01c90f4
8863a6f
5e7e689
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,8 @@ | |
|
|
||
| import xxhash | ||
|
|
||
| from orcapod.types import ContentHash | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
|
|
@@ -41,27 +43,27 @@ def combine_hashes( | |
| return combined_hash | ||
|
|
||
|
|
||
| def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: | ||
| """ | ||
| Calculate the hash of a file using the specified algorithm. | ||
| def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> ContentHash: | ||
| """Calculate the hash of a file using the specified algorithm. | ||
|
|
||
| Parameters: | ||
| file_path (str): Path to the file to hash | ||
| algorithm (str): Hash algorithm to use - options include: | ||
| 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path' | ||
| buffer_size (int): Size of chunks to read from the file at a time | ||
| Args: | ||
| file_path: Path to the file to hash. | ||
| algorithm: Hash algorithm to use — options include: | ||
| 'md5', 'sha1', 'sha256', 'sha512', 'xxh64', 'crc32', 'hash_path'. | ||
| buffer_size: Size of chunks to read from the file at a time. | ||
|
|
||
| Returns: | ||
| bytes: Raw digest bytes of the hash | ||
| A ContentHash with method set to the algorithm name and digest | ||
| containing the raw hash bytes. | ||
| """ | ||
| if not Path(file_path).is_file(): | ||
| raise FileNotFoundError(f"The file {file_path} does not exist") | ||
|
|
||
| # Hash the path string itself rather than file content | ||
| if algorithm == "hash_path": | ||
| hasher = hashlib.sha256() | ||
| hasher.update(file_path.encode("utf-8")) | ||
| return hasher.digest() | ||
| hasher.update(str(file_path).encode("utf-8")) | ||
| return ContentHash(method=algorithm, digest=hasher.digest()) | ||
|
Comment on lines
59
to
+66
|
||
|
|
||
| if algorithm == "xxh64": | ||
| hasher = xxhash.xxh64() | ||
|
|
@@ -71,7 +73,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: | |
| if not data: | ||
| break | ||
| hasher.update(data) | ||
| return hasher.digest() | ||
| return ContentHash(method=algorithm, digest=hasher.digest()) | ||
|
|
||
| if algorithm == "crc32": | ||
| crc = 0 | ||
|
|
@@ -81,7 +83,10 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: | |
| if not data: | ||
| break | ||
| crc = zlib.crc32(data, crc) | ||
| return (crc & 0xFFFFFFFF).to_bytes(4, byteorder="big") | ||
| return ContentHash( | ||
| method=algorithm, | ||
| digest=(crc & 0xFFFFFFFF).to_bytes(4, byteorder="big"), | ||
| ) | ||
|
|
||
| try: | ||
| hasher = hashlib.new(algorithm) | ||
|
|
@@ -98,7 +103,7 @@ def hash_file(file_path, algorithm="sha256", buffer_size=65536) -> bytes: | |
| break | ||
| hasher.update(data) | ||
|
|
||
| return hasher.digest() | ||
| return ContentHash(method=algorithm, digest=hasher.digest()) | ||
|
|
||
|
|
||
| def _is_in_string(line: str, pos: int) -> bool: | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -13,6 +13,8 @@ | |||||||||||||
|
|
||||||||||||||
| if TYPE_CHECKING: | ||||||||||||||
| import pyarrow as pa | ||||||||||||||
|
|
||||||||||||||
| from orcapod.protocols.hashing_protocols import FileContentHasherProtocol | ||||||||||||||
| else: | ||||||||||||||
| pa = LazyModule("pyarrow") | ||||||||||||||
|
|
||||||||||||||
|
|
@@ -76,9 +78,10 @@ def _compute_content_hash(self, content: bytes) -> ContentHash: | |||||||||||||
| class PathStructConverter(SemanticStructConverterBase): | ||||||||||||||
| """Converter for pathlib.Path objects to/from semantic structs of form { path: "/value/of/path"}""" | ||||||||||||||
|
|
||||||||||||||
| def __init__(self): | ||||||||||||||
| def __init__(self, file_hasher: "FileContentHasherProtocol"): | ||||||||||||||
| super().__init__("path") | ||||||||||||||
| self._python_type = Path | ||||||||||||||
| self._file_hasher = file_hasher | ||||||||||||||
|
|
||||||||||||||
| # Define the Arrow struct type for paths | ||||||||||||||
| self._arrow_struct_type = pa.struct( | ||||||||||||||
|
|
@@ -134,3 +137,32 @@ def is_semantic_struct(self, struct_dict: dict[str, Any]) -> bool: | |||||||||||||
| return set(struct_dict.keys()) == {"path"} and isinstance( | ||||||||||||||
| struct_dict["path"], str | ||||||||||||||
| ) | ||||||||||||||
|
|
||||||||||||||
| def hash_struct_dict( | ||||||||||||||
| self, struct_dict: dict[str, Any], add_prefix: bool = False | ||||||||||||||
| ) -> str: | ||||||||||||||
| """Compute hash of a path semantic type by hashing the file content. | ||||||||||||||
|
|
||||||||||||||
| Args: | ||||||||||||||
| struct_dict: Dict with a "path" key containing a file path string. | ||||||||||||||
| add_prefix: If True, prefix with "path:sha256:...". | ||||||||||||||
|
|
||||||||||||||
| Returns: | ||||||||||||||
| Hash string of the file content. | ||||||||||||||
|
|
||||||||||||||
| Raises: | ||||||||||||||
| FileNotFoundError: If the path does not exist. | ||||||||||||||
| IsADirectoryError: If the path is a directory. | ||||||||||||||
| """ | ||||||||||||||
| path_str = struct_dict.get("path") | ||||||||||||||
| if path_str is None: | ||||||||||||||
| raise ValueError("Missing 'path' field in struct dict") | ||||||||||||||
|
|
||||||||||||||
| path = Path(path_str) | ||||||||||||||
| if not path.exists(): | ||||||||||||||
| raise FileNotFoundError(f"Path does not exist: {path}") | ||||||||||||||
| if path.is_dir(): | ||||||||||||||
| raise IsADirectoryError(f"Path is a directory: {path}") | ||||||||||||||
|
|
||||||||||||||
| content_hash = self._file_hasher.hash_file(path) | ||||||||||||||
|
||||||||||||||
| content_hash = self._file_hasher.hash_file(path) | |
| content_hash = self._file_hasher.hash_file(path) | |
| if add_prefix and content_hash.method != "sha256": | |
| raise ValueError( | |
| f"add_prefix=True requires a SHA-256 file hasher, but got '{content_hash.method}'" | |
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CachedFileHasher cache key does not include the hashing algorithm / hasher identity, so two CachedFileHasher instances with different algorithms but the same backing StringCacher can return an incorrect cached result (e.g., md5 hasher receiving a sha256 ContentHash). Include something like the underlying hasher's algorithm/method (or a stable hasher_id) in the cache_key to prevent cross-algorithm collisions.