Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.17.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.12) - 2026-02-23

### Added
- `Dataset.deduplicate()` method to deduplicate images using perceptual hashing. Accepts optional `reference_ids` to deduplicate specific items, or deduplicates the entire dataset when only `threshold` is provided. Required `threshold` parameter (0-64) controls similarity matching (lower = stricter, 0 = exact matches only).
- `Dataset.deduplicate_by_ids()` method for deduplication using internal `dataset_item_ids` directly, avoiding the reference ID to item ID mapping for improved efficiency.
- `DeduplicationResult` and `DeduplicationStats` dataclasses for structured deduplication results.

Example usage:

```python
dataset = client.get_dataset("ds_...")

# Deduplicate entire dataset
result = dataset.deduplicate(threshold=10)

# Deduplicate specific items by reference IDs
result = dataset.deduplicate(threshold=10, reference_ids=["ref_1", "ref_2", "ref_3"])

# Deduplicate by internal item IDs (more efficient if you have them)
result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["item_1", "item_2"])

# Access results
print(f"Threshold: {result.stats.threshold}")
print(f"Original: {result.stats.original_count}, Unique: {result.stats.deduplicated_count}")
print(result.unique_reference_ids)
```

## [0.17.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.11) - 2025-11-03

### Added
Expand Down
3 changes: 3 additions & 0 deletions nucleus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"AsyncJob",
"EmbeddingsExportJob",
"BoxAnnotation",
"DeduplicationResult",
"DeduplicationStats",
"BoxPrediction",
"CameraParams",
"CategoryAnnotation",
Expand Down Expand Up @@ -128,6 +130,7 @@
from .data_transfer_object.job_status import JobInfoRequestPayload
from .dataset import Dataset
from .dataset_item import DatasetItem
from .deduplication import DeduplicationResult, DeduplicationStats
from .deprecation_warning import deprecated
from .errors import (
DatasetItemRetrievalError,
Expand Down
1 change: 1 addition & 0 deletions nucleus/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@
SLICE_TAGS_KEY = "slice_tags"
TAXONOMY_NAME_KEY = "taxonomy_name"
TASK_ID_KEY = "task_id"
THRESHOLD_KEY = "threshold"
TRACK_REFERENCE_ID_KEY = "track_reference_id"
TRACK_REFERENCE_IDS_KEY = "track_reference_ids"
TRACKS_KEY = "tracks"
Expand Down
112 changes: 112 additions & 0 deletions nucleus/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
REQUEST_ID_KEY,
SCENE_IDS_KEY,
SLICE_ID_KEY,
THRESHOLD_KEY,
TRACK_REFERENCE_IDS_KEY,
TRACKS_KEY,
TRAINED_SLICE_ID_KEY,
Expand All @@ -83,6 +84,7 @@
check_items_have_dimensions,
)
from .dataset_item_uploader import DatasetItemUploader
from .deduplication import DeduplicationResult, DeduplicationStats
from .deprecation_warning import deprecated
from .errors import NotFoundError, NucleusAPIError
from .job import CustomerJobTypes, jobs_status_overview
Expand Down Expand Up @@ -1006,6 +1008,116 @@ def create_slice_by_ids(
)
return Slice(response[SLICE_ID_KEY], self._client)

def deduplicate(
self,
threshold: int,
reference_ids: Optional[List[str]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make it clearer in the docstring what the difference is between this and the following endpoint?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, will do

) -> DeduplicationResult:
"""Deduplicate images or frames using user-defined reference IDs.

This method can deduplicate an entire dataset (when reference_ids is omitted)
or a specific subset of items identified by the reference_id you assigned
when uploading (e.g., "image_001", "frame_xyz"). To deduplicate using
internal Nucleus item IDs instead, use `deduplicate_by_ids()`.

Parameters:
threshold: Hamming distance threshold (0-64). Lower = stricter.
0 = exact matches only.
reference_ids: Optional list of user-defined reference IDs to deduplicate.
If not provided (or None), deduplicates the entire dataset.
Cannot be an empty list - use None for entire dataset.

Returns:
DeduplicationResult with unique_reference_ids, unique_item_ids, and stats.

Raises:
ValueError: If reference_ids is an empty list (use None for entire dataset).
NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
NucleusAPIError: If any reference_id is not found in the dataset.
NucleusAPIError: If any item is missing a perceptual hash (pHash).
Contact Scale support if this occurs.

Note:
- For scene datasets, this deduplicates the underlying scene frames,
not the scenes themselves. Frame reference IDs or dataset item IDs
should be provided for scene datasets.
- For very large datasets, this operation may take significant time.
"""
# Client-side validation
if reference_ids is not None and len(reference_ids) == 0:
raise ValueError(
"reference_ids cannot be empty. Omit reference_ids parameter to deduplicate entire dataset."
)

payload: Dict[str, Any] = {THRESHOLD_KEY: threshold}
if reference_ids is not None:
payload[REFERENCE_IDS_KEY] = reference_ids

response = self._client.make_request(
payload, f"dataset/{self.id}/deduplicate"
)
return DeduplicationResult(
unique_item_ids=response["unique_item_ids"],
unique_reference_ids=response["unique_reference_ids"],
stats=DeduplicationStats(
threshold=threshold,
original_count=response["stats"]["original_count"],
deduplicated_count=response["stats"]["deduplicated_count"],
),
)

def deduplicate_by_ids(
self,
threshold: int,
dataset_item_ids: List[str],
) -> DeduplicationResult:
"""Deduplicate images or frames using internal Nucleus dataset item IDs.

This method identifies items by internal Nucleus IDs (e.g., "di_abc123...")
which are system-assigned when items are uploaded. To deduplicate using
your own user-defined reference IDs instead, or to deduplicate the entire
dataset, use `deduplicate()`.

Parameters:
threshold: Hamming distance threshold (0-64). Lower = stricter.
0 = exact matches only.
dataset_item_ids: List of internal Nucleus dataset item IDs to deduplicate.
These IDs are generated by Nucleus; they are not
user-defined reference IDs. Must be non-empty.

Returns:
DeduplicationResult with unique_item_ids, unique_reference_ids, and stats.

Raises:
ValueError: If dataset_item_ids is empty.
NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
NucleusAPIError: If any dataset_item_id is not found in the dataset.
NucleusAPIError: If any item is missing a perceptual hash (pHash).
Contact Scale support if this occurs.
"""
# Client-side validation
if not dataset_item_ids:
raise ValueError(
"dataset_item_ids must be non-empty. Use deduplicate() for entire dataset."
)

payload = {
DATASET_ITEM_IDS_KEY: dataset_item_ids,
THRESHOLD_KEY: threshold,
}
response = self._client.make_request(
payload, f"dataset/{self.id}/deduplicate"
)
return DeduplicationResult(
unique_item_ids=response["unique_item_ids"],
unique_reference_ids=response["unique_reference_ids"],
stats=DeduplicationStats(
threshold=threshold,
original_count=response["stats"]["original_count"],
deduplicated_count=response["stats"]["deduplicated_count"],
),
)

def build_slice(
self,
name: str,
Expand Down
16 changes: 16 additions & 0 deletions nucleus/deduplication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from dataclasses import dataclass
from typing import List


@dataclass
class DeduplicationStats:
threshold: int
original_count: int
deduplicated_count: int


@dataclass
class DeduplicationResult:
unique_item_ids: List[str] # Internal dataset item IDs
unique_reference_ids: List[str] # User-defined reference IDs
stats: DeduplicationStats
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running

[tool.poetry]
name = "scale-nucleus"
version = "0.17.11"
version = "0.17.12"
description = "The official Python client library for Nucleus, the Data Platform for AI"
license = "MIT"
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
Expand Down
2 changes: 2 additions & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
EVAL_FUNCTION_THRESHOLD = 0.5
EVAL_FUNCTION_COMPARISON = ThresholdComparison.GREATER_THAN_EQUAL_TO

DEDUP_DEFAULT_TEST_THRESHOLD = 10


TEST_IMG_URLS = [
"https://github.com/scaleapi/nucleus-python-client/raw/master/tests/testdata/airplane.jpeg",
Expand Down
Loading