From b8ff5162d787a173c0a8739fc3be1090964063bf Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 23 Feb 2026 15:50:34 -0500 Subject: [PATCH 01/19] Enable deduplication in nucleus sdk --- CHANGELOG.md | 27 +++++++++++ nucleus/__init__.py | 3 ++ nucleus/constants.py | 1 + nucleus/dataset.py | 102 +++++++++++++++++++++++++++++++++++++++ nucleus/deduplication.py | 16 ++++++ pyproject.toml | 2 +- 6 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 nucleus/deduplication.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 29f59d10..1a4305fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.17.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.12) - 2026-02-23 + +### Added +- `Dataset.deduplicate()` method to deduplicate images using perceptual hashing. Accepts optional `reference_ids` to deduplicate specific items, or deduplicates the entire dataset when only `threshold` is provided. Required `threshold` parameter (0-64) controls similarity matching (lower = stricter, 0 = exact matches only). +- `Dataset.deduplicate_by_ids()` method for deduplication using internal `dataset_item_ids` directly, avoiding the reference ID to item ID mapping for improved efficiency. +- `DeduplicationResult` and `DeduplicationStats` dataclasses for structured deduplication results. + +Example usage: + +```python +dataset = client.get_dataset("ds_...") + +# Deduplicate entire dataset +result = dataset.deduplicate(threshold=10) + +# Deduplicate specific items by reference IDs +result = dataset.deduplicate(threshold=10, reference_ids=["ref_1", "ref_2", "ref_3"]) + +# Deduplicate by internal item IDs (more efficient if you have them) +result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["item_1", "item_2"]) + +# Access results +print(f"Threshold: {result.stats.threshold}") +print(f"Original: {result.stats.original_count}, Unique: {result.stats.deduplicated_count}") +print(result.unique_reference_ids) +``` + ## [0.17.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.11) - 2025-11-03 ### Added diff --git a/nucleus/__init__.py b/nucleus/__init__.py index 3f970c2b..df97ddec 100644 --- a/nucleus/__init__.py +++ b/nucleus/__init__.py @@ -4,6 +4,8 @@ "AsyncJob", "EmbeddingsExportJob", "BoxAnnotation", + "DeduplicationResult", + "DeduplicationStats", "BoxPrediction", "CameraParams", "CategoryAnnotation", @@ -128,6 +130,7 @@ from .data_transfer_object.job_status import JobInfoRequestPayload from .dataset import Dataset from .dataset_item import DatasetItem +from .deduplication import DeduplicationResult, DeduplicationStats from .deprecation_warning import deprecated from .errors import ( DatasetItemRetrievalError, diff --git a/nucleus/constants.py b/nucleus/constants.py index 0a2bbf46..ebad94f5 100644 --- a/nucleus/constants.py +++ b/nucleus/constants.py @@ -149,6 +149,7 @@ SLICE_TAGS_KEY = "slice_tags" TAXONOMY_NAME_KEY = "taxonomy_name" TASK_ID_KEY = "task_id" +THRESHOLD_KEY = "threshold" TRACK_REFERENCE_ID_KEY = "track_reference_id" TRACK_REFERENCE_IDS_KEY = "track_reference_ids" TRACKS_KEY = "tracks" diff --git a/nucleus/dataset.py b/nucleus/dataset.py index ea95f840..ff1d421e 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -67,6 +67,7 @@ REQUEST_ID_KEY, SCENE_IDS_KEY, SLICE_ID_KEY, + THRESHOLD_KEY, TRACK_REFERENCE_IDS_KEY, TRACKS_KEY, TRAINED_SLICE_ID_KEY, @@ -74,6 +75,7 @@ VIDEO_URL_KEY, ) from .data_transfer_object.dataset_info import DatasetInfo +from .deduplication import DeduplicationResult, DeduplicationStats from .data_transfer_object.dataset_size import DatasetSize from .data_transfer_object.scenes_list import ScenesList, ScenesListEntry from .dataset_item import ( @@ -1006,6 +1008,106 @@ def create_slice_by_ids( ) return Slice(response[SLICE_ID_KEY], self._client) + def deduplicate( + self, + threshold: int, + reference_ids: Optional[List[str]] = None, + ) -> DeduplicationResult: + """Deduplicate images or frames in this dataset. + + Parameters: + threshold: Hamming distance threshold (0-64). Lower = stricter. + 0 = exact matches only. + reference_ids: Optional list of reference IDs to deduplicate. + If not provided (or None), deduplicates the entire dataset. + Cannot be an empty list - use None for entire dataset. + + Returns: + DeduplicationResult with unique_reference_ids, unique_item_ids, and stats. + + Raises: + ValueError: If reference_ids is an empty list (use None for entire dataset). + NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive. + NucleusAPIError: If any reference_id is not found in the dataset. + NucleusAPIError: If any item is missing a perceptual hash (pHash). + Contact Scale support if this occurs. + + Note: + - For scene datasets, this deduplicates the underlying scene frames, + not the scenes themselves. Frame reference IDs or dataset item IDs + should be provided for scene datasets. + - For very large datasets, this operation may take significant time. + """ + # Client-side validation + if reference_ids is not None and len(reference_ids) == 0: + raise ValueError( + "reference_ids cannot be empty. Omit reference_ids parameter to deduplicate entire dataset." + ) + + payload: Dict[str, Any] = {THRESHOLD_KEY: threshold} + if reference_ids is not None: + payload[REFERENCE_IDS_KEY] = reference_ids + + response = self._client.make_request( + payload, f"dataset/{self.id}/deduplicate" + ) + return DeduplicationResult( + unique_item_ids=response["unique_item_ids"], + unique_reference_ids=response["unique_reference_ids"], + stats=DeduplicationStats( + threshold=threshold, + original_count=response["stats"]["original_count"], + deduplicated_count=response["stats"]["deduplicated_count"], + ), + ) + + def deduplicate_by_ids( + self, + threshold: int, + dataset_item_ids: List[str], + ) -> DeduplicationResult: + """Deduplicate images or frames by internal dataset item IDs. + + Parameters: + threshold: Hamming distance threshold (0-64). Lower = stricter. + 0 = exact matches only. + dataset_item_ids: List of internal dataset item IDs to deduplicate. + Must be non-empty. To deduplicate the entire dataset, refer to + the documentation for `deduplicate()` instead. + + Returns: + DeduplicationResult with unique_item_ids, unique_reference_ids, and stats. + + Raises: + ValueError: If dataset_item_ids is empty. + NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive. + NucleusAPIError: If any dataset_item_id is not found in the dataset. + NucleusAPIError: If any item is missing a perceptual hash (pHash). + Contact Scale support if this occurs. + """ + # Client-side validation + if not dataset_item_ids: + raise ValueError( + "dataset_item_ids must be non-empty. Use deduplicate() for entire dataset." + ) + + payload = { + DATASET_ITEM_IDS_KEY: dataset_item_ids, + THRESHOLD_KEY: threshold, + } + response = self._client.make_request( + payload, f"dataset/{self.id}/deduplicate" + ) + return DeduplicationResult( + unique_item_ids=response["unique_item_ids"], + unique_reference_ids=response["unique_reference_ids"], + stats=DeduplicationStats( + threshold=threshold, + original_count=response["stats"]["original_count"], + deduplicated_count=response["stats"]["deduplicated_count"], + ), + ) + def build_slice( self, name: str, diff --git a/nucleus/deduplication.py b/nucleus/deduplication.py new file mode 100644 index 00000000..f427c004 --- /dev/null +++ b/nucleus/deduplication.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from typing import List + + +@dataclass +class DeduplicationStats: + threshold: int + original_count: int + deduplicated_count: int + + +@dataclass +class DeduplicationResult: + unique_item_ids: List[str] # Internal dataset item IDs + unique_reference_ids: List[str] # User-defined reference IDs + stats: DeduplicationStats diff --git a/pyproject.toml b/pyproject.toml index 4fe1aaa2..6622dcd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running [tool.poetry] name = "scale-nucleus" -version = "0.17.11" +version = "0.17.12" description = "The official Python client library for Nucleus, the Data Platform for AI" license = "MIT" authors = ["Scale AI Nucleus Team "] From 436fbafceb86adf1936e7ac9811425a96ed0b418 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 23 Feb 2026 15:54:22 -0500 Subject: [PATCH 02/19] Lint fixes --- nucleus/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nucleus/dataset.py b/nucleus/dataset.py index ff1d421e..030d334f 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -1034,7 +1034,7 @@ def deduplicate( Note: - For scene datasets, this deduplicates the underlying scene frames, - not the scenes themselves. Frame reference IDs or dataset item IDs + not the scenes themselves. Frame reference IDs or dataset item IDs should be provided for scene datasets. - For very large datasets, this operation may take significant time. """ From 0a1c8d23a46281c7f6c6c4aa55c061947439f32e Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 23 Feb 2026 16:00:04 -0500 Subject: [PATCH 03/19] Fix import order --- nucleus/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 030d334f..be1c9242 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -75,7 +75,6 @@ VIDEO_URL_KEY, ) from .data_transfer_object.dataset_info import DatasetInfo -from .deduplication import DeduplicationResult, DeduplicationStats from .data_transfer_object.dataset_size import DatasetSize from .data_transfer_object.scenes_list import ScenesList, ScenesListEntry from .dataset_item import ( @@ -85,6 +84,7 @@ check_items_have_dimensions, ) from .dataset_item_uploader import DatasetItemUploader +from .deduplication import DeduplicationResult, DeduplicationStats from .deprecation_warning import deprecated from .errors import NotFoundError, NucleusAPIError from .job import CustomerJobTypes, jobs_status_overview From 4b2a1d4112c965ddc966b6ed602c1281c1c605e3 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Fri, 27 Feb 2026 23:07:50 -0500 Subject: [PATCH 04/19] Add tests for deduplication sdk --- tests/test_deduplication.py | 263 ++++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 tests/test_deduplication.py diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py new file mode 100644 index 00000000..4ddd2851 --- /dev/null +++ b/tests/test_deduplication.py @@ -0,0 +1,263 @@ +import pytest + +from nucleus import DatasetItem, Dataset, NucleusClient, VideoScene +from nucleus.deduplication import DeduplicationResult +from nucleus.errors import NucleusAPIError + +from .helpers import ( + TEST_DATASET_ITEMS, + TEST_DATASET_NAME, + TEST_IMG_URLS, + TEST_VIDEO_DATASET_NAME, + TEST_VIDEO_SCENES, +) + + +def test_deduplicate_empty_reference_ids_raises_error(): + fake_dataset = Dataset("fake", NucleusClient("fake")) + with pytest.raises(ValueError, match="reference_ids cannot be empty"): + fake_dataset.deduplicate(threshold=10, reference_ids=[]) + + +def test_deduplicate_by_ids_empty_list_raises_error(): + fake_dataset = Dataset("fake", NucleusClient("fake")) + with pytest.raises(ValueError, match="dataset_item_ids must be non-empty"): + fake_dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=[]) + + +@pytest.mark.integration +def test_deduplicate_entire_dataset(dataset): + result = dataset.deduplicate(threshold=10) + assert isinstance(result, DeduplicationResult) + assert len(result.unique_reference_ids) > 0 + assert len(result.unique_item_ids) > 0 + assert result.stats.original_count == len(TEST_DATASET_ITEMS) + + +@pytest.mark.integration +def test_deduplicate_with_reference_ids(dataset): + reference_ids = [item.reference_id for item in TEST_DATASET_ITEMS[:2]] + result = dataset.deduplicate(threshold=10, reference_ids=reference_ids) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(reference_ids) + assert len(result.unique_reference_ids) <= len(reference_ids) + assert len(result.unique_item_ids) <= len(reference_ids) + + +@pytest.mark.integration +def test_deduplicate_by_ids(dataset): + initial_result = dataset.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + assert len(item_ids) > 0 + + result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(item_ids) + assert result.unique_item_ids == initial_result.unique_item_ids + + +@pytest.fixture(scope="module") +def dataset_video_scene(CLIENT): + """Scene dataset with scene_1 (frame IDs: video_frame_0, video_frame_1).""" + ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup", is_scene=True) + scene_1 = TEST_VIDEO_SCENES["scenes"][0] + scenes = [VideoScene.from_json(scene_1)] + job = ds.append(scenes, asynchronous=True) + job.sleep_until_complete() + yield ds + CLIENT.delete_dataset(ds.id) + + +def _get_scene_frame_ref_ids(): + """Extract frame reference IDs from TEST_VIDEO_SCENES scene_1.""" + return [frame["reference_id"] for frame in TEST_VIDEO_SCENES["scenes"][0]["frames"]] + + +@pytest.mark.integration +def test_deduplicate_video_scene_entire_dataset(dataset_video_scene): + result = dataset_video_scene.deduplicate(threshold=10) + assert isinstance(result, DeduplicationResult) + assert len(result.unique_reference_ids) > 0 + assert len(result.unique_item_ids) > 0 + assert result.stats.original_count == len(_get_scene_frame_ref_ids()) + + +@pytest.mark.integration +def test_deduplicate_video_scene_with_frame_reference_ids(dataset_video_scene): + frame_ref_ids = _get_scene_frame_ref_ids() + result = dataset_video_scene.deduplicate(threshold=10, reference_ids=frame_ref_ids) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(frame_ref_ids) + assert len(result.unique_reference_ids) <= len(frame_ref_ids) + assert len(result.unique_item_ids) <= len(frame_ref_ids) + + +@pytest.mark.integration +def test_deduplicate_video_scene_by_ids(dataset_video_scene): + initial_result = dataset_video_scene.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + assert len(item_ids) > 0 + + result = dataset_video_scene.deduplicate_by_ids( + threshold=10, dataset_item_ids=item_ids + ) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(item_ids) + assert result.unique_item_ids == initial_result.unique_item_ids + + +# Edge case tests + + +@pytest.mark.integration +def test_deduplicate_threshold_zero(dataset): + """Threshold=0 means exact matches only.""" + result = dataset.deduplicate(threshold=0) + assert isinstance(result, DeduplicationResult) + assert result.stats.threshold == 0 + + +@pytest.mark.integration +def test_deduplicate_threshold_max(dataset): + """Threshold=64 is the maximum allowed value.""" + result = dataset.deduplicate(threshold=64) + assert isinstance(result, DeduplicationResult) + assert result.stats.threshold == 64 + + +@pytest.mark.integration +def test_deduplicate_threshold_negative(dataset): + """Threshold must be >= 0.""" + with pytest.raises(NucleusAPIError): + dataset.deduplicate(threshold=-1) + + +@pytest.mark.integration +def test_deduplicate_threshold_too_high(dataset): + """Threshold must be <= 64.""" + with pytest.raises(NucleusAPIError): + dataset.deduplicate(threshold=65) + + +@pytest.mark.integration +def test_deduplicate_threshold_non_integer(dataset): + """Threshold must be an integer.""" + with pytest.raises(NucleusAPIError): + dataset.deduplicate(threshold=10.5) + + +@pytest.mark.integration +def test_deduplicate_nonexistent_reference_id(dataset): + with pytest.raises(NucleusAPIError): + dataset.deduplicate(threshold=10, reference_ids=["nonexistent_ref_id"]) + + +@pytest.mark.integration +def test_deduplicate_by_ids_nonexistent_id(dataset): + with pytest.raises(NucleusAPIError): + dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["di_nonexistent"]) + + +@pytest.mark.integration +def test_deduplicate_idempotency(dataset): + result1 = dataset.deduplicate(threshold=10) + result2 = dataset.deduplicate(threshold=10) + + assert result1.unique_item_ids == result2.unique_item_ids + assert result1.unique_reference_ids == result2.unique_reference_ids + assert result1.stats.original_count == result2.stats.original_count + assert result1.stats.deduplicated_count == result2.stats.deduplicated_count + + +@pytest.mark.integration +def test_deduplicate_response_invariants(dataset): + result = dataset.deduplicate(threshold=10) + + assert len(result.unique_item_ids) == len(result.unique_reference_ids) + assert result.stats.deduplicated_count == len(result.unique_item_ids) + assert result.stats.deduplicated_count <= result.stats.original_count + assert result.stats.threshold == 10 + + +@pytest.mark.integration +def test_deduplicate_by_ids_threshold_negative(dataset): + """deduplicate_by_ids should enforce the same threshold constraints.""" + initial_result = dataset.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + + with pytest.raises(NucleusAPIError): + dataset.deduplicate_by_ids(threshold=-1, dataset_item_ids=item_ids) + + +@pytest.mark.integration +def test_deduplicate_by_ids_threshold_too_high(dataset): + """deduplicate_by_ids should enforce the same threshold constraints.""" + initial_result = dataset.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + + with pytest.raises(NucleusAPIError): + dataset.deduplicate_by_ids(threshold=65, dataset_item_ids=item_ids) + + +@pytest.mark.integration +def test_deduplicate_single_item(dataset): + """Single item should always be unique.""" + reference_ids = [TEST_DATASET_ITEMS[0].reference_id] + result = dataset.deduplicate(threshold=10, reference_ids=reference_ids) + + assert result.stats.original_count == 1 + assert result.stats.deduplicated_count == 1 + assert len(result.unique_reference_ids) == 1 + + +@pytest.fixture() +def dataset_empty(CLIENT): + """Empty dataset with no items.""" + ds = CLIENT.create_dataset(TEST_DATASET_NAME + " empty", is_scene=False) + yield ds + CLIENT.delete_dataset(ds.id) + + +@pytest.mark.integration +def test_deduplicate_empty_dataset(dataset_empty): + """Empty dataset should return zero counts.""" + result = dataset_empty.deduplicate(threshold=10) + + assert result.stats.original_count == 0 + assert result.stats.deduplicated_count == 0 + assert len(result.unique_reference_ids) == 0 + assert len(result.unique_item_ids) == 0 + + +@pytest.fixture() +def dataset_with_duplicates(CLIENT): + """Dataset with duplicate images (same image uploaded twice).""" + ds = CLIENT.create_dataset(TEST_DATASET_NAME + " duplicates", is_scene=False) + items = [ + DatasetItem(image_url=TEST_IMG_URLS[0], reference_id="img_original"), + DatasetItem(image_url=TEST_IMG_URLS[0], reference_id="img_duplicate"), + DatasetItem(image_url=TEST_IMG_URLS[1], reference_id="img_different"), + ] + ds.append(items) + yield ds + CLIENT.delete_dataset(ds.id) + + +@pytest.mark.integration +def test_deduplicate_identifies_duplicates(dataset_with_duplicates): + """Verify deduplication actually identifies duplicate images.""" + result = dataset_with_duplicates.deduplicate(threshold=0) + + assert result.stats.original_count == 3 + # With threshold=0, the two identical images should be deduplicated to one + assert result.stats.deduplicated_count == 2 + assert len(result.unique_reference_ids) == 2 + + +@pytest.mark.integration +def test_deduplicate_distinct_images_all_unique(dataset): + """Distinct images should all remain after deduplication.""" + result = dataset.deduplicate(threshold=0) + + # With threshold=0 (exact match only), all distinct images should be unique + assert result.stats.deduplicated_count == result.stats.original_count From 6545d0264f12885e25c66d76f2beb46b7b399c18 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Fri, 27 Feb 2026 23:11:22 -0500 Subject: [PATCH 05/19] Fix isort import formatting errors --- tests/test_deduplication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index 4ddd2851..f84d4190 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -1,6 +1,6 @@ import pytest -from nucleus import DatasetItem, Dataset, NucleusClient, VideoScene +from nucleus import Dataset, DatasetItem, NucleusClient, VideoScene from nucleus.deduplication import DeduplicationResult from nucleus.errors import NucleusAPIError From 019a31ab5fe6b84059bbfa71a48db06328f0ed67 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Fri, 27 Feb 2026 23:30:34 -0500 Subject: [PATCH 06/19] Add fixture for image dataset specifically for dedup --- tests/test_deduplication.py | 82 +++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index f84d4190..99ce52f7 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -25,9 +25,19 @@ def test_deduplicate_by_ids_empty_list_raises_error(): fake_dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=[]) +@pytest.fixture(scope="module") +def dataset_image(CLIENT): + """Image dataset with TEST_DATASET_ITEMS (waits for phash calculation).""" + ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup", is_scene=False) + job = ds.append(TEST_DATASET_ITEMS, asynchronous=True) + job.sleep_until_complete() + yield ds + CLIENT.delete_dataset(ds.id) + + @pytest.mark.integration -def test_deduplicate_entire_dataset(dataset): - result = dataset.deduplicate(threshold=10) +def test_deduplicate_entire_dataset(dataset_image): + result = dataset_image.deduplicate(threshold=10) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -35,9 +45,9 @@ def test_deduplicate_entire_dataset(dataset): @pytest.mark.integration -def test_deduplicate_with_reference_ids(dataset): +def test_deduplicate_with_reference_ids(dataset_image): reference_ids = [item.reference_id for item in TEST_DATASET_ITEMS[:2]] - result = dataset.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image.deduplicate(threshold=10, reference_ids=reference_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(reference_ids) assert len(result.unique_reference_ids) <= len(reference_ids) @@ -45,12 +55,12 @@ def test_deduplicate_with_reference_ids(dataset): @pytest.mark.integration -def test_deduplicate_by_ids(dataset): - initial_result = dataset.deduplicate(threshold=10) +def test_deduplicate_by_ids(dataset_image): + initial_result = dataset_image.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 - result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) + result = dataset_image.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) assert result.unique_item_ids == initial_result.unique_item_ids @@ -110,58 +120,58 @@ def test_deduplicate_video_scene_by_ids(dataset_video_scene): @pytest.mark.integration -def test_deduplicate_threshold_zero(dataset): +def test_deduplicate_threshold_zero(dataset_image): """Threshold=0 means exact matches only.""" - result = dataset.deduplicate(threshold=0) + result = dataset_image.deduplicate(threshold=0) assert isinstance(result, DeduplicationResult) assert result.stats.threshold == 0 @pytest.mark.integration -def test_deduplicate_threshold_max(dataset): +def test_deduplicate_threshold_max(dataset_image): """Threshold=64 is the maximum allowed value.""" - result = dataset.deduplicate(threshold=64) + result = dataset_image.deduplicate(threshold=64) assert isinstance(result, DeduplicationResult) assert result.stats.threshold == 64 @pytest.mark.integration -def test_deduplicate_threshold_negative(dataset): +def test_deduplicate_threshold_negative(dataset_image): """Threshold must be >= 0.""" with pytest.raises(NucleusAPIError): - dataset.deduplicate(threshold=-1) + dataset_image.deduplicate(threshold=-1) @pytest.mark.integration -def test_deduplicate_threshold_too_high(dataset): +def test_deduplicate_threshold_too_high(dataset_image): """Threshold must be <= 64.""" with pytest.raises(NucleusAPIError): - dataset.deduplicate(threshold=65) + dataset_image.deduplicate(threshold=65) @pytest.mark.integration -def test_deduplicate_threshold_non_integer(dataset): +def test_deduplicate_threshold_non_integer(dataset_image): """Threshold must be an integer.""" with pytest.raises(NucleusAPIError): - dataset.deduplicate(threshold=10.5) + dataset_image.deduplicate(threshold=10.5) @pytest.mark.integration -def test_deduplicate_nonexistent_reference_id(dataset): +def test_deduplicate_nonexistent_reference_id(dataset_image): with pytest.raises(NucleusAPIError): - dataset.deduplicate(threshold=10, reference_ids=["nonexistent_ref_id"]) + dataset_image.deduplicate(threshold=10, reference_ids=["nonexistent_ref_id"]) @pytest.mark.integration -def test_deduplicate_by_ids_nonexistent_id(dataset): +def test_deduplicate_by_ids_nonexistent_id(dataset_image): with pytest.raises(NucleusAPIError): - dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["di_nonexistent"]) + dataset_image.deduplicate_by_ids(threshold=10, dataset_item_ids=["di_nonexistent"]) @pytest.mark.integration -def test_deduplicate_idempotency(dataset): - result1 = dataset.deduplicate(threshold=10) - result2 = dataset.deduplicate(threshold=10) +def test_deduplicate_idempotency(dataset_image): + result1 = dataset_image.deduplicate(threshold=10) + result2 = dataset_image.deduplicate(threshold=10) assert result1.unique_item_ids == result2.unique_item_ids assert result1.unique_reference_ids == result2.unique_reference_ids @@ -170,8 +180,8 @@ def test_deduplicate_idempotency(dataset): @pytest.mark.integration -def test_deduplicate_response_invariants(dataset): - result = dataset.deduplicate(threshold=10) +def test_deduplicate_response_invariants(dataset_image): + result = dataset_image.deduplicate(threshold=10) assert len(result.unique_item_ids) == len(result.unique_reference_ids) assert result.stats.deduplicated_count == len(result.unique_item_ids) @@ -180,30 +190,30 @@ def test_deduplicate_response_invariants(dataset): @pytest.mark.integration -def test_deduplicate_by_ids_threshold_negative(dataset): +def test_deduplicate_by_ids_threshold_negative(dataset_image): """deduplicate_by_ids should enforce the same threshold constraints.""" - initial_result = dataset.deduplicate(threshold=10) + initial_result = dataset_image.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids with pytest.raises(NucleusAPIError): - dataset.deduplicate_by_ids(threshold=-1, dataset_item_ids=item_ids) + dataset_image.deduplicate_by_ids(threshold=-1, dataset_item_ids=item_ids) @pytest.mark.integration -def test_deduplicate_by_ids_threshold_too_high(dataset): +def test_deduplicate_by_ids_threshold_too_high(dataset_image): """deduplicate_by_ids should enforce the same threshold constraints.""" - initial_result = dataset.deduplicate(threshold=10) + initial_result = dataset_image.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids with pytest.raises(NucleusAPIError): - dataset.deduplicate_by_ids(threshold=65, dataset_item_ids=item_ids) + dataset_image.deduplicate_by_ids(threshold=65, dataset_item_ids=item_ids) @pytest.mark.integration -def test_deduplicate_single_item(dataset): +def test_deduplicate_single_item(dataset_image): """Single item should always be unique.""" reference_ids = [TEST_DATASET_ITEMS[0].reference_id] - result = dataset.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image.deduplicate(threshold=10, reference_ids=reference_ids) assert result.stats.original_count == 1 assert result.stats.deduplicated_count == 1 @@ -255,9 +265,9 @@ def test_deduplicate_identifies_duplicates(dataset_with_duplicates): @pytest.mark.integration -def test_deduplicate_distinct_images_all_unique(dataset): +def test_deduplicate_distinct_images_all_unique(dataset_image): """Distinct images should all remain after deduplication.""" - result = dataset.deduplicate(threshold=0) + result = dataset_image.deduplicate(threshold=0) # With threshold=0 (exact match only), all distinct images should be unique assert result.stats.deduplicated_count == result.stats.original_count From ed67d5b5c693b9bcab496033b2981fd9feead558 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Sat, 28 Feb 2026 00:18:54 -0500 Subject: [PATCH 07/19] Fix image dataset creation syntax --- tests/test_deduplication.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index 99ce52f7..634d5ffe 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -244,11 +244,12 @@ def dataset_with_duplicates(CLIENT): """Dataset with duplicate images (same image uploaded twice).""" ds = CLIENT.create_dataset(TEST_DATASET_NAME + " duplicates", is_scene=False) items = [ - DatasetItem(image_url=TEST_IMG_URLS[0], reference_id="img_original"), - DatasetItem(image_url=TEST_IMG_URLS[0], reference_id="img_duplicate"), - DatasetItem(image_url=TEST_IMG_URLS[1], reference_id="img_different"), + DatasetItem(TEST_IMG_URLS[0], reference_id="img_original"), + DatasetItem(TEST_IMG_URLS[0], reference_id="img_duplicate"), + DatasetItem(TEST_IMG_URLS[1], reference_id="img_different"), ] - ds.append(items) + job = ds.append(items, asynchronous=True) + job.sleep_until_complete() yield ds CLIENT.delete_dataset(ds.id) From 6330be2e229fc7e658333fdb022abb6be2354c86 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Sat, 28 Feb 2026 00:57:24 -0500 Subject: [PATCH 08/19] Create image dataset syncrhonously --- tests/test_deduplication.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index 634d5ffe..0813f6dc 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -29,8 +29,7 @@ def test_deduplicate_by_ids_empty_list_raises_error(): def dataset_image(CLIENT): """Image dataset with TEST_DATASET_ITEMS (waits for phash calculation).""" ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup", is_scene=False) - job = ds.append(TEST_DATASET_ITEMS, asynchronous=True) - job.sleep_until_complete() + ds.append(TEST_DATASET_ITEMS) yield ds CLIENT.delete_dataset(ds.id) From 6d6a0ceeb0124903ccce7fb855786b9c7eb4297e Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Sat, 28 Feb 2026 01:31:42 -0500 Subject: [PATCH 09/19] Make dataset_with_duplicates fixture sync --- tests/test_deduplication.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index 0813f6dc..d55b77d6 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -247,8 +247,7 @@ def dataset_with_duplicates(CLIENT): DatasetItem(TEST_IMG_URLS[0], reference_id="img_duplicate"), DatasetItem(TEST_IMG_URLS[1], reference_id="img_different"), ] - job = ds.append(items, asynchronous=True) - job.sleep_until_complete() + ds.append(items) yield ds CLIENT.delete_dataset(ds.id) From 9ec043a5f42dc26cb89945e53de024a756e63933 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Sat, 28 Feb 2026 02:14:11 -0500 Subject: [PATCH 10/19] Add dedup test for scene made with video url --- tests/test_deduplication.py | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index d55b77d6..75947296 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -10,6 +10,7 @@ TEST_IMG_URLS, TEST_VIDEO_DATASET_NAME, TEST_VIDEO_SCENES, + TEST_VIDEO_URL, ) @@ -115,6 +116,46 @@ def test_deduplicate_video_scene_by_ids(dataset_video_scene): assert result.unique_item_ids == initial_result.unique_item_ids +@pytest.fixture(scope="module") +def dataset_video_url(CLIENT): + """Scene dataset created from a video URL (not a list of frames).""" + ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup", is_scene=True) + scene = VideoScene.from_json({ + "reference_id": "video_url_scene", + "video_url": TEST_VIDEO_URL, + "metadata": {"test": "video_url_dedup"}, + }) + job = ds.append([scene], asynchronous=True) + job.sleep_until_complete() + yield ds + CLIENT.delete_dataset(ds.id) + + +@pytest.mark.integration +def test_deduplicate_video_url_entire_dataset(dataset_video_url): + """Test deduplication on a dataset created from a video URL.""" + result = dataset_video_url.deduplicate(threshold=10) + assert isinstance(result, DeduplicationResult) + assert len(result.unique_reference_ids) > 0 + assert len(result.unique_item_ids) > 0 + assert result.stats.original_count > 0 + + +@pytest.mark.integration +def test_deduplicate_video_url_by_ids(dataset_video_url): + """Test deduplicate_by_ids on a dataset created from a video URL.""" + initial_result = dataset_video_url.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + assert len(item_ids) > 0 + + result = dataset_video_url.deduplicate_by_ids( + threshold=10, dataset_item_ids=item_ids + ) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(item_ids) + assert result.unique_item_ids == initial_result.unique_item_ids + + # Edge case tests From 538c864e6ec8bcb420f44e878ab7d12c4671003b Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:03:38 -0500 Subject: [PATCH 11/19] Document difference between deduplicate and deduplicate_by_ids better in docstring --- nucleus/dataset.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/nucleus/dataset.py b/nucleus/dataset.py index be1c9242..224444e2 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -1013,12 +1013,17 @@ def deduplicate( threshold: int, reference_ids: Optional[List[str]] = None, ) -> DeduplicationResult: - """Deduplicate images or frames in this dataset. + """Deduplicate images or frames using user-defined reference IDs. + + This method can deduplicate an entire dataset (when reference_ids is omitted) + or a specific subset of items identified by the reference_id you assigned + when uploading (e.g., "image_001", "frame_xyz"). To deduplicate using + internal Nucleus item IDs instead, use `deduplicate_by_ids()`. Parameters: threshold: Hamming distance threshold (0-64). Lower = stricter. 0 = exact matches only. - reference_ids: Optional list of reference IDs to deduplicate. + reference_ids: Optional list of user-defined reference IDs to deduplicate. If not provided (or None), deduplicates the entire dataset. Cannot be an empty list - use None for entire dataset. @@ -1066,14 +1071,19 @@ def deduplicate_by_ids( threshold: int, dataset_item_ids: List[str], ) -> DeduplicationResult: - """Deduplicate images or frames by internal dataset item IDs. + """Deduplicate images or frames using internal Nucleus dataset item IDs. + + This method identifies items by internal Nucleus IDs (e.g., "di_abc123...") + which are system-assigned when items are uploaded. To deduplicate using + your own user-defined reference IDs instead, or to deduplicate the entire + dataset, use `deduplicate()`. Parameters: threshold: Hamming distance threshold (0-64). Lower = stricter. 0 = exact matches only. - dataset_item_ids: List of internal dataset item IDs to deduplicate. - Must be non-empty. To deduplicate the entire dataset, refer to - the documentation for `deduplicate()` instead. + dataset_item_ids: List of internal Nucleus dataset item IDs to deduplicate. + These are IDs are generated by Nucleus; they are not + user-defined reference IDs. Must be non-empty. Returns: DeduplicationResult with unique_item_ids, unique_reference_ids, and stats. From dc3f61cf63f8a96c53e61c048432b7201350006d Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:17:56 -0500 Subject: [PATCH 12/19] Add tests to cover all ingestion forms --- tests/test_deduplication.py | 258 +++++++++++++++++++++++++++--------- 1 file changed, 197 insertions(+), 61 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index 75947296..d27c0ea5 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -27,17 +27,28 @@ def test_deduplicate_by_ids_empty_list_raises_error(): @pytest.fixture(scope="module") -def dataset_image(CLIENT): - """Image dataset with TEST_DATASET_ITEMS (waits for phash calculation).""" - ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup", is_scene=False) +def dataset_image_sync(CLIENT): + """Image dataset uploaded synchronously.""" + ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup sync", is_scene=False) ds.append(TEST_DATASET_ITEMS) yield ds CLIENT.delete_dataset(ds.id) +@pytest.fixture(scope="module") +def dataset_image_async(CLIENT): + """Image dataset uploaded asynchronously.""" + ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup async", is_scene=False) + job = ds.append(TEST_DATASET_ITEMS, asynchronous=True) + job.sleep_until_complete() + yield ds + CLIENT.delete_dataset(ds.id) + + @pytest.mark.integration -def test_deduplicate_entire_dataset(dataset_image): - result = dataset_image.deduplicate(threshold=10) +def test_deduplicate_image_sync_entire_dataset(dataset_image_sync): + """Test deduplication on image dataset uploaded synchronously.""" + result = dataset_image_sync.deduplicate(threshold=10) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -45,9 +56,10 @@ def test_deduplicate_entire_dataset(dataset_image): @pytest.mark.integration -def test_deduplicate_with_reference_ids(dataset_image): +def test_deduplicate_image_sync_with_reference_ids(dataset_image_sync): + """Test deduplication with reference IDs on image dataset uploaded synchronously.""" reference_ids = [item.reference_id for item in TEST_DATASET_ITEMS[:2]] - result = dataset_image.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image_sync.deduplicate(threshold=10, reference_ids=reference_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(reference_ids) assert len(result.unique_reference_ids) <= len(reference_ids) @@ -55,21 +67,67 @@ def test_deduplicate_with_reference_ids(dataset_image): @pytest.mark.integration -def test_deduplicate_by_ids(dataset_image): - initial_result = dataset_image.deduplicate(threshold=10) +def test_deduplicate_image_sync_by_ids(dataset_image_sync): + """Test deduplicate_by_ids on image dataset uploaded synchronously.""" + initial_result = dataset_image_sync.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 - result = dataset_image.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) + result = dataset_image_sync.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) assert result.unique_item_ids == initial_result.unique_item_ids +@pytest.mark.integration +def test_deduplicate_image_async_entire_dataset(dataset_image_async): + """Test deduplication on image dataset uploaded asynchronously.""" + result = dataset_image_async.deduplicate(threshold=10) + assert isinstance(result, DeduplicationResult) + assert len(result.unique_reference_ids) > 0 + assert len(result.unique_item_ids) > 0 + assert result.stats.original_count == len(TEST_DATASET_ITEMS) + + +@pytest.mark.integration +def test_deduplicate_image_async_with_reference_ids(dataset_image_async): + """Test deduplication with reference IDs on image dataset uploaded asynchronously.""" + reference_ids = [item.reference_id for item in TEST_DATASET_ITEMS[:2]] + result = dataset_image_async.deduplicate(threshold=10, reference_ids=reference_ids) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(reference_ids) + assert len(result.unique_reference_ids) <= len(reference_ids) + assert len(result.unique_item_ids) <= len(reference_ids) + + +@pytest.mark.integration +def test_deduplicate_image_async_by_ids(dataset_image_async): + """Test deduplicate_by_ids on image dataset uploaded asynchronously.""" + initial_result = dataset_image_async.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + assert len(item_ids) > 0 + + result = dataset_image_async.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(item_ids) + assert result.unique_item_ids == initial_result.unique_item_ids + + +@pytest.fixture(scope="module") +def dataset_video_scene_sync(CLIENT): + """Video scene dataset (with frames) uploaded synchronously.""" + ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup sync", is_scene=True) + scene_1 = TEST_VIDEO_SCENES["scenes"][0] + scenes = [VideoScene.from_json(scene_1)] + ds.append(scenes) + yield ds + CLIENT.delete_dataset(ds.id) + + @pytest.fixture(scope="module") -def dataset_video_scene(CLIENT): - """Scene dataset with scene_1 (frame IDs: video_frame_0, video_frame_1).""" - ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup", is_scene=True) +def dataset_video_scene_async(CLIENT): + """Video scene dataset (with frames) uploaded asynchronously.""" + ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup async", is_scene=True) scene_1 = TEST_VIDEO_SCENES["scenes"][0] scenes = [VideoScene.from_json(scene_1)] job = ds.append(scenes, asynchronous=True) @@ -84,8 +142,9 @@ def _get_scene_frame_ref_ids(): @pytest.mark.integration -def test_deduplicate_video_scene_entire_dataset(dataset_video_scene): - result = dataset_video_scene.deduplicate(threshold=10) +def test_deduplicate_video_scene_sync_entire_dataset(dataset_video_scene_sync): + """Test deduplication on video scene dataset uploaded synchronously.""" + result = dataset_video_scene_sync.deduplicate(threshold=10) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -93,9 +152,10 @@ def test_deduplicate_video_scene_entire_dataset(dataset_video_scene): @pytest.mark.integration -def test_deduplicate_video_scene_with_frame_reference_ids(dataset_video_scene): +def test_deduplicate_video_scene_sync_with_frame_reference_ids(dataset_video_scene_sync): + """Test deduplication with frame reference IDs on video scene dataset uploaded synchronously.""" frame_ref_ids = _get_scene_frame_ref_ids() - result = dataset_video_scene.deduplicate(threshold=10, reference_ids=frame_ref_ids) + result = dataset_video_scene_sync.deduplicate(threshold=10, reference_ids=frame_ref_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(frame_ref_ids) assert len(result.unique_reference_ids) <= len(frame_ref_ids) @@ -103,12 +163,13 @@ def test_deduplicate_video_scene_with_frame_reference_ids(dataset_video_scene): @pytest.mark.integration -def test_deduplicate_video_scene_by_ids(dataset_video_scene): - initial_result = dataset_video_scene.deduplicate(threshold=10) +def test_deduplicate_video_scene_sync_by_ids(dataset_video_scene_sync): + """Test deduplicate_by_ids on video scene dataset uploaded synchronously.""" + initial_result = dataset_video_scene_sync.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 - result = dataset_video_scene.deduplicate_by_ids( + result = dataset_video_scene_sync.deduplicate_by_ids( threshold=10, dataset_item_ids=item_ids ) assert isinstance(result, DeduplicationResult) @@ -116,14 +177,64 @@ def test_deduplicate_video_scene_by_ids(dataset_video_scene): assert result.unique_item_ids == initial_result.unique_item_ids +@pytest.mark.integration +def test_deduplicate_video_scene_async_entire_dataset(dataset_video_scene_async): + """Test deduplication on video scene dataset uploaded asynchronously.""" + result = dataset_video_scene_async.deduplicate(threshold=10) + assert isinstance(result, DeduplicationResult) + assert len(result.unique_reference_ids) > 0 + assert len(result.unique_item_ids) > 0 + assert result.stats.original_count == len(_get_scene_frame_ref_ids()) + + +@pytest.mark.integration +def test_deduplicate_video_scene_async_with_frame_reference_ids(dataset_video_scene_async): + """Test deduplication with frame reference IDs on video scene dataset uploaded asynchronously.""" + frame_ref_ids = _get_scene_frame_ref_ids() + result = dataset_video_scene_async.deduplicate(threshold=10, reference_ids=frame_ref_ids) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(frame_ref_ids) + assert len(result.unique_reference_ids) <= len(frame_ref_ids) + assert len(result.unique_item_ids) <= len(frame_ref_ids) + + +@pytest.mark.integration +def test_deduplicate_video_scene_async_by_ids(dataset_video_scene_async): + """Test deduplicate_by_ids on video scene dataset uploaded asynchronously.""" + initial_result = dataset_video_scene_async.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + assert len(item_ids) > 0 + + result = dataset_video_scene_async.deduplicate_by_ids( + threshold=10, dataset_item_ids=item_ids + ) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(item_ids) + assert result.unique_item_ids == initial_result.unique_item_ids + + +@pytest.fixture(scope="module") +def dataset_video_url_sync(CLIENT): + """Video URL dataset uploaded synchronously.""" + ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup sync", is_scene=True) + scene = VideoScene.from_json({ + "reference_id": "video_url_scene_sync", + "video_url": TEST_VIDEO_URL, + "metadata": {"test": "video_url_dedup_sync"}, + }) + ds.append([scene]) + yield ds + CLIENT.delete_dataset(ds.id) + + @pytest.fixture(scope="module") -def dataset_video_url(CLIENT): - """Scene dataset created from a video URL (not a list of frames).""" - ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup", is_scene=True) +def dataset_video_url_async(CLIENT): + """Video URL dataset uploaded asynchronously.""" + ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup async", is_scene=True) scene = VideoScene.from_json({ - "reference_id": "video_url_scene", + "reference_id": "video_url_scene_async", "video_url": TEST_VIDEO_URL, - "metadata": {"test": "video_url_dedup"}, + "metadata": {"test": "video_url_dedup_async"}, }) job = ds.append([scene], asynchronous=True) job.sleep_until_complete() @@ -132,9 +243,34 @@ def dataset_video_url(CLIENT): @pytest.mark.integration -def test_deduplicate_video_url_entire_dataset(dataset_video_url): - """Test deduplication on a dataset created from a video URL.""" - result = dataset_video_url.deduplicate(threshold=10) +def test_deduplicate_video_url_sync_entire_dataset(dataset_video_url_sync): + """Test deduplication on video URL dataset uploaded synchronously.""" + result = dataset_video_url_sync.deduplicate(threshold=10) + assert isinstance(result, DeduplicationResult) + assert len(result.unique_reference_ids) > 0 + assert len(result.unique_item_ids) > 0 + assert result.stats.original_count > 0 + + +@pytest.mark.integration +def test_deduplicate_video_url_sync_by_ids(dataset_video_url_sync): + """Test deduplicate_by_ids on video URL dataset uploaded synchronously.""" + initial_result = dataset_video_url_sync.deduplicate(threshold=10) + item_ids = initial_result.unique_item_ids + assert len(item_ids) > 0 + + result = dataset_video_url_sync.deduplicate_by_ids( + threshold=10, dataset_item_ids=item_ids + ) + assert isinstance(result, DeduplicationResult) + assert result.stats.original_count == len(item_ids) + assert result.unique_item_ids == initial_result.unique_item_ids + + +@pytest.mark.integration +def test_deduplicate_video_url_async_entire_dataset(dataset_video_url_async): + """Test deduplication on video URL dataset uploaded asynchronously.""" + result = dataset_video_url_async.deduplicate(threshold=10) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -142,13 +278,13 @@ def test_deduplicate_video_url_entire_dataset(dataset_video_url): @pytest.mark.integration -def test_deduplicate_video_url_by_ids(dataset_video_url): - """Test deduplicate_by_ids on a dataset created from a video URL.""" - initial_result = dataset_video_url.deduplicate(threshold=10) +def test_deduplicate_video_url_async_by_ids(dataset_video_url_async): + """Test deduplicate_by_ids on video URL dataset uploaded asynchronously.""" + initial_result = dataset_video_url_async.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 - result = dataset_video_url.deduplicate_by_ids( + result = dataset_video_url_async.deduplicate_by_ids( threshold=10, dataset_item_ids=item_ids ) assert isinstance(result, DeduplicationResult) @@ -160,58 +296,58 @@ def test_deduplicate_video_url_by_ids(dataset_video_url): @pytest.mark.integration -def test_deduplicate_threshold_zero(dataset_image): +def test_deduplicate_threshold_zero(dataset_image_sync): """Threshold=0 means exact matches only.""" - result = dataset_image.deduplicate(threshold=0) + result = dataset_image_sync.deduplicate(threshold=0) assert isinstance(result, DeduplicationResult) assert result.stats.threshold == 0 @pytest.mark.integration -def test_deduplicate_threshold_max(dataset_image): +def test_deduplicate_threshold_max(dataset_image_sync): """Threshold=64 is the maximum allowed value.""" - result = dataset_image.deduplicate(threshold=64) + result = dataset_image_sync.deduplicate(threshold=64) assert isinstance(result, DeduplicationResult) assert result.stats.threshold == 64 @pytest.mark.integration -def test_deduplicate_threshold_negative(dataset_image): +def test_deduplicate_threshold_negative(dataset_image_sync): """Threshold must be >= 0.""" with pytest.raises(NucleusAPIError): - dataset_image.deduplicate(threshold=-1) + dataset_image_sync.deduplicate(threshold=-1) @pytest.mark.integration -def test_deduplicate_threshold_too_high(dataset_image): +def test_deduplicate_threshold_too_high(dataset_image_sync): """Threshold must be <= 64.""" with pytest.raises(NucleusAPIError): - dataset_image.deduplicate(threshold=65) + dataset_image_sync.deduplicate(threshold=65) @pytest.mark.integration -def test_deduplicate_threshold_non_integer(dataset_image): +def test_deduplicate_threshold_non_integer(dataset_image_sync): """Threshold must be an integer.""" with pytest.raises(NucleusAPIError): - dataset_image.deduplicate(threshold=10.5) + dataset_image_sync.deduplicate(threshold=10.5) @pytest.mark.integration -def test_deduplicate_nonexistent_reference_id(dataset_image): +def test_deduplicate_nonexistent_reference_id(dataset_image_sync): with pytest.raises(NucleusAPIError): - dataset_image.deduplicate(threshold=10, reference_ids=["nonexistent_ref_id"]) + dataset_image_sync.deduplicate(threshold=10, reference_ids=["nonexistent_ref_id"]) @pytest.mark.integration -def test_deduplicate_by_ids_nonexistent_id(dataset_image): +def test_deduplicate_by_ids_nonexistent_id(dataset_image_sync): with pytest.raises(NucleusAPIError): - dataset_image.deduplicate_by_ids(threshold=10, dataset_item_ids=["di_nonexistent"]) + dataset_image_sync.deduplicate_by_ids(threshold=10, dataset_item_ids=["di_nonexistent"]) @pytest.mark.integration -def test_deduplicate_idempotency(dataset_image): - result1 = dataset_image.deduplicate(threshold=10) - result2 = dataset_image.deduplicate(threshold=10) +def test_deduplicate_idempotency(dataset_image_sync): + result1 = dataset_image_sync.deduplicate(threshold=10) + result2 = dataset_image_sync.deduplicate(threshold=10) assert result1.unique_item_ids == result2.unique_item_ids assert result1.unique_reference_ids == result2.unique_reference_ids @@ -220,8 +356,8 @@ def test_deduplicate_idempotency(dataset_image): @pytest.mark.integration -def test_deduplicate_response_invariants(dataset_image): - result = dataset_image.deduplicate(threshold=10) +def test_deduplicate_response_invariants(dataset_image_sync): + result = dataset_image_sync.deduplicate(threshold=10) assert len(result.unique_item_ids) == len(result.unique_reference_ids) assert result.stats.deduplicated_count == len(result.unique_item_ids) @@ -230,30 +366,30 @@ def test_deduplicate_response_invariants(dataset_image): @pytest.mark.integration -def test_deduplicate_by_ids_threshold_negative(dataset_image): +def test_deduplicate_by_ids_threshold_negative(dataset_image_sync): """deduplicate_by_ids should enforce the same threshold constraints.""" - initial_result = dataset_image.deduplicate(threshold=10) + initial_result = dataset_image_sync.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids with pytest.raises(NucleusAPIError): - dataset_image.deduplicate_by_ids(threshold=-1, dataset_item_ids=item_ids) + dataset_image_sync.deduplicate_by_ids(threshold=-1, dataset_item_ids=item_ids) @pytest.mark.integration -def test_deduplicate_by_ids_threshold_too_high(dataset_image): +def test_deduplicate_by_ids_threshold_too_high(dataset_image_sync): """deduplicate_by_ids should enforce the same threshold constraints.""" - initial_result = dataset_image.deduplicate(threshold=10) + initial_result = dataset_image_sync.deduplicate(threshold=10) item_ids = initial_result.unique_item_ids with pytest.raises(NucleusAPIError): - dataset_image.deduplicate_by_ids(threshold=65, dataset_item_ids=item_ids) + dataset_image_sync.deduplicate_by_ids(threshold=65, dataset_item_ids=item_ids) @pytest.mark.integration -def test_deduplicate_single_item(dataset_image): +def test_deduplicate_single_item(dataset_image_sync): """Single item should always be unique.""" reference_ids = [TEST_DATASET_ITEMS[0].reference_id] - result = dataset_image.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image_sync.deduplicate(threshold=10, reference_ids=reference_ids) assert result.stats.original_count == 1 assert result.stats.deduplicated_count == 1 @@ -305,9 +441,9 @@ def test_deduplicate_identifies_duplicates(dataset_with_duplicates): @pytest.mark.integration -def test_deduplicate_distinct_images_all_unique(dataset_image): +def test_deduplicate_distinct_images_all_unique(dataset_image_sync): """Distinct images should all remain after deduplication.""" - result = dataset_image.deduplicate(threshold=0) + result = dataset_image_sync.deduplicate(threshold=0) # With threshold=0 (exact match only), all distinct images should be unique assert result.stats.deduplicated_count == result.stats.original_count From 2753603c98765db810cb2af0ea88884e849b151a Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:19:41 -0500 Subject: [PATCH 13/19] Refactor tests to use DEDUP_DEFAULT_TEST_THRESHOLD constant --- tests/helpers.py | 2 + tests/test_deduplication.py | 75 ++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 34 deletions(-) diff --git a/tests/helpers.py b/tests/helpers.py index bef9cc50..03076e6f 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -24,6 +24,8 @@ EVAL_FUNCTION_THRESHOLD = 0.5 EVAL_FUNCTION_COMPARISON = ThresholdComparison.GREATER_THAN_EQUAL_TO +DEDUP_DEFAULT_TEST_THRESHOLD = 10 + TEST_IMG_URLS = [ "https://github.com/scaleapi/nucleus-python-client/raw/master/tests/testdata/airplane.jpeg", diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index d27c0ea5..f23a7331 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -5,6 +5,7 @@ from nucleus.errors import NucleusAPIError from .helpers import ( + DEDUP_DEFAULT_TEST_THRESHOLD, TEST_DATASET_ITEMS, TEST_DATASET_NAME, TEST_IMG_URLS, @@ -17,13 +18,13 @@ def test_deduplicate_empty_reference_ids_raises_error(): fake_dataset = Dataset("fake", NucleusClient("fake")) with pytest.raises(ValueError, match="reference_ids cannot be empty"): - fake_dataset.deduplicate(threshold=10, reference_ids=[]) + fake_dataset.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=[]) def test_deduplicate_by_ids_empty_list_raises_error(): fake_dataset = Dataset("fake", NucleusClient("fake")) with pytest.raises(ValueError, match="dataset_item_ids must be non-empty"): - fake_dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=[]) + fake_dataset.deduplicate_by_ids(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=[]) @pytest.fixture(scope="module") @@ -48,7 +49,7 @@ def dataset_image_async(CLIENT): @pytest.mark.integration def test_deduplicate_image_sync_entire_dataset(dataset_image_sync): """Test deduplication on image dataset uploaded synchronously.""" - result = dataset_image_sync.deduplicate(threshold=10) + result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -59,7 +60,7 @@ def test_deduplicate_image_sync_entire_dataset(dataset_image_sync): def test_deduplicate_image_sync_with_reference_ids(dataset_image_sync): """Test deduplication with reference IDs on image dataset uploaded synchronously.""" reference_ids = [item.reference_id for item in TEST_DATASET_ITEMS[:2]] - result = dataset_image_sync.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=reference_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(reference_ids) assert len(result.unique_reference_ids) <= len(reference_ids) @@ -69,11 +70,11 @@ def test_deduplicate_image_sync_with_reference_ids(dataset_image_sync): @pytest.mark.integration def test_deduplicate_image_sync_by_ids(dataset_image_sync): """Test deduplicate_by_ids on image dataset uploaded synchronously.""" - initial_result = dataset_image_sync.deduplicate(threshold=10) + initial_result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 - result = dataset_image_sync.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) + result = dataset_image_sync.deduplicate_by_ids(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) assert result.unique_item_ids == initial_result.unique_item_ids @@ -82,7 +83,7 @@ def test_deduplicate_image_sync_by_ids(dataset_image_sync): @pytest.mark.integration def test_deduplicate_image_async_entire_dataset(dataset_image_async): """Test deduplication on image dataset uploaded asynchronously.""" - result = dataset_image_async.deduplicate(threshold=10) + result = dataset_image_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -93,7 +94,7 @@ def test_deduplicate_image_async_entire_dataset(dataset_image_async): def test_deduplicate_image_async_with_reference_ids(dataset_image_async): """Test deduplication with reference IDs on image dataset uploaded asynchronously.""" reference_ids = [item.reference_id for item in TEST_DATASET_ITEMS[:2]] - result = dataset_image_async.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=reference_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(reference_ids) assert len(result.unique_reference_ids) <= len(reference_ids) @@ -103,11 +104,11 @@ def test_deduplicate_image_async_with_reference_ids(dataset_image_async): @pytest.mark.integration def test_deduplicate_image_async_by_ids(dataset_image_async): """Test deduplicate_by_ids on image dataset uploaded asynchronously.""" - initial_result = dataset_image_async.deduplicate(threshold=10) + initial_result = dataset_image_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 - result = dataset_image_async.deduplicate_by_ids(threshold=10, dataset_item_ids=item_ids) + result = dataset_image_async.deduplicate_by_ids(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) assert result.unique_item_ids == initial_result.unique_item_ids @@ -144,7 +145,7 @@ def _get_scene_frame_ref_ids(): @pytest.mark.integration def test_deduplicate_video_scene_sync_entire_dataset(dataset_video_scene_sync): """Test deduplication on video scene dataset uploaded synchronously.""" - result = dataset_video_scene_sync.deduplicate(threshold=10) + result = dataset_video_scene_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -155,7 +156,7 @@ def test_deduplicate_video_scene_sync_entire_dataset(dataset_video_scene_sync): def test_deduplicate_video_scene_sync_with_frame_reference_ids(dataset_video_scene_sync): """Test deduplication with frame reference IDs on video scene dataset uploaded synchronously.""" frame_ref_ids = _get_scene_frame_ref_ids() - result = dataset_video_scene_sync.deduplicate(threshold=10, reference_ids=frame_ref_ids) + result = dataset_video_scene_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=frame_ref_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(frame_ref_ids) assert len(result.unique_reference_ids) <= len(frame_ref_ids) @@ -165,12 +166,12 @@ def test_deduplicate_video_scene_sync_with_frame_reference_ids(dataset_video_sce @pytest.mark.integration def test_deduplicate_video_scene_sync_by_ids(dataset_video_scene_sync): """Test deduplicate_by_ids on video scene dataset uploaded synchronously.""" - initial_result = dataset_video_scene_sync.deduplicate(threshold=10) + initial_result = dataset_video_scene_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 result = dataset_video_scene_sync.deduplicate_by_ids( - threshold=10, dataset_item_ids=item_ids + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids ) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) @@ -180,7 +181,7 @@ def test_deduplicate_video_scene_sync_by_ids(dataset_video_scene_sync): @pytest.mark.integration def test_deduplicate_video_scene_async_entire_dataset(dataset_video_scene_async): """Test deduplication on video scene dataset uploaded asynchronously.""" - result = dataset_video_scene_async.deduplicate(threshold=10) + result = dataset_video_scene_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -191,7 +192,7 @@ def test_deduplicate_video_scene_async_entire_dataset(dataset_video_scene_async) def test_deduplicate_video_scene_async_with_frame_reference_ids(dataset_video_scene_async): """Test deduplication with frame reference IDs on video scene dataset uploaded asynchronously.""" frame_ref_ids = _get_scene_frame_ref_ids() - result = dataset_video_scene_async.deduplicate(threshold=10, reference_ids=frame_ref_ids) + result = dataset_video_scene_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=frame_ref_ids) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(frame_ref_ids) assert len(result.unique_reference_ids) <= len(frame_ref_ids) @@ -201,12 +202,12 @@ def test_deduplicate_video_scene_async_with_frame_reference_ids(dataset_video_sc @pytest.mark.integration def test_deduplicate_video_scene_async_by_ids(dataset_video_scene_async): """Test deduplicate_by_ids on video scene dataset uploaded asynchronously.""" - initial_result = dataset_video_scene_async.deduplicate(threshold=10) + initial_result = dataset_video_scene_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 result = dataset_video_scene_async.deduplicate_by_ids( - threshold=10, dataset_item_ids=item_ids + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids ) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) @@ -245,7 +246,7 @@ def dataset_video_url_async(CLIENT): @pytest.mark.integration def test_deduplicate_video_url_sync_entire_dataset(dataset_video_url_sync): """Test deduplication on video URL dataset uploaded synchronously.""" - result = dataset_video_url_sync.deduplicate(threshold=10) + result = dataset_video_url_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -255,12 +256,12 @@ def test_deduplicate_video_url_sync_entire_dataset(dataset_video_url_sync): @pytest.mark.integration def test_deduplicate_video_url_sync_by_ids(dataset_video_url_sync): """Test deduplicate_by_ids on video URL dataset uploaded synchronously.""" - initial_result = dataset_video_url_sync.deduplicate(threshold=10) + initial_result = dataset_video_url_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 result = dataset_video_url_sync.deduplicate_by_ids( - threshold=10, dataset_item_ids=item_ids + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids ) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) @@ -270,7 +271,7 @@ def test_deduplicate_video_url_sync_by_ids(dataset_video_url_sync): @pytest.mark.integration def test_deduplicate_video_url_async_entire_dataset(dataset_video_url_async): """Test deduplication on video URL dataset uploaded asynchronously.""" - result = dataset_video_url_async.deduplicate(threshold=10) + result = dataset_video_url_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert isinstance(result, DeduplicationResult) assert len(result.unique_reference_ids) > 0 assert len(result.unique_item_ids) > 0 @@ -280,12 +281,12 @@ def test_deduplicate_video_url_async_entire_dataset(dataset_video_url_async): @pytest.mark.integration def test_deduplicate_video_url_async_by_ids(dataset_video_url_async): """Test deduplicate_by_ids on video URL dataset uploaded asynchronously.""" - initial_result = dataset_video_url_async.deduplicate(threshold=10) + initial_result = dataset_video_url_async.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids assert len(item_ids) > 0 result = dataset_video_url_async.deduplicate_by_ids( - threshold=10, dataset_item_ids=item_ids + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids ) assert isinstance(result, DeduplicationResult) assert result.stats.original_count == len(item_ids) @@ -335,19 +336,23 @@ def test_deduplicate_threshold_non_integer(dataset_image_sync): @pytest.mark.integration def test_deduplicate_nonexistent_reference_id(dataset_image_sync): with pytest.raises(NucleusAPIError): - dataset_image_sync.deduplicate(threshold=10, reference_ids=["nonexistent_ref_id"]) + dataset_image_sync.deduplicate( + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=["nonexistent_ref_id"] + ) @pytest.mark.integration def test_deduplicate_by_ids_nonexistent_id(dataset_image_sync): with pytest.raises(NucleusAPIError): - dataset_image_sync.deduplicate_by_ids(threshold=10, dataset_item_ids=["di_nonexistent"]) + dataset_image_sync.deduplicate_by_ids( + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=["di_nonexistent"] + ) @pytest.mark.integration def test_deduplicate_idempotency(dataset_image_sync): - result1 = dataset_image_sync.deduplicate(threshold=10) - result2 = dataset_image_sync.deduplicate(threshold=10) + result1 = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) + result2 = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert result1.unique_item_ids == result2.unique_item_ids assert result1.unique_reference_ids == result2.unique_reference_ids @@ -357,18 +362,18 @@ def test_deduplicate_idempotency(dataset_image_sync): @pytest.mark.integration def test_deduplicate_response_invariants(dataset_image_sync): - result = dataset_image_sync.deduplicate(threshold=10) + result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert len(result.unique_item_ids) == len(result.unique_reference_ids) assert result.stats.deduplicated_count == len(result.unique_item_ids) assert result.stats.deduplicated_count <= result.stats.original_count - assert result.stats.threshold == 10 + assert result.stats.threshold == DEDUP_DEFAULT_TEST_THRESHOLD @pytest.mark.integration def test_deduplicate_by_ids_threshold_negative(dataset_image_sync): """deduplicate_by_ids should enforce the same threshold constraints.""" - initial_result = dataset_image_sync.deduplicate(threshold=10) + initial_result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids with pytest.raises(NucleusAPIError): @@ -378,7 +383,7 @@ def test_deduplicate_by_ids_threshold_negative(dataset_image_sync): @pytest.mark.integration def test_deduplicate_by_ids_threshold_too_high(dataset_image_sync): """deduplicate_by_ids should enforce the same threshold constraints.""" - initial_result = dataset_image_sync.deduplicate(threshold=10) + initial_result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids with pytest.raises(NucleusAPIError): @@ -389,7 +394,9 @@ def test_deduplicate_by_ids_threshold_too_high(dataset_image_sync): def test_deduplicate_single_item(dataset_image_sync): """Single item should always be unique.""" reference_ids = [TEST_DATASET_ITEMS[0].reference_id] - result = dataset_image_sync.deduplicate(threshold=10, reference_ids=reference_ids) + result = dataset_image_sync.deduplicate( + threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=reference_ids + ) assert result.stats.original_count == 1 assert result.stats.deduplicated_count == 1 @@ -407,7 +414,7 @@ def dataset_empty(CLIENT): @pytest.mark.integration def test_deduplicate_empty_dataset(dataset_empty): """Empty dataset should return zero counts.""" - result = dataset_empty.deduplicate(threshold=10) + result = dataset_empty.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert result.stats.original_count == 0 assert result.stats.deduplicated_count == 0 From 8309469b21dac812b2fa1355b3433818fbde5876 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:20:37 -0500 Subject: [PATCH 14/19] Use try-finally for dataset creation and deletion --- tests/test_deduplication.py | 110 +++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 47 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index f23a7331..5165d1b3 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -31,19 +31,23 @@ def test_deduplicate_by_ids_empty_list_raises_error(): def dataset_image_sync(CLIENT): """Image dataset uploaded synchronously.""" ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup sync", is_scene=False) - ds.append(TEST_DATASET_ITEMS) - yield ds - CLIENT.delete_dataset(ds.id) + try: + ds.append(TEST_DATASET_ITEMS) + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.fixture(scope="module") def dataset_image_async(CLIENT): """Image dataset uploaded asynchronously.""" ds = CLIENT.create_dataset(TEST_DATASET_NAME + " dedup async", is_scene=False) - job = ds.append(TEST_DATASET_ITEMS, asynchronous=True) - job.sleep_until_complete() - yield ds - CLIENT.delete_dataset(ds.id) + try: + job = ds.append(TEST_DATASET_ITEMS, asynchronous=True) + job.sleep_until_complete() + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.mark.integration @@ -118,23 +122,27 @@ def test_deduplicate_image_async_by_ids(dataset_image_async): def dataset_video_scene_sync(CLIENT): """Video scene dataset (with frames) uploaded synchronously.""" ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup sync", is_scene=True) - scene_1 = TEST_VIDEO_SCENES["scenes"][0] - scenes = [VideoScene.from_json(scene_1)] - ds.append(scenes) - yield ds - CLIENT.delete_dataset(ds.id) + try: + scene_1 = TEST_VIDEO_SCENES["scenes"][0] + scenes = [VideoScene.from_json(scene_1)] + ds.append(scenes) + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.fixture(scope="module") def dataset_video_scene_async(CLIENT): """Video scene dataset (with frames) uploaded asynchronously.""" ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup async", is_scene=True) - scene_1 = TEST_VIDEO_SCENES["scenes"][0] - scenes = [VideoScene.from_json(scene_1)] - job = ds.append(scenes, asynchronous=True) - job.sleep_until_complete() - yield ds - CLIENT.delete_dataset(ds.id) + try: + scene_1 = TEST_VIDEO_SCENES["scenes"][0] + scenes = [VideoScene.from_json(scene_1)] + job = ds.append(scenes, asynchronous=True) + job.sleep_until_complete() + yield ds + finally: + CLIENT.delete_dataset(ds.id) def _get_scene_frame_ref_ids(): @@ -218,29 +226,33 @@ def test_deduplicate_video_scene_async_by_ids(dataset_video_scene_async): def dataset_video_url_sync(CLIENT): """Video URL dataset uploaded synchronously.""" ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup sync", is_scene=True) - scene = VideoScene.from_json({ - "reference_id": "video_url_scene_sync", - "video_url": TEST_VIDEO_URL, - "metadata": {"test": "video_url_dedup_sync"}, - }) - ds.append([scene]) - yield ds - CLIENT.delete_dataset(ds.id) + try: + scene = VideoScene.from_json({ + "reference_id": "video_url_scene_sync", + "video_url": TEST_VIDEO_URL, + "metadata": {"test": "video_url_dedup_sync"}, + }) + ds.append([scene]) + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.fixture(scope="module") def dataset_video_url_async(CLIENT): """Video URL dataset uploaded asynchronously.""" ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup async", is_scene=True) - scene = VideoScene.from_json({ - "reference_id": "video_url_scene_async", - "video_url": TEST_VIDEO_URL, - "metadata": {"test": "video_url_dedup_async"}, - }) - job = ds.append([scene], asynchronous=True) - job.sleep_until_complete() - yield ds - CLIENT.delete_dataset(ds.id) + try: + scene = VideoScene.from_json({ + "reference_id": "video_url_scene_async", + "video_url": TEST_VIDEO_URL, + "metadata": {"test": "video_url_dedup_async"}, + }) + job = ds.append([scene], asynchronous=True) + job.sleep_until_complete() + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.mark.integration @@ -403,12 +415,14 @@ def test_deduplicate_single_item(dataset_image_sync): assert len(result.unique_reference_ids) == 1 -@pytest.fixture() +@pytest.fixture(scope="function") def dataset_empty(CLIENT): """Empty dataset with no items.""" ds = CLIENT.create_dataset(TEST_DATASET_NAME + " empty", is_scene=False) - yield ds - CLIENT.delete_dataset(ds.id) + try: + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.mark.integration @@ -422,18 +436,20 @@ def test_deduplicate_empty_dataset(dataset_empty): assert len(result.unique_item_ids) == 0 -@pytest.fixture() +@pytest.fixture(scope="function") def dataset_with_duplicates(CLIENT): """Dataset with duplicate images (same image uploaded twice).""" ds = CLIENT.create_dataset(TEST_DATASET_NAME + " duplicates", is_scene=False) - items = [ - DatasetItem(TEST_IMG_URLS[0], reference_id="img_original"), - DatasetItem(TEST_IMG_URLS[0], reference_id="img_duplicate"), - DatasetItem(TEST_IMG_URLS[1], reference_id="img_different"), - ] - ds.append(items) - yield ds - CLIENT.delete_dataset(ds.id) + try: + items = [ + DatasetItem(TEST_IMG_URLS[0], reference_id="img_original"), + DatasetItem(TEST_IMG_URLS[0], reference_id="img_duplicate"), + DatasetItem(TEST_IMG_URLS[1], reference_id="img_different"), + ] + ds.append(items) + yield ds + finally: + CLIENT.delete_dataset(ds.id) @pytest.mark.integration From 5a0e896073eb0c970da57e99a6a5da5196552c85 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:21:00 -0500 Subject: [PATCH 15/19] Make edge case test docstrings more detailed --- tests/test_deduplication.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index 5165d1b3..a4144c0b 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -310,7 +310,7 @@ def test_deduplicate_video_url_async_by_ids(dataset_video_url_async): @pytest.mark.integration def test_deduplicate_threshold_zero(dataset_image_sync): - """Threshold=0 means exact matches only.""" + """Verify threshold=0 (exact match only) succeeds and returns correct stats.""" result = dataset_image_sync.deduplicate(threshold=0) assert isinstance(result, DeduplicationResult) assert result.stats.threshold == 0 @@ -318,7 +318,7 @@ def test_deduplicate_threshold_zero(dataset_image_sync): @pytest.mark.integration def test_deduplicate_threshold_max(dataset_image_sync): - """Threshold=64 is the maximum allowed value.""" + """Verify threshold=64 (maximum allowed) succeeds and returns correct stats.""" result = dataset_image_sync.deduplicate(threshold=64) assert isinstance(result, DeduplicationResult) assert result.stats.threshold == 64 @@ -326,27 +326,28 @@ def test_deduplicate_threshold_max(dataset_image_sync): @pytest.mark.integration def test_deduplicate_threshold_negative(dataset_image_sync): - """Threshold must be >= 0.""" + """Verify negative threshold raises NucleusAPIError (must be >= 0).""" with pytest.raises(NucleusAPIError): dataset_image_sync.deduplicate(threshold=-1) @pytest.mark.integration def test_deduplicate_threshold_too_high(dataset_image_sync): - """Threshold must be <= 64.""" + """Verify threshold > 64 raises NucleusAPIError (must be <= 64).""" with pytest.raises(NucleusAPIError): dataset_image_sync.deduplicate(threshold=65) @pytest.mark.integration def test_deduplicate_threshold_non_integer(dataset_image_sync): - """Threshold must be an integer.""" + """Verify non-integer threshold raises NucleusAPIError.""" with pytest.raises(NucleusAPIError): dataset_image_sync.deduplicate(threshold=10.5) @pytest.mark.integration def test_deduplicate_nonexistent_reference_id(dataset_image_sync): + """Verify nonexistent reference_id raises NucleusAPIError.""" with pytest.raises(NucleusAPIError): dataset_image_sync.deduplicate( threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=["nonexistent_ref_id"] @@ -355,6 +356,7 @@ def test_deduplicate_nonexistent_reference_id(dataset_image_sync): @pytest.mark.integration def test_deduplicate_by_ids_nonexistent_id(dataset_image_sync): + """Verify nonexistent dataset_item_id raises NucleusAPIError.""" with pytest.raises(NucleusAPIError): dataset_image_sync.deduplicate_by_ids( threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=["di_nonexistent"] @@ -363,6 +365,7 @@ def test_deduplicate_by_ids_nonexistent_id(dataset_image_sync): @pytest.mark.integration def test_deduplicate_idempotency(dataset_image_sync): + """Verify repeated deduplication calls return consistent results.""" result1 = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) result2 = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) @@ -374,6 +377,7 @@ def test_deduplicate_idempotency(dataset_image_sync): @pytest.mark.integration def test_deduplicate_response_invariants(dataset_image_sync): + """Verify response maintains expected invariants between fields.""" result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert len(result.unique_item_ids) == len(result.unique_reference_ids) @@ -384,7 +388,7 @@ def test_deduplicate_response_invariants(dataset_image_sync): @pytest.mark.integration def test_deduplicate_by_ids_threshold_negative(dataset_image_sync): - """deduplicate_by_ids should enforce the same threshold constraints.""" + """Verify deduplicate_by_ids rejects negative threshold.""" initial_result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids @@ -394,7 +398,7 @@ def test_deduplicate_by_ids_threshold_negative(dataset_image_sync): @pytest.mark.integration def test_deduplicate_by_ids_threshold_too_high(dataset_image_sync): - """deduplicate_by_ids should enforce the same threshold constraints.""" + """Verify deduplicate_by_ids rejects threshold > 64.""" initial_result = dataset_image_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) item_ids = initial_result.unique_item_ids @@ -404,7 +408,7 @@ def test_deduplicate_by_ids_threshold_too_high(dataset_image_sync): @pytest.mark.integration def test_deduplicate_single_item(dataset_image_sync): - """Single item should always be unique.""" + """Verify single item deduplication returns that item as unique.""" reference_ids = [TEST_DATASET_ITEMS[0].reference_id] result = dataset_image_sync.deduplicate( threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=reference_ids @@ -427,7 +431,7 @@ def dataset_empty(CLIENT): @pytest.mark.integration def test_deduplicate_empty_dataset(dataset_empty): - """Empty dataset should return zero counts.""" + """Verify deduplication on empty dataset returns zero counts.""" result = dataset_empty.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) assert result.stats.original_count == 0 From 14c751f12c089b68bff8daf4cf29c6edab194b26 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:28:30 -0500 Subject: [PATCH 16/19] Remove deprecated video sync upload tests --- tests/test_deduplication.py | 90 ------------------------------------- 1 file changed, 90 deletions(-) diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py index a4144c0b..857cd126 100644 --- a/tests/test_deduplication.py +++ b/tests/test_deduplication.py @@ -118,19 +118,6 @@ def test_deduplicate_image_async_by_ids(dataset_image_async): assert result.unique_item_ids == initial_result.unique_item_ids -@pytest.fixture(scope="module") -def dataset_video_scene_sync(CLIENT): - """Video scene dataset (with frames) uploaded synchronously.""" - ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " dedup sync", is_scene=True) - try: - scene_1 = TEST_VIDEO_SCENES["scenes"][0] - scenes = [VideoScene.from_json(scene_1)] - ds.append(scenes) - yield ds - finally: - CLIENT.delete_dataset(ds.id) - - @pytest.fixture(scope="module") def dataset_video_scene_async(CLIENT): """Video scene dataset (with frames) uploaded asynchronously.""" @@ -150,42 +137,6 @@ def _get_scene_frame_ref_ids(): return [frame["reference_id"] for frame in TEST_VIDEO_SCENES["scenes"][0]["frames"]] -@pytest.mark.integration -def test_deduplicate_video_scene_sync_entire_dataset(dataset_video_scene_sync): - """Test deduplication on video scene dataset uploaded synchronously.""" - result = dataset_video_scene_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) - assert isinstance(result, DeduplicationResult) - assert len(result.unique_reference_ids) > 0 - assert len(result.unique_item_ids) > 0 - assert result.stats.original_count == len(_get_scene_frame_ref_ids()) - - -@pytest.mark.integration -def test_deduplicate_video_scene_sync_with_frame_reference_ids(dataset_video_scene_sync): - """Test deduplication with frame reference IDs on video scene dataset uploaded synchronously.""" - frame_ref_ids = _get_scene_frame_ref_ids() - result = dataset_video_scene_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD, reference_ids=frame_ref_ids) - assert isinstance(result, DeduplicationResult) - assert result.stats.original_count == len(frame_ref_ids) - assert len(result.unique_reference_ids) <= len(frame_ref_ids) - assert len(result.unique_item_ids) <= len(frame_ref_ids) - - -@pytest.mark.integration -def test_deduplicate_video_scene_sync_by_ids(dataset_video_scene_sync): - """Test deduplicate_by_ids on video scene dataset uploaded synchronously.""" - initial_result = dataset_video_scene_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) - item_ids = initial_result.unique_item_ids - assert len(item_ids) > 0 - - result = dataset_video_scene_sync.deduplicate_by_ids( - threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids - ) - assert isinstance(result, DeduplicationResult) - assert result.stats.original_count == len(item_ids) - assert result.unique_item_ids == initial_result.unique_item_ids - - @pytest.mark.integration def test_deduplicate_video_scene_async_entire_dataset(dataset_video_scene_async): """Test deduplication on video scene dataset uploaded asynchronously.""" @@ -222,22 +173,6 @@ def test_deduplicate_video_scene_async_by_ids(dataset_video_scene_async): assert result.unique_item_ids == initial_result.unique_item_ids -@pytest.fixture(scope="module") -def dataset_video_url_sync(CLIENT): - """Video URL dataset uploaded synchronously.""" - ds = CLIENT.create_dataset(TEST_VIDEO_DATASET_NAME + " video_url dedup sync", is_scene=True) - try: - scene = VideoScene.from_json({ - "reference_id": "video_url_scene_sync", - "video_url": TEST_VIDEO_URL, - "metadata": {"test": "video_url_dedup_sync"}, - }) - ds.append([scene]) - yield ds - finally: - CLIENT.delete_dataset(ds.id) - - @pytest.fixture(scope="module") def dataset_video_url_async(CLIENT): """Video URL dataset uploaded asynchronously.""" @@ -255,31 +190,6 @@ def dataset_video_url_async(CLIENT): CLIENT.delete_dataset(ds.id) -@pytest.mark.integration -def test_deduplicate_video_url_sync_entire_dataset(dataset_video_url_sync): - """Test deduplication on video URL dataset uploaded synchronously.""" - result = dataset_video_url_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) - assert isinstance(result, DeduplicationResult) - assert len(result.unique_reference_ids) > 0 - assert len(result.unique_item_ids) > 0 - assert result.stats.original_count > 0 - - -@pytest.mark.integration -def test_deduplicate_video_url_sync_by_ids(dataset_video_url_sync): - """Test deduplicate_by_ids on video URL dataset uploaded synchronously.""" - initial_result = dataset_video_url_sync.deduplicate(threshold=DEDUP_DEFAULT_TEST_THRESHOLD) - item_ids = initial_result.unique_item_ids - assert len(item_ids) > 0 - - result = dataset_video_url_sync.deduplicate_by_ids( - threshold=DEDUP_DEFAULT_TEST_THRESHOLD, dataset_item_ids=item_ids - ) - assert isinstance(result, DeduplicationResult) - assert result.stats.original_count == len(item_ids) - assert result.unique_item_ids == initial_result.unique_item_ids - - @pytest.mark.integration def test_deduplicate_video_url_async_entire_dataset(dataset_video_url_async): """Test deduplication on video URL dataset uploaded asynchronously.""" From 4861691c4644809c04b79d486ea382f56a9ba735 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:38:37 -0500 Subject: [PATCH 17/19] Update test_jobs to be deterministic --- tests/test_jobs.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/test_jobs.py b/tests/test_jobs.py index fb5a631a..3172c8b1 100644 --- a/tests/test_jobs.py +++ b/tests/test_jobs.py @@ -1,10 +1,9 @@ -import time -from pathlib import Path - import pytest from nucleus import AsyncJob, NucleusClient +from .helpers import TEST_DATASET_ITEMS, TEST_DATASET_NAME + def test_reprs(): # Have to define here in order to have access to all relevant objects @@ -23,11 +22,28 @@ def test_repr(test_object: any): ) -def test_job_listing_and_retrieval(CLIENT): +@pytest.fixture(scope="module") +def job_from_dataset_upload(CLIENT): + """Create a job by doing an async dataset upload.""" + ds = CLIENT.create_dataset(TEST_DATASET_NAME + " job test", is_scene=False) + try: + job = ds.append(TEST_DATASET_ITEMS, asynchronous=True) + job.sleep_until_complete() + yield job + finally: + CLIENT.delete_dataset(ds.id) + + +@pytest.mark.integration +def test_job_listing_and_retrieval(CLIENT, job_from_dataset_upload): + """Test that a job we created can be listed and retrieved.""" + known_job_id = job_from_dataset_upload.job_id + + # Verify the job appears in the list jobs = CLIENT.list_jobs() - assert len(jobs) > 0, "No jobs found" - fetch_id = jobs[0].job_id - fetched_job = CLIENT.get_job(fetch_id) - # job_last_known_status can change - fetched_job.job_last_known_status = jobs[0].job_last_known_status - assert fetched_job == jobs[0] + job_ids = [j.job_id for j in jobs] + assert known_job_id in job_ids, f"Created job {known_job_id} not in job list" + + # Verify we can fetch it by ID + fetched_job = CLIENT.get_job(known_job_id) + assert fetched_job.job_id == known_job_id From 07a87a758c8f266c2e131f49cf6c4c8c9c458a55 Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:47:27 -0500 Subject: [PATCH 18/19] Split jobs tests into listing and retrieval separately --- tests/test_jobs.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/test_jobs.py b/tests/test_jobs.py index 3172c8b1..3b600665 100644 --- a/tests/test_jobs.py +++ b/tests/test_jobs.py @@ -35,15 +35,19 @@ def job_from_dataset_upload(CLIENT): @pytest.mark.integration -def test_job_listing_and_retrieval(CLIENT, job_from_dataset_upload): - """Test that a job we created can be listed and retrieved.""" - known_job_id = job_from_dataset_upload.job_id - - # Verify the job appears in the list +def test_job_listing(CLIENT): + """Test that list_jobs returns results.""" jobs = CLIENT.list_jobs() - job_ids = [j.job_id for j in jobs] - assert known_job_id in job_ids, f"Created job {known_job_id} not in job list" + assert isinstance(jobs, list) + # Just verify the API works and returns AsyncJob objects + if len(jobs) > 0: + assert hasattr(jobs[0], "job_id") + + +@pytest.mark.integration +def test_job_retrieval(CLIENT, job_from_dataset_upload): + """Test that we can retrieve a job we created by ID.""" + known_job_id = job_from_dataset_upload.job_id - # Verify we can fetch it by ID fetched_job = CLIENT.get_job(known_job_id) assert fetched_job.job_id == known_job_id From 4cfc129e89bee0bc44bc80e24ef30d487d563e9f Mon Sep 17 00:00:00 2001 From: Edwin Pavlovsky Date: Mon, 2 Mar 2026 14:49:43 -0500 Subject: [PATCH 19/19] Fix docstring typo --- nucleus/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nucleus/dataset.py b/nucleus/dataset.py index 224444e2..bc1f244f 100644 --- a/nucleus/dataset.py +++ b/nucleus/dataset.py @@ -1082,7 +1082,7 @@ def deduplicate_by_ids( threshold: Hamming distance threshold (0-64). Lower = stricter. 0 = exact matches only. dataset_item_ids: List of internal Nucleus dataset item IDs to deduplicate. - These are IDs are generated by Nucleus; they are not + These IDs are generated by Nucleus; they are not user-defined reference IDs. Must be non-empty. Returns: