[DE-6999] Enable image deduplication within nucleus sdk (#452)

edwinpav · web-flow · commit 878ca0598f98 · 2026-03-02T14:57:06.000-05:00
* Enable deduplication in nucleus sdk

* Lint fixes

* Fix import order

* Add tests for deduplication sdk

* Fix isort import formatting errors

* Add fixture for image dataset specifically for dedup

* Fix image dataset creation syntax

* Create image dataset syncrhonously

* Make dataset_with_duplicates fixture sync

* Add dedup test for scene made with video url

* Document difference between deduplicate and deduplicate_by_ids better in docstring

* Add tests to cover all ingestion forms

* Refactor tests to use DEDUP_DEFAULT_TEST_THRESHOLD constant

* Use try-finally for dataset creation and deletion

* Make edge case test docstrings more detailed

* Remove deprecated video sync upload tests

* Update test_jobs to be deterministic

* Split jobs tests into listing and retrieval separately

* Fix docstring typo
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
+## [0.17.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.12) - 2026-02-23
+
+### Added
+- `Dataset.deduplicate()` method to deduplicate images using perceptual hashing. Accepts optional `reference_ids` to deduplicate specific items, or deduplicates the entire dataset when only `threshold` is provided. Required `threshold` parameter (0-64) controls similarity matching (lower = stricter, 0 = exact matches only).
+- `Dataset.deduplicate_by_ids()` method for deduplication using internal `dataset_item_ids` directly, avoiding the reference ID to item ID mapping for improved efficiency.
+- `DeduplicationResult` and `DeduplicationStats` dataclasses for structured deduplication results.
+
+Example usage:
+
+```python
+dataset = client.get_dataset("ds_...")
+
+# Deduplicate entire dataset
+result = dataset.deduplicate(threshold=10)
+
+# Deduplicate specific items by reference IDs
+result = dataset.deduplicate(threshold=10, reference_ids=["ref_1", "ref_2", "ref_3"])
+
+# Deduplicate by internal item IDs (more efficient if you have them)
+result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["item_1", "item_2"])
+
+# Access results
+print(f"Threshold: {result.stats.threshold}")
+print(f"Original: {result.stats.original_count}, Unique: {result.stats.deduplicated_count}")
+print(result.unique_reference_ids)
+```
+
 ## [0.17.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.11) - 2025-11-03
 
 ### Added
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
@@ -4,6 +4,8 @@
     "AsyncJob",
     "EmbeddingsExportJob",
     "BoxAnnotation",
+    "DeduplicationResult",
+    "DeduplicationStats",
     "BoxPrediction",
     "CameraParams",
     "CategoryAnnotation",
@@ -128,6 +130,7 @@
 from .data_transfer_object.job_status import JobInfoRequestPayload
 from .dataset import Dataset
 from .dataset_item import DatasetItem
+from .deduplication import DeduplicationResult, DeduplicationStats
 from .deprecation_warning import deprecated
 from .errors import (
     DatasetItemRetrievalError,
diff --git a/nucleus/constants.py b/nucleus/constants.py
@@ -149,6 +149,7 @@
 SLICE_TAGS_KEY = "slice_tags"
 TAXONOMY_NAME_KEY = "taxonomy_name"
 TASK_ID_KEY = "task_id"
+THRESHOLD_KEY = "threshold"
 TRACK_REFERENCE_ID_KEY = "track_reference_id"
 TRACK_REFERENCE_IDS_KEY = "track_reference_ids"
 TRACKS_KEY = "tracks"
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
@@ -67,6 +67,7 @@
     REQUEST_ID_KEY,
     SCENE_IDS_KEY,
     SLICE_ID_KEY,
+    THRESHOLD_KEY,
     TRACK_REFERENCE_IDS_KEY,
     TRACKS_KEY,
     TRAINED_SLICE_ID_KEY,
@@ -83,6 +84,7 @@
     check_items_have_dimensions,
 )
 from .dataset_item_uploader import DatasetItemUploader
+from .deduplication import DeduplicationResult, DeduplicationStats
 from .deprecation_warning import deprecated
 from .errors import NotFoundError, NucleusAPIError
 from .job import CustomerJobTypes, jobs_status_overview
@@ -1006,6 +1008,116 @@ def create_slice_by_ids(
         )
         return Slice(response[SLICE_ID_KEY], self._client)
 
+    def deduplicate(
+        self,
+        threshold: int,
+        reference_ids: Optional[List[str]] = None,
+    ) -> DeduplicationResult:
+        """Deduplicate images or frames using user-defined reference IDs.
+
+        This method can deduplicate an entire dataset (when reference_ids is omitted)
+        or a specific subset of items identified by the reference_id you assigned
+        when uploading (e.g., "image_001", "frame_xyz"). To deduplicate using
+        internal Nucleus item IDs instead, use `deduplicate_by_ids()`.
+
+        Parameters:
+            threshold: Hamming distance threshold (0-64). Lower = stricter.
+                0 = exact matches only.
+            reference_ids: Optional list of user-defined reference IDs to deduplicate.
+                If not provided (or None), deduplicates the entire dataset.
+                Cannot be an empty list - use None for entire dataset.
+
+        Returns:
+            DeduplicationResult with unique_reference_ids, unique_item_ids, and stats.
+
+        Raises:
+            ValueError: If reference_ids is an empty list (use None for entire dataset).
+            NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
+            NucleusAPIError: If any reference_id is not found in the dataset.
+            NucleusAPIError: If any item is missing a perceptual hash (pHash).
+                Contact Scale support if this occurs.
+
+        Note:
+            - For scene datasets, this deduplicates the underlying scene frames,
+              not the scenes themselves. Frame reference IDs or dataset item IDs
+              should be provided for scene datasets.
+            - For very large datasets, this operation may take significant time.
+        """
+        # Client-side validation
+        if reference_ids is not None and len(reference_ids) == 0:
+            raise ValueError(
+                "reference_ids cannot be empty. Omit reference_ids parameter to deduplicate entire dataset."
+            )
+
+        payload: Dict[str, Any] = {THRESHOLD_KEY: threshold}
+        if reference_ids is not None:
+            payload[REFERENCE_IDS_KEY] = reference_ids
+
+        response = self._client.make_request(
+            payload, f"dataset/{self.id}/deduplicate"
+        )
+        return DeduplicationResult(
+            unique_item_ids=response["unique_item_ids"],
+            unique_reference_ids=response["unique_reference_ids"],
+            stats=DeduplicationStats(
+                threshold=threshold,
+                original_count=response["stats"]["original_count"],
+                deduplicated_count=response["stats"]["deduplicated_count"],
+            ),
+        )
+
+    def deduplicate_by_ids(
+        self,
+        threshold: int,
+        dataset_item_ids: List[str],
+    ) -> DeduplicationResult:
+        """Deduplicate images or frames using internal Nucleus dataset item IDs.
+
+        This method identifies items by internal Nucleus IDs (e.g., "di_abc123...")
+        which are system-assigned when items are uploaded. To deduplicate using
+        your own user-defined reference IDs instead, or to deduplicate the entire
+        dataset, use `deduplicate()`.
+
+        Parameters:
+            threshold: Hamming distance threshold (0-64). Lower = stricter.
+                0 = exact matches only.
+            dataset_item_ids: List of internal Nucleus dataset item IDs to deduplicate.
+                These IDs are generated by Nucleus; they are not
+                user-defined reference IDs. Must be non-empty.
+
+        Returns:
+            DeduplicationResult with unique_item_ids, unique_reference_ids, and stats.
+
+        Raises:
+            ValueError: If dataset_item_ids is empty.
+            NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
+            NucleusAPIError: If any dataset_item_id is not found in the dataset.
+            NucleusAPIError: If any item is missing a perceptual hash (pHash).
+                Contact Scale support if this occurs.
+        """
+        # Client-side validation
+        if not dataset_item_ids:
+            raise ValueError(
+                "dataset_item_ids must be non-empty. Use deduplicate() for entire dataset."
+            )
+
+        payload = {
+            DATASET_ITEM_IDS_KEY: dataset_item_ids,
+            THRESHOLD_KEY: threshold,
+        }
+        response = self._client.make_request(
+            payload, f"dataset/{self.id}/deduplicate"
+        )
+        return DeduplicationResult(
+            unique_item_ids=response["unique_item_ids"],
+            unique_reference_ids=response["unique_reference_ids"],
+            stats=DeduplicationStats(
+                threshold=threshold,
+                original_count=response["stats"]["original_count"],
+                deduplicated_count=response["stats"]["deduplicated_count"],
+            ),
+        )
+
     def build_slice(
         self,
         name: str,
diff --git a/nucleus/deduplication.py b/nucleus/deduplication.py
@@ -0,0 +1,16 @@
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class DeduplicationStats:
+    threshold: int
+    original_count: int
+    deduplicated_count: int
+
+
+@dataclass
+class DeduplicationResult:
+    unique_item_ids: List[str]  # Internal dataset item IDs
+    unique_reference_ids: List[str]  # User-defined reference IDs
+    stats: DeduplicationStats
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"]  # Easy ignore for getting it running
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.17.11"
+version = "0.17.12"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -24,6 +24,8 @@
 EVAL_FUNCTION_THRESHOLD = 0.5
 EVAL_FUNCTION_COMPARISON = ThresholdComparison.GREATER_THAN_EQUAL_TO
 
+DEDUP_DEFAULT_TEST_THRESHOLD = 10
+
 
 TEST_IMG_URLS = [
     "https://github.com/scaleapi/nucleus-python-client/raw/master/tests/testdata/airplane.jpeg",
diff --git a/tests/test_deduplication.py b/tests/test_deduplication.py
diff --git a/tests/test_jobs.py b/tests/test_jobs.py