Skip to content

Commit 878ca05

Browse files
authored
[DE-6999] Enable image deduplication within nucleus sdk (#452)
* Enable deduplication in nucleus sdk * Lint fixes * Fix import order * Add tests for deduplication sdk * Fix isort import formatting errors * Add fixture for image dataset specifically for dedup * Fix image dataset creation syntax * Create image dataset syncrhonously * Make dataset_with_duplicates fixture sync * Add dedup test for scene made with video url * Document difference between deduplicate and deduplicate_by_ids better in docstring * Add tests to cover all ingestion forms * Refactor tests to use DEDUP_DEFAULT_TEST_THRESHOLD constant * Use try-finally for dataset creation and deletion * Make edge case test docstrings more detailed * Remove deprecated video sync upload tests * Update test_jobs to be deterministic * Split jobs tests into listing and retrieval separately * Fix docstring typo
1 parent 671f475 commit 878ca05

9 files changed

Lines changed: 578 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88

9+
## [0.17.12](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.12) - 2026-02-23
10+
11+
### Added
12+
- `Dataset.deduplicate()` method to deduplicate images using perceptual hashing. Accepts optional `reference_ids` to deduplicate specific items, or deduplicates the entire dataset when only `threshold` is provided. Required `threshold` parameter (0-64) controls similarity matching (lower = stricter, 0 = exact matches only).
13+
- `Dataset.deduplicate_by_ids()` method for deduplication using internal `dataset_item_ids` directly, avoiding the reference ID to item ID mapping for improved efficiency.
14+
- `DeduplicationResult` and `DeduplicationStats` dataclasses for structured deduplication results.
15+
16+
Example usage:
17+
18+
```python
19+
dataset = client.get_dataset("ds_...")
20+
21+
# Deduplicate entire dataset
22+
result = dataset.deduplicate(threshold=10)
23+
24+
# Deduplicate specific items by reference IDs
25+
result = dataset.deduplicate(threshold=10, reference_ids=["ref_1", "ref_2", "ref_3"])
26+
27+
# Deduplicate by internal item IDs (more efficient if you have them)
28+
result = dataset.deduplicate_by_ids(threshold=10, dataset_item_ids=["item_1", "item_2"])
29+
30+
# Access results
31+
print(f"Threshold: {result.stats.threshold}")
32+
print(f"Original: {result.stats.original_count}, Unique: {result.stats.deduplicated_count}")
33+
print(result.unique_reference_ids)
34+
```
35+
936
## [0.17.11](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.17.11) - 2025-11-03
1037

1138
### Added

nucleus/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
"AsyncJob",
55
"EmbeddingsExportJob",
66
"BoxAnnotation",
7+
"DeduplicationResult",
8+
"DeduplicationStats",
79
"BoxPrediction",
810
"CameraParams",
911
"CategoryAnnotation",
@@ -128,6 +130,7 @@
128130
from .data_transfer_object.job_status import JobInfoRequestPayload
129131
from .dataset import Dataset
130132
from .dataset_item import DatasetItem
133+
from .deduplication import DeduplicationResult, DeduplicationStats
131134
from .deprecation_warning import deprecated
132135
from .errors import (
133136
DatasetItemRetrievalError,

nucleus/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@
149149
SLICE_TAGS_KEY = "slice_tags"
150150
TAXONOMY_NAME_KEY = "taxonomy_name"
151151
TASK_ID_KEY = "task_id"
152+
THRESHOLD_KEY = "threshold"
152153
TRACK_REFERENCE_ID_KEY = "track_reference_id"
153154
TRACK_REFERENCE_IDS_KEY = "track_reference_ids"
154155
TRACKS_KEY = "tracks"

nucleus/dataset.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
REQUEST_ID_KEY,
6868
SCENE_IDS_KEY,
6969
SLICE_ID_KEY,
70+
THRESHOLD_KEY,
7071
TRACK_REFERENCE_IDS_KEY,
7172
TRACKS_KEY,
7273
TRAINED_SLICE_ID_KEY,
@@ -83,6 +84,7 @@
8384
check_items_have_dimensions,
8485
)
8586
from .dataset_item_uploader import DatasetItemUploader
87+
from .deduplication import DeduplicationResult, DeduplicationStats
8688
from .deprecation_warning import deprecated
8789
from .errors import NotFoundError, NucleusAPIError
8890
from .job import CustomerJobTypes, jobs_status_overview
@@ -1006,6 +1008,116 @@ def create_slice_by_ids(
10061008
)
10071009
return Slice(response[SLICE_ID_KEY], self._client)
10081010

1011+
def deduplicate(
1012+
self,
1013+
threshold: int,
1014+
reference_ids: Optional[List[str]] = None,
1015+
) -> DeduplicationResult:
1016+
"""Deduplicate images or frames using user-defined reference IDs.
1017+
1018+
This method can deduplicate an entire dataset (when reference_ids is omitted)
1019+
or a specific subset of items identified by the reference_id you assigned
1020+
when uploading (e.g., "image_001", "frame_xyz"). To deduplicate using
1021+
internal Nucleus item IDs instead, use `deduplicate_by_ids()`.
1022+
1023+
Parameters:
1024+
threshold: Hamming distance threshold (0-64). Lower = stricter.
1025+
0 = exact matches only.
1026+
reference_ids: Optional list of user-defined reference IDs to deduplicate.
1027+
If not provided (or None), deduplicates the entire dataset.
1028+
Cannot be an empty list - use None for entire dataset.
1029+
1030+
Returns:
1031+
DeduplicationResult with unique_reference_ids, unique_item_ids, and stats.
1032+
1033+
Raises:
1034+
ValueError: If reference_ids is an empty list (use None for entire dataset).
1035+
NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
1036+
NucleusAPIError: If any reference_id is not found in the dataset.
1037+
NucleusAPIError: If any item is missing a perceptual hash (pHash).
1038+
Contact Scale support if this occurs.
1039+
1040+
Note:
1041+
- For scene datasets, this deduplicates the underlying scene frames,
1042+
not the scenes themselves. Frame reference IDs or dataset item IDs
1043+
should be provided for scene datasets.
1044+
- For very large datasets, this operation may take significant time.
1045+
"""
1046+
# Client-side validation
1047+
if reference_ids is not None and len(reference_ids) == 0:
1048+
raise ValueError(
1049+
"reference_ids cannot be empty. Omit reference_ids parameter to deduplicate entire dataset."
1050+
)
1051+
1052+
payload: Dict[str, Any] = {THRESHOLD_KEY: threshold}
1053+
if reference_ids is not None:
1054+
payload[REFERENCE_IDS_KEY] = reference_ids
1055+
1056+
response = self._client.make_request(
1057+
payload, f"dataset/{self.id}/deduplicate"
1058+
)
1059+
return DeduplicationResult(
1060+
unique_item_ids=response["unique_item_ids"],
1061+
unique_reference_ids=response["unique_reference_ids"],
1062+
stats=DeduplicationStats(
1063+
threshold=threshold,
1064+
original_count=response["stats"]["original_count"],
1065+
deduplicated_count=response["stats"]["deduplicated_count"],
1066+
),
1067+
)
1068+
1069+
def deduplicate_by_ids(
1070+
self,
1071+
threshold: int,
1072+
dataset_item_ids: List[str],
1073+
) -> DeduplicationResult:
1074+
"""Deduplicate images or frames using internal Nucleus dataset item IDs.
1075+
1076+
This method identifies items by internal Nucleus IDs (e.g., "di_abc123...")
1077+
which are system-assigned when items are uploaded. To deduplicate using
1078+
your own user-defined reference IDs instead, or to deduplicate the entire
1079+
dataset, use `deduplicate()`.
1080+
1081+
Parameters:
1082+
threshold: Hamming distance threshold (0-64). Lower = stricter.
1083+
0 = exact matches only.
1084+
dataset_item_ids: List of internal Nucleus dataset item IDs to deduplicate.
1085+
These IDs are generated by Nucleus; they are not
1086+
user-defined reference IDs. Must be non-empty.
1087+
1088+
Returns:
1089+
DeduplicationResult with unique_item_ids, unique_reference_ids, and stats.
1090+
1091+
Raises:
1092+
ValueError: If dataset_item_ids is empty.
1093+
NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive.
1094+
NucleusAPIError: If any dataset_item_id is not found in the dataset.
1095+
NucleusAPIError: If any item is missing a perceptual hash (pHash).
1096+
Contact Scale support if this occurs.
1097+
"""
1098+
# Client-side validation
1099+
if not dataset_item_ids:
1100+
raise ValueError(
1101+
"dataset_item_ids must be non-empty. Use deduplicate() for entire dataset."
1102+
)
1103+
1104+
payload = {
1105+
DATASET_ITEM_IDS_KEY: dataset_item_ids,
1106+
THRESHOLD_KEY: threshold,
1107+
}
1108+
response = self._client.make_request(
1109+
payload, f"dataset/{self.id}/deduplicate"
1110+
)
1111+
return DeduplicationResult(
1112+
unique_item_ids=response["unique_item_ids"],
1113+
unique_reference_ids=response["unique_reference_ids"],
1114+
stats=DeduplicationStats(
1115+
threshold=threshold,
1116+
original_count=response["stats"]["original_count"],
1117+
deduplicated_count=response["stats"]["deduplicated_count"],
1118+
),
1119+
)
1120+
10091121
def build_slice(
10101122
self,
10111123
name: str,

nucleus/deduplication.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from dataclasses import dataclass
2+
from typing import List
3+
4+
5+
@dataclass
6+
class DeduplicationStats:
7+
threshold: int
8+
original_count: int
9+
deduplicated_count: int
10+
11+
12+
@dataclass
13+
class DeduplicationResult:
14+
unique_item_ids: List[str] # Internal dataset item IDs
15+
unique_reference_ids: List[str] # User-defined reference IDs
16+
stats: DeduplicationStats

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ ignore = ["E501", "E741", "E731", "F401"] # Easy ignore for getting it running
2525

2626
[tool.poetry]
2727
name = "scale-nucleus"
28-
version = "0.17.11"
28+
version = "0.17.12"
2929
description = "The official Python client library for Nucleus, the Data Platform for AI"
3030
license = "MIT"
3131
authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]

tests/helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
EVAL_FUNCTION_THRESHOLD = 0.5
2525
EVAL_FUNCTION_COMPARISON = ThresholdComparison.GREATER_THAN_EQUAL_TO
2626

27+
DEDUP_DEFAULT_TEST_THRESHOLD = 10
28+
2729

2830
TEST_IMG_URLS = [
2931
"https://github.com/scaleapi/nucleus-python-client/raw/master/tests/testdata/airplane.jpeg",

0 commit comments

Comments
 (0)