|
67 | 67 | REQUEST_ID_KEY, |
68 | 68 | SCENE_IDS_KEY, |
69 | 69 | SLICE_ID_KEY, |
| 70 | + THRESHOLD_KEY, |
70 | 71 | TRACK_REFERENCE_IDS_KEY, |
71 | 72 | TRACKS_KEY, |
72 | 73 | TRAINED_SLICE_ID_KEY, |
|
83 | 84 | check_items_have_dimensions, |
84 | 85 | ) |
85 | 86 | from .dataset_item_uploader import DatasetItemUploader |
| 87 | +from .deduplication import DeduplicationResult, DeduplicationStats |
86 | 88 | from .deprecation_warning import deprecated |
87 | 89 | from .errors import NotFoundError, NucleusAPIError |
88 | 90 | from .job import CustomerJobTypes, jobs_status_overview |
@@ -1006,6 +1008,116 @@ def create_slice_by_ids( |
1006 | 1008 | ) |
1007 | 1009 | return Slice(response[SLICE_ID_KEY], self._client) |
1008 | 1010 |
|
| 1011 | + def deduplicate( |
| 1012 | + self, |
| 1013 | + threshold: int, |
| 1014 | + reference_ids: Optional[List[str]] = None, |
| 1015 | + ) -> DeduplicationResult: |
| 1016 | + """Deduplicate images or frames using user-defined reference IDs. |
| 1017 | +
|
| 1018 | + This method can deduplicate an entire dataset (when reference_ids is omitted) |
| 1019 | + or a specific subset of items identified by the reference_id you assigned |
| 1020 | + when uploading (e.g., "image_001", "frame_xyz"). To deduplicate using |
| 1021 | + internal Nucleus item IDs instead, use `deduplicate_by_ids()`. |
| 1022 | +
|
| 1023 | + Parameters: |
| 1024 | + threshold: Hamming distance threshold (0-64). Lower = stricter. |
| 1025 | + 0 = exact matches only. |
| 1026 | + reference_ids: Optional list of user-defined reference IDs to deduplicate. |
| 1027 | + If not provided (or None), deduplicates the entire dataset. |
| 1028 | + Cannot be an empty list - use None for entire dataset. |
| 1029 | +
|
| 1030 | + Returns: |
| 1031 | + DeduplicationResult with unique_reference_ids, unique_item_ids, and stats. |
| 1032 | +
|
| 1033 | + Raises: |
| 1034 | + ValueError: If reference_ids is an empty list (use None for entire dataset). |
| 1035 | + NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive. |
| 1036 | + NucleusAPIError: If any reference_id is not found in the dataset. |
| 1037 | + NucleusAPIError: If any item is missing a perceptual hash (pHash). |
| 1038 | + Contact Scale support if this occurs. |
| 1039 | +
|
| 1040 | + Note: |
| 1041 | + - For scene datasets, this deduplicates the underlying scene frames, |
| 1042 | + not the scenes themselves. Frame reference IDs or dataset item IDs |
| 1043 | + should be provided for scene datasets. |
| 1044 | + - For very large datasets, this operation may take significant time. |
| 1045 | + """ |
| 1046 | + # Client-side validation |
| 1047 | + if reference_ids is not None and len(reference_ids) == 0: |
| 1048 | + raise ValueError( |
| 1049 | + "reference_ids cannot be empty. Omit reference_ids parameter to deduplicate entire dataset." |
| 1050 | + ) |
| 1051 | + |
| 1052 | + payload: Dict[str, Any] = {THRESHOLD_KEY: threshold} |
| 1053 | + if reference_ids is not None: |
| 1054 | + payload[REFERENCE_IDS_KEY] = reference_ids |
| 1055 | + |
| 1056 | + response = self._client.make_request( |
| 1057 | + payload, f"dataset/{self.id}/deduplicate" |
| 1058 | + ) |
| 1059 | + return DeduplicationResult( |
| 1060 | + unique_item_ids=response["unique_item_ids"], |
| 1061 | + unique_reference_ids=response["unique_reference_ids"], |
| 1062 | + stats=DeduplicationStats( |
| 1063 | + threshold=threshold, |
| 1064 | + original_count=response["stats"]["original_count"], |
| 1065 | + deduplicated_count=response["stats"]["deduplicated_count"], |
| 1066 | + ), |
| 1067 | + ) |
| 1068 | + |
| 1069 | + def deduplicate_by_ids( |
| 1070 | + self, |
| 1071 | + threshold: int, |
| 1072 | + dataset_item_ids: List[str], |
| 1073 | + ) -> DeduplicationResult: |
| 1074 | + """Deduplicate images or frames using internal Nucleus dataset item IDs. |
| 1075 | +
|
| 1076 | + This method identifies items by internal Nucleus IDs (e.g., "di_abc123...") |
| 1077 | + which are system-assigned when items are uploaded. To deduplicate using |
| 1078 | + your own user-defined reference IDs instead, or to deduplicate the entire |
| 1079 | + dataset, use `deduplicate()`. |
| 1080 | +
|
| 1081 | + Parameters: |
| 1082 | + threshold: Hamming distance threshold (0-64). Lower = stricter. |
| 1083 | + 0 = exact matches only. |
| 1084 | + dataset_item_ids: List of internal Nucleus dataset item IDs to deduplicate. |
| 1085 | + These IDs are generated by Nucleus; they are not |
| 1086 | + user-defined reference IDs. Must be non-empty. |
| 1087 | +
|
| 1088 | + Returns: |
| 1089 | + DeduplicationResult with unique_item_ids, unique_reference_ids, and stats. |
| 1090 | +
|
| 1091 | + Raises: |
| 1092 | + ValueError: If dataset_item_ids is empty. |
| 1093 | + NucleusAPIError: If threshold is not an integer between 0 and 64 inclusive. |
| 1094 | + NucleusAPIError: If any dataset_item_id is not found in the dataset. |
| 1095 | + NucleusAPIError: If any item is missing a perceptual hash (pHash). |
| 1096 | + Contact Scale support if this occurs. |
| 1097 | + """ |
| 1098 | + # Client-side validation |
| 1099 | + if not dataset_item_ids: |
| 1100 | + raise ValueError( |
| 1101 | + "dataset_item_ids must be non-empty. Use deduplicate() for entire dataset." |
| 1102 | + ) |
| 1103 | + |
| 1104 | + payload = { |
| 1105 | + DATASET_ITEM_IDS_KEY: dataset_item_ids, |
| 1106 | + THRESHOLD_KEY: threshold, |
| 1107 | + } |
| 1108 | + response = self._client.make_request( |
| 1109 | + payload, f"dataset/{self.id}/deduplicate" |
| 1110 | + ) |
| 1111 | + return DeduplicationResult( |
| 1112 | + unique_item_ids=response["unique_item_ids"], |
| 1113 | + unique_reference_ids=response["unique_reference_ids"], |
| 1114 | + stats=DeduplicationStats( |
| 1115 | + threshold=threshold, |
| 1116 | + original_count=response["stats"]["original_count"], |
| 1117 | + deduplicated_count=response["stats"]["deduplicated_count"], |
| 1118 | + ), |
| 1119 | + ) |
| 1120 | + |
1009 | 1121 | def build_slice( |
1010 | 1122 | self, |
1011 | 1123 | name: str, |
|
0 commit comments