Auto-chunk *Multiple operations at 1,000 records (issue #156)

Abel Milash · claude · Abel Milash · commit 61a9ee13a8ee · 2026-04-09T10:35:39.000-07:00
Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.claude/skills/dataverse-sdk-use/SKILL.md b/.claude/skills/dataverse-sdk-use/SKILL.md
@@ -25,7 +25,7 @@ Use the PowerPlatform Dataverse Client Python SDK to interact with Microsoft Dat
 - `client.batch` -- batch multiple operations into a single HTTP request
 
 ### Bulk Operations
-The SDK supports Dataverse's native bulk operations: Pass lists to `create()`, `update()` for automatic bulk processing, for `delete()`, set `use_bulk_delete` when passing lists to use bulk operation
+The SDK supports Dataverse's native bulk operations: Pass lists to `create()`, `update()`, or `upsert()` for automatic bulk processing; for `delete()`, set `use_bulk_delete=True`. Lists exceeding 1,000 records are automatically split into sequential 1,000-record chunks — no manual pre-splitting needed. Operations across chunks are **not atomic**: a failure mid-way may leave earlier chunks applied.
 
 ### Paging
 - Control page size with `page_size` parameter
diff --git a/README.md b/README.md
@@ -186,6 +186,10 @@ client.records.update("account", ids, {"industry": "Technology"})
 client.records.delete("account", ids, use_bulk_delete=True)
 ```
 
+> **Large batches**: Lists exceeding 1,000 records are automatically split into sequential
+> 1,000-record chunks — no manual pre-splitting needed. Note that chunked operations are
+> **not atomic**: a failure mid-way may leave earlier chunks applied.
+
 ### Upsert operations
 
 Use `client.records.upsert()` to create or update records identified by alternate keys. When the
diff --git a/examples/advanced/walkthrough.py b/examples/advanced/walkthrough.py
@@ -257,6 +257,37 @@ def _run_walkthrough(client):
         record_ids = [r.get("new_walkthroughdemoid")[:8] + "..." for r in page]
         print(f"  Page {page_num}: {len(page)} records - IDs: {record_ids}")
 
+    # ============================================================================
+    # 6b. LARGE BATCH (AUTO-CHUNKING)
+    # The SDK automatically splits lists > 1,000 records into sequential chunks,
+    # each dispatched as a separate CreateMultiple / UpdateMultiple / UpsertMultiple
+    # request. No manual pre-splitting needed.
+    # Note: chunked operations are NOT atomic — a failure mid-way leaves earlier
+    # chunks applied.
+    # ============================================================================
+    print("\n" + "=" * 80)
+    print("6b. Large Batch (Auto-Chunking)")
+    print("=" * 80)
+
+    LARGE_BATCH_SIZE = 1200  # spans 2 chunks: first 1000 + remaining 200
+    log_call(f"client.records.create('{table_name}', [{LARGE_BATCH_SIZE} records])  # auto-chunked")
+    large_batch_records = [
+        {
+            "new_Title": f"Batch item {i}",
+            "new_Quantity": i % 100,
+            "new_Amount": float(i),
+            "new_Completed": False,
+            "new_Priority": Priority.LOW,
+        }
+        for i in range(LARGE_BATCH_SIZE)
+    ]
+    large_batch_ids = backoff(lambda: client.records.create(table_name, large_batch_records))
+    print(f"[OK] Created {len(large_batch_ids)} records across 2 auto-chunks (1000 + 200)")
+
+    log_call(f"client.records.update('{table_name}', [{LARGE_BATCH_SIZE} IDs], {{...}})  # auto-chunked")
+    backoff(lambda: client.records.update(table_name, large_batch_ids, {"new_Completed": True}))
+    print(f"[OK] Updated {len(large_batch_ids)} records across 2 auto-chunks")
+
     # ============================================================================
     # 7. QUERYBUILDER - FLUENT QUERIES
     # ============================================================================
@@ -602,6 +633,7 @@ def _run_walkthrough(client):
     print("  [OK] Reading records by ID and with filters")
     print("  [OK] Single and multiple record updates")
     print("  [OK] Paging through large result sets")
+    print("  [OK] Large batch auto-chunking (1,200 records split into 2 chunks)")
     print("  [OK] QueryBuilder fluent queries (filter_eq, filter_in, filter_between, where, to_dataframe)")
     print("  [OK] Expand navigation properties (simple + nested ExpandOption)")
     print("  [OK] SQL queries")
diff --git a/src/PowerPlatform/Dataverse/data/_odata.py b/src/PowerPlatform/Dataverse/data/_odata.py
@@ -52,6 +52,7 @@
 _GUID_RE = re.compile(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
 _CALL_SCOPE_CORRELATION_ID: ContextVar[Optional[str]] = ContextVar("_CALL_SCOPE_CORRELATION_ID", default=None)
 _DEFAULT_EXPECTED_STATUSES: tuple[int, ...] = (200, 201, 202, 204)
+_MULTIPLE_BATCH_SIZE = 1000
 
 
 @dataclass
@@ -331,9 +332,17 @@ def _create(self, entity_set: str, table_schema_name: str, record: Dict[str, Any
             f"Create response missing GUID in OData-EntityId/Location headers (status={getattr(r,'status_code', '?')}). Headers: {header_keys}"
         )
 
-    def _create_multiple(self, entity_set: str, table_schema_name: str, records: List[Dict[str, Any]]) -> List[str]:
+    def _create_multiple(
+        self,
+        entity_set: str,
+        table_schema_name: str,
+        records: List[Dict[str, Any]],
+    ) -> List[str]:
         """Create multiple records using the collection-bound ``CreateMultiple`` action.
 
+        Large record lists are automatically split into chunks of up to
+        ``_MULTIPLE_BATCH_SIZE`` records and dispatched sequentially.
+
         :param entity_set: Resolved entity set (plural) name.
         :type entity_set: ``str``
         :param table_schema_name: Schema name of the table.
@@ -345,35 +354,42 @@ def _create_multiple(self, entity_set: str, table_schema_name: str, records: Lis
         :rtype: ``list[str]``
 
         .. note::
-           Logical type stamping: if any payload omits ``@odata.type`` the client injects ``Microsoft.Dynamics.CRM.<table_logical_name>``. If all payloads already include ``@odata.type`` no modification occurs.
+           Logical type stamping: if any payload omits ``@odata.type`` the client
+           injects ``Microsoft.Dynamics.CRM.<table_logical_name>``. If all payloads
+           already include ``@odata.type`` no modification occurs.
+
+        .. warning::
+           When input exceeds ``_MULTIPLE_BATCH_SIZE`` records, the operation is
+           split into multiple requests and is **not atomic**. If a later batch
+           fails, earlier batches are already committed. Callers that require
+           atomicity should limit input to ``<= _MULTIPLE_BATCH_SIZE`` records.
         """
         if not all(isinstance(r, dict) for r in records):
             raise TypeError("All items for multi-create must be dicts")
-        r = self._execute_raw(self._build_create_multiple(entity_set, table_schema_name, records))
-        try:
-            body = r.json() if r.text else {}
-        except ValueError:
-            body = {}
-        if not isinstance(body, dict):
-            return []
-        # Expected: { "Ids": [guid, ...] }
-        ids = body.get("Ids")
-        if isinstance(ids, list):
-            return [i for i in ids if isinstance(i, str)]
-
-        value = body.get("value")
-        if isinstance(value, list):
-            # Extract IDs if possible
-            out: List[str] = []
-            for item in value:
-                if isinstance(item, dict):
-                    # Heuristic: look for a property ending with 'id'
-                    for k, v in item.items():
-                        if isinstance(k, str) and k.lower().endswith("id") and isinstance(v, str) and len(v) >= 32:
-                            out.append(v)
-                            break
-            return out
-        return []
+
+        all_ids: List[str] = []
+        for i in range(0, len(records), _MULTIPLE_BATCH_SIZE):
+            chunk = records[i : i + _MULTIPLE_BATCH_SIZE]
+            r = self._execute_raw(self._build_create_multiple(entity_set, table_schema_name, chunk))
+            try:
+                body = r.json() if r.text else {}
+            except ValueError:
+                body = {}
+            if not isinstance(body, dict):
+                continue
+            ids = body.get("Ids")
+            if isinstance(ids, list):
+                all_ids.extend(i for i in ids if isinstance(i, str))
+                continue
+            value = body.get("value")
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict):
+                        for k, v in item.items():
+                            if isinstance(k, str) and k.lower().endswith("id") and isinstance(v, str) and len(v) >= 32:
+                                all_ids.append(v)
+                                break
+        return all_ids
 
     def _build_alternate_key_str(self, alternate_key: Dict[str, Any]) -> str:
         """Build an OData alternate key segment from a mapping of key names to values.
@@ -467,6 +483,10 @@ def _upsert_multiple(
 
         :raises ValueError: If ``alternate_keys`` and ``records`` differ in length, or if
             any record payload contains an alternate key field with a conflicting value.
+
+        .. warning::
+           When input exceeds ``_MULTIPLE_BATCH_SIZE`` records, the operation is
+           split into multiple requests and is **not atomic** across batches.
         """
         if len(alternate_keys) != len(records):
             raise ValueError(
@@ -488,9 +508,12 @@ def _upsert_multiple(
             key_str = self._build_alternate_key_str(alt_key)
             record_processed["@odata.id"] = f"{entity_set}({key_str})"
             targets.append(record_processed)
-        payload = {"Targets": targets}
+
         url = f"{self.api}/{entity_set}/Microsoft.Dynamics.CRM.UpsertMultiple"
-        self._request("post", url, json=payload, expected=(200, 201, 204))
+        for i in range(0, len(targets), _MULTIPLE_BATCH_SIZE):
+            chunk = targets[i : i + _MULTIPLE_BATCH_SIZE]
+            self._request("post", url, json={"Targets": chunk}, expected=(200, 201, 204))
+        return None
 
     # --- Derived helpers for high-level client ergonomics ---
     def _primary_id_attr(self, table_schema_name: str) -> str:
@@ -509,7 +532,10 @@ def _primary_id_attr(self, table_schema_name: str) -> str:
         )
 
     def _update_by_ids(
-        self, table_schema_name: str, ids: List[str], changes: Union[Dict[str, Any], List[Dict[str, Any]]]
+        self,
+        table_schema_name: str,
+        ids: List[str],
+        changes: Union[Dict[str, Any], List[Dict[str, Any]]],
     ) -> None:
         """Update many records by GUID list using the collection-bound ``UpdateMultiple`` action.
 
@@ -607,9 +633,17 @@ def _update(self, table_schema_name: str, key: str, data: Dict[str, Any]) -> Non
         """
         self._execute_raw(self._build_update(table_schema_name, key, data))
 
-    def _update_multiple(self, entity_set: str, table_schema_name: str, records: List[Dict[str, Any]]) -> None:
+    def _update_multiple(
+        self,
+        entity_set: str,
+        table_schema_name: str,
+        records: List[Dict[str, Any]],
+    ) -> None:
         """Bulk update existing records via the collection-bound ``UpdateMultiple`` action.
 
+        Large record lists are automatically split into chunks of up to
+        ``_MULTIPLE_BATCH_SIZE`` records and dispatched sequentially.
+
         :param entity_set: Resolved entity set (plural) name.
         :type entity_set: ``str``
         :param table_schema_name: Schema name of the table, e.g. "new_MyTestTable".
@@ -621,13 +655,20 @@ def _update_multiple(self, entity_set: str, table_schema_name: str, records: Lis
 
         .. note::
            - Endpoint: ``POST /{entity_set}/Microsoft.Dynamics.CRM.UpdateMultiple`` with body ``{"Targets": [...]}``.
-           - Transactional semantics: if any individual update fails, the entire request rolls back.
+           - Transactional semantics apply within each batch; if a batch fails it rolls back, but earlier batches are already committed.
            - Response content is ignored; no stable contract for returned IDs/representations.
            - Caller must supply the correct primary key attribute (e.g. ``accountid``) in every record.
+
+        .. warning::
+           When input exceeds ``_MULTIPLE_BATCH_SIZE`` records, the operation is
+           split into multiple requests and is **not atomic** across batches.
         """
         if not isinstance(records, list) or not records or not all(isinstance(r, dict) for r in records):
             raise TypeError("records must be a non-empty list[dict]")
-        self._execute_raw(self._build_update_multiple_from_records(entity_set, table_schema_name, records))
+
+        for i in range(0, len(records), _MULTIPLE_BATCH_SIZE):
+            chunk = records[i : i + _MULTIPLE_BATCH_SIZE]
+            self._execute_raw(self._build_update_multiple_from_records(entity_set, table_schema_name, chunk))
         return None
 
     def _delete(self, table_schema_name: str, key: str) -> None:
diff --git a/src/PowerPlatform/Dataverse/operations/dataframe.py b/src/PowerPlatform/Dataverse/operations/dataframe.py
@@ -178,9 +178,11 @@ def create(
             IDs does not match the number of input rows.
 
         .. tip::
-            All rows are sent in a single ``CreateMultiple`` request. For very
-            large DataFrames, consider splitting into smaller batches to avoid
-            request timeouts.
+            The SDK automatically splits large DataFrames into sequential
+            1,000-row chunks before sending to ``CreateMultiple``. You do not
+            need to pre-split large DataFrames. Note that chunked operations
+            are **not atomic** — a failure mid-way may leave earlier chunks
+            applied.
 
         Example:
             Create records from a DataFrame::
@@ -253,9 +255,11 @@ def update(
             rows are never skipped.
 
         .. tip::
-            All rows are sent in a single ``UpdateMultiple`` request (or a
-            single PATCH for one row). For very large DataFrames, consider
-            splitting into smaller batches to avoid request timeouts.
+            The SDK automatically splits large DataFrames into sequential
+            1,000-row chunks before sending to ``UpdateMultiple`` (or a single
+            PATCH for one row). You do not need to pre-split large DataFrames.
+            Note that chunked operations are **not atomic** — a failure
+            mid-way may leave earlier chunks applied.
 
         Example:
             Update records with different values per row::
diff --git a/tests/unit/data/test_multiple_chunking.py b/tests/unit/data/test_multiple_chunking.py