Fix silent data truncation in client.query.sql() — add @odata.nextLink pagination (#157) (#159)

sagebree · Samson Gebre · claude · web-flow · commit ecc4d079e6c7 · 2026-04-10T14:33:40.000-07:00
- client.query.sql() silently returned only the first 5,000 rows
regardless of result set size. The method now follows @odata.nextLink
until all pages are exhausted.
 
- Added _extract_pagingcookie() helper to detect a confirmed server-side
bug where the Dataverse SQL endpoint returns successive @odata.nextLink
responses with pagenumber incrementing but the pagingcookie GUIDs
(keyset cursor) never advancing — causing an infinite pagination loop.
The SDK now detects and breaks out of this condition and emits a
RuntimeWarning.

- Pagination is guarded against three failure modes: exact URL cycles,
stuck pagingcookie (server bug), and failed or non-JSON next-page
responses. All three emit RuntimeWarning with the partial row count and
actionable guidance.

---------

Co-authored-by: Samson Gebre &lt;sagebree@microsoft.com&gt;
Co-authored-by: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,23 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+- Batch API: `client.batch` namespace for deferred-execution batch operations that pack multiple Dataverse Web API calls into a single `POST $batch` HTTP request (#129)
+- Batch DataFrame integration: `client.batch.dataframe` namespace with pandas DataFrame wrappers for batch operations (#129)
+- `client.records.upsert()` and `client.batch.records.upsert()` backed by the `UpsertMultiple` bound action with alternate-key support (#129)
+- QueryBuilder: `client.query.builder("table")` with a fluent API, 20+ chainable methods (`select`, `filter_eq`, `filter_contains`, `order_by`, `expand`, etc.), and composable filter expressions using Python operators (`&`, `|`, `~`) (#118)
+- Memo/multiline column type support: `"memo"` (or `"multiline"`) can now be passed as a column type in `client.tables.create()` and `client.tables.add_columns()` (#155)
+
+### Changed
+- Picklist label-to-integer resolution now uses a single bulk `PicklistAttributeMetadata` API call for the entire table instead of per-attribute requests, with a 1-hour TTL cache (#154)
+
+### Fixed
+- `client.query.sql()` silently truncated results at 5,000 rows. The method now follows `@odata.nextLink` pagination and returns all matching rows (#157).
+- Alternate key fields were incorrectly merged into the `UpsertMultiple` request body, causing `400 Bad Request` on the create path (#129)
+- Docstring type annotations corrected for Microsoft Learn API reference compatibility (#153)
+
 ## [0.1.0b7] - 2026-03-17
 
 ### Added
@@ -91,6 +108,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Comprehensive error handling with specific exception types (`DataverseError`, `AuthenticationError`, etc.) (#22, #24)
 - HTTP retry logic with exponential backoff for resilient operations (#72)
 
+[Unreleased]: https://github.com/microsoft/PowerPlatform-DataverseClient-Python/compare/v0.1.0b7...HEAD
 [0.1.0b7]: https://github.com/microsoft/PowerPlatform-DataverseClient-Python/compare/v0.1.0b6...v0.1.0b7
 [0.1.0b6]: https://github.com/microsoft/PowerPlatform-DataverseClient-Python/compare/v0.1.0b5...v0.1.0b6
 [0.1.0b5]: https://github.com/microsoft/PowerPlatform-DataverseClient-Python/compare/v0.1.0b4...v0.1.0b5
diff --git a/src/PowerPlatform/Dataverse/data/_odata.py b/src/PowerPlatform/Dataverse/data/_odata.py
@@ -13,12 +13,13 @@
 import re
 import json
 import uuid
+import warnings
 from datetime import datetime, timezone
 import importlib.resources as ir
 from contextlib import contextmanager
 from contextvars import ContextVar
 
-from urllib.parse import quote as _url_quote
+from urllib.parse import quote as _url_quote, parse_qs, urlparse
 
 from ..core._http import _HttpClient
 from ._upload import _FileUploadMixin
@@ -54,6 +55,34 @@
 _DEFAULT_EXPECTED_STATUSES: tuple[int, ...] = (200, 201, 202, 204)
 
 
+def _extract_pagingcookie(next_link: str) -> Optional[str]:
+    """Extract the raw pagingcookie value from a SQL ``@odata.nextLink`` URL.
+
+    The Dataverse SQL endpoint has a server-side bug where the pagingcookie
+    (containing first/last record GUIDs) does not advance between pages even
+    though ``pagenumber`` increments. Detecting a repeated cookie lets the
+    pagination loop break instead of looping indefinitely.
+
+    Returns the pagingcookie string if present, or ``None`` if not found.
+    """
+    try:
+        qs = parse_qs(urlparse(next_link).query)
+        skiptoken = qs.get("$skiptoken", [None])[0]
+        if not skiptoken:
+            return None
+        # parse_qs already URL-decodes the value once, giving the outer XML with
+        # pagingcookie still percent-encoded (e.g. pagingcookie="%3ccookie...").
+        # A second decode is intentionally omitted: decoding again would turn %22
+        # into " inside the cookie XML, breaking the regex and causing every page
+        # to extract the same truncated prefix regardless of the actual GUIDs.
+        m = re.search(r'pagingcookie="([^"]+)"', skiptoken)
+        if m:
+            return m.group(1)
+    except Exception:
+        pass
+    return None
+
+
 @dataclass
 class _RequestContext:
     """Structured request context used by ``_request`` to clarify payload and metadata."""
@@ -776,15 +805,86 @@ def _query_sql(self, sql: str) -> list[dict[str, Any]]:
             body = r.json()
         except ValueError:
             return []
-        if isinstance(body, dict):
-            value = body.get("value")
-            if isinstance(value, list):
-                # Ensure dict rows only
-                return [row for row in value if isinstance(row, dict)]
-        # Fallbacks: if body itself is a list
+
+        # Collect first page
+        results: list[dict[str, Any]] = []
         if isinstance(body, list):
             return [row for row in body if isinstance(row, dict)]
-        return []
+        if not isinstance(body, dict):
+            return results
+
+        value = body.get("value")
+        if isinstance(value, list):
+            results = [row for row in value if isinstance(row, dict)]
+
+        # Follow pagination links until exhausted
+        raw_link = body.get("@odata.nextLink") or body.get("odata.nextLink")
+        next_link: str | None = raw_link if isinstance(raw_link, str) else None
+        visited: set[str] = set()
+        seen_cookies: set[str] = set()
+        while next_link:
+            # Guard 1: exact URL cycle (same next_link returned twice)
+            if next_link in visited:
+                warnings.warn(
+                    f"SQL pagination stopped after {len(results)} rows — "
+                    "the Dataverse server returned the same nextLink URL twice, "
+                    "indicating an infinite pagination cycle. "
+                    "Returning the rows collected so far. "
+                    "To avoid pagination entirely, add a TOP clause to your query.",
+                    RuntimeWarning,
+                    stacklevel=4,
+                )
+                break
+            visited.add(next_link)
+            # Guard 2: server-side bug where pagingcookie does not advance between
+            # pages (pagenumber increments but cookie GUIDs stay the same), which
+            # causes an infinite loop even though URLs differ.
+            cookie = _extract_pagingcookie(next_link)
+            if cookie is not None:
+                if cookie in seen_cookies:
+                    warnings.warn(
+                        f"SQL pagination stopped after {len(results)} rows — "
+                        "the Dataverse server returned the same pagingcookie twice "
+                        "(pagenumber incremented but the paging position did not advance). "
+                        "This is a server-side bug. Returning the rows collected so far. "
+                        "To avoid pagination entirely, add a TOP clause to your query.",
+                        RuntimeWarning,
+                        stacklevel=4,
+                    )
+                    break
+                seen_cookies.add(cookie)
+            try:
+                page_resp = self._request("get", next_link)
+            except Exception as exc:
+                warnings.warn(
+                    f"SQL pagination stopped after {len(results)} rows — "
+                    f"the next-page request failed: {exc}. "
+                    "Add a TOP clause to your query to limit results to a single page.",
+                    RuntimeWarning,
+                    stacklevel=5,
+                )
+                break
+            try:
+                page_body = page_resp.json()
+            except ValueError as exc:
+                warnings.warn(
+                    f"SQL pagination stopped after {len(results)} rows — "
+                    f"the next-page response was not valid JSON: {exc}. "
+                    "Add a TOP clause to your query to limit results to a single page.",
+                    RuntimeWarning,
+                    stacklevel=5,
+                )
+                break
+            if not isinstance(page_body, dict):
+                break
+            page_value = page_body.get("value")
+            if not isinstance(page_value, list) or not page_value:
+                break
+            results.extend(row for row in page_value if isinstance(row, dict))
+            raw_link = page_body.get("@odata.nextLink") or page_body.get("odata.nextLink")
+            next_link = raw_link if isinstance(raw_link, str) else None
+
+        return results
 
     @staticmethod
     def _extract_logical_table(sql: str) -> str:
diff --git a/tests/unit/data/test_sql_parse.py b/tests/unit/data/test_sql_parse.py