feat: implement pagination handling in SQL queries

Samson Gebre · Samson Gebre · commit 63bdd679af6b · 2026-04-08T15:32:46.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Fixed
+- `client.query.sql()` silently truncated results at 5,000 rows. The method now follows `@odata.nextLink` pagination and returns all matching rows (#157).
+
 ## [0.1.0b5] - 2026-02-27
 
 ### Fixed
diff --git a/src/PowerPlatform/Dataverse/data/_odata.py b/src/PowerPlatform/Dataverse/data/_odata.py
@@ -13,12 +13,13 @@
 import re
 import json
 import uuid
+import warnings
 from datetime import datetime, timezone
 import importlib.resources as ir
 from contextlib import contextmanager
 from contextvars import ContextVar
 
-from urllib.parse import quote as _url_quote
+from urllib.parse import quote as _url_quote, parse_qs, urlparse
 
 from ..core._http import _HttpClient
 from ._upload import _FileUploadMixin
@@ -46,6 +47,34 @@
 _DEFAULT_EXPECTED_STATUSES: tuple[int, ...] = (200, 201, 202, 204)
 
 
+def _extract_pagingcookie(next_link: str) -> Optional[str]:
+    """Extract the raw pagingcookie value from a SQL ``@odata.nextLink`` URL.
+
+    The Dataverse SQL endpoint has a server-side bug where the pagingcookie
+    (containing first/last record GUIDs) does not advance between pages even
+    though ``pagenumber`` increments. Detecting a repeated cookie lets the
+    pagination loop break instead of looping indefinitely.
+
+    Returns the pagingcookie string if present, or ``None`` if not found.
+    """
+    try:
+        qs = parse_qs(urlparse(next_link).query)
+        skiptoken = qs.get("$skiptoken", [None])[0]
+        if not skiptoken:
+            return None
+        # parse_qs already URL-decodes the value once, giving the outer XML with
+        # pagingcookie still percent-encoded (e.g. pagingcookie="%3ccookie...").
+        # A second decode is intentionally omitted: decoding again would turn %22
+        # into " inside the cookie XML, breaking the regex and causing every page
+        # to extract the same truncated prefix regardless of the actual GUIDs.
+        m = re.search(r'pagingcookie="([^"]+)"', skiptoken)
+        if m:
+            return m.group(1)
+    except Exception:
+        pass
+    return None
+
+
 @dataclass
 class _RequestContext:
     """Structured request context used by ``_request`` to clarify payload and metadata."""
@@ -733,15 +762,77 @@ def _query_sql(self, sql: str) -> list[dict[str, Any]]:
             body = r.json()
         except ValueError:
             return []
-        if isinstance(body, dict):
-            value = body.get("value")
-            if isinstance(value, list):
-                # Ensure dict rows only
-                return [row for row in value if isinstance(row, dict)]
-        # Fallbacks: if body itself is a list
+
+        # Collect first page
+        results: list[dict[str, Any]] = []
         if isinstance(body, list):
             return [row for row in body if isinstance(row, dict)]
-        return []
+        if not isinstance(body, dict):
+            return results
+
+        value = body.get("value")
+        if isinstance(value, list):
+            results = [row for row in value if isinstance(row, dict)]
+
+        # Follow pagination links until exhausted
+        raw_link = body.get("@odata.nextLink") or body.get("odata.nextLink")
+        next_link: str | None = raw_link if isinstance(raw_link, str) else None
+        visited: set[str] = set()
+        seen_cookies: set[str] = set()
+        while next_link:
+            # Guard 1: exact URL cycle (same next_link returned twice)
+            if next_link in visited:
+                break
+            visited.add(next_link)
+            # Guard 2: server-side bug where pagingcookie does not advance between
+            # pages (pagenumber increments but cookie GUIDs stay the same), which
+            # causes an infinite loop even though URLs differ.
+            cookie = _extract_pagingcookie(next_link)
+            if cookie is not None:
+                if cookie in seen_cookies:
+                    warnings.warn(
+                        f"SQL pagination stopped after {len(results)} rows — "
+                        "the Dataverse server returned the same pagingcookie twice "
+                        "(pagenumber incremented but the paging position did not advance). "
+                        "This is a server-side bug. Returning the rows collected so far. "
+                        "To avoid pagination entirely, add a TOP clause to your query.",
+                        RuntimeWarning,
+                        stacklevel=4,
+                    )
+                    break
+                seen_cookies.add(cookie)
+            try:
+                page_resp = self._request("get", next_link)
+            except Exception as exc:
+                warnings.warn(
+                    f"SQL pagination stopped after {len(results)} rows — "
+                    f"the next-page request failed: {exc}. "
+                    "Add a TOP clause to your query to limit results to a single page.",
+                    RuntimeWarning,
+                    stacklevel=5,
+                )
+                break
+            try:
+                page_body = page_resp.json()
+            except ValueError as exc:
+                warnings.warn(
+                    f"SQL pagination stopped after {len(results)} rows — "
+                    f"the next-page response was not valid JSON: {exc}. "
+                    "Add a TOP clause to your query to limit results to a single page.",
+                    RuntimeWarning,
+                    stacklevel=5,
+                )
+                break
+            if not isinstance(page_body, dict):
+                break
+            page_value = page_body.get("value")
+            if not isinstance(page_value, list) or not page_value:
+                break
+            results.extend(row for row in page_value if isinstance(row, dict))
+            raw_link = page_body.get("@odata.nextLink") or page_body.get("odata.nextLink")
+            next_link = raw_link if isinstance(raw_link, str) else None
+
+        return results
 
     @staticmethod
     def _extract_logical_table(sql: str) -> str:
diff --git a/tests/unit/data/test_sql_parse.py b/tests/unit/data/test_sql_parse.py