|
13 | 13 | import re |
14 | 14 | import json |
15 | 15 | import uuid |
| 16 | +import warnings |
16 | 17 | from datetime import datetime, timezone |
17 | 18 | import importlib.resources as ir |
18 | 19 | from contextlib import contextmanager |
19 | 20 | from contextvars import ContextVar |
20 | 21 |
|
21 | | -from urllib.parse import quote as _url_quote |
| 22 | +from urllib.parse import quote as _url_quote, parse_qs, urlparse |
22 | 23 |
|
23 | 24 | from ..core._http import _HttpClient |
24 | 25 | from ._upload import _FileUploadMixin |
|
54 | 55 | _DEFAULT_EXPECTED_STATUSES: tuple[int, ...] = (200, 201, 202, 204) |
55 | 56 |
|
56 | 57 |
|
| 58 | +def _extract_pagingcookie(next_link: str) -> Optional[str]: |
| 59 | + """Extract the raw pagingcookie value from a SQL ``@odata.nextLink`` URL. |
| 60 | +
|
| 61 | + The Dataverse SQL endpoint has a server-side bug where the pagingcookie |
| 62 | + (containing first/last record GUIDs) does not advance between pages even |
| 63 | + though ``pagenumber`` increments. Detecting a repeated cookie lets the |
| 64 | + pagination loop break instead of looping indefinitely. |
| 65 | +
|
| 66 | + Returns the pagingcookie string if present, or ``None`` if not found. |
| 67 | + """ |
| 68 | + try: |
| 69 | + qs = parse_qs(urlparse(next_link).query) |
| 70 | + skiptoken = qs.get("$skiptoken", [None])[0] |
| 71 | + if not skiptoken: |
| 72 | + return None |
| 73 | + # parse_qs already URL-decodes the value once, giving the outer XML with |
| 74 | + # pagingcookie still percent-encoded (e.g. pagingcookie="%3ccookie..."). |
| 75 | + # A second decode is intentionally omitted: decoding again would turn %22 |
| 76 | + # into " inside the cookie XML, breaking the regex and causing every page |
| 77 | + # to extract the same truncated prefix regardless of the actual GUIDs. |
| 78 | + m = re.search(r'pagingcookie="([^"]+)"', skiptoken) |
| 79 | + if m: |
| 80 | + return m.group(1) |
| 81 | + except Exception: |
| 82 | + pass |
| 83 | + return None |
| 84 | + |
| 85 | + |
57 | 86 | @dataclass |
58 | 87 | class _RequestContext: |
59 | 88 | """Structured request context used by ``_request`` to clarify payload and metadata.""" |
@@ -776,15 +805,86 @@ def _query_sql(self, sql: str) -> list[dict[str, Any]]: |
776 | 805 | body = r.json() |
777 | 806 | except ValueError: |
778 | 807 | return [] |
779 | | - if isinstance(body, dict): |
780 | | - value = body.get("value") |
781 | | - if isinstance(value, list): |
782 | | - # Ensure dict rows only |
783 | | - return [row for row in value if isinstance(row, dict)] |
784 | | - # Fallbacks: if body itself is a list |
| 808 | + |
| 809 | + # Collect first page |
| 810 | + results: list[dict[str, Any]] = [] |
785 | 811 | if isinstance(body, list): |
786 | 812 | return [row for row in body if isinstance(row, dict)] |
787 | | - return [] |
| 813 | + if not isinstance(body, dict): |
| 814 | + return results |
| 815 | + |
| 816 | + value = body.get("value") |
| 817 | + if isinstance(value, list): |
| 818 | + results = [row for row in value if isinstance(row, dict)] |
| 819 | + |
| 820 | + # Follow pagination links until exhausted |
| 821 | + raw_link = body.get("@odata.nextLink") or body.get("odata.nextLink") |
| 822 | + next_link: str | None = raw_link if isinstance(raw_link, str) else None |
| 823 | + visited: set[str] = set() |
| 824 | + seen_cookies: set[str] = set() |
| 825 | + while next_link: |
| 826 | + # Guard 1: exact URL cycle (same next_link returned twice) |
| 827 | + if next_link in visited: |
| 828 | + warnings.warn( |
| 829 | + f"SQL pagination stopped after {len(results)} rows — " |
| 830 | + "the Dataverse server returned the same nextLink URL twice, " |
| 831 | + "indicating an infinite pagination cycle. " |
| 832 | + "Returning the rows collected so far. " |
| 833 | + "To avoid pagination entirely, add a TOP clause to your query.", |
| 834 | + RuntimeWarning, |
| 835 | + stacklevel=4, |
| 836 | + ) |
| 837 | + break |
| 838 | + visited.add(next_link) |
| 839 | + # Guard 2: server-side bug where pagingcookie does not advance between |
| 840 | + # pages (pagenumber increments but cookie GUIDs stay the same), which |
| 841 | + # causes an infinite loop even though URLs differ. |
| 842 | + cookie = _extract_pagingcookie(next_link) |
| 843 | + if cookie is not None: |
| 844 | + if cookie in seen_cookies: |
| 845 | + warnings.warn( |
| 846 | + f"SQL pagination stopped after {len(results)} rows — " |
| 847 | + "the Dataverse server returned the same pagingcookie twice " |
| 848 | + "(pagenumber incremented but the paging position did not advance). " |
| 849 | + "This is a server-side bug. Returning the rows collected so far. " |
| 850 | + "To avoid pagination entirely, add a TOP clause to your query.", |
| 851 | + RuntimeWarning, |
| 852 | + stacklevel=4, |
| 853 | + ) |
| 854 | + break |
| 855 | + seen_cookies.add(cookie) |
| 856 | + try: |
| 857 | + page_resp = self._request("get", next_link) |
| 858 | + except Exception as exc: |
| 859 | + warnings.warn( |
| 860 | + f"SQL pagination stopped after {len(results)} rows — " |
| 861 | + f"the next-page request failed: {exc}. " |
| 862 | + "Add a TOP clause to your query to limit results to a single page.", |
| 863 | + RuntimeWarning, |
| 864 | + stacklevel=5, |
| 865 | + ) |
| 866 | + break |
| 867 | + try: |
| 868 | + page_body = page_resp.json() |
| 869 | + except ValueError as exc: |
| 870 | + warnings.warn( |
| 871 | + f"SQL pagination stopped after {len(results)} rows — " |
| 872 | + f"the next-page response was not valid JSON: {exc}. " |
| 873 | + "Add a TOP clause to your query to limit results to a single page.", |
| 874 | + RuntimeWarning, |
| 875 | + stacklevel=5, |
| 876 | + ) |
| 877 | + break |
| 878 | + if not isinstance(page_body, dict): |
| 879 | + break |
| 880 | + page_value = page_body.get("value") |
| 881 | + if not isinstance(page_value, list) or not page_value: |
| 882 | + break |
| 883 | + results.extend(row for row in page_value if isinstance(row, dict)) |
| 884 | + raw_link = page_body.get("@odata.nextLink") or page_body.get("odata.nextLink") |
| 885 | + next_link = raw_link if isinstance(raw_link, str) else None |
| 886 | + |
| 887 | + return results |
788 | 888 |
|
789 | 889 | @staticmethod |
790 | 890 | def _extract_logical_table(sql: str) -> str: |
|
0 commit comments