|
13 | 13 | import re |
14 | 14 | import json |
15 | 15 | import uuid |
| 16 | +import warnings |
16 | 17 | from datetime import datetime, timezone |
17 | 18 | import importlib.resources as ir |
18 | 19 | from contextlib import contextmanager |
19 | 20 | from contextvars import ContextVar |
20 | 21 |
|
21 | | -from urllib.parse import quote as _url_quote |
| 22 | +from urllib.parse import quote as _url_quote, parse_qs, urlparse |
22 | 23 |
|
23 | 24 | from ..core._http import _HttpClient |
24 | 25 | from ._upload import _FileUploadMixin |
|
55 | 56 | _MULTIPLE_BATCH_SIZE = 1000 |
56 | 57 |
|
57 | 58 |
|
| 59 | +def _extract_pagingcookie(next_link: str) -> Optional[str]: |
| 60 | + """Extract the raw pagingcookie value from a SQL ``@odata.nextLink`` URL. |
| 61 | +
|
| 62 | + The Dataverse SQL endpoint has a server-side bug where the pagingcookie |
| 63 | + (containing first/last record GUIDs) does not advance between pages even |
| 64 | + though ``pagenumber`` increments. Detecting a repeated cookie lets the |
| 65 | + pagination loop break instead of looping indefinitely. |
| 66 | +
|
| 67 | + Returns the pagingcookie string if present, or ``None`` if not found. |
| 68 | + """ |
| 69 | + try: |
| 70 | + qs = parse_qs(urlparse(next_link).query) |
| 71 | + skiptoken = qs.get("$skiptoken", [None])[0] |
| 72 | + if not skiptoken: |
| 73 | + return None |
| 74 | + # parse_qs already URL-decodes the value once, giving the outer XML with |
| 75 | + # pagingcookie still percent-encoded (e.g. pagingcookie="%3ccookie..."). |
| 76 | + # A second decode is intentionally omitted: decoding again would turn %22 |
| 77 | + # into " inside the cookie XML, breaking the regex and causing every page |
| 78 | + # to extract the same truncated prefix regardless of the actual GUIDs. |
| 79 | + m = re.search(r'pagingcookie="([^"]+)"', skiptoken) |
| 80 | + if m: |
| 81 | + return m.group(1) |
| 82 | + except Exception: |
| 83 | + pass |
| 84 | + return None |
| 85 | + |
| 86 | + |
58 | 87 | @dataclass |
59 | 88 | class _RequestContext: |
60 | 89 | """Structured request context used by ``_request`` to clarify payload and metadata.""" |
@@ -804,15 +833,86 @@ def _query_sql(self, sql: str) -> list[dict[str, Any]]: |
804 | 833 | body = r.json() |
805 | 834 | except ValueError: |
806 | 835 | return [] |
807 | | - if isinstance(body, dict): |
808 | | - value = body.get("value") |
809 | | - if isinstance(value, list): |
810 | | - # Ensure dict rows only |
811 | | - return [row for row in value if isinstance(row, dict)] |
812 | | - # Fallbacks: if body itself is a list |
| 836 | + |
| 837 | + # Collect first page |
| 838 | + results: list[dict[str, Any]] = [] |
813 | 839 | if isinstance(body, list): |
814 | 840 | return [row for row in body if isinstance(row, dict)] |
815 | | - return [] |
| 841 | + if not isinstance(body, dict): |
| 842 | + return results |
| 843 | + |
| 844 | + value = body.get("value") |
| 845 | + if isinstance(value, list): |
| 846 | + results = [row for row in value if isinstance(row, dict)] |
| 847 | + |
| 848 | + # Follow pagination links until exhausted |
| 849 | + raw_link = body.get("@odata.nextLink") or body.get("odata.nextLink") |
| 850 | + next_link: str | None = raw_link if isinstance(raw_link, str) else None |
| 851 | + visited: set[str] = set() |
| 852 | + seen_cookies: set[str] = set() |
| 853 | + while next_link: |
| 854 | + # Guard 1: exact URL cycle (same next_link returned twice) |
| 855 | + if next_link in visited: |
| 856 | + warnings.warn( |
| 857 | + f"SQL pagination stopped after {len(results)} rows — " |
| 858 | + "the Dataverse server returned the same nextLink URL twice, " |
| 859 | + "indicating an infinite pagination cycle. " |
| 860 | + "Returning the rows collected so far. " |
| 861 | + "To avoid pagination entirely, add a TOP clause to your query.", |
| 862 | + RuntimeWarning, |
| 863 | + stacklevel=4, |
| 864 | + ) |
| 865 | + break |
| 866 | + visited.add(next_link) |
| 867 | + # Guard 2: server-side bug where pagingcookie does not advance between |
| 868 | + # pages (pagenumber increments but cookie GUIDs stay the same), which |
| 869 | + # causes an infinite loop even though URLs differ. |
| 870 | + cookie = _extract_pagingcookie(next_link) |
| 871 | + if cookie is not None: |
| 872 | + if cookie in seen_cookies: |
| 873 | + warnings.warn( |
| 874 | + f"SQL pagination stopped after {len(results)} rows — " |
| 875 | + "the Dataverse server returned the same pagingcookie twice " |
| 876 | + "(pagenumber incremented but the paging position did not advance). " |
| 877 | + "This is a server-side bug. Returning the rows collected so far. " |
| 878 | + "To avoid pagination entirely, add a TOP clause to your query.", |
| 879 | + RuntimeWarning, |
| 880 | + stacklevel=4, |
| 881 | + ) |
| 882 | + break |
| 883 | + seen_cookies.add(cookie) |
| 884 | + try: |
| 885 | + page_resp = self._request("get", next_link) |
| 886 | + except Exception as exc: |
| 887 | + warnings.warn( |
| 888 | + f"SQL pagination stopped after {len(results)} rows — " |
| 889 | + f"the next-page request failed: {exc}. " |
| 890 | + "Add a TOP clause to your query to limit results to a single page.", |
| 891 | + RuntimeWarning, |
| 892 | + stacklevel=5, |
| 893 | + ) |
| 894 | + break |
| 895 | + try: |
| 896 | + page_body = page_resp.json() |
| 897 | + except ValueError as exc: |
| 898 | + warnings.warn( |
| 899 | + f"SQL pagination stopped after {len(results)} rows — " |
| 900 | + f"the next-page response was not valid JSON: {exc}. " |
| 901 | + "Add a TOP clause to your query to limit results to a single page.", |
| 902 | + RuntimeWarning, |
| 903 | + stacklevel=5, |
| 904 | + ) |
| 905 | + break |
| 906 | + if not isinstance(page_body, dict): |
| 907 | + break |
| 908 | + page_value = page_body.get("value") |
| 909 | + if not isinstance(page_value, list) or not page_value: |
| 910 | + break |
| 911 | + results.extend(row for row in page_value if isinstance(row, dict)) |
| 912 | + raw_link = page_body.get("@odata.nextLink") or page_body.get("odata.nextLink") |
| 913 | + next_link = raw_link if isinstance(raw_link, str) else None |
| 914 | + |
| 915 | + return results |
816 | 916 |
|
817 | 917 | @staticmethod |
818 | 918 | def _extract_logical_table(sql: str) -> str: |
|
0 commit comments