From 0ad568c851e8f9ed0e378f0a1184f270665bc3a9 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Sun, 3 May 2026 15:23:27 -0500
Subject: [PATCH 1/6] =?UTF-8?q?waterdata:=20tighten=20stats=20and=20OGC=20?=
 =?UTF-8?q?pagination=20=E2=80=94=20geometry,=20KeyError,=20non-200?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three correctness fixes to the two pagination loops in
dataretrieval.waterdata.utils.

* `get_stats_data` honored `GEOPANDAS` for the first page but
  hard-coded `geopd=False` on every continuation page. With geopandas
  installed, a multi-page stats response started as a `GeoDataFrame`
  and pages 2..N came back as plain `DataFrame`s; `pd.concat` then
  silently downgraded the result and the caller lost geometry / CRS.
  Use `geopd=GEOPANDAS` on every page.

* `get_stats_data` indexed `body["next"]` directly, raising `KeyError`
  on responses without that key (some terminal responses simply omit
  it). Switch to `body.get("next")`, which produces `None` and exits
  the loop cleanly.

* Both `get_stats_data`'s in-loop request and `_walk_pages`'s in-loop
  request returned the response without checking `status_code`. A 4xx
  or 5xx page whose body happened to JSON-decode could be appended as
  data, then pagination quietly stopped — the caller got a partial
  result with no warning. Add an explicit `if status_code != 200`
  raise inside each loop so the existing log-and-truncate handler
  fires deliberately rather than incidentally.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py |  10 ++-
 tests/waterdata_utils_test.py    | 120 +++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 3 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 7070da50..d1cb1254 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -606,6 +606,8 @@ def _walk_pages(
                     headers=headers,
                     data=content if method == "POST" else None,
                 )
+                if resp.status_code != 200:
+                    raise RuntimeError(_error_body(resp))
                 dfs.append(_get_resp_data(resp, geopd=geopd))
                 curr_url = _next_req_url(resp)
             except Exception:  # noqa: BLE001
@@ -1058,7 +1060,7 @@ def get_stats_data(
         all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)]
 
         # Look for a next code in the response body
-        next_token = body["next"]
+        next_token = body.get("next")
 
         while next_token:
             args["next_token"] = next_token
@@ -1070,9 +1072,11 @@ def get_stats_data(
                     params=args,
                     headers=headers,
                 )
+                if resp.status_code != 200:
+                    raise RuntimeError(_error_body(resp))
                 body = resp.json()
-                all_dfs.append(_handle_stats_nesting(body, geopd=False))
-                next_token = body["next"]
+                all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
+                next_token = body.get("next")
             except Exception:  # noqa: BLE001
                 error_text = _error_body(resp)
                 logger.error("Request incomplete. %s", error_text)
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
index 36150be8..4b5da19f 100644
--- a/tests/waterdata_utils_test.py
+++ b/tests/waterdata_utils_test.py
@@ -1,10 +1,12 @@
 from unittest import mock
 
+import pandas as pd
 import requests
 
 from dataretrieval.waterdata.utils import (
     _get_args,
     _walk_pages,
+    get_stats_data,
 )
 
 
@@ -80,3 +82,121 @@ def test_walk_pages_multiple_mocked():
     assert mock_client.send.called
     assert mock_client.request.called
     assert mock_client.request.call_args[0][1] == "https://example.com/page2"
+
+
+def test_walk_pages_raises_on_non_200_in_loop():
+    """`_walk_pages` must surface a non-200 mid-loop, not silently truncate.
+
+    Regression: previously any non-200 page was appended (with whatever
+    body it had) and pagination quietly stopped because `_get_resp_data`
+    or `_next_req_url` raised inside the bare except. The user got a
+    partial result with no warning.
+    """
+    resp1 = mock.MagicMock()
+    resp1.json.return_value = {
+        "numberReturned": 1,
+        "features": [{"id": "1", "properties": {"val": "a"}}],
+        "links": [],
+    }
+    resp1.headers = {}
+    resp1.links = {"next": {"url": "https://example.com/page2"}}
+    resp1.status_code = 200
+
+    resp2 = mock.MagicMock()
+    resp2.status_code = 500
+    resp2.text = "<html>error</html>"
+
+    mock_client = mock.MagicMock(spec=requests.Session)
+    mock_client.send.return_value = resp1
+    mock_client.request.return_value = resp2
+
+    mock_req = mock.MagicMock(spec=requests.PreparedRequest)
+    mock_req.method = "GET"
+    mock_req.headers = {}
+    mock_req.url = "https://example.com/page1"
+
+    df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client)
+
+    # Page 1 still returned; page 2 logged-and-stopped after the explicit
+    # status check raised. The contract here is "log + truncate", same
+    # as the pre-fix bare-except behavior, but now the raise inside the
+    # loop is intentional rather than incidental.
+    assert len(df) == 1
+
+
+# --- get_stats_data pagination ----------------------------------------------
+
+
+def _stats_feature():
+    """Build a single feature shaped to satisfy ``_handle_stats_nesting``."""
+    return {
+        "type": "Feature",
+        "id": "USGS-1",
+        "geometry": None,
+        "properties": {
+            "monitoring_location_id": "USGS-1",
+            "data": [
+                {
+                    "parameter_code": "00060",
+                    "unit_of_measure": "ft^3/s",
+                    "parent_time_series_id": "abc",
+                    "values": [{"value": 1.0}],
+                }
+            ],
+        },
+    }
+
+
+def _stats_body(features, next_token=None):
+    body = {
+        "type": "FeatureCollection",
+        "features": features,
+        "numberReturned": len(features),
+    }
+    if next_token is not None:
+        body["next"] = next_token
+    return body
+
+
+def test_get_stats_data_handles_missing_next_key():
+    """A response without a ``next`` key must not raise KeyError.
+
+    Regression: ``body["next"]`` raised when the key was absent. Now
+    uses ``body.get("next")`` so a missing key means "no more pages".
+    """
+    resp = mock.MagicMock()
+    resp.status_code = 200
+    resp.json.return_value = _stats_body([_stats_feature()])
+    # No "next" key at all.
+
+    client = mock.MagicMock(spec=requests.Session)
+    client.send.return_value = resp
+
+    df, _ = get_stats_data(
+        args={}, service="observationNormals", expand_percentiles=False, client=client
+    )
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) >= 1
+
+
+def test_get_stats_data_truncates_on_non_200_continuation():
+    """A 4xx/5xx on a continuation page must log and stop, not crash."""
+    resp1 = mock.MagicMock()
+    resp1.status_code = 200
+    resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc")
+
+    resp2 = mock.MagicMock()
+    resp2.status_code = 503
+    resp2.text = "Service Unavailable"
+    resp2.url = "https://example.com/page2"
+
+    client = mock.MagicMock(spec=requests.Session)
+    client.send.return_value = resp1
+    client.request.return_value = resp2
+
+    df, _ = get_stats_data(
+        args={}, service="observationNormals", expand_percentiles=False, client=client
+    )
+    # Page 1 still surfaces; page 2 was caught by the in-loop status check.
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) >= 1

From 0fa758ea2f967a03183c95126f43cc435f84bd2b Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Sun, 3 May 2026 15:29:53 -0500
Subject: [PATCH 2/6] Unify status-check sites behind a small _raise_if_not_ok
 helper

Both pagination loops now had four call sites repeating
``if resp.status_code != 200: raise RuntimeError(_error_body(resp))``.
Move that into a one-line helper alongside ``_error_body`` and call
it from every site.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index d1cb1254..443d7adf 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -339,6 +339,12 @@ def _error_body(resp: requests.Response):
     )
 
 
+def _raise_if_not_ok(resp: requests.Response) -> None:
+    """Raise ``RuntimeError(_error_body(resp))`` for any non-200 response."""
+    if resp.status_code != 200:
+        raise RuntimeError(_error_body(resp))
+
+
 def _construct_api_requests(
     service: str,
     properties: list[str] | None = None,
@@ -583,8 +589,7 @@ def _walk_pages(
     client = client or requests.Session()
     try:
         resp = client.send(req)
-        if resp.status_code != 200:
-            raise RuntimeError(_error_body(resp))
+        _raise_if_not_ok(resp)
 
         # Store the initial response for metadata
         initial_response = resp
@@ -606,8 +611,7 @@ def _walk_pages(
                     headers=headers,
                     data=content if method == "POST" else None,
                 )
-                if resp.status_code != 200:
-                    raise RuntimeError(_error_body(resp))
+                _raise_if_not_ok(resp)
                 dfs.append(_get_resp_data(resp, geopd=geopd))
                 curr_url = _next_req_url(resp)
             except Exception:  # noqa: BLE001
@@ -1045,8 +1049,7 @@ def get_stats_data(
 
     try:
         resp = client.send(req)
-        if resp.status_code != 200:
-            raise RuntimeError(_error_body(resp))
+        _raise_if_not_ok(resp)
 
         # Store the initial response for metadata
         initial_response = resp
@@ -1072,8 +1075,7 @@ def get_stats_data(
                     params=args,
                     headers=headers,
                 )
-                if resp.status_code != 200:
-                    raise RuntimeError(_error_body(resp))
+                _raise_if_not_ok(resp)
                 body = resp.json()
                 all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
                 next_token = body.get("next")

From af007051498f2de0fb556f2eee475c97602371ba Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Mon, 4 May 2026 10:12:38 -0500
Subject: [PATCH 3/6] Harden _error_body and add geopandas continuation
 regression test

Per copilot review on PR #255:

- _error_body: catch JSONDecodeError when a 4xx/5xx returns plain text
  or HTML. Previously, _raise_if_not_ok -> _error_body -> resp.json()
  raised JSONDecodeError on non-JSON bodies, defeating the in-loop
  status check.
- Tests:
  - Rename test_walk_pages_raises_on_non_200_in_loop to
    test_walk_pages_truncates_on_non_200_continuation; the assertion
    verifies log-and-truncate, not raise.
  - New test_get_stats_data_preserves_geometry_across_pages exercises
    the GEOPANDAS=True continuation path so a regression to geopd=False
    on page 2..N is caught.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py |  9 ++++-
 tests/waterdata_utils_test.py    | 58 +++++++++++++++++++++++++++++---
 2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 443d7adf..3122220e 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -332,7 +332,14 @@ def _error_body(resp: requests.Response):
             "403: Query request denied. Possible reasons include "
             "query exceeding server limits."
         )
-    j_txt = resp.json()
+    try:
+        j_txt = resp.json()
+    except (json.JSONDecodeError, ValueError):
+        # Non-JSON 4xx/5xx bodies (HTML error pages, plain text) — fall back
+        # to the raw text so callers still get a useful message instead of
+        # the helper crashing with JSONDecodeError.
+        snippet = (resp.text or "").strip()[:500]
+        return f"{status}: {resp.reason or 'Error'}. {snippet}"
     return (
         f"{status}: {j_txt.get('code', 'Unknown type')}. "
         f"{j_txt.get('description', 'No description provided')}."
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
index 4b5da19f..25266347 100644
--- a/tests/waterdata_utils_test.py
+++ b/tests/waterdata_utils_test.py
@@ -1,14 +1,19 @@
 from unittest import mock
 
 import pandas as pd
+import pytest
 import requests
 
 from dataretrieval.waterdata.utils import (
+    GEOPANDAS,
     _get_args,
     _walk_pages,
     get_stats_data,
 )
 
+if GEOPANDAS:
+    import geopandas as gpd
+
 
 def test_get_args_basic():
     local_vars = {
@@ -84,13 +89,15 @@ def test_walk_pages_multiple_mocked():
     assert mock_client.request.call_args[0][1] == "https://example.com/page2"
 
 
-def test_walk_pages_raises_on_non_200_in_loop():
-    """`_walk_pages` must surface a non-200 mid-loop, not silently truncate.
+def test_walk_pages_truncates_on_non_200_continuation():
+    """`_walk_pages` must truncate (not silently extend) on a non-200 page.
 
     Regression: previously any non-200 page was appended (with whatever
     body it had) and pagination quietly stopped because `_get_resp_data`
-    or `_next_req_url` raised inside the bare except. The user got a
-    partial result with no warning.
+    or `_next_req_url` raised inside the bare except. Now the explicit
+    status check raises inside the loop, and the existing log-and-truncate
+    handler converts that to a clean partial result. Page 1 is still
+    returned; page 2 is dropped.
     """
     resp1 = mock.MagicMock()
     resp1.json.return_value = {
@@ -200,3 +207,46 @@ def test_get_stats_data_truncates_on_non_200_continuation():
     # Page 1 still surfaces; page 2 was caught by the in-loop status check.
     assert isinstance(df, pd.DataFrame)
     assert len(df) >= 1
+
+
+def _stats_feature_with_geometry(loc_id, lon, lat):
+    """Stats feature with a real point geometry."""
+    feat = _stats_feature()
+    feat["id"] = loc_id
+    feat["geometry"] = {"type": "Point", "coordinates": [lon, lat]}
+    feat["properties"]["monitoring_location_id"] = loc_id
+    return feat
+
+
+@pytest.mark.skipif(not GEOPANDAS, reason="geopandas not installed")
+def test_get_stats_data_preserves_geometry_across_pages():
+    """Pages 2..N must use ``geopd=GEOPANDAS`` so a multi-page response
+    stays a GeoDataFrame and doesn't lose geometry/CRS at the page-1 boundary.
+
+    Regression: previously pages 2..N hard-coded ``geopd=False``, producing
+    plain DataFrames. ``pd.concat`` then silently downgraded the result to
+    a plain DataFrame.
+    """
+    resp1 = mock.MagicMock()
+    resp1.status_code = 200
+    resp1.json.return_value = _stats_body(
+        [_stats_feature_with_geometry("USGS-1", -89.0, 43.0)],
+        next_token="abc",
+    )
+
+    resp2 = mock.MagicMock()
+    resp2.status_code = 200
+    resp2.json.return_value = _stats_body(
+        [_stats_feature_with_geometry("USGS-2", -90.0, 44.0)]
+    )
+
+    client = mock.MagicMock(spec=requests.Session)
+    client.send.return_value = resp1
+    client.request.return_value = resp2
+
+    df, _ = get_stats_data(
+        args={}, service="observationNormals", expand_percentiles=False, client=client
+    )
+    assert isinstance(df, gpd.GeoDataFrame)
+    assert len(df) == 2
+    assert df.geometry.notna().all()

From 1bfea67e11a7426f7ee1328c8171889d378609b3 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 09:39:26 -0500
Subject: [PATCH 4/6] Address Copilot review on PR #255
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update `_error_body` docstring to describe the new JSON / text-snippet
  fallback behavior (was: "returns the raw response text" — out of date
  since the helper now formats `code` + `description` from the JSON body
  and only falls back to a text snippet when JSON parsing fails).
- Add `test_error_body_falls_back_to_text_for_non_json_responses` to
  exercise the JSONDecodeError fallback branch (the existing pagination
  tests use unconstrained `resp.json` mocks, so the fallback was never
  reached).
- Tighten `test_get_stats_data_handles_missing_next_key`: pin to
  `len(df) == 1` and assert `client.request` was never called, so a
  regression that kept fetching after a missing next-token would be
  caught.
- Tighten `test_get_stats_data_truncates_on_non_200_continuation`: pin
  to exactly 1 row and assert `client.request.call_count == 1`, so a
  regression that silently appended page-2 data or duplicated page-1
  would still fail.

Live-API verification of the dropped-geometry bug: with
`state=US:42, parameter=00060, page_size=1`, page-2 contributes 1890
rows; the buggy main concat leaves 0/1890 of those rows with geometry
(all NaN) while the fixed concat preserves 1890/1890.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py | 11 +++++---
 tests/waterdata_utils_test.py    | 44 ++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index f4c3f949..633cdf54 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -316,10 +316,13 @@ def _error_body(resp: requests.Response):
     Returns
     -------
     str
-        The extracted error message. For status code 429, returns the 'message'
-        field from the JSON error object. For status code 403, returns a
-        predefined message indicating possible reasons for denial. For other
-        status codes, returns the raw response text.
+        The extracted error message. Status 429 returns a fixed
+        'too many requests' message; status 403 returns a fixed
+        'query exceeding server limits' message. For all other statuses,
+        the response body is parsed as JSON and the ``code`` /
+        ``description`` fields are formatted into a one-line message; if
+        the body is not JSON (HTML error page, plain text), the helper
+        falls back to ``"<status>: <reason>. <first 500 chars of body>"``.
     """
     status = resp.status_code
     if status == 429:
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
index e5d54a12..9acb8bf4 100644
--- a/tests/waterdata_utils_test.py
+++ b/tests/waterdata_utils_test.py
@@ -6,6 +6,7 @@
 
 from dataretrieval.waterdata.utils import (
     GEOPANDAS,
+    _error_body,
     _get_args,
     _handle_stats_nesting,
     _walk_pages,
@@ -132,6 +133,29 @@ def test_walk_pages_truncates_on_non_200_continuation():
     assert len(df) == 1
 
 
+# --- _error_body -------------------------------------------------------------
+
+
+def test_error_body_falls_back_to_text_for_non_json_responses():
+    """`_error_body` must not crash on plain-text/HTML 4xx-5xx bodies.
+
+    Regression: previously ``resp.json()`` was unconditionally called,
+    so an HTML error page raised ``JSONDecodeError`` from inside the
+    helper. Now non-JSON bodies fall back to the raw text snippet.
+    """
+    import json as json_mod
+
+    resp = mock.MagicMock()
+    resp.status_code = 502
+    resp.reason = "Bad Gateway"
+    resp.text = "<html><body>upstream timeout</body></html>"
+    resp.json.side_effect = json_mod.JSONDecodeError("expecting value", "", 0)
+
+    msg = _error_body(resp)
+    assert msg.startswith("502: Bad Gateway")
+    assert "upstream timeout" in msg
+
+
 # --- get_stats_data pagination ----------------------------------------------
 
 
@@ -167,7 +191,8 @@ def _stats_body(features, next_token=None):
 
 
 def test_get_stats_data_handles_missing_next_key():
-    """A response without a ``next`` key must not raise KeyError.
+    """A response without a ``next`` key must not raise KeyError, and
+    pagination must stop without firing a continuation request.
 
     Regression: ``body["next"]`` raised when the key was absent. Now
     uses ``body.get("next")`` so a missing key means "no more pages".
@@ -184,11 +209,18 @@ def test_get_stats_data_handles_missing_next_key():
         args={}, service="observationNormals", expand_percentiles=False, client=client
     )
     assert isinstance(df, pd.DataFrame)
-    assert len(df) >= 1
+    # The single feature flattens to exactly 1 stats row; no continuation
+    # request should have been made.
+    assert len(df) == 1
+    client.request.assert_not_called()
 
 
 def test_get_stats_data_truncates_on_non_200_continuation():
-    """A 4xx/5xx on a continuation page must log and stop, not crash."""
+    """A 4xx/5xx on a continuation page must log and stop, not crash.
+
+    Pin the result to exactly the page-1 row so a regression that silently
+    appended page-2 data (or duplicated page-1) would still fail.
+    """
     resp1 = mock.MagicMock()
     resp1.status_code = 200
     resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc")
@@ -205,9 +237,11 @@ def test_get_stats_data_truncates_on_non_200_continuation():
     df, _ = get_stats_data(
         args={}, service="observationNormals", expand_percentiles=False, client=client
     )
-    # Page 1 still surfaces; page 2 was caught by the in-loop status check.
+    # Page 1's single feature flattens to exactly 1 row; page 2 must not
+    # contribute anything and the loop must not have looped past resp2.
     assert isinstance(df, pd.DataFrame)
-    assert len(df) >= 1
+    assert len(df) == 1
+    assert client.request.call_count == 1
 
 
 def _stats_feature_with_geometry(loc_id, lon, lat):

From 22c60de952d960381aba77e605afdd9b2b2cf232 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 09:48:34 -0500
Subject: [PATCH 5/6] Drop defensive fixes; keep only the real geometry bug

Live-API testing (state=US:42, parameter=00060, page_size=1 against
observationNormals) confirmed only the geometry-on-continuation bug:
the buggy path leaves 0/1890 page-2 rows with geometry; the fix
preserves 1890/1890.

The other two candidate fixes were defensive-only and could not be
reproduced against the current live API, so drop them and the
associated tests:
- `body.get("next")` -- the live endpoint always includes the `next`
  key (value `null` on terminal pages), so the original `body["next"]`
  never KeyErrors.
- Explicit non-200 raise inside the continuation loops (plus the
  `_error_body` JSON-decode fallback and the `_raise_if_not_ok` helper
  extraction) -- real failure modes already raise inside the existing
  `try/except` at `_handle_stats_nesting` or `body["next"]`. The
  silent-append window required a non-200 response whose body both
  JSON-decodes AND has a `features` key, which the API doesn't produce.

Net diff is now one line of production code (`geopd=False` ->
`geopd=GEOPANDAS`) plus one regression test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dataretrieval/waterdata/utils.py |  38 +++-------
 tests/waterdata_utils_test.py    | 121 -------------------------------
 2 files changed, 11 insertions(+), 148 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 633cdf54..35a29b33 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -316,13 +316,10 @@ def _error_body(resp: requests.Response):
     Returns
     -------
     str
-        The extracted error message. Status 429 returns a fixed
-        'too many requests' message; status 403 returns a fixed
-        'query exceeding server limits' message. For all other statuses,
-        the response body is parsed as JSON and the ``code`` /
-        ``description`` fields are formatted into a one-line message; if
-        the body is not JSON (HTML error page, plain text), the helper
-        falls back to ``"<status>: <reason>. <first 500 chars of body>"``.
+        The extracted error message. For status code 429, returns the 'message'
+        field from the JSON error object. For status code 403, returns a
+        predefined message indicating possible reasons for denial. For other
+        status codes, returns the raw response text.
     """
     status = resp.status_code
     if status == 429:
@@ -335,26 +332,13 @@ def _error_body(resp: requests.Response):
             "403: Query request denied. Possible reasons include "
             "query exceeding server limits."
         )
-    try:
-        j_txt = resp.json()
-    except (json.JSONDecodeError, ValueError):
-        # Non-JSON 4xx/5xx bodies (HTML error pages, plain text) — fall back
-        # to the raw text so callers still get a useful message instead of
-        # the helper crashing with JSONDecodeError.
-        snippet = (resp.text or "").strip()[:500]
-        return f"{status}: {resp.reason or 'Error'}. {snippet}"
+    j_txt = resp.json()
     return (
         f"{status}: {j_txt.get('code', 'Unknown type')}. "
         f"{j_txt.get('description', 'No description provided')}."
     )
 
 
-def _raise_if_not_ok(resp: requests.Response) -> None:
-    """Raise ``RuntimeError(_error_body(resp))`` for any non-200 response."""
-    if resp.status_code != 200:
-        raise RuntimeError(_error_body(resp))
-
-
 def _construct_api_requests(
     service: str,
     properties: list[str] | None = None,
@@ -599,7 +583,8 @@ def _walk_pages(
     client = client or requests.Session()
     try:
         resp = client.send(req)
-        _raise_if_not_ok(resp)
+        if resp.status_code != 200:
+            raise RuntimeError(_error_body(resp))
 
         # Store the initial response for metadata
         initial_response = resp
@@ -621,7 +606,6 @@ def _walk_pages(
                     headers=headers,
                     data=content if method == "POST" else None,
                 )
-                _raise_if_not_ok(resp)
                 dfs.append(_get_resp_data(resp, geopd=geopd))
                 curr_url = _next_req_url(resp)
             except Exception:  # noqa: BLE001
@@ -1061,7 +1045,8 @@ def get_stats_data(
 
     try:
         resp = client.send(req)
-        _raise_if_not_ok(resp)
+        if resp.status_code != 200:
+            raise RuntimeError(_error_body(resp))
 
         # Store the initial response for metadata
         initial_response = resp
@@ -1075,7 +1060,7 @@ def get_stats_data(
         all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)]
 
         # Look for a next code in the response body
-        next_token = body.get("next")
+        next_token = body["next"]
 
         while next_token:
             args["next_token"] = next_token
@@ -1087,10 +1072,9 @@ def get_stats_data(
                     params=args,
                     headers=headers,
                 )
-                _raise_if_not_ok(resp)
                 body = resp.json()
                 all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS))
-                next_token = body.get("next")
+                next_token = body["next"]
             except Exception:  # noqa: BLE001
                 error_text = _error_body(resp)
                 logger.error("Request incomplete. %s", error_text)
diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
index 9acb8bf4..e06b20ab 100644
--- a/tests/waterdata_utils_test.py
+++ b/tests/waterdata_utils_test.py
@@ -1,12 +1,10 @@
 from unittest import mock
 
-import pandas as pd
 import pytest
 import requests
 
 from dataretrieval.waterdata.utils import (
     GEOPANDAS,
-    _error_body,
     _get_args,
     _handle_stats_nesting,
     _walk_pages,
@@ -91,71 +89,6 @@ def test_walk_pages_multiple_mocked():
     assert mock_client.request.call_args[0][1] == "https://example.com/page2"
 
 
-def test_walk_pages_truncates_on_non_200_continuation():
-    """`_walk_pages` must truncate (not silently extend) on a non-200 page.
-
-    Regression: previously any non-200 page was appended (with whatever
-    body it had) and pagination quietly stopped because `_get_resp_data`
-    or `_next_req_url` raised inside the bare except. Now the explicit
-    status check raises inside the loop, and the existing log-and-truncate
-    handler converts that to a clean partial result. Page 1 is still
-    returned; page 2 is dropped.
-    """
-    resp1 = mock.MagicMock()
-    resp1.json.return_value = {
-        "numberReturned": 1,
-        "features": [{"id": "1", "properties": {"val": "a"}}],
-        "links": [],
-    }
-    resp1.headers = {}
-    resp1.links = {"next": {"url": "https://example.com/page2"}}
-    resp1.status_code = 200
-
-    resp2 = mock.MagicMock()
-    resp2.status_code = 500
-    resp2.text = "<html>error</html>"
-
-    mock_client = mock.MagicMock(spec=requests.Session)
-    mock_client.send.return_value = resp1
-    mock_client.request.return_value = resp2
-
-    mock_req = mock.MagicMock(spec=requests.PreparedRequest)
-    mock_req.method = "GET"
-    mock_req.headers = {}
-    mock_req.url = "https://example.com/page1"
-
-    df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client)
-
-    # Page 1 still returned; page 2 logged-and-stopped after the explicit
-    # status check raised. The contract here is "log + truncate", same
-    # as the pre-fix bare-except behavior, but now the raise inside the
-    # loop is intentional rather than incidental.
-    assert len(df) == 1
-
-
-# --- _error_body -------------------------------------------------------------
-
-
-def test_error_body_falls_back_to_text_for_non_json_responses():
-    """`_error_body` must not crash on plain-text/HTML 4xx-5xx bodies.
-
-    Regression: previously ``resp.json()`` was unconditionally called,
-    so an HTML error page raised ``JSONDecodeError`` from inside the
-    helper. Now non-JSON bodies fall back to the raw text snippet.
-    """
-    import json as json_mod
-
-    resp = mock.MagicMock()
-    resp.status_code = 502
-    resp.reason = "Bad Gateway"
-    resp.text = "<html><body>upstream timeout</body></html>"
-    resp.json.side_effect = json_mod.JSONDecodeError("expecting value", "", 0)
-
-    msg = _error_body(resp)
-    assert msg.startswith("502: Bad Gateway")
-    assert "upstream timeout" in msg
-
-
 # --- get_stats_data pagination ----------------------------------------------
 
 
@@ -190,60 +123,6 @@ def _stats_body(features, next_token=None):
     return body
 
 
-def test_get_stats_data_handles_missing_next_key():
-    """A response without a ``next`` key must not raise KeyError, and
-    pagination must stop without firing a continuation request.
-
-    Regression: ``body["next"]`` raised when the key was absent. Now
-    uses ``body.get("next")`` so a missing key means "no more pages".
-    """
-    resp = mock.MagicMock()
-    resp.status_code = 200
-    resp.json.return_value = _stats_body([_stats_feature()])
-    # No "next" key at all.
-
-    client = mock.MagicMock(spec=requests.Session)
-    client.send.return_value = resp
-
-    df, _ = get_stats_data(
-        args={}, service="observationNormals", expand_percentiles=False, client=client
-    )
-    assert isinstance(df, pd.DataFrame)
-    # The single feature flattens to exactly 1 stats row; no continuation
-    # request should have been made.
-    assert len(df) == 1
-    client.request.assert_not_called()
-
-
-def test_get_stats_data_truncates_on_non_200_continuation():
-    """A 4xx/5xx on a continuation page must log and stop, not crash.
-
-    Pin the result to exactly the page-1 row so a regression that silently
-    appended page-2 data (or duplicated page-1) would still fail.
-    """
-    resp1 = mock.MagicMock()
-    resp1.status_code = 200
-    resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc")
-
-    resp2 = mock.MagicMock()
-    resp2.status_code = 503
-    resp2.text = "Service Unavailable"
-    resp2.url = "https://example.com/page2"
-
-    client = mock.MagicMock(spec=requests.Session)
-    client.send.return_value = resp1
-    client.request.return_value = resp2
-
-    df, _ = get_stats_data(
-        args={}, service="observationNormals", expand_percentiles=False, client=client
-    )
-    # Page 1's single feature flattens to exactly 1 row; page 2 must not
-    # contribute anything and the loop must not have looped past resp2.
-    assert isinstance(df, pd.DataFrame)
-    assert len(df) == 1
-    assert client.request.call_count == 1
-
-
 def _stats_feature_with_geometry(loc_id, lon, lat):
     """Stats feature with a real point geometry."""
     feat = _stats_feature()

From b7285e6c476167699131c445fd76d06770295737 Mon Sep 17 00:00:00 2001
From: thodson-usgs <thodson@usgs.gov>
Date: Tue, 5 May 2026 09:55:26 -0500
Subject: [PATCH 6/6] Drop trivial regression test from stats-pagination PR

The fix is a one-line `geopd=False` -> `geopd=GEOPANDAS` change that's
self-evident from the diff and verified against the live API in the
PR description. Drop `test_get_stats_data_preserves_geometry_across_pages`
along with its helpers (`_stats_feature`, `_stats_body`,
`_stats_feature_with_geometry`) and the now-unused `GEOPANDAS`,
`get_stats_data`, `pytest`, and `gpd` imports.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/waterdata_utils_test.py | 83 -----------------------------------
 1 file changed, 83 deletions(-)

diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py
index 9ff04d3b..d8d654b4 100644
--- a/tests/waterdata_utils_test.py
+++ b/tests/waterdata_utils_test.py
@@ -1,20 +1,14 @@
 from unittest import mock
 
-import pytest
 import requests
 
 from dataretrieval.waterdata.utils import (
-    GEOPANDAS,
     _format_api_dates,
     _get_args,
     _handle_stats_nesting,
     _walk_pages,
-    get_stats_data,
 )
 
-if GEOPANDAS:
-    import geopandas as gpd
-
 
 def test_get_args_basic():
     local_vars = {
@@ -90,83 +84,6 @@ def test_walk_pages_multiple_mocked():
     assert mock_client.request.call_args[0][1] == "https://example.com/page2"
 
 
-# --- get_stats_data pagination ----------------------------------------------
-
-
-def _stats_feature():
-    """Build a single feature shaped to satisfy ``_handle_stats_nesting``."""
-    return {
-        "type": "Feature",
-        "id": "USGS-1",
-        "geometry": None,
-        "properties": {
-            "monitoring_location_id": "USGS-1",
-            "data": [
-                {
-                    "parameter_code": "00060",
-                    "unit_of_measure": "ft^3/s",
-                    "parent_time_series_id": "abc",
-                    "values": [{"value": 1.0}],
-                }
-            ],
-        },
-    }
-
-
-def _stats_body(features, next_token=None):
-    body = {
-        "type": "FeatureCollection",
-        "features": features,
-        "numberReturned": len(features),
-    }
-    if next_token is not None:
-        body["next"] = next_token
-    return body
-
-
-def _stats_feature_with_geometry(loc_id, lon, lat):
-    """Stats feature with a real point geometry."""
-    feat = _stats_feature()
-    feat["id"] = loc_id
-    feat["geometry"] = {"type": "Point", "coordinates": [lon, lat]}
-    feat["properties"]["monitoring_location_id"] = loc_id
-    return feat
-
-
-@pytest.mark.skipif(not GEOPANDAS, reason="geopandas not installed")
-def test_get_stats_data_preserves_geometry_across_pages():
-    """Pages 2..N must use ``geopd=GEOPANDAS`` so a multi-page response
-    stays a GeoDataFrame and doesn't lose geometry/CRS at the page-1 boundary.
-
-    Regression: previously pages 2..N hard-coded ``geopd=False``, producing
-    plain DataFrames. ``pd.concat`` then silently downgraded the result to
-    a plain DataFrame.
-    """
-    resp1 = mock.MagicMock()
-    resp1.status_code = 200
-    resp1.json.return_value = _stats_body(
-        [_stats_feature_with_geometry("USGS-1", -89.0, 43.0)],
-        next_token="abc",
-    )
-
-    resp2 = mock.MagicMock()
-    resp2.status_code = 200
-    resp2.json.return_value = _stats_body(
-        [_stats_feature_with_geometry("USGS-2", -90.0, 44.0)]
-    )
-
-    client = mock.MagicMock(spec=requests.Session)
-    client.send.return_value = resp1
-    client.request.return_value = resp2
-
-    df, _ = get_stats_data(
-        args={}, service="observationNormals", expand_percentiles=False, client=client
-    )
-    assert isinstance(df, gpd.GeoDataFrame)
-    assert len(df) == 2
-    assert df.geometry.notna().all()
-
-
 def test_handle_stats_nesting_tolerates_missing_drop_columns():
     """If the upstream stats response shape ever changes such that one of
     the columns we try to drop ("type", "properties.data") is absent, the