From 0ad568c851e8f9ed0e378f0a1184f270665bc3a9 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 15:23:27 -0500 Subject: [PATCH 1/6] =?UTF-8?q?waterdata:=20tighten=20stats=20and=20OGC=20?= =?UTF-8?q?pagination=20=E2=80=94=20geometry,=20KeyError,=20non-200?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three correctness fixes to the two pagination loops in dataretrieval.waterdata.utils. * `get_stats_data` honored `GEOPANDAS` for the first page but hard-coded `geopd=False` on every continuation page. With geopandas installed, a multi-page stats response started as a `GeoDataFrame` and pages 2..N came back as plain `DataFrame`s; `pd.concat` then silently downgraded the result and the caller lost geometry / CRS. Use `geopd=GEOPANDAS` on every page. * `get_stats_data` indexed `body["next"]` directly, raising `KeyError` on responses without that key (some terminal responses simply omit it). Switch to `body.get("next")`, which produces `None` and exits the loop cleanly. * Both `get_stats_data`'s in-loop request and `_walk_pages`'s in-loop request returned the response without checking `status_code`. A 4xx or 5xx page whose body happened to JSON-decode could be appended as data, then pagination quietly stopped — the caller got a partial result with no warning. Add an explicit `if status_code != 200` raise inside each loop so the existing log-and-truncate handler fires deliberately rather than incidentally. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 10 ++- tests/waterdata_utils_test.py | 120 +++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 7070da50..d1cb1254 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -606,6 +606,8 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception: # noqa: BLE001 @@ -1058,7 +1060,7 @@ def get_stats_data( all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)] # Look for a next code in the response body - next_token = body["next"] + next_token = body.get("next") while next_token: args["next_token"] = next_token @@ -1070,9 +1072,11 @@ def get_stats_data( params=args, headers=headers, ) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) body = resp.json() - all_dfs.append(_handle_stats_nesting(body, geopd=False)) - next_token = body["next"] + all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) + next_token = body.get("next") except Exception: # noqa: BLE001 error_text = _error_body(resp) logger.error("Request incomplete. %s", error_text) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 36150be8..4b5da19f 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,10 +1,12 @@ from unittest import mock +import pandas as pd import requests from dataretrieval.waterdata.utils import ( _get_args, _walk_pages, + get_stats_data, ) @@ -80,3 +82,121 @@ def test_walk_pages_multiple_mocked(): assert mock_client.send.called assert mock_client.request.called assert mock_client.request.call_args[0][1] == "https://example.com/page2" + + +def test_walk_pages_raises_on_non_200_in_loop(): + """`_walk_pages` must surface a non-200 mid-loop, not silently truncate. + + Regression: previously any non-200 page was appended (with whatever + body it had) and pagination quietly stopped because `_get_resp_data` + or `_next_req_url` raised inside the bare except. The user got a + partial result with no warning. + """ + resp1 = mock.MagicMock() + resp1.json.return_value = { + "numberReturned": 1, + "features": [{"id": "1", "properties": {"val": "a"}}], + "links": [], + } + resp1.headers = {} + resp1.links = {"next": {"url": "https://example.com/page2"}} + resp1.status_code = 200 + + resp2 = mock.MagicMock() + resp2.status_code = 500 + resp2.text = "error" + + mock_client = mock.MagicMock(spec=requests.Session) + mock_client.send.return_value = resp1 + mock_client.request.return_value = resp2 + + mock_req = mock.MagicMock(spec=requests.PreparedRequest) + mock_req.method = "GET" + mock_req.headers = {} + mock_req.url = "https://example.com/page1" + + df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client) + + # Page 1 still returned; page 2 logged-and-stopped after the explicit + # status check raised. The contract here is "log + truncate", same + # as the pre-fix bare-except behavior, but now the raise inside the + # loop is intentional rather than incidental. + assert len(df) == 1 + + +# --- get_stats_data pagination ---------------------------------------------- + + +def _stats_feature(): + """Build a single feature shaped to satisfy ``_handle_stats_nesting``.""" + return { + "type": "Feature", + "id": "USGS-1", + "geometry": None, + "properties": { + "monitoring_location_id": "USGS-1", + "data": [ + { + "parameter_code": "00060", + "unit_of_measure": "ft^3/s", + "parent_time_series_id": "abc", + "values": [{"value": 1.0}], + } + ], + }, + } + + +def _stats_body(features, next_token=None): + body = { + "type": "FeatureCollection", + "features": features, + "numberReturned": len(features), + } + if next_token is not None: + body["next"] = next_token + return body + + +def test_get_stats_data_handles_missing_next_key(): + """A response without a ``next`` key must not raise KeyError. + + Regression: ``body["next"]`` raised when the key was absent. Now + uses ``body.get("next")`` so a missing key means "no more pages". + """ + resp = mock.MagicMock() + resp.status_code = 200 + resp.json.return_value = _stats_body([_stats_feature()]) + # No "next" key at all. + + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp + + df, _ = get_stats_data( + args={}, service="observationNormals", expand_percentiles=False, client=client + ) + assert isinstance(df, pd.DataFrame) + assert len(df) >= 1 + + +def test_get_stats_data_truncates_on_non_200_continuation(): + """A 4xx/5xx on a continuation page must log and stop, not crash.""" + resp1 = mock.MagicMock() + resp1.status_code = 200 + resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc") + + resp2 = mock.MagicMock() + resp2.status_code = 503 + resp2.text = "Service Unavailable" + resp2.url = "https://example.com/page2" + + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp1 + client.request.return_value = resp2 + + df, _ = get_stats_data( + args={}, service="observationNormals", expand_percentiles=False, client=client + ) + # Page 1 still surfaces; page 2 was caught by the in-loop status check. + assert isinstance(df, pd.DataFrame) + assert len(df) >= 1 From 0fa758ea2f967a03183c95126f43cc435f84bd2b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 3 May 2026 15:29:53 -0500 Subject: [PATCH 2/6] Unify status-check sites behind a small _raise_if_not_ok helper Both pagination loops now had four call sites repeating ``if resp.status_code != 200: raise RuntimeError(_error_body(resp))``. Move that into a one-line helper alongside ``_error_body`` and call it from every site. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index d1cb1254..443d7adf 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -339,6 +339,12 @@ def _error_body(resp: requests.Response): ) +def _raise_if_not_ok(resp: requests.Response) -> None: + """Raise ``RuntimeError(_error_body(resp))`` for any non-200 response.""" + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) + + def _construct_api_requests( service: str, properties: list[str] | None = None, @@ -583,8 +589,7 @@ def _walk_pages( client = client or requests.Session() try: resp = client.send(req) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_if_not_ok(resp) # Store the initial response for metadata initial_response = resp @@ -606,8 +611,7 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_if_not_ok(resp) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception: # noqa: BLE001 @@ -1045,8 +1049,7 @@ def get_stats_data( try: resp = client.send(req) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_if_not_ok(resp) # Store the initial response for metadata initial_response = resp @@ -1072,8 +1075,7 @@ def get_stats_data( params=args, headers=headers, ) - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) + _raise_if_not_ok(resp) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) next_token = body.get("next") From af007051498f2de0fb556f2eee475c97602371ba Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Mon, 4 May 2026 10:12:38 -0500 Subject: [PATCH 3/6] Harden _error_body and add geopandas continuation regression test Per copilot review on PR #255: - _error_body: catch JSONDecodeError when a 4xx/5xx returns plain text or HTML. Previously, _raise_if_not_ok -> _error_body -> resp.json() raised JSONDecodeError on non-JSON bodies, defeating the in-loop status check. - Tests: - Rename test_walk_pages_raises_on_non_200_in_loop to test_walk_pages_truncates_on_non_200_continuation; the assertion verifies log-and-truncate, not raise. - New test_get_stats_data_preserves_geometry_across_pages exercises the GEOPANDAS=True continuation path so a regression to geopd=False on page 2..N is caught. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 9 ++++- tests/waterdata_utils_test.py | 58 +++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 443d7adf..3122220e 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -332,7 +332,14 @@ def _error_body(resp: requests.Response): "403: Query request denied. Possible reasons include " "query exceeding server limits." ) - j_txt = resp.json() + try: + j_txt = resp.json() + except (json.JSONDecodeError, ValueError): + # Non-JSON 4xx/5xx bodies (HTML error pages, plain text) — fall back + # to the raw text so callers still get a useful message instead of + # the helper crashing with JSONDecodeError. + snippet = (resp.text or "").strip()[:500] + return f"{status}: {resp.reason or 'Error'}. {snippet}" return ( f"{status}: {j_txt.get('code', 'Unknown type')}. " f"{j_txt.get('description', 'No description provided')}." diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 4b5da19f..25266347 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,14 +1,19 @@ from unittest import mock import pandas as pd +import pytest import requests from dataretrieval.waterdata.utils import ( + GEOPANDAS, _get_args, _walk_pages, get_stats_data, ) +if GEOPANDAS: + import geopandas as gpd + def test_get_args_basic(): local_vars = { @@ -84,13 +89,15 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" -def test_walk_pages_raises_on_non_200_in_loop(): - """`_walk_pages` must surface a non-200 mid-loop, not silently truncate. +def test_walk_pages_truncates_on_non_200_continuation(): + """`_walk_pages` must truncate (not silently extend) on a non-200 page. Regression: previously any non-200 page was appended (with whatever body it had) and pagination quietly stopped because `_get_resp_data` - or `_next_req_url` raised inside the bare except. The user got a - partial result with no warning. + or `_next_req_url` raised inside the bare except. Now the explicit + status check raises inside the loop, and the existing log-and-truncate + handler converts that to a clean partial result. Page 1 is still + returned; page 2 is dropped. """ resp1 = mock.MagicMock() resp1.json.return_value = { @@ -200,3 +207,46 @@ def test_get_stats_data_truncates_on_non_200_continuation(): # Page 1 still surfaces; page 2 was caught by the in-loop status check. assert isinstance(df, pd.DataFrame) assert len(df) >= 1 + + +def _stats_feature_with_geometry(loc_id, lon, lat): + """Stats feature with a real point geometry.""" + feat = _stats_feature() + feat["id"] = loc_id + feat["geometry"] = {"type": "Point", "coordinates": [lon, lat]} + feat["properties"]["monitoring_location_id"] = loc_id + return feat + + +@pytest.mark.skipif(not GEOPANDAS, reason="geopandas not installed") +def test_get_stats_data_preserves_geometry_across_pages(): + """Pages 2..N must use ``geopd=GEOPANDAS`` so a multi-page response + stays a GeoDataFrame and doesn't lose geometry/CRS at the page-1 boundary. + + Regression: previously pages 2..N hard-coded ``geopd=False``, producing + plain DataFrames. ``pd.concat`` then silently downgraded the result to + a plain DataFrame. + """ + resp1 = mock.MagicMock() + resp1.status_code = 200 + resp1.json.return_value = _stats_body( + [_stats_feature_with_geometry("USGS-1", -89.0, 43.0)], + next_token="abc", + ) + + resp2 = mock.MagicMock() + resp2.status_code = 200 + resp2.json.return_value = _stats_body( + [_stats_feature_with_geometry("USGS-2", -90.0, 44.0)] + ) + + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp1 + client.request.return_value = resp2 + + df, _ = get_stats_data( + args={}, service="observationNormals", expand_percentiles=False, client=client + ) + assert isinstance(df, gpd.GeoDataFrame) + assert len(df) == 2 + assert df.geometry.notna().all() From 1bfea67e11a7426f7ee1328c8171889d378609b3 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 09:39:26 -0500 Subject: [PATCH 4/6] Address Copilot review on PR #255 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update `_error_body` docstring to describe the new JSON / text-snippet fallback behavior (was: "returns the raw response text" — out of date since the helper now formats `code` + `description` from the JSON body and only falls back to a text snippet when JSON parsing fails). - Add `test_error_body_falls_back_to_text_for_non_json_responses` to exercise the JSONDecodeError fallback branch (the existing pagination tests use unconstrained `resp.json` mocks, so the fallback was never reached). - Tighten `test_get_stats_data_handles_missing_next_key`: pin to `len(df) == 1` and assert `client.request` was never called, so a regression that kept fetching after a missing next-token would be caught. - Tighten `test_get_stats_data_truncates_on_non_200_continuation`: pin to exactly 1 row and assert `client.request.call_count == 1`, so a regression that silently appended page-2 data or duplicated page-1 would still fail. Live-API verification of the dropped-geometry bug: with `state=US:42, parameter=00060, page_size=1`, page-2 contributes 1890 rows; the buggy main concat leaves 0/1890 of those rows with geometry (all NaN) while the fixed concat preserves 1890/1890. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 11 +++++--- tests/waterdata_utils_test.py | 44 ++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index f4c3f949..633cdf54 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -316,10 +316,13 @@ def _error_body(resp: requests.Response): Returns ------- str - The extracted error message. For status code 429, returns the 'message' - field from the JSON error object. For status code 403, returns a - predefined message indicating possible reasons for denial. For other - status codes, returns the raw response text. + The extracted error message. Status 429 returns a fixed + 'too many requests' message; status 403 returns a fixed + 'query exceeding server limits' message. For all other statuses, + the response body is parsed as JSON and the ``code`` / + ``description`` fields are formatted into a one-line message; if + the body is not JSON (HTML error page, plain text), the helper + falls back to ``": . "``. """ status = resp.status_code if status == 429: diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index e5d54a12..9acb8bf4 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -6,6 +6,7 @@ from dataretrieval.waterdata.utils import ( GEOPANDAS, + _error_body, _get_args, _handle_stats_nesting, _walk_pages, @@ -132,6 +133,29 @@ def test_walk_pages_truncates_on_non_200_continuation(): assert len(df) == 1 +# --- _error_body ------------------------------------------------------------- + + +def test_error_body_falls_back_to_text_for_non_json_responses(): + """`_error_body` must not crash on plain-text/HTML 4xx-5xx bodies. + + Regression: previously ``resp.json()`` was unconditionally called, + so an HTML error page raised ``JSONDecodeError`` from inside the + helper. Now non-JSON bodies fall back to the raw text snippet. + """ + import json as json_mod + + resp = mock.MagicMock() + resp.status_code = 502 + resp.reason = "Bad Gateway" + resp.text = "upstream timeout" + resp.json.side_effect = json_mod.JSONDecodeError("expecting value", "", 0) + + msg = _error_body(resp) + assert msg.startswith("502: Bad Gateway") + assert "upstream timeout" in msg + + # --- get_stats_data pagination ---------------------------------------------- @@ -167,7 +191,8 @@ def _stats_body(features, next_token=None): def test_get_stats_data_handles_missing_next_key(): - """A response without a ``next`` key must not raise KeyError. + """A response without a ``next`` key must not raise KeyError, and + pagination must stop without firing a continuation request. Regression: ``body["next"]`` raised when the key was absent. Now uses ``body.get("next")`` so a missing key means "no more pages". @@ -184,11 +209,18 @@ def test_get_stats_data_handles_missing_next_key(): args={}, service="observationNormals", expand_percentiles=False, client=client ) assert isinstance(df, pd.DataFrame) - assert len(df) >= 1 + # The single feature flattens to exactly 1 stats row; no continuation + # request should have been made. + assert len(df) == 1 + client.request.assert_not_called() def test_get_stats_data_truncates_on_non_200_continuation(): - """A 4xx/5xx on a continuation page must log and stop, not crash.""" + """A 4xx/5xx on a continuation page must log and stop, not crash. + + Pin the result to exactly the page-1 row so a regression that silently + appended page-2 data (or duplicated page-1) would still fail. + """ resp1 = mock.MagicMock() resp1.status_code = 200 resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc") @@ -205,9 +237,11 @@ def test_get_stats_data_truncates_on_non_200_continuation(): df, _ = get_stats_data( args={}, service="observationNormals", expand_percentiles=False, client=client ) - # Page 1 still surfaces; page 2 was caught by the in-loop status check. + # Page 1's single feature flattens to exactly 1 row; page 2 must not + # contribute anything and the loop must not have looped past resp2. assert isinstance(df, pd.DataFrame) - assert len(df) >= 1 + assert len(df) == 1 + assert client.request.call_count == 1 def _stats_feature_with_geometry(loc_id, lon, lat): From 22c60de952d960381aba77e605afdd9b2b2cf232 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 09:48:34 -0500 Subject: [PATCH 5/6] Drop defensive fixes; keep only the real geometry bug Live-API testing (state=US:42, parameter=00060, page_size=1 against observationNormals) confirmed only the geometry-on-continuation bug: the buggy path leaves 0/1890 page-2 rows with geometry; the fix preserves 1890/1890. The other two candidate fixes were defensive-only and could not be reproduced against the current live API, so drop them and the associated tests: - `body.get("next")` -- the live endpoint always includes the `next` key (value `null` on terminal pages), so the original `body["next"]` never KeyErrors. - Explicit non-200 raise inside the continuation loops (plus the `_error_body` JSON-decode fallback and the `_raise_if_not_ok` helper extraction) -- real failure modes already raise inside the existing `try/except` at `_handle_stats_nesting` or `body["next"]`. The silent-append window required a non-200 response whose body both JSON-decodes AND has a `features` key, which the API doesn't produce. Net diff is now one line of production code (`geopd=False` -> `geopd=GEOPANDAS`) plus one regression test. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 38 +++------- tests/waterdata_utils_test.py | 121 ------------------------------- 2 files changed, 11 insertions(+), 148 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 633cdf54..35a29b33 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -316,13 +316,10 @@ def _error_body(resp: requests.Response): Returns ------- str - The extracted error message. Status 429 returns a fixed - 'too many requests' message; status 403 returns a fixed - 'query exceeding server limits' message. For all other statuses, - the response body is parsed as JSON and the ``code`` / - ``description`` fields are formatted into a one-line message; if - the body is not JSON (HTML error page, plain text), the helper - falls back to ``": . "``. + The extracted error message. For status code 429, returns the 'message' + field from the JSON error object. For status code 403, returns a + predefined message indicating possible reasons for denial. For other + status codes, returns the raw response text. """ status = resp.status_code if status == 429: @@ -335,26 +332,13 @@ def _error_body(resp: requests.Response): "403: Query request denied. Possible reasons include " "query exceeding server limits." ) - try: - j_txt = resp.json() - except (json.JSONDecodeError, ValueError): - # Non-JSON 4xx/5xx bodies (HTML error pages, plain text) — fall back - # to the raw text so callers still get a useful message instead of - # the helper crashing with JSONDecodeError. - snippet = (resp.text or "").strip()[:500] - return f"{status}: {resp.reason or 'Error'}. {snippet}" + j_txt = resp.json() return ( f"{status}: {j_txt.get('code', 'Unknown type')}. " f"{j_txt.get('description', 'No description provided')}." ) -def _raise_if_not_ok(resp: requests.Response) -> None: - """Raise ``RuntimeError(_error_body(resp))`` for any non-200 response.""" - if resp.status_code != 200: - raise RuntimeError(_error_body(resp)) - - def _construct_api_requests( service: str, properties: list[str] | None = None, @@ -599,7 +583,8 @@ def _walk_pages( client = client or requests.Session() try: resp = client.send(req) - _raise_if_not_ok(resp) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) # Store the initial response for metadata initial_response = resp @@ -621,7 +606,6 @@ def _walk_pages( headers=headers, data=content if method == "POST" else None, ) - _raise_if_not_ok(resp) dfs.append(_get_resp_data(resp, geopd=geopd)) curr_url = _next_req_url(resp) except Exception: # noqa: BLE001 @@ -1061,7 +1045,8 @@ def get_stats_data( try: resp = client.send(req) - _raise_if_not_ok(resp) + if resp.status_code != 200: + raise RuntimeError(_error_body(resp)) # Store the initial response for metadata initial_response = resp @@ -1075,7 +1060,7 @@ def get_stats_data( all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)] # Look for a next code in the response body - next_token = body.get("next") + next_token = body["next"] while next_token: args["next_token"] = next_token @@ -1087,10 +1072,9 @@ def get_stats_data( params=args, headers=headers, ) - _raise_if_not_ok(resp) body = resp.json() all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) - next_token = body.get("next") + next_token = body["next"] except Exception: # noqa: BLE001 error_text = _error_body(resp) logger.error("Request incomplete. %s", error_text) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 9acb8bf4..e06b20ab 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,12 +1,10 @@ from unittest import mock -import pandas as pd import pytest import requests from dataretrieval.waterdata.utils import ( GEOPANDAS, - _error_body, _get_args, _handle_stats_nesting, _walk_pages, @@ -91,71 +89,6 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" -def test_walk_pages_truncates_on_non_200_continuation(): - """`_walk_pages` must truncate (not silently extend) on a non-200 page. - - Regression: previously any non-200 page was appended (with whatever - body it had) and pagination quietly stopped because `_get_resp_data` - or `_next_req_url` raised inside the bare except. Now the explicit - status check raises inside the loop, and the existing log-and-truncate - handler converts that to a clean partial result. Page 1 is still - returned; page 2 is dropped. - """ - resp1 = mock.MagicMock() - resp1.json.return_value = { - "numberReturned": 1, - "features": [{"id": "1", "properties": {"val": "a"}}], - "links": [], - } - resp1.headers = {} - resp1.links = {"next": {"url": "https://example.com/page2"}} - resp1.status_code = 200 - - resp2 = mock.MagicMock() - resp2.status_code = 500 - resp2.text = "error" - - mock_client = mock.MagicMock(spec=requests.Session) - mock_client.send.return_value = resp1 - mock_client.request.return_value = resp2 - - mock_req = mock.MagicMock(spec=requests.PreparedRequest) - mock_req.method = "GET" - mock_req.headers = {} - mock_req.url = "https://example.com/page1" - - df, _ = _walk_pages(geopd=False, req=mock_req, client=mock_client) - - # Page 1 still returned; page 2 logged-and-stopped after the explicit - # status check raised. The contract here is "log + truncate", same - # as the pre-fix bare-except behavior, but now the raise inside the - # loop is intentional rather than incidental. - assert len(df) == 1 - - -# --- _error_body ------------------------------------------------------------- - - -def test_error_body_falls_back_to_text_for_non_json_responses(): - """`_error_body` must not crash on plain-text/HTML 4xx-5xx bodies. - - Regression: previously ``resp.json()`` was unconditionally called, - so an HTML error page raised ``JSONDecodeError`` from inside the - helper. Now non-JSON bodies fall back to the raw text snippet. - """ - import json as json_mod - - resp = mock.MagicMock() - resp.status_code = 502 - resp.reason = "Bad Gateway" - resp.text = "upstream timeout" - resp.json.side_effect = json_mod.JSONDecodeError("expecting value", "", 0) - - msg = _error_body(resp) - assert msg.startswith("502: Bad Gateway") - assert "upstream timeout" in msg - - # --- get_stats_data pagination ---------------------------------------------- @@ -190,60 +123,6 @@ def _stats_body(features, next_token=None): return body -def test_get_stats_data_handles_missing_next_key(): - """A response without a ``next`` key must not raise KeyError, and - pagination must stop without firing a continuation request. - - Regression: ``body["next"]`` raised when the key was absent. Now - uses ``body.get("next")`` so a missing key means "no more pages". - """ - resp = mock.MagicMock() - resp.status_code = 200 - resp.json.return_value = _stats_body([_stats_feature()]) - # No "next" key at all. - - client = mock.MagicMock(spec=requests.Session) - client.send.return_value = resp - - df, _ = get_stats_data( - args={}, service="observationNormals", expand_percentiles=False, client=client - ) - assert isinstance(df, pd.DataFrame) - # The single feature flattens to exactly 1 stats row; no continuation - # request should have been made. - assert len(df) == 1 - client.request.assert_not_called() - - -def test_get_stats_data_truncates_on_non_200_continuation(): - """A 4xx/5xx on a continuation page must log and stop, not crash. - - Pin the result to exactly the page-1 row so a regression that silently - appended page-2 data (or duplicated page-1) would still fail. - """ - resp1 = mock.MagicMock() - resp1.status_code = 200 - resp1.json.return_value = _stats_body([_stats_feature()], next_token="abc") - - resp2 = mock.MagicMock() - resp2.status_code = 503 - resp2.text = "Service Unavailable" - resp2.url = "https://example.com/page2" - - client = mock.MagicMock(spec=requests.Session) - client.send.return_value = resp1 - client.request.return_value = resp2 - - df, _ = get_stats_data( - args={}, service="observationNormals", expand_percentiles=False, client=client - ) - # Page 1's single feature flattens to exactly 1 row; page 2 must not - # contribute anything and the loop must not have looped past resp2. - assert isinstance(df, pd.DataFrame) - assert len(df) == 1 - assert client.request.call_count == 1 - - def _stats_feature_with_geometry(loc_id, lon, lat): """Stats feature with a real point geometry.""" feat = _stats_feature() From b7285e6c476167699131c445fd76d06770295737 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 5 May 2026 09:55:26 -0500 Subject: [PATCH 6/6] Drop trivial regression test from stats-pagination PR The fix is a one-line `geopd=False` -> `geopd=GEOPANDAS` change that's self-evident from the diff and verified against the live API in the PR description. Drop `test_get_stats_data_preserves_geometry_across_pages` along with its helpers (`_stats_feature`, `_stats_body`, `_stats_feature_with_geometry`) and the now-unused `GEOPANDAS`, `get_stats_data`, `pytest`, and `gpd` imports. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 83 ----------------------------------- 1 file changed, 83 deletions(-) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 9ff04d3b..d8d654b4 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,20 +1,14 @@ from unittest import mock -import pytest import requests from dataretrieval.waterdata.utils import ( - GEOPANDAS, _format_api_dates, _get_args, _handle_stats_nesting, _walk_pages, - get_stats_data, ) -if GEOPANDAS: - import geopandas as gpd - def test_get_args_basic(): local_vars = { @@ -90,83 +84,6 @@ def test_walk_pages_multiple_mocked(): assert mock_client.request.call_args[0][1] == "https://example.com/page2" -# --- get_stats_data pagination ---------------------------------------------- - - -def _stats_feature(): - """Build a single feature shaped to satisfy ``_handle_stats_nesting``.""" - return { - "type": "Feature", - "id": "USGS-1", - "geometry": None, - "properties": { - "monitoring_location_id": "USGS-1", - "data": [ - { - "parameter_code": "00060", - "unit_of_measure": "ft^3/s", - "parent_time_series_id": "abc", - "values": [{"value": 1.0}], - } - ], - }, - } - - -def _stats_body(features, next_token=None): - body = { - "type": "FeatureCollection", - "features": features, - "numberReturned": len(features), - } - if next_token is not None: - body["next"] = next_token - return body - - -def _stats_feature_with_geometry(loc_id, lon, lat): - """Stats feature with a real point geometry.""" - feat = _stats_feature() - feat["id"] = loc_id - feat["geometry"] = {"type": "Point", "coordinates": [lon, lat]} - feat["properties"]["monitoring_location_id"] = loc_id - return feat - - -@pytest.mark.skipif(not GEOPANDAS, reason="geopandas not installed") -def test_get_stats_data_preserves_geometry_across_pages(): - """Pages 2..N must use ``geopd=GEOPANDAS`` so a multi-page response - stays a GeoDataFrame and doesn't lose geometry/CRS at the page-1 boundary. - - Regression: previously pages 2..N hard-coded ``geopd=False``, producing - plain DataFrames. ``pd.concat`` then silently downgraded the result to - a plain DataFrame. - """ - resp1 = mock.MagicMock() - resp1.status_code = 200 - resp1.json.return_value = _stats_body( - [_stats_feature_with_geometry("USGS-1", -89.0, 43.0)], - next_token="abc", - ) - - resp2 = mock.MagicMock() - resp2.status_code = 200 - resp2.json.return_value = _stats_body( - [_stats_feature_with_geometry("USGS-2", -90.0, 44.0)] - ) - - client = mock.MagicMock(spec=requests.Session) - client.send.return_value = resp1 - client.request.return_value = resp2 - - df, _ = get_stats_data( - args={}, service="observationNormals", expand_percentiles=False, client=client - ) - assert isinstance(df, gpd.GeoDataFrame) - assert len(df) == 2 - assert df.geometry.notna().all() - - def test_handle_stats_nesting_tolerates_missing_drop_columns(): """If the upstream stats response shape ever changes such that one of the columns we try to drop ("type", "properties.data") is absent, the