diff --git a/NEWS.md b/NEWS.md index 0517a15a..50bd8b9b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. + **05/06/2026:** Added `waterdata.get_field_measurements_metadata(...)` — wraps the OGC `field-measurements-metadata` collection. Returns one row per (location, parameter) field-measurement series describing its period of record, units, etc., without the underlying observations. Discrete-measurement analogue to `get_time_series_metadata`. Mirrors R's `read_waterdata_field_meta`. **05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`. diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 0bcb1d68..ec8d2537 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -7,12 +7,12 @@ from __future__ import annotations import warnings -from io import StringIO from json import JSONDecodeError import pandas as pd import requests +from dataretrieval.rdb import read_rdb from dataretrieval.utils import BaseMetadata from .utils import query @@ -44,6 +44,14 @@ # NAD83 _CRS = "EPSG:4269" +_NWIS_RDB_DTYPES = { + "site_no": str, + "dec_long_va": float, + "dec_lat_va": float, + "parm_cd": str, + "parameter_cd": str, +} + def _parse_json_or_raise(response: requests.Response) -> pd.DataFrame: """Parse a JSON NWIS response, raising a helpful error on HTML responses.""" @@ -1018,64 +1026,13 @@ def _read_json(json): def _read_rdb(rdb): - """ - Convert NWIS rdb table into a ``pandas.dataframe``. - - Parameters - ---------- - rdb: string - A string representation of an rdb table - - Returns - ------- - df: ``pandas.dataframe`` - A formatted pandas data frame + """Parse an NWIS RDB response and apply NWIS-specific post-processing. + Thin wrapper around :func:`dataretrieval.rdb.read_rdb` that adds the + NWIS column-dtype hints and runs :func:`format_response` (datetime + index, multi-site MultiIndex, optional GeoDataFrame). """ - if "" in rdb.lower() or "" in rdb.lower(): - raise ValueError( - "Received HTML response instead of RDB. This often indicates " - "that the service has been moved or is currently unavailable." - ) - - count = 0 - lines = rdb.splitlines() - - for line in lines: - # ignore comment lines - if line.startswith("#"): - count = count + 1 - - else: - break - - if count >= len(lines): - # All lines are comments — the service returned no data rows (e.g. - # "No sites found matching all criteria"). This is a legitimate empty - # result, so return an empty DataFrame rather than raising. - return pd.DataFrame() - - fields = lines[count].split("\t") - fields = [field.replace(",", "").strip() for field in fields if field.strip()] - dtypes = { - "site_no": str, - "dec_long_va": float, - "dec_lat_va": float, - "parm_cd": str, - "parameter_cd": str, - } - - df = pd.read_csv( - StringIO(rdb), - delimiter="\t", - skiprows=count + 2, - names=fields, - na_values="NaN", - dtype=dtypes, - ) - - df = format_response(df) - return df + return format_response(read_rdb(rdb, dtypes=_NWIS_RDB_DTYPES)) def _check_sites_value_types(sites): diff --git a/dataretrieval/rdb.py b/dataretrieval/rdb.py new file mode 100644 index 00000000..2b52656b --- /dev/null +++ b/dataretrieval/rdb.py @@ -0,0 +1,90 @@ +"""Parser for the USGS RDB tab-separated text format. + +RDB (Relational DataBase) is the text format used by NWIS web services +and by the Water Data STAC catalog's rating-curve assets. Every RDB +file has the same shape: + +- One or more ``#``-prefixed comment lines carrying provenance metadata + (data source, retrieval timestamp, station name, parameter codes, etc.). +- A tab-separated header row naming each column. +- A second tab-separated row giving column format specs (e.g. ``5s 15s``); + it is informational only and skipped during parsing. +- Tab-separated data rows. + +This module exposes the parsing primitives that both ``dataretrieval.nwis`` +and ``dataretrieval.waterdata.ratings`` use. Callers layer their own +post-processing (NWIS-specific datetime indexing, ratings-specific +``df.attrs`` provenance, etc.) on top of the raw frame. +""" + +from __future__ import annotations + +from io import StringIO + +import pandas as pd + + +def read_rdb(text: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame: + """Parse an RDB text response into a ``pandas.DataFrame``. + + Parameters + ---------- + text : str + The RDB text response from a USGS web service. + dtypes : dict[str, type] or None, optional + Optional column-name to dtype hints, forwarded to + ``pandas.read_csv``. Unknown column names are silently ignored, so + callers may safely pass a dict of all columns they might be + interested in. + + Returns + ------- + pandas.DataFrame + The parsed data. An RDB consisting only of comment lines (e.g. a + "no sites found" response) returns an empty DataFrame rather than + raising. + + Raises + ------ + ValueError + If the response body looks like HTML, which usually means the + service has been moved, is degraded, or returned an error page. + """ + if "" in text.lower() or "" in text.lower(): + raise ValueError( + "Received HTML response instead of RDB. This often indicates " + "that the service has been moved or is currently unavailable." + ) + + lines = text.splitlines() + header_idx = next( + (i for i, line in enumerate(lines) if not line.startswith("#")), + len(lines), + ) + if header_idx == len(lines): + # All lines are comments — a legitimate empty result. + return pd.DataFrame() + + fields = [f.replace(",", "").strip() for f in lines[header_idx].split("\t")] + fields = [f for f in fields if f] + + return pd.read_csv( + StringIO(text), + delimiter="\t", + skiprows=header_idx + 2, # +1 for header, +1 for the format-spec row + names=fields, + na_values="NaN", + dtype=dtypes, + ) + + +def extract_rdb_comment(text: str) -> list[str]: + """Return the RDB ``#``-prefixed comment block, raw and in original order. + + Each entry includes its leading ``#`` and any whitespace, matching what + R's ``dataRetrieval`` returns from ``comment(df)``. The comment block + carries provenance metadata that is otherwise lost during parsing — + data source, retrieval timestamp, parameter codes, rating id and + last-shifted timestamp for ratings, etc. + """ + return [line for line in text.splitlines() if line.startswith("#")] diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 4ea7475a..28510b70 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -30,6 +30,7 @@ ) from .filters import FILTER_LANG from .nearest import get_nearest_continuous +from .ratings import get_ratings from .types import ( CODE_SERVICES, PROFILE_LOOKUP, @@ -54,6 +55,7 @@ "get_latest_daily", "get_monitoring_locations", "get_nearest_continuous", + "get_ratings", "get_reference_table", "get_samples", "get_samples_summary", diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py new file mode 100644 index 00000000..a1d0a3bb --- /dev/null +++ b/dataretrieval/waterdata/ratings.py @@ -0,0 +1,262 @@ +"""USGS rating-curve retrieval via the Water Data STAC catalog. + +Wraps ``https://api.waterdata.usgs.gov/stac/v0/search`` and the per-feature +RDB downloads that follow. The STAC endpoint hosts standard NWIS rating +files (``exsa``, ``base``, ``corr``) for active streamgages — see the +service overview at https://api.waterdata.usgs.gov/docs/stac/ and the +WDFN announcement at https://waterdata.usgs.gov/blog/wdfn-rating-curves/. + +The R analogue is ``read_waterdata_ratings`` in +https://github.com/DOI-USGS/dataRetrieval/. +""" + +from __future__ import annotations + +import logging +import os +from typing import Any, Iterable, Literal, get_args + +import pandas as pd +import requests + +from dataretrieval.rdb import extract_rdb_comment, read_rdb + +from .utils import _DURATION_RE, BASE_URL, _default_headers, _format_api_dates + +logger = logging.getLogger(__name__) + +STAC_URL = f"{BASE_URL}/stac/v0" + +RATING_FILE_TYPE = Literal["exsa", "base", "corr"] +_VALID_FILE_TYPES = get_args(RATING_FILE_TYPE) + + +def get_ratings( + monitoring_location_id: str | list[str] | None = None, + file_type: RATING_FILE_TYPE | list[RATING_FILE_TYPE] = "exsa", + file_path: str | None = None, + time: str | list[str] | None = None, + bbox: list[float] | None = None, + limit: int = 10000, + download_and_parse: bool = True, + ssl_check: bool = True, +) -> dict[str, pd.DataFrame] | list[dict[str, Any]]: + """Get USGS stage-discharge rating curves from the Water Data STAC catalog. + + Returns the current rating tables for one or more active USGS streamgages. + The catalog hosts three file types: + + - ``"exsa"`` — expanded shift-adjusted rating (default). Adds a ``SHIFT`` + column to ``"base"`` indicating the current shift for each ``INDEP``. + - ``"base"`` — three columns: ``INDEP`` (typically gage height, ft); + ``DEP`` (typically discharge, ft^3/s); ``STOR`` ("``*``" marks fixed + points of the rating). + - ``"corr"`` — three columns: ``INDEP``; ``CORR`` (correction for that + value); ``CORRINDEP`` (corrected INDEP). + + See https://api.waterdata.usgs.gov/docs/stac/ for the upstream service + docs and https://waterdata.usgs.gov/blog/wdfn-rating-curves/ for the + background announcement. The R analogue is ``read_waterdata_ratings`` + in https://github.com/DOI-USGS/dataRetrieval/. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + One or more identifiers in ``AGENCY-ID`` form (e.g. + ``"USGS-01104475"``). If omitted, the spatial / temporal filters + determine the result set. + file_type : ``"exsa"``, ``"base"``, ``"corr"``, or a list, default ``"exsa"`` + Which rating file(s) to request. + file_path : string, optional + Directory the downloaded RDB files are written to. If ``None`` + (the default), the parsed ``DataFrame`` is returned without + persisting the bytes to disk; ``df.attrs["url"]`` still records + where each rating came from. + time : string or list of strings, optional + STAC ``datetime`` filter (passed through verbatim under that name) + — a single date / datetime, or an interval (``"start/end"``, + optionally half-bounded with ``..``). ISO 8601 *durations* + (``"P1M"``, ``"PT36H"``, …) are **not** supported by the + rating-curve service; passing one raises ``ValueError``. + bbox : list of numbers, optional + Only features whose geometry intersects the bounding box are + selected. Format: ``[xmin, ymin, xmax, ymax]`` in CRS 4326 + (longitude / latitude, west-south-east-north). + limit : int, default 10000 + Page size for the STAC ``/search`` request (capped at 10000). + download_and_parse : bool, default ``True`` + If ``True``, download every matching RDB file and parse it into a + ``DataFrame``. If ``False``, return the raw list of STAC feature + dicts so the caller can inspect what's available before pulling + bytes. + ssl_check : bool, default ``True`` + Verify the server's SSL certificate. + + Returns + ------- + dict[str, pandas.DataFrame] or list[dict] + When ``download_and_parse=True`` (the default), a dict keyed by + feature ID (e.g. ``"USGS-01104475.exsa.rdb"``) mapping to a parsed + ``DataFrame``. Each frame carries provenance in + ``df.attrs["comment"]`` (the RDB ``#``-prefixed header lines, like + rating id, parameter, last-shifted timestamp) and + ``df.attrs["url"]`` (the asset URL it was fetched from). When + ``download_and_parse=False``, the raw list of STAC feature dicts + as returned by the search endpoint. + + Raises + ------ + ValueError + For an unrecognized ``file_type`` value or an ISO 8601 duration in + ``time``. + + Examples + -------- + .. code:: + + >>> # Default exsa ratings for two sites + >>> ratings = dataretrieval.waterdata.get_ratings( + ... monitoring_location_id=["USGS-01104475", "USGS-01104460"], + ... file_type="exsa", + ... ) + >>> ratings["USGS-01104475.exsa.rdb"].head() + + >>> # Both exsa and corr files for the same two sites + >>> ratings = dataretrieval.waterdata.get_ratings( + ... monitoring_location_id=["USGS-01104475", "USGS-01104460"], + ... file_type=["exsa", "corr"], + ... ) + + >>> # Bounding-box query, listing what's available without downloading + >>> features = dataretrieval.waterdata.get_ratings( + ... bbox=[-95.0, 40.0, -92.0, 42.0], + ... download_and_parse=False, + ... ) + + >>> # Restrict to features in a date range (durations not supported) + >>> features = dataretrieval.waterdata.get_ratings( + ... bbox=[-95.0, 40.0, -92.0, 42.0], + ... time=["2026-04-29", ".."], + ... download_and_parse=False, + ... ) + + """ + file_types = _as_list(file_type) + invalid = [ft for ft in file_types if ft not in _VALID_FILE_TYPES] + if invalid: + raise ValueError( + f"Invalid file_type {invalid!r}; " + f"valid options are {list(_VALID_FILE_TYPES)}." + ) + + if time is not None and any(_DURATION_RE.match(str(v)) for v in _as_list(time)): + raise ValueError( + "ISO 8601 durations (e.g. 'P7D') are not supported in `time` " + "for the rating-curve service. Provide a date or interval instead." + ) + time_str = _format_api_dates(time) if time is not None else None + + # Mirror R: pin file_type server-side only when one type is requested. + server_file_type = file_types[0] if len(file_types) == 1 else None + filter_str = _build_filter(monitoring_location_id, server_file_type) + + features = _search(filter_str, time_str, bbox, limit, ssl_check) + + if not download_and_parse: + return features + + requested = set(file_types) + matching = [ + f for f in features if f.get("properties", {}).get("file_type") in requested + ] + + if file_path is not None: + os.makedirs(file_path, exist_ok=True) + + out: dict[str, pd.DataFrame] = {} + for feature in matching: + fid = feature["id"] + try: + out[fid] = _download_and_parse(feature, file_path, ssl_check) + except (requests.RequestException, ValueError, OSError) as e: + logger.warning("Failed to download / parse %s: %s", fid, e) + + return out + + +def _as_list(x: str | Iterable[str]) -> list[str]: + """Normalize a string or iterable-of-strings to a list.""" + return [x] if isinstance(x, str) else list(x) + + +def _quote_cql_str(value: str) -> str: + """Escape a single-quoted CQL literal by doubling embedded quotes. + + Defends against malformed filters / injection on arbitrary user input, + even though valid USGS monitoring-location IDs cannot contain a quote. + """ + return value.replace("'", "''") + + +def _build_filter( + monitoring_location_id: str | list[str] | None, + file_type: str | None, +) -> str | None: + """Compose the CQL filter sent to STAC ``/search``. + + Returns ``None`` when neither argument constrains the search. + """ + parts: list[str] = [] + if monitoring_location_id is not None: + ids = _as_list(monitoring_location_id) + joined = "', '".join(_quote_cql_str(i) for i in ids) + parts.append(f"monitoring_location_id IN ('{joined}')") + if file_type is not None: + parts.append(f"file_type = '{_quote_cql_str(file_type)}'") + return " AND ".join(parts) if parts else None + + +def _search( + filter_str: str | None, + time_str: str | None, + bbox: list[float] | None, + limit: int, + ssl_check: bool, +) -> list[dict[str, Any]]: + """Run a single STAC ``/search`` request and return its features.""" + params: dict[str, Any] = {"limit": limit} + if filter_str is not None: + params["filter"] = filter_str + if time_str is not None: + params["datetime"] = time_str + if bbox is not None: + params["bbox"] = ",".join(map(str, bbox)) + + response = requests.get( + f"{STAC_URL}/search", + params=params, + headers=_default_headers(), + verify=ssl_check, + ) + response.raise_for_status() + return response.json().get("features", []) + + +def _download_and_parse( + feature: dict[str, Any], + file_path: str | None, + ssl_check: bool, +) -> pd.DataFrame: + """Fetch the feature's data asset, parse RDB, optionally persist to disk.""" + url = feature["assets"]["data"]["href"] + response = requests.get(url, headers=_default_headers(), verify=ssl_check) + response.raise_for_status() + + if file_path is not None: + with open(os.path.join(file_path, feature["id"]), "w") as f: + f.write(response.text) + + df = read_rdb(response.text) + df.attrs["comment"] = extract_rdb_comment(response.text) + df.attrs["url"] = url + return df diff --git a/tests/nwis_test.py b/tests/nwis_test.py index c52775a4..a42ba509 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -325,43 +325,24 @@ def test_variable_info_deprecated(self): class TestReadRdb: - """Tests for the _read_rdb helper. + """Tests for the NWIS-specific _read_rdb wrapper. - Notes - ----- - Related to GitHub Issue #171. + The format-agnostic parser is exercised in tests/rdb_test.py; this + class pins the wrapper-specific contract — that an empty parser + result flows through format_response without crashing (issue #171). """ - # Minimal valid RDB response with one data row - _VALID_RDB = "# comment\nsite_no\tvalue\n5s\t10n\n01491000\t42\n" - - # NWIS response when no sites match the query criteria - _NO_SITES_RDB = ( - "# //Output-Format: RDB\n" - "# //Response-Status: OK\n" - "# //Response-Message: No sites found matching all criteria\n" - ) - - def test_valid_rdb_returns_dataframe(self): - """_read_rdb returns a DataFrame for a well-formed RDB response.""" - df = _read_rdb(self._VALID_RDB) - assert isinstance(df, pd.DataFrame) - assert "site_no" in df.columns - - def test_no_sites_returns_empty_dataframe(self): - """_read_rdb returns an empty DataFrame when NWIS finds no matching sites. - - A "No sites found" response is a legitimate empty result, not an error, - so callers can check ``df.empty`` rather than catching an exception. - Regression test for issue #171 (previously raised IndexError). + def test_no_sites_flows_through_format_response(self): + """A "No sites found" response is a legitimate empty result, not an + error, so callers can check ``df.empty`` rather than catching an + exception. Regression for issue #171 (previously raised IndexError), + which now also covers the empty-frame path through ``format_response``. """ - df = _read_rdb(self._NO_SITES_RDB) - assert isinstance(df, pd.DataFrame) - assert df.empty - - def test_all_comments_returns_empty_dataframe(self): - """_read_rdb returns an empty DataFrame when the response has only comments.""" - rdb = "# just a comment\n# another comment\n" - df = _read_rdb(rdb) + no_sites_rdb = ( + "# //Output-Format: RDB\n" + "# //Response-Status: OK\n" + "# //Response-Message: No sites found matching all criteria\n" + ) + df = _read_rdb(no_sites_rdb) assert isinstance(df, pd.DataFrame) assert df.empty diff --git a/tests/rdb_test.py b/tests/rdb_test.py new file mode 100644 index 00000000..99f46ff5 --- /dev/null +++ b/tests/rdb_test.py @@ -0,0 +1,65 @@ +import pandas as pd +import pytest + +from dataretrieval.rdb import extract_rdb_comment, read_rdb + +# A minimally complete RDB: comment block, header row, format-spec row, +# data rows. Both NWIS responses and ratings RDBs share this shape. +_BASIC_RDB = """\ +# header line one +# header line two +agency_cd\tsite_no\tINDEP\tDEP +5s\t15s\t10n\t10n +USGS\t01104475\t0.10\t0.0 +USGS\t01104475\t0.20\t0.5 +USGS\t01104475\t0.30\t1.2 +""" + + +def test_read_rdb_parses_basic_shape(): + df = read_rdb(_BASIC_RDB) + assert list(df.columns) == ["agency_cd", "site_no", "INDEP", "DEP"] + assert len(df) == 3 + assert df["INDEP"].tolist() == [0.10, 0.20, 0.30] + + +def test_read_rdb_skips_format_spec_row(): + """The "5s 15s 10n 10n" row is metadata, not data.""" + df = read_rdb(_BASIC_RDB) + # If the format-spec row had been treated as data, df would have 4 rows + # and "5s" / "15s" would appear in the parsed values. + assert "5s" not in df["agency_cd"].tolist() + + +def test_read_rdb_dtype_hints_applied(): + """Caller-supplied dtype hints are forwarded to pandas; unknown names ignored.""" + df = read_rdb(_BASIC_RDB, dtypes={"site_no": str, "DEP": float, "unknown": int}) + # Without the str hint, pandas would parse "01104475" as int and drop the + # leading zero. Check the values, not the dtype name (which varies across + # pandas versions: object, StringDtype, etc.). + assert df["site_no"].iloc[0] == "01104475" + assert df["DEP"].dtype == float + + +def test_read_rdb_empty_when_only_comments(): + """All-comments input is a legitimate "no data" response, not an error.""" + df = read_rdb("# only a comment\n# and another\n") + assert isinstance(df, pd.DataFrame) + assert df.empty + + +def test_read_rdb_raises_on_html_response(): + """If the service returns an HTML error page, surface it loudly.""" + with pytest.raises(ValueError, match="HTML"): + read_rdb("Service Unavailable") + with pytest.raises(ValueError, match="HTML"): + read_rdb("\n...") + + +def test_extract_rdb_comment_returns_only_hash_lines(): + comments = extract_rdb_comment(_BASIC_RDB) + assert comments == ["# header line one", "# header line two"] + + +def test_extract_rdb_comment_empty_when_no_comments(): + assert extract_rdb_comment("a\tb\n1\t2\n") == [] diff --git a/tests/waterdata_ratings_test.py b/tests/waterdata_ratings_test.py new file mode 100644 index 00000000..fcead65d --- /dev/null +++ b/tests/waterdata_ratings_test.py @@ -0,0 +1,185 @@ +import sys +from urllib.parse import parse_qs, urlsplit + +import pandas as pd +import pytest + +if sys.version_info < (3, 10): + pytest.skip("Skip entire module on Python < 3.10", allow_module_level=True) + +from dataretrieval.waterdata import get_ratings +from dataretrieval.waterdata.ratings import _build_filter + + +def test_build_filter_single_site_single_type(): + f = _build_filter("USGS-01104475", "exsa") + assert f == "monitoring_location_id IN ('USGS-01104475') AND file_type = 'exsa'" + + +def test_build_filter_multi_site_no_type(): + f = _build_filter(["USGS-A", "USGS-B"], None) + assert f == "monitoring_location_id IN ('USGS-A', 'USGS-B')" + + +def test_build_filter_no_site_single_type(): + f = _build_filter(None, "corr") + assert f == "file_type = 'corr'" + + +def test_build_filter_empty_returns_none(): + assert _build_filter(None, None) is None + + +def test_get_ratings_rejects_invalid_file_type(): + with pytest.raises(ValueError, match="Invalid file_type"): + get_ratings(monitoring_location_id="USGS-01104475", file_type="bogus") + + +def test_get_ratings_rejects_iso_8601_duration_in_time(): + """STAC ratings doesn't accept ISO 8601 durations; surface a clear error.""" + with pytest.raises(ValueError, match=r"durations.*not supported"): + get_ratings( + monitoring_location_id="USGS-01104475", + time="P7D", + ) + + +def test_build_filter_escapes_quotes(): + """Defends against malformed CQL or injection if an ID contains a quote.""" + f = _build_filter("USGS-x'-y", None) + assert f == "monitoring_location_id IN ('USGS-x''-y')" + + +_SAMPLE_RDB = """\ +# header line one +# header line two +agency_cd\tsite_no\tINDEP\tDEP +5s\t15s\t10n\t10n +USGS\t01104475\t0.10\t0.0 +USGS\t01104475\t0.20\t0.5 +USGS\t01104475\t0.30\t1.2 +""" + + +def _stub_search_response(): + return { + "features": [ + { + "id": "USGS-01104475.exsa.rdb", + "properties": {"file_type": "exsa"}, + "assets": { + "data": { + "href": "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb" + } + }, + } + ] + } + + +def test_get_ratings_mocked_search_and_download(requests_mock, tmp_path): + """End-to-end happy path with mocked STAC search + RDB download.""" + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json=_stub_search_response(), + ) + requests_mock.get( + "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb", + text=_SAMPLE_RDB, + ) + + out = get_ratings( + monitoring_location_id="USGS-01104475", + file_type="exsa", + file_path=str(tmp_path), + ) + assert "USGS-01104475.exsa.rdb" in out + df = out["USGS-01104475.exsa.rdb"] + assert isinstance(df, pd.DataFrame) + assert {"INDEP", "DEP"}.issubset(df.columns) + assert len(df) == 3 + + # Server-side filter should pin the single requested file_type. + sent = requests_mock.request_history[0] + qs = parse_qs(urlsplit(sent.url).query) + assert "file_type = 'exsa'" in qs["filter"][0] + assert "monitoring_location_id IN ('USGS-01104475')" in qs["filter"][0] + + +def test_get_ratings_attaches_rdb_comment_and_url(requests_mock, tmp_path): + """Each parsed frame should carry its RDB header + source URL in df.attrs.""" + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json=_stub_search_response(), + ) + asset_url = ( + "https://api.waterdata.usgs.gov/stac-files/ratings/USGS.01104475.exsa.rdb" + ) + requests_mock.get(asset_url, text=_SAMPLE_RDB) + + out = get_ratings( + monitoring_location_id="USGS-01104475", + file_type="exsa", + file_path=str(tmp_path), + ) + df = out["USGS-01104475.exsa.rdb"] + # The fixture has two `# ...` lines at the top; both should land in attrs. + assert df.attrs["comment"] == [ + "# header line one", + "# header line two", + ] + assert df.attrs["url"] == asset_url + + +def test_get_ratings_download_and_parse_false_returns_features(requests_mock): + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json=_stub_search_response(), + ) + features = get_ratings( + monitoring_location_id="USGS-01104475", + download_and_parse=False, + ) + assert isinstance(features, list) + assert features[0]["id"] == "USGS-01104475.exsa.rdb" + + +def test_get_ratings_multi_type_filters_via_property(requests_mock, tmp_path): + """File_type list: server filter omits it; local filter reads the property.""" + requests_mock.get( + "https://api.waterdata.usgs.gov/stac/v0/search", + json={ + "features": [ + { + "id": "USGS-X.exsa.rdb", + "properties": {"file_type": "exsa"}, + "assets": {"data": {"href": "https://x.example/X.exsa.rdb"}}, + }, + { + "id": "USGS-X.base.rdb", + "properties": {"file_type": "base"}, + "assets": {"data": {"href": "https://x.example/X.base.rdb"}}, + }, + { + "id": "USGS-X.corr.rdb", + "properties": {"file_type": "corr"}, + "assets": {"data": {"href": "https://x.example/X.corr.rdb"}}, + }, + ] + }, + ) + # Only mock the two URLs we expect to be downloaded. + requests_mock.get("https://x.example/X.exsa.rdb", text=_SAMPLE_RDB) + requests_mock.get("https://x.example/X.corr.rdb", text=_SAMPLE_RDB) + + out = get_ratings( + monitoring_location_id="USGS-X", + file_type=["exsa", "corr"], + file_path=str(tmp_path), + ) + assert set(out) == {"USGS-X.exsa.rdb", "USGS-X.corr.rdb"} + + # Server-side filter must NOT include file_type for multi-type requests. + search_req = requests_mock.request_history[0] + qs = parse_qs(urlsplit(search_req.url).query) + assert "file_type" not in qs["filter"][0]