Skip to content
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.

**05/06/2026:** Added `waterdata.get_field_measurements_metadata(...)` — wraps the OGC `field-measurements-metadata` collection. Returns one row per (location, parameter) field-measurement series describing its period of record, units, etc., without the underlying observations. Discrete-measurement analogue to `get_time_series_metadata`. Mirrors R's `read_waterdata_field_meta`.

**05/05/2026:** Added `waterdata.get_combined_metadata(...)` — wraps the Water Data API's `combined-metadata` collection, which joins the monitoring-locations catalog with the time-series-metadata catalog and returns one row per (location, parameter, statistic) inventory entry. This is the most flexible "what data is available" endpoint in the API: any location attribute (state, HUC, site type, drainage area, well-construction depth, …) can be combined with any time-series attribute (parameter code, statistic, data type, period of record, …) in a single query. Mirrors R's `read_waterdata_combined_meta`.
Expand Down
71 changes: 14 additions & 57 deletions dataretrieval/nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from __future__ import annotations

import warnings
from io import StringIO
from json import JSONDecodeError

import pandas as pd
import requests

from dataretrieval.rdb import read_rdb
from dataretrieval.utils import BaseMetadata

from .utils import query
Expand Down Expand Up @@ -44,6 +44,14 @@
# NAD83
_CRS = "EPSG:4269"

_NWIS_RDB_DTYPES = {
"site_no": str,
"dec_long_va": float,
"dec_lat_va": float,
"parm_cd": str,
"parameter_cd": str,
}


def _parse_json_or_raise(response: requests.Response) -> pd.DataFrame:
"""Parse a JSON NWIS response, raising a helpful error on HTML responses."""
Expand Down Expand Up @@ -1018,64 +1026,13 @@ def _read_json(json):


def _read_rdb(rdb):
"""
Convert NWIS rdb table into a ``pandas.dataframe``.

Parameters
----------
rdb: string
A string representation of an rdb table

Returns
-------
df: ``pandas.dataframe``
A formatted pandas data frame
"""Parse an NWIS RDB response and apply NWIS-specific post-processing.

Thin wrapper around :func:`dataretrieval.rdb.read_rdb` that adds the
NWIS column-dtype hints and runs :func:`format_response` (datetime
index, multi-site MultiIndex, optional GeoDataFrame).
"""
if "<html>" in rdb.lower() or "<!doctype html>" in rdb.lower():
raise ValueError(
"Received HTML response instead of RDB. This often indicates "
"that the service has been moved or is currently unavailable."
)

count = 0
lines = rdb.splitlines()

for line in lines:
# ignore comment lines
if line.startswith("#"):
count = count + 1

else:
break

if count >= len(lines):
# All lines are comments — the service returned no data rows (e.g.
# "No sites found matching all criteria"). This is a legitimate empty
# result, so return an empty DataFrame rather than raising.
return pd.DataFrame()

fields = lines[count].split("\t")
fields = [field.replace(",", "").strip() for field in fields if field.strip()]
dtypes = {
"site_no": str,
"dec_long_va": float,
"dec_lat_va": float,
"parm_cd": str,
"parameter_cd": str,
}

df = pd.read_csv(
StringIO(rdb),
delimiter="\t",
skiprows=count + 2,
names=fields,
na_values="NaN",
dtype=dtypes,
)

df = format_response(df)
return df
return format_response(read_rdb(rdb, dtypes=_NWIS_RDB_DTYPES))


def _check_sites_value_types(sites):
Expand Down
90 changes: 90 additions & 0 deletions dataretrieval/rdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Parser for the USGS RDB tab-separated text format.

RDB (Relational DataBase) is the text format used by NWIS web services
and by the Water Data STAC catalog's rating-curve assets. Every RDB
file has the same shape:

- One or more ``#``-prefixed comment lines carrying provenance metadata
(data source, retrieval timestamp, station name, parameter codes, etc.).
- A tab-separated header row naming each column.
- A second tab-separated row giving column format specs (e.g. ``5s 15s``);
it is informational only and skipped during parsing.
- Tab-separated data rows.

This module exposes the parsing primitives that both ``dataretrieval.nwis``
and ``dataretrieval.waterdata.ratings`` use. Callers layer their own
post-processing (NWIS-specific datetime indexing, ratings-specific
``df.attrs`` provenance, etc.) on top of the raw frame.
"""

from __future__ import annotations

from io import StringIO

import pandas as pd


def read_rdb(text: str, dtypes: dict[str, type] | None = None) -> pd.DataFrame:
"""Parse an RDB text response into a ``pandas.DataFrame``.

Parameters
----------
text : str
The RDB text response from a USGS web service.
dtypes : dict[str, type] or None, optional
Optional column-name to dtype hints, forwarded to
``pandas.read_csv``. Unknown column names are silently ignored, so
callers may safely pass a dict of all columns they might be
interested in.

Returns
-------
pandas.DataFrame
The parsed data. An RDB consisting only of comment lines (e.g. a
"no sites found" response) returns an empty DataFrame rather than
raising.

Raises
------
ValueError
If the response body looks like HTML, which usually means the
service has been moved, is degraded, or returned an error page.
"""
if "<html>" in text.lower() or "<!doctype html>" in text.lower():
raise ValueError(
"Received HTML response instead of RDB. This often indicates "
"that the service has been moved or is currently unavailable."
)

lines = text.splitlines()
header_idx = next(
(i for i, line in enumerate(lines) if not line.startswith("#")),
len(lines),
)
if header_idx == len(lines):
# All lines are comments — a legitimate empty result.
return pd.DataFrame()

fields = [f.replace(",", "").strip() for f in lines[header_idx].split("\t")]
fields = [f for f in fields if f]

return pd.read_csv(
StringIO(text),
delimiter="\t",
skiprows=header_idx + 2, # +1 for header, +1 for the format-spec row
names=fields,
na_values="NaN",
dtype=dtypes,
)


def extract_rdb_comment(text: str) -> list[str]:
"""Return the RDB ``#``-prefixed comment block, raw and in original order.

Each entry includes its leading ``#`` and any whitespace, matching what
R's ``dataRetrieval`` returns from ``comment(df)``. The comment block
carries provenance metadata that is otherwise lost during parsing —
data source, retrieval timestamp, parameter codes, rating id and
last-shifted timestamp for ratings, etc.
"""
return [line for line in text.splitlines() if line.startswith("#")]
2 changes: 2 additions & 0 deletions dataretrieval/waterdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
)
from .filters import FILTER_LANG
from .nearest import get_nearest_continuous
from .ratings import get_ratings
from .types import (
CODE_SERVICES,
PROFILE_LOOKUP,
Expand All @@ -54,6 +55,7 @@
"get_latest_daily",
"get_monitoring_locations",
"get_nearest_continuous",
"get_ratings",
"get_reference_table",
"get_samples",
"get_samples_summary",
Expand Down
Loading
Loading