Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
**05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade.

**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `<prefix>DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`<X>Date`/`<X>Time`/`<X>TimeZone`) and legacy WQP (`<X>Date`/`<X>Time/Time`/`<X>Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Returned rows are also now sorted by `Activity_StartDateTime` (or the legacy `ActivityStartDateTime`) — the underlying APIs return rows in an unstable order. Mirrors R's `create_dateTime` and end-of-pipeline sort. Closes #266.

**05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**.

**05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`.
Expand Down
102 changes: 102 additions & 0 deletions dataretrieval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,108 @@ def format_datetime(df, date_field, time_field, tz_field):
return df


# (time-suffix, tz-suffix) pairs that follow a "<prefix>Date" column.
_TIME_TZ_SUFFIXES = (
# WQX3 / Samples, e.g.
# Activity_StartDate / Activity_StartTime / Activity_StartTimeZone
("Time", "TimeZone"),
# Legacy WQP (slash-separated), e.g.
# ActivityStartDate / ActivityStartTime/Time / ActivityStartTime/TimeZoneCode
("Time/Time", "Time/TimeZoneCode"),
)


def _build_utc_datetime(
date_series: pd.Series, time_series: pd.Series, tz_series: pd.Series
) -> pd.Series:
"""Combine date + time + tz-abbreviation columns into a UTC pandas Series.

Unknown timezone codes (and rows missing any of the three values) yield
``NaT``. The input columns are not mutated.
"""
offsets = tz_series.map(tz)
combined = (
date_series.astype("string")
+ " "
+ time_series.astype("string")
+ " "
+ offsets.astype("string")
)
return pd.to_datetime(
combined, format="%Y-%m-%d %H:%M:%S %z", utc=True, errors="coerce"
)


def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Add ``<prefix>DateTime`` UTC columns for any Date/Time/TimeZone triplets
and sort the frame by the activity-start datetime.

Detects two naming patterns that appear in USGS Samples and Water Quality
Portal CSV responses:
Comment thread
thodson-usgs marked this conversation as resolved.
Comment thread
thodson-usgs marked this conversation as resolved.

* **WQX3** — ``<prefix>Date``, ``<prefix>Time``, ``<prefix>TimeZone``
* **Legacy WQP** — ``<prefix>Date``, ``<prefix>Time/Time``,
``<prefix>Time/TimeZoneCode``

For every triplet present, a new ``<prefix>DateTime`` column is appended
holding a UTC ``Timestamp`` (offsets resolved via
:data:`dataretrieval.codes.tz`). The original Date/Time/TimeZone columns
are left intact, and an existing ``<prefix>DateTime`` column is never
overwritten.

Rows are sorted (and the index reset) by the canonical activity-start
datetime when present — ``Activity_StartDateTime`` (WQX3) or
``ActivityStartDateTime`` (legacy WQP) — falling back to the first
detected ``*Date`` column. Mirrors R ``dataRetrieval``'s
end-of-pipeline sort in ``importWQP.R``.

Parameters
----------
df : ``pandas.DataFrame``
DataFrame returned from a Samples or WQP CSV endpoint.

Returns
-------
df : ``pandas.DataFrame``
A new DataFrame with derivable ``<prefix>DateTime`` columns appended
and rows sorted by the activity-start datetime (if any date column
was detected).
"""
columns = set(df.columns)
new_columns = {}
first_date_col = None
for col in df.columns:
if not col.endswith("Date"):
continue
if first_date_col is None:
first_date_col = col
prefix = col.removesuffix("Date")
target = prefix + "DateTime"
if target in columns or target in new_columns:
continue
for time_suffix, tz_suffix in _TIME_TZ_SUFFIXES:
time_col = prefix + time_suffix
tz_col = prefix + tz_suffix
if time_col in columns and tz_col in columns:
new_columns[target] = _build_utc_datetime(
df[col], df[time_col], df[tz_col]
)
break
if new_columns:
# Concat in one shot — per-column assignment on a wide CSV-derived
# frame triggers pandas' fragmentation PerformanceWarning.
df = pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1)
if "Activity_StartDateTime" in df.columns:
sort_key = "Activity_StartDateTime"
elif "ActivityStartDateTime" in df.columns:
sort_key = "ActivityStartDateTime"
else:
sort_key = first_date_col
if sort_key is not None:
df = df.sort_values(by=sort_key, ignore_index=True)
return df


class BaseMetadata:
"""Base class for metadata.

Expand Down
13 changes: 11 additions & 2 deletions dataretrieval/waterdata/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import requests
from requests.models import PreparedRequest

from dataretrieval.utils import BaseMetadata, to_str
from dataretrieval.utils import BaseMetadata, _attach_datetime_columns, to_str
from dataretrieval.waterdata.filters import FILTER_LANG
from dataretrieval.waterdata.types import (
CODE_SERVICES,
Expand Down Expand Up @@ -2266,7 +2266,15 @@ def get_samples(
Returns
-------
df : ``pandas.DataFrame``
Formatted data returned from the API query.
Formatted data returned from the API query. For each
``<prefix>Date`` / ``<prefix>Time`` / ``<prefix>TimeZone`` triplet in
the response (e.g. ``Activity_StartDate``, ``Activity_StartTime``,
``Activity_StartTimeZone``), an additional ``<prefix>DateTime`` column
is appended holding a UTC ``Timestamp`` derived from the three. The
original Date/Time/TimeZone columns are left intact; rows whose
timezone abbreviation is not recognized resolve to ``NaT``. Rows are
sorted by ``Activity_StartDateTime`` when present (the API's default
order is unstable).
md : :obj:`dataretrieval.utils.Metadata`
Custom ``dataretrieval`` metadata object pertaining to the query.

Expand Down Expand Up @@ -2323,6 +2331,7 @@ def get_samples(
response.raise_for_status()

df = pd.read_csv(StringIO(response.text), delimiter=",")
df = _attach_datetime_columns(df)

return df, BaseMetadata(response)

Expand Down
3 changes: 2 additions & 1 deletion dataretrieval/waterdata/ratings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import logging
import os
from typing import Any, Iterable, Literal, get_args
from collections.abc import Iterable
from typing import Any, Literal, get_args

import pandas as pd
import requests
Expand Down
2 changes: 1 addition & 1 deletion dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
import re
from datetime import datetime
from typing import Any, get_args
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from zoneinfo import ZoneInfo

from dataretrieval import __version__
from dataretrieval.utils import BaseMetadata
Expand Down
12 changes: 10 additions & 2 deletions dataretrieval/wqp.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pandas as pd

from .utils import BaseMetadata, query
from .utils import BaseMetadata, _attach_datetime_columns, query

if TYPE_CHECKING:
from pandas import DataFrame
Expand Down Expand Up @@ -101,7 +101,14 @@ def get_results(
Returns
-------
df : ``pandas.DataFrame``
Formatted data returned from the API query.
Formatted data returned from the API query. For each
``<prefix>Date`` / ``<prefix>Time`` / ``<prefix>TimeZone`` triplet in
the response (legacy WQP uses ``<prefix>Time/Time`` and
``<prefix>Time/TimeZoneCode``), an additional ``<prefix>DateTime``
column is appended holding a UTC ``Timestamp``. Original triplet
columns are preserved; unrecognized timezone codes yield ``NaT``.
Rows are sorted by ``ActivityStartDateTime`` (or ``Activity_StartDateTime``
for WQX3 responses) when present.
md : :obj:`dataretrieval.utils.Metadata`
Custom ``dataretrieval`` metadata object pertaining to the query.

Expand Down Expand Up @@ -147,6 +154,7 @@ def get_results(
response = query(url, kwargs, delimiter=";", ssl_check=ssl_check)

df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False)
df = _attach_datetime_columns(df)
return df, WQP_Metadata(response)


Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
name = "dataretrieval"
description = "Discover and retrieve water data from U.S. federal hydrologic web services."
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.9"
keywords = ["USGS", "water data"]
license = "CC0-1.0"
license-files = ["LICENSE.md"]
Expand Down Expand Up @@ -63,7 +63,7 @@ repository = "https://github.com/DOI-USGS/dataretrieval-python.git"
write_to = "dataretrieval/_version.py"

[tool.ruff]
target-version = "py38"
target-version = "py39"
extend-exclude = ["demos"]

[tool.ruff.lint]
Expand Down
58 changes: 58 additions & 0 deletions tests/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,61 @@ def test_to_str_custom_delimiter(self):

def test_to_str_non_iterable(self):
assert utils.to_str(123) is None


class Test_attach_datetime_columns:
"""Tests of _attach_datetime_columns, which derives <prefix>DateTime UTC
columns from Date/Time/TimeZone triplets in Samples and WQP CSVs."""

def test_wqx3_triplet_resolves_to_utc(self):
df = pd.DataFrame(
{
"Activity_StartDate": ["2024-01-09", "2024-02-15"],
"Activity_StartTime": ["10:00:00", "14:30:00"],
"Activity_StartTimeZone": ["PST", "EST"],
}
)
df = utils._attach_datetime_columns(df)
assert df["Activity_StartDateTime"][0] == pd.Timestamp(
"2024-01-09 18:00:00", tz="UTC"
)
assert df["Activity_StartDateTime"][1] == pd.Timestamp(
"2024-02-15 19:30:00", tz="UTC"
)
assert df["Activity_StartTimeZone"].tolist() == ["PST", "EST"]

def test_legacy_wqp_triplet_resolves_to_utc(self):
df = pd.DataFrame(
{
"ActivityStartDate": ["2024-01-09"],
"ActivityStartTime/Time": ["10:00:00"],
"ActivityStartTime/TimeZoneCode": ["PST"],
}
)
df = utils._attach_datetime_columns(df)
assert df["ActivityStartDateTime"][0] == pd.Timestamp(
"2024-01-09 18:00:00", tz="UTC"
)

def test_unknown_timezone_is_NaT(self):
df = pd.DataFrame(
{
"Activity_StartDate": ["2024-01-09"],
"Activity_StartTime": ["10:00:00"],
"Activity_StartTimeZone": ["BOGUS"],
}
)
df = utils._attach_datetime_columns(df)
assert df["Activity_StartDateTime"].isna().all()

def test_existing_datetime_column_not_overwritten(self):
df = pd.DataFrame(
{
"Activity_StartDate": ["2024-01-09"],
"Activity_StartTime": ["10:00:00"],
"Activity_StartTimeZone": ["PST"],
"Activity_StartDateTime": ["preexisting"],
}
)
df = utils._attach_datetime_columns(df)
assert df["Activity_StartDateTime"].tolist() == ["preexisting"]
79 changes: 47 additions & 32 deletions tests/waterdata_filters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,14 +190,18 @@ def fake_walk_pages(*_args, **_kwargs):
frame = pd.DataFrame({"id": [f"chunk-{idx}"], "value": [idx]})
return frame, _fake_response()

with mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
side_effect=fake_construct_api_requests,
), mock.patch(
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
), mock.patch(
"dataretrieval.waterdata.filters._effective_filter_budget",
return_value=_CQL_FILTER_CHUNK_LEN,
with (
mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
side_effect=fake_construct_api_requests,
),
mock.patch(
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
),
mock.patch(
"dataretrieval.waterdata.filters._effective_filter_budget",
return_value=_CQL_FILTER_CHUNK_LEN,
),
Comment thread
thodson-usgs marked this conversation as resolved.
):
df, _ = get_continuous(
monitoring_location_id="USGS-07374525",
Expand Down Expand Up @@ -239,14 +243,18 @@ def fake_walk_pages(*_args, **_kwargs):
frame = pd.DataFrame({"id": ["shared-feature"], "value": [1]})
return frame, _fake_response()

with mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
return_value=_fake_prepared_request(),
), mock.patch(
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
), mock.patch(
"dataretrieval.waterdata.filters._effective_filter_budget",
return_value=_CQL_FILTER_CHUNK_LEN,
with (
mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
return_value=_fake_prepared_request(),
),
mock.patch(
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
),
mock.patch(
"dataretrieval.waterdata.filters._effective_filter_budget",
return_value=_CQL_FILTER_CHUNK_LEN,
),
Comment thread
thodson-usgs marked this conversation as resolved.
):
df, _ = get_continuous(
monitoring_location_id="USGS-07374525",
Expand Down Expand Up @@ -293,14 +301,18 @@ def fake_walk_pages(*_args, **_kwargs):
)
return frame, _fake_response()

with mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
return_value=_fake_prepared_request(),
), mock.patch(
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
), mock.patch(
"dataretrieval.waterdata.filters._effective_filter_budget",
return_value=_CQL_FILTER_CHUNK_LEN,
with (
mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
return_value=_fake_prepared_request(),
),
mock.patch(
"dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages
),
mock.patch(
"dataretrieval.waterdata.filters._effective_filter_budget",
return_value=_CQL_FILTER_CHUNK_LEN,
),
):
Comment thread
thodson-usgs marked this conversation as resolved.
df, _ = get_continuous(
monitoring_location_id="USGS-07374525",
Expand Down Expand Up @@ -434,14 +446,17 @@ def fake_construct_api_requests(**kwargs):
sent_filters.append(kwargs.get("filter"))
return _fake_prepared_request()

with mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
side_effect=fake_construct_api_requests,
), mock.patch(
"dataretrieval.waterdata.utils._walk_pages",
return_value=(
pd.DataFrame({"id": ["row-1"], "value": [1]}),
_fake_response(),
with (
mock.patch(
"dataretrieval.waterdata.utils._construct_api_requests",
side_effect=fake_construct_api_requests,
),
mock.patch(
"dataretrieval.waterdata.utils._walk_pages",
return_value=(
pd.DataFrame({"id": ["row-1"], "value": [1]}),
_fake_response(),
),
),
):
get_continuous(
Expand Down
Loading
Loading