diff --git a/NEWS.md b/NEWS.md index 2faaeb42..246ede15 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +**05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. + +**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`Date`/`Time`/`TimeZone`) and legacy WQP (`Date`/`Time/Time`/`Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Returned rows are also now sorted by `Activity_StartDateTime` (or the legacy `ActivityStartDateTime`) — the underlying APIs return rows in an unstable order. Mirrors R's `create_dateTime` and end-of-pipeline sort. Closes #266. + **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 4aa76a61..76bbb6ad 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -94,6 +94,108 @@ def format_datetime(df, date_field, time_field, tz_field): return df +# (time-suffix, tz-suffix) pairs that follow a "Date" column. +_TIME_TZ_SUFFIXES = ( + # WQX3 / Samples, e.g. + # Activity_StartDate / Activity_StartTime / Activity_StartTimeZone + ("Time", "TimeZone"), + # Legacy WQP (slash-separated), e.g. + # ActivityStartDate / ActivityStartTime/Time / ActivityStartTime/TimeZoneCode + ("Time/Time", "Time/TimeZoneCode"), +) + + +def _build_utc_datetime( + date_series: pd.Series, time_series: pd.Series, tz_series: pd.Series +) -> pd.Series: + """Combine date + time + tz-abbreviation columns into a UTC pandas Series. + + Unknown timezone codes (and rows missing any of the three values) yield + ``NaT``. The input columns are not mutated. + """ + offsets = tz_series.map(tz) + combined = ( + date_series.astype("string") + + " " + + time_series.astype("string") + + " " + + offsets.astype("string") + ) + return pd.to_datetime( + combined, format="%Y-%m-%d %H:%M:%S %z", utc=True, errors="coerce" + ) + + +def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame: + """Add ``DateTime`` UTC columns for any Date/Time/TimeZone triplets + and sort the frame by the activity-start datetime. + + Detects two naming patterns that appear in USGS Samples and Water Quality + Portal CSV responses: + + * **WQX3** — ``Date``, ``Time``, ``TimeZone`` + * **Legacy WQP** — ``Date``, ``Time/Time``, + ``Time/TimeZoneCode`` + + For every triplet present, a new ``DateTime`` column is appended + holding a UTC ``Timestamp`` (offsets resolved via + :data:`dataretrieval.codes.tz`). The original Date/Time/TimeZone columns + are left intact, and an existing ``DateTime`` column is never + overwritten. + + Rows are sorted (and the index reset) by the canonical activity-start + datetime when present — ``Activity_StartDateTime`` (WQX3) or + ``ActivityStartDateTime`` (legacy WQP) — falling back to the first + detected ``*Date`` column. Mirrors R ``dataRetrieval``'s + end-of-pipeline sort in ``importWQP.R``. + + Parameters + ---------- + df : ``pandas.DataFrame`` + DataFrame returned from a Samples or WQP CSV endpoint. + + Returns + ------- + df : ``pandas.DataFrame`` + A new DataFrame with derivable ``DateTime`` columns appended + and rows sorted by the activity-start datetime (if any date column + was detected). + """ + columns = set(df.columns) + new_columns = {} + first_date_col = None + for col in df.columns: + if not col.endswith("Date"): + continue + if first_date_col is None: + first_date_col = col + prefix = col.removesuffix("Date") + target = prefix + "DateTime" + if target in columns or target in new_columns: + continue + for time_suffix, tz_suffix in _TIME_TZ_SUFFIXES: + time_col = prefix + time_suffix + tz_col = prefix + tz_suffix + if time_col in columns and tz_col in columns: + new_columns[target] = _build_utc_datetime( + df[col], df[time_col], df[tz_col] + ) + break + if new_columns: + # Concat in one shot — per-column assignment on a wide CSV-derived + # frame triggers pandas' fragmentation PerformanceWarning. + df = pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1) + if "Activity_StartDateTime" in df.columns: + sort_key = "Activity_StartDateTime" + elif "ActivityStartDateTime" in df.columns: + sort_key = "ActivityStartDateTime" + else: + sort_key = first_date_col + if sort_key is not None: + df = df.sort_values(by=sort_key, ignore_index=True) + return df + + class BaseMetadata: """Base class for metadata. diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 886a989c..bca8dc40 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,7 +16,7 @@ import requests from requests.models import PreparedRequest -from dataretrieval.utils import BaseMetadata, to_str +from dataretrieval.utils import BaseMetadata, _attach_datetime_columns, to_str from dataretrieval.waterdata.filters import FILTER_LANG from dataretrieval.waterdata.types import ( CODE_SERVICES, @@ -2266,7 +2266,15 @@ def get_samples( Returns ------- df : ``pandas.DataFrame`` - Formatted data returned from the API query. + Formatted data returned from the API query. For each + ``Date`` / ``Time`` / ``TimeZone`` triplet in + the response (e.g. ``Activity_StartDate``, ``Activity_StartTime``, + ``Activity_StartTimeZone``), an additional ``DateTime`` column + is appended holding a UTC ``Timestamp`` derived from the three. The + original Date/Time/TimeZone columns are left intact; rows whose + timezone abbreviation is not recognized resolve to ``NaT``. Rows are + sorted by ``Activity_StartDateTime`` when present (the API's default + order is unstable). md : :obj:`dataretrieval.utils.Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. @@ -2323,6 +2331,7 @@ def get_samples( response.raise_for_status() df = pd.read_csv(StringIO(response.text), delimiter=",") + df = _attach_datetime_columns(df) return df, BaseMetadata(response) diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index a1d0a3bb..f5a1a0ff 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -14,7 +14,8 @@ import logging import os -from typing import Any, Iterable, Literal, get_args +from collections.abc import Iterable +from typing import Any, Literal, get_args import pandas as pd import requests diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 784f2969..413da7dd 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -6,10 +6,10 @@ import re from datetime import datetime from typing import Any, get_args +from zoneinfo import ZoneInfo import pandas as pd import requests -from zoneinfo import ZoneInfo from dataretrieval import __version__ from dataretrieval.utils import BaseMetadata diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index 24e1737e..8cfc6ca1 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -17,7 +17,7 @@ import pandas as pd -from .utils import BaseMetadata, query +from .utils import BaseMetadata, _attach_datetime_columns, query if TYPE_CHECKING: from pandas import DataFrame @@ -101,7 +101,14 @@ def get_results( Returns ------- df : ``pandas.DataFrame`` - Formatted data returned from the API query. + Formatted data returned from the API query. For each + ``Date`` / ``Time`` / ``TimeZone`` triplet in + the response (legacy WQP uses ``Time/Time`` and + ``Time/TimeZoneCode``), an additional ``DateTime`` + column is appended holding a UTC ``Timestamp``. Original triplet + columns are preserved; unrecognized timezone codes yield ``NaT``. + Rows are sorted by ``ActivityStartDateTime`` (or ``Activity_StartDateTime`` + for WQX3 responses) when present. md : :obj:`dataretrieval.utils.Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. @@ -147,6 +154,7 @@ def get_results( response = query(url, kwargs, delimiter=";", ssl_check=ssl_check) df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = _attach_datetime_columns(df) return df, WQP_Metadata(response) diff --git a/pyproject.toml b/pyproject.toml index 1322dcc3..35edcc5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "dataretrieval" description = "Discover and retrieve water data from U.S. federal hydrologic web services." readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["USGS", "water data"] license = "CC0-1.0" license-files = ["LICENSE.md"] @@ -63,7 +63,7 @@ repository = "https://github.com/DOI-USGS/dataretrieval-python.git" write_to = "dataretrieval/_version.py" [tool.ruff] -target-version = "py38" +target-version = "py39" extend-exclude = ["demos"] [tool.ruff.lint] diff --git a/tests/utils_test.py b/tests/utils_test.py index 4cb9b383..2c350b2b 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -97,3 +97,61 @@ def test_to_str_custom_delimiter(self): def test_to_str_non_iterable(self): assert utils.to_str(123) is None + + +class Test_attach_datetime_columns: + """Tests of _attach_datetime_columns, which derives DateTime UTC + columns from Date/Time/TimeZone triplets in Samples and WQP CSVs.""" + + def test_wqx3_triplet_resolves_to_utc(self): + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09", "2024-02-15"], + "Activity_StartTime": ["10:00:00", "14:30:00"], + "Activity_StartTimeZone": ["PST", "EST"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["Activity_StartDateTime"][0] == pd.Timestamp( + "2024-01-09 18:00:00", tz="UTC" + ) + assert df["Activity_StartDateTime"][1] == pd.Timestamp( + "2024-02-15 19:30:00", tz="UTC" + ) + assert df["Activity_StartTimeZone"].tolist() == ["PST", "EST"] + + def test_legacy_wqp_triplet_resolves_to_utc(self): + df = pd.DataFrame( + { + "ActivityStartDate": ["2024-01-09"], + "ActivityStartTime/Time": ["10:00:00"], + "ActivityStartTime/TimeZoneCode": ["PST"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["ActivityStartDateTime"][0] == pd.Timestamp( + "2024-01-09 18:00:00", tz="UTC" + ) + + def test_unknown_timezone_is_NaT(self): + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09"], + "Activity_StartTime": ["10:00:00"], + "Activity_StartTimeZone": ["BOGUS"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["Activity_StartDateTime"].isna().all() + + def test_existing_datetime_column_not_overwritten(self): + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09"], + "Activity_StartTime": ["10:00:00"], + "Activity_StartTimeZone": ["PST"], + "Activity_StartDateTime": ["preexisting"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["Activity_StartDateTime"].tolist() == ["preexisting"] diff --git a/tests/waterdata_filters_test.py b/tests/waterdata_filters_test.py index 21eb6c1b..545f7039 100644 --- a/tests/waterdata_filters_test.py +++ b/tests/waterdata_filters_test.py @@ -190,14 +190,18 @@ def fake_walk_pages(*_args, **_kwargs): frame = pd.DataFrame({"id": [f"chunk-{idx}"], "value": [idx]}) return frame, _fake_response() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages - ), mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + side_effect=fake_construct_api_requests, + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages + ), + mock.patch( + "dataretrieval.waterdata.filters._effective_filter_budget", + return_value=_CQL_FILTER_CHUNK_LEN, + ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -239,14 +243,18 @@ def fake_walk_pages(*_args, **_kwargs): frame = pd.DataFrame({"id": ["shared-feature"], "value": [1]}) return frame, _fake_response() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - return_value=_fake_prepared_request(), - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages - ), mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + return_value=_fake_prepared_request(), + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages + ), + mock.patch( + "dataretrieval.waterdata.filters._effective_filter_budget", + return_value=_CQL_FILTER_CHUNK_LEN, + ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -293,14 +301,18 @@ def fake_walk_pages(*_args, **_kwargs): ) return frame, _fake_response() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - return_value=_fake_prepared_request(), - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages - ), mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + return_value=_fake_prepared_request(), + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages + ), + mock.patch( + "dataretrieval.waterdata.filters._effective_filter_budget", + return_value=_CQL_FILTER_CHUNK_LEN, + ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -434,14 +446,17 @@ def fake_construct_api_requests(**kwargs): sent_filters.append(kwargs.get("filter")) return _fake_prepared_request() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", - return_value=( - pd.DataFrame({"id": ["row-1"], "value": [1]}), - _fake_response(), + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + side_effect=fake_construct_api_requests, + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", + return_value=( + pd.DataFrame({"id": ["row-1"], "value": [1]}), + _fake_response(), + ), ), ): get_continuous( diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 1edf012e..b53ee296 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -54,11 +54,13 @@ def test_mock_get_samples(requests_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - assert df.size == 12127 + # 181 source columns + 6 derived DateTime columns + assert df.shape == (67, 187) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None + assert df["Activity_StartDateTime"].notna().any() def test_mock_get_samples_summary(requests_mock): @@ -127,7 +129,7 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - assert len(df.columns) == 95 + assert len(df.columns) == 97 assert "Location_HUCTwelveDigitCode" in df.columns diff --git a/tests/wqp_test.py b/tests/wqp_test.py index a337f7ec..f432ab26 100644 --- a/tests/wqp_test.py +++ b/tests/wqp_test.py @@ -33,11 +33,12 @@ def test_get_results(requests_mock): startDateHi="09-30-2011", ) assert type(df) is DataFrame - assert df.size == 315 + assert df.shape == (5, 65) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None + assert df["ActivityStartDateTime"].notna().all() def test_get_results_WQX3(requests_mock): @@ -58,11 +59,12 @@ def test_get_results_WQX3(requests_mock): startDateHi="09-30-2011", ) assert type(df) is DataFrame - assert df.size == 900 + assert df.shape == (5, 186) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None + assert df["Activity_StartDateTime"].notna().all() def test_what_sites(requests_mock):