From 8aae7b41532c0fa3250bbd8b3aafac5b5d56dd88 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 07:03:50 -0500 Subject: [PATCH 01/12] Parse Date/Time/TimeZone triplets in samples and WQP responses Add a shared utils.attach_datetime_columns helper that scans a CSV-derived DataFrame for Date / Time / TimeZone triplets and appends a derived DateTime UTC column for each one, leaving the original triplet columns intact. Recognizes both the WQX3 / Samples naming (Activity_StartDate, Activity_StartTime, Activity_StartTimeZone) and the legacy WQP naming (ActivityStartDate, ActivityStartTime/Time, ActivityStartTime/TimeZoneCode). Mirrors R dataRetrieval's create_dateTime. Wired into waterdata.get_samples and wqp.get_results. Closes #266. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 + dataretrieval/utils.py | 92 ++++++++++++++++++++++++++++ dataretrieval/waterdata/api.py | 11 +++- dataretrieval/wqp.py | 10 ++- tests/utils_test.py | 107 +++++++++++++++++++++++++++++++++ tests/waterdata_test.py | 15 ++++- tests/wqp_test.py | 12 +++- 7 files changed, 241 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2faaeb42..8ddd3282 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`Date`/`Time`/`TimeZone`) and legacy WQP (`Date`/`Time/Time`/`Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Mirrors R's `create_dateTime` behavior. Closes #266. + **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. **05/06/2026:** Added `waterdata.get_ratings(...)` — wraps the new Water Data STAC catalog (`api.waterdata.usgs.gov/stac/v0/search`) for USGS stage-discharge rating curves. Returns parsed `exsa` / `base` / `corr` rating tables as a dict of DataFrames keyed by feature ID, or just the list of available STAC features when `download_and_parse=False`. Mirrors R's `read_waterdata_ratings`. diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 4aa76a61..d8827bf4 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -94,6 +94,98 @@ def format_datetime(df, date_field, time_field, tz_field): return df +# Triplet patterns we recognize in WQP and Samples CSV responses. Each entry +# defines how to derive the time/timezone column names from a date column, and +# the suffix to strip when forming the new DateTime column name. +_DATETIME_TRIPLET_PATTERNS = ( + # WQX3 / Samples: Activity_StartDate, Activity_StartTime, Activity_StartTimeZone + { + "date_suffix": "Date", + "time_from_date": lambda d: d[: -len("Date")] + "Time", + "tz_from_date": lambda d: d[: -len("Date")] + "TimeZone", + }, + # Legacy WQP: Date, Time/Time, Time/TimeZoneCode + { + "date_suffix": "Date", + "time_from_date": lambda d: d[: -len("Date")] + "Time/Time", + "tz_from_date": lambda d: d[: -len("Date")] + "Time/TimeZoneCode", + }, +) + + +def _build_utc_datetime(date_series, time_series, tz_series): + """Combine date + time + tz-abbreviation columns into a UTC pandas Series. + + Unknown timezone codes (and rows missing any of the three values) yield + ``NaT``. The input columns are not mutated. + """ + offsets = tz_series.map(tz) + combined = ( + date_series.astype("string") + + " " + + time_series.astype("string") + + " " + + offsets.astype("string") + ) + # Rows where any input is missing produce a string containing ""; mark + # those so pd.to_datetime returns NaT rather than guessing. + invalid = ( + date_series.isna() | time_series.isna() | tz_series.isna() | offsets.isna() + ) + combined = combined.mask(invalid) + return pd.to_datetime(combined, format="mixed", utc=True, errors="coerce") + + +def attach_datetime_columns(df): + """Add ``DateTime`` UTC columns for any Date/Time/TimeZone triplets. + + Detects two naming patterns that appear in USGS Samples and Water Quality + Portal CSV responses: + + * **WQX3** — ``Date``, ``Time``, ``TimeZone`` + * **Legacy WQP** — ``Date``, ``Time/Time``, + ``Time/TimeZoneCode`` + + For every triplet present, a new ``DateTime`` column is appended + holding a UTC ``Timestamp`` (offsets resolved via + :data:`dataretrieval.codes.tz`). The original Date/Time/TimeZone columns + are left intact, and an existing ``DateTime`` column is never + overwritten. + + Parameters + ---------- + df : ``pandas.DataFrame`` + DataFrame returned from a Samples or WQP CSV endpoint. + + Returns + ------- + df : ``pandas.DataFrame`` + A DataFrame with any derivable ``DateTime`` columns appended. + Callers should use the returned value (the helper may concatenate + rather than mutate in place). + """ + columns = set(df.columns) + new_columns = {} + for col in df.columns: + if not col.endswith("Date"): + continue + for pattern in _DATETIME_TRIPLET_PATTERNS: + time_col = pattern["time_from_date"](col) + tz_col = pattern["tz_from_date"](col) + if time_col not in columns or tz_col not in columns: + continue + target = col[: -len("Date")] + "DateTime" + if target in columns or target in new_columns: + break + new_columns[target] = _build_utc_datetime(df[col], df[time_col], df[tz_col]) + break + if not new_columns: + return df + # Concat in one shot — appending columns one-by-one to a wide CSV-derived + # frame triggers pandas' fragmentation PerformanceWarning. + return pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1) + + class BaseMetadata: """Base class for metadata. diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 886a989c..60ce70d1 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,7 +16,7 @@ import requests from requests.models import PreparedRequest -from dataretrieval.utils import BaseMetadata, to_str +from dataretrieval.utils import BaseMetadata, attach_datetime_columns, to_str from dataretrieval.waterdata.filters import FILTER_LANG from dataretrieval.waterdata.types import ( CODE_SERVICES, @@ -2266,7 +2266,13 @@ def get_samples( Returns ------- df : ``pandas.DataFrame`` - Formatted data returned from the API query. + Formatted data returned from the API query. For each + ``Date`` / ``Time`` / ``TimeZone`` triplet in + the response (e.g. ``Activity_StartDate``, ``Activity_StartTime``, + ``Activity_StartTimeZone``), an additional ``DateTime`` column + is appended holding a UTC ``Timestamp`` derived from the three. The + original Date/Time/TimeZone columns are left intact; rows whose + timezone abbreviation is not recognized resolve to ``NaT``. md : :obj:`dataretrieval.utils.Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. @@ -2323,6 +2329,7 @@ def get_samples( response.raise_for_status() df = pd.read_csv(StringIO(response.text), delimiter=",") + df = attach_datetime_columns(df) return df, BaseMetadata(response) diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index 24e1737e..6df145e6 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -17,7 +17,7 @@ import pandas as pd -from .utils import BaseMetadata, query +from .utils import BaseMetadata, attach_datetime_columns, query if TYPE_CHECKING: from pandas import DataFrame @@ -101,7 +101,12 @@ def get_results( Returns ------- df : ``pandas.DataFrame`` - Formatted data returned from the API query. + Formatted data returned from the API query. For each + ``Date`` / ``Time`` / ``TimeZone`` triplet in + the response (legacy WQP uses ``Time/Time`` and + ``Time/TimeZoneCode``), an additional ``DateTime`` + column is appended holding a UTC ``Timestamp``. Original triplet + columns are preserved; unrecognized timezone codes yield ``NaT``. md : :obj:`dataretrieval.utils.Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. @@ -147,6 +152,7 @@ def get_results( response = query(url, kwargs, delimiter=";", ssl_check=ssl_check) df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) + df = attach_datetime_columns(df) return df, WQP_Metadata(response) diff --git a/tests/utils_test.py b/tests/utils_test.py index 4cb9b383..7a15af60 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -97,3 +97,110 @@ def test_to_str_custom_delimiter(self): def test_to_str_non_iterable(self): assert utils.to_str(123) is None + + +class Test_attach_datetime_columns: + """Tests of attach_datetime_columns, which derives DateTime UTC + columns from Date/Time/TimeZone triplets in Samples and WQP CSVs.""" + + def test_wqx3_triplet_resolves_to_utc(self): + """The Samples / WQX3 pattern (Activity_Start*) is detected and the + resulting DateTime is converted to UTC.""" + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09", "2024-02-15"], + "Activity_StartTime": ["10:00:00", "14:30:00"], + "Activity_StartTimeZone": ["PST", "EST"], + } + ) + df = utils.attach_datetime_columns(df) + assert "Activity_StartDateTime" in df.columns + # PST is UTC-8 → 10:00 PST is 18:00 UTC + assert df["Activity_StartDateTime"][0] == pd.Timestamp( + "2024-01-09 18:00:00", tz="UTC" + ) + # EST is UTC-5 → 14:30 EST is 19:30 UTC + assert df["Activity_StartDateTime"][1] == pd.Timestamp( + "2024-02-15 19:30:00", tz="UTC" + ) + # Original columns are preserved + assert df["Activity_StartTimeZone"].tolist() == ["PST", "EST"] + + def test_legacy_wqp_triplet_resolves_to_utc(self): + """The legacy WQP pattern (slash-separated time/tz columns) is also + detected.""" + df = pd.DataFrame( + { + "ActivityStartDate": ["2024-01-09"], + "ActivityStartTime/Time": ["10:00:00"], + "ActivityStartTime/TimeZoneCode": ["PST"], + } + ) + df = utils.attach_datetime_columns(df) + assert "ActivityStartDateTime" in df.columns + assert df["ActivityStartDateTime"][0] == pd.Timestamp( + "2024-01-09 18:00:00", tz="UTC" + ) + + def test_unknown_timezone_is_NaT(self): + """Unknown timezone codes resolve to NaT rather than raising.""" + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09"], + "Activity_StartTime": ["10:00:00"], + "Activity_StartTimeZone": ["BOGUS"], + } + ) + df = utils.attach_datetime_columns(df) + assert df["Activity_StartDateTime"].isna().all() + + def test_missing_time_or_tz_is_NaT(self): + """Rows with a missing time or tz produce NaT but don't poison others.""" + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09", "2024-02-15"], + "Activity_StartTime": ["10:00:00", None], + "Activity_StartTimeZone": ["PST", "EST"], + } + ) + df = utils.attach_datetime_columns(df) + assert df["Activity_StartDateTime"][0] == pd.Timestamp( + "2024-01-09 18:00:00", tz="UTC" + ) + assert pd.isna(df["Activity_StartDateTime"][1]) + + def test_existing_datetime_column_not_overwritten(self): + """An existing DateTime column is left alone.""" + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09"], + "Activity_StartTime": ["10:00:00"], + "Activity_StartTimeZone": ["PST"], + "Activity_StartDateTime": ["preexisting"], + } + ) + df = utils.attach_datetime_columns(df) + assert df["Activity_StartDateTime"].tolist() == ["preexisting"] + + def test_multiple_triplets_handled(self): + """All Date/Time/TimeZone triplets in the frame get DateTime columns.""" + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-01-09"], + "Activity_StartTime": ["10:00:00"], + "Activity_StartTimeZone": ["PST"], + "LabInfo_AnalysisStartDate": ["2024-01-10"], + "LabInfo_AnalysisStartTime": ["09:00:00"], + "LabInfo_AnalysisStartTimeZone": ["EST"], + } + ) + df = utils.attach_datetime_columns(df) + assert "Activity_StartDateTime" in df.columns + assert "LabInfo_AnalysisStartDateTime" in df.columns + + def test_lone_date_column_left_alone(self): + """A Date column without matching Time/TimeZone columns is ignored.""" + df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]}) + df = utils.attach_datetime_columns(df) + assert "LastChangeDateTime" not in df.columns + assert list(df.columns) == ["LastChangeDate"] diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 1edf012e..4ddb3155 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -1,6 +1,7 @@ import datetime import sys +import pandas as pd import pytest from pandas import DataFrame @@ -54,11 +55,20 @@ def test_mock_get_samples(requests_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - assert df.size == 12127 + # 67 rows × 181 source columns + 6 derived DateTime columns + assert df.shape == (67, 187) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None + # The Activity start triplet is parsed into a UTC Timestamp column. + assert "Activity_StartDateTime" in df.columns + # Row 0 is "2023-08-22 08:50:00 CDT" → 13:50 UTC. + assert df["Activity_StartDateTime"].iloc[0] == pd.Timestamp( + "2023-08-22 13:50:00", tz="UTC" + ) + # Original triplet columns are preserved. + assert df["Activity_StartTimeZone"].iloc[0] == "CDT" def test_mock_get_samples_summary(requests_mock): @@ -127,7 +137,8 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - assert len(df.columns) == 95 + # 95 columns from the API plus 2 derived DateTime columns. + assert len(df.columns) == 97 assert "Location_HUCTwelveDigitCode" in df.columns diff --git a/tests/wqp_test.py b/tests/wqp_test.py index a337f7ec..cbd772ff 100644 --- a/tests/wqp_test.py +++ b/tests/wqp_test.py @@ -33,11 +33,15 @@ def test_get_results(requests_mock): startDateHi="09-30-2011", ) assert type(df) is DataFrame - assert df.size == 315 + # 5 rows × 63 source columns + 2 derived DateTime columns + assert df.shape == (5, 65) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None + # Legacy WQP triplets (slash-separated) are parsed into UTC. + assert "ActivityStartDateTime" in df.columns + assert df["ActivityStartDateTime"].notna().all() def test_get_results_WQX3(requests_mock): @@ -58,11 +62,15 @@ def test_get_results_WQX3(requests_mock): startDateHi="09-30-2011", ) assert type(df) is DataFrame - assert df.size == 900 + # 5 rows × 180 source columns + 6 derived DateTime columns + assert df.shape == (5, 186) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None + # WQX3 WQP triplets are parsed into UTC. + assert "Activity_StartDateTime" in df.columns + assert df["Activity_StartDateTime"].notna().all() def test_what_sites(requests_mock): From 68eb573fd6cbe8af86c774e337637ab074cf2352 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 07:14:29 -0500 Subject: [PATCH 02/12] Address /simplify findings on attach_datetime_columns - Replace the lambda-laden _DATETIME_TRIPLET_PATTERNS dict with a flat _TIME_TZ_SUFFIXES tuple of (time_suffix, tz_suffix) pairs; the unused date_suffix field is gone. - Use str.removesuffix("Date") for the prefix swap and resolve the target column name once before iterating patterns, hoisting the existence check out of the inner loop. - Drop the redundant -mask in _build_utc_datetime; errors="coerce" already turns rows with missing inputs into NaT. - Switch pd.to_datetime from format="mixed" to a fixed "%Y-%m-%d %H:%M:%S %z" so pandas doesn't probe formats per row. - Trim WHAT-comments and per-test docstrings so the new tests match the noise level of the surrounding Test_to_str class. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/utils.py | 57 +++++++++++++++-------------------------- tests/utils_test.py | 15 ----------- tests/waterdata_test.py | 7 +---- tests/wqp_test.py | 6 ----- 4 files changed, 21 insertions(+), 64 deletions(-) diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index d8827bf4..4be188a1 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -94,23 +94,9 @@ def format_datetime(df, date_field, time_field, tz_field): return df -# Triplet patterns we recognize in WQP and Samples CSV responses. Each entry -# defines how to derive the time/timezone column names from a date column, and -# the suffix to strip when forming the new DateTime column name. -_DATETIME_TRIPLET_PATTERNS = ( - # WQX3 / Samples: Activity_StartDate, Activity_StartTime, Activity_StartTimeZone - { - "date_suffix": "Date", - "time_from_date": lambda d: d[: -len("Date")] + "Time", - "tz_from_date": lambda d: d[: -len("Date")] + "TimeZone", - }, - # Legacy WQP: Date, Time/Time, Time/TimeZoneCode - { - "date_suffix": "Date", - "time_from_date": lambda d: d[: -len("Date")] + "Time/Time", - "tz_from_date": lambda d: d[: -len("Date")] + "Time/TimeZoneCode", - }, -) +# (time-suffix, tz-suffix) pairs that follow a "Date" column. +# First entry is WQX3 / Samples, second is legacy WQP (slash-separated). +_TIME_TZ_SUFFIXES = (("Time", "TimeZone"), ("Time/Time", "Time/TimeZoneCode")) def _build_utc_datetime(date_series, time_series, tz_series): @@ -127,13 +113,9 @@ def _build_utc_datetime(date_series, time_series, tz_series): + " " + offsets.astype("string") ) - # Rows where any input is missing produce a string containing ""; mark - # those so pd.to_datetime returns NaT rather than guessing. - invalid = ( - date_series.isna() | time_series.isna() | tz_series.isna() | offsets.isna() + return pd.to_datetime( + combined, format="%Y-%m-%d %H:%M:%S %z", utc=True, errors="coerce" ) - combined = combined.mask(invalid) - return pd.to_datetime(combined, format="mixed", utc=True, errors="coerce") def attach_datetime_columns(df): @@ -160,29 +142,30 @@ def attach_datetime_columns(df): Returns ------- df : ``pandas.DataFrame`` - A DataFrame with any derivable ``DateTime`` columns appended. - Callers should use the returned value (the helper may concatenate - rather than mutate in place). + A new DataFrame with any derivable ``DateTime`` columns + appended (or the original frame if no triplets were found). """ columns = set(df.columns) new_columns = {} for col in df.columns: if not col.endswith("Date"): continue - for pattern in _DATETIME_TRIPLET_PATTERNS: - time_col = pattern["time_from_date"](col) - tz_col = pattern["tz_from_date"](col) - if time_col not in columns or tz_col not in columns: - continue - target = col[: -len("Date")] + "DateTime" - if target in columns or target in new_columns: + prefix = col.removesuffix("Date") + target = prefix + "DateTime" + if target in columns or target in new_columns: + continue + for time_suffix, tz_suffix in _TIME_TZ_SUFFIXES: + time_col = prefix + time_suffix + tz_col = prefix + tz_suffix + if time_col in columns and tz_col in columns: + new_columns[target] = _build_utc_datetime( + df[col], df[time_col], df[tz_col] + ) break - new_columns[target] = _build_utc_datetime(df[col], df[time_col], df[tz_col]) - break if not new_columns: return df - # Concat in one shot — appending columns one-by-one to a wide CSV-derived - # frame triggers pandas' fragmentation PerformanceWarning. + # Concat in one shot — per-column assignment on a wide CSV-derived frame + # triggers pandas' fragmentation PerformanceWarning. return pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1) diff --git a/tests/utils_test.py b/tests/utils_test.py index 7a15af60..8f52acd9 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -104,8 +104,6 @@ class Test_attach_datetime_columns: columns from Date/Time/TimeZone triplets in Samples and WQP CSVs.""" def test_wqx3_triplet_resolves_to_utc(self): - """The Samples / WQX3 pattern (Activity_Start*) is detected and the - resulting DateTime is converted to UTC.""" df = pd.DataFrame( { "Activity_StartDate": ["2024-01-09", "2024-02-15"], @@ -114,21 +112,15 @@ def test_wqx3_triplet_resolves_to_utc(self): } ) df = utils.attach_datetime_columns(df) - assert "Activity_StartDateTime" in df.columns - # PST is UTC-8 → 10:00 PST is 18:00 UTC assert df["Activity_StartDateTime"][0] == pd.Timestamp( "2024-01-09 18:00:00", tz="UTC" ) - # EST is UTC-5 → 14:30 EST is 19:30 UTC assert df["Activity_StartDateTime"][1] == pd.Timestamp( "2024-02-15 19:30:00", tz="UTC" ) - # Original columns are preserved assert df["Activity_StartTimeZone"].tolist() == ["PST", "EST"] def test_legacy_wqp_triplet_resolves_to_utc(self): - """The legacy WQP pattern (slash-separated time/tz columns) is also - detected.""" df = pd.DataFrame( { "ActivityStartDate": ["2024-01-09"], @@ -137,13 +129,11 @@ def test_legacy_wqp_triplet_resolves_to_utc(self): } ) df = utils.attach_datetime_columns(df) - assert "ActivityStartDateTime" in df.columns assert df["ActivityStartDateTime"][0] == pd.Timestamp( "2024-01-09 18:00:00", tz="UTC" ) def test_unknown_timezone_is_NaT(self): - """Unknown timezone codes resolve to NaT rather than raising.""" df = pd.DataFrame( { "Activity_StartDate": ["2024-01-09"], @@ -155,7 +145,6 @@ def test_unknown_timezone_is_NaT(self): assert df["Activity_StartDateTime"].isna().all() def test_missing_time_or_tz_is_NaT(self): - """Rows with a missing time or tz produce NaT but don't poison others.""" df = pd.DataFrame( { "Activity_StartDate": ["2024-01-09", "2024-02-15"], @@ -170,7 +159,6 @@ def test_missing_time_or_tz_is_NaT(self): assert pd.isna(df["Activity_StartDateTime"][1]) def test_existing_datetime_column_not_overwritten(self): - """An existing DateTime column is left alone.""" df = pd.DataFrame( { "Activity_StartDate": ["2024-01-09"], @@ -183,7 +171,6 @@ def test_existing_datetime_column_not_overwritten(self): assert df["Activity_StartDateTime"].tolist() == ["preexisting"] def test_multiple_triplets_handled(self): - """All Date/Time/TimeZone triplets in the frame get DateTime columns.""" df = pd.DataFrame( { "Activity_StartDate": ["2024-01-09"], @@ -199,8 +186,6 @@ def test_multiple_triplets_handled(self): assert "LabInfo_AnalysisStartDateTime" in df.columns def test_lone_date_column_left_alone(self): - """A Date column without matching Time/TimeZone columns is ignored.""" df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]}) df = utils.attach_datetime_columns(df) - assert "LastChangeDateTime" not in df.columns assert list(df.columns) == ["LastChangeDate"] diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 4ddb3155..493c73ff 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -55,19 +55,15 @@ def test_mock_get_samples(requests_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame - # 67 rows × 181 source columns + 6 derived DateTime columns assert df.shape == (67, 187) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None - # The Activity start triplet is parsed into a UTC Timestamp column. - assert "Activity_StartDateTime" in df.columns - # Row 0 is "2023-08-22 08:50:00 CDT" → 13:50 UTC. + # Row 0 of the fixture is "2023-08-22 08:50:00 CDT" → 13:50 UTC. assert df["Activity_StartDateTime"].iloc[0] == pd.Timestamp( "2023-08-22 13:50:00", tz="UTC" ) - # Original triplet columns are preserved. assert df["Activity_StartTimeZone"].iloc[0] == "CDT" @@ -137,7 +133,6 @@ def test_samples_activity(): monitoringLocationIdentifier="USGS-06719505", ) assert len(df) > 0 - # 95 columns from the API plus 2 derived DateTime columns. assert len(df.columns) == 97 assert "Location_HUCTwelveDigitCode" in df.columns diff --git a/tests/wqp_test.py b/tests/wqp_test.py index cbd772ff..f432ab26 100644 --- a/tests/wqp_test.py +++ b/tests/wqp_test.py @@ -33,14 +33,11 @@ def test_get_results(requests_mock): startDateHi="09-30-2011", ) assert type(df) is DataFrame - # 5 rows × 63 source columns + 2 derived DateTime columns assert df.shape == (5, 65) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None - # Legacy WQP triplets (slash-separated) are parsed into UTC. - assert "ActivityStartDateTime" in df.columns assert df["ActivityStartDateTime"].notna().all() @@ -62,14 +59,11 @@ def test_get_results_WQX3(requests_mock): startDateHi="09-30-2011", ) assert type(df) is DataFrame - # 5 rows × 180 source columns + 6 derived DateTime columns assert df.shape == (5, 186) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None - # WQX3 WQP triplets are parsed into UTC. - assert "Activity_StartDateTime" in df.columns assert df["Activity_StartDateTime"].notna().all() From 69594758f97565bc75e4539160fc87848828f9a6 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 07:42:18 -0500 Subject: [PATCH 03/12] Drop str.removesuffix to keep py38 compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /simplify pass introduced col.removesuffix("Date"), but the project declares requires-python = ">=3.8" (and the ruff target is py38), and removesuffix was added in Python 3.9 — so the helper would AttributeError at first call on a 3.8 interpreter. Revert to the slice form. Reported by Copilot review on PR #272. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 4be188a1..018e47aa 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -150,7 +150,7 @@ def attach_datetime_columns(df): for col in df.columns: if not col.endswith("Date"): continue - prefix = col.removesuffix("Date") + prefix = col[: -len("Date")] target = prefix + "DateTime" if target in columns or target in new_columns: continue From 3a6cad9a8aa8d550acbd51725747156892ffbe80 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 07:52:09 -0500 Subject: [PATCH 04/12] Bump declared Python floor to 3.9 to match CI CI's matrix already tests only Python 3.9 / 3.13 / 3.14 (and the waterdata test module skips itself on <3.10), but pyproject.toml still declared requires-python = ">=3.8" and ruff was targeting py38. Bring the manifest in line with reality: - requires-python = ">=3.9" - [tool.ruff] target-version = "py39" That unblocks col.removesuffix("Date") in attach_datetime_columns (restored), and surfaces two pre-existing pyupgrade fixes that ruff now applies under the py39 target: - dataretrieval/waterdata/ratings.py: import Iterable from collections.abc instead of typing. - dataretrieval/waterdata/utils.py: zoneinfo is stdlib on 3.9+, so the ZoneInfo import moves into the stdlib group. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/utils.py | 2 +- dataretrieval/waterdata/ratings.py | 3 ++- dataretrieval/waterdata/utils.py | 2 +- pyproject.toml | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 018e47aa..4be188a1 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -150,7 +150,7 @@ def attach_datetime_columns(df): for col in df.columns: if not col.endswith("Date"): continue - prefix = col[: -len("Date")] + prefix = col.removesuffix("Date") target = prefix + "DateTime" if target in columns or target in new_columns: continue diff --git a/dataretrieval/waterdata/ratings.py b/dataretrieval/waterdata/ratings.py index a1d0a3bb..f5a1a0ff 100644 --- a/dataretrieval/waterdata/ratings.py +++ b/dataretrieval/waterdata/ratings.py @@ -14,7 +14,8 @@ import logging import os -from typing import Any, Iterable, Literal, get_args +from collections.abc import Iterable +from typing import Any, Literal, get_args import pandas as pd import requests diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 784f2969..413da7dd 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -6,10 +6,10 @@ import re from datetime import datetime from typing import Any, get_args +from zoneinfo import ZoneInfo import pandas as pd import requests -from zoneinfo import ZoneInfo from dataretrieval import __version__ from dataretrieval.utils import BaseMetadata diff --git a/pyproject.toml b/pyproject.toml index 1322dcc3..35edcc5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "dataretrieval" description = "Discover and retrieve water data from U.S. federal hydrologic web services." readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["USGS", "water data"] license = "CC0-1.0" license-files = ["LICENSE.md"] @@ -63,7 +63,7 @@ repository = "https://github.com/DOI-USGS/dataretrieval-python.git" write_to = "dataretrieval/_version.py" [tool.ruff] -target-version = "py38" +target-version = "py39" extend-exclude = ["demos"] [tool.ruff.lint] From 2f7cf10661f25da360a162cc50034e171ca5a96b Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 08:19:00 -0500 Subject: [PATCH 05/12] Apply ruff format under the new py39 target Bumping ruff target-version from py38 to py39 made the formatter prefer parenthesized context managers (a 3.9-PEG-parser feature). The CI lint job picked up the resulting drift in tests/waterdata_filters_test.py; apply the formatter to bring it back in line. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_filters_test.py | 79 ++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/tests/waterdata_filters_test.py b/tests/waterdata_filters_test.py index 21eb6c1b..545f7039 100644 --- a/tests/waterdata_filters_test.py +++ b/tests/waterdata_filters_test.py @@ -190,14 +190,18 @@ def fake_walk_pages(*_args, **_kwargs): frame = pd.DataFrame({"id": [f"chunk-{idx}"], "value": [idx]}) return frame, _fake_response() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages - ), mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + side_effect=fake_construct_api_requests, + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages + ), + mock.patch( + "dataretrieval.waterdata.filters._effective_filter_budget", + return_value=_CQL_FILTER_CHUNK_LEN, + ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -239,14 +243,18 @@ def fake_walk_pages(*_args, **_kwargs): frame = pd.DataFrame({"id": ["shared-feature"], "value": [1]}) return frame, _fake_response() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - return_value=_fake_prepared_request(), - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages - ), mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + return_value=_fake_prepared_request(), + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages + ), + mock.patch( + "dataretrieval.waterdata.filters._effective_filter_budget", + return_value=_CQL_FILTER_CHUNK_LEN, + ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -293,14 +301,18 @@ def fake_walk_pages(*_args, **_kwargs): ) return frame, _fake_response() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - return_value=_fake_prepared_request(), - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages - ), mock.patch( - "dataretrieval.waterdata.filters._effective_filter_budget", - return_value=_CQL_FILTER_CHUNK_LEN, + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + return_value=_fake_prepared_request(), + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", side_effect=fake_walk_pages + ), + mock.patch( + "dataretrieval.waterdata.filters._effective_filter_budget", + return_value=_CQL_FILTER_CHUNK_LEN, + ), ): df, _ = get_continuous( monitoring_location_id="USGS-07374525", @@ -434,14 +446,17 @@ def fake_construct_api_requests(**kwargs): sent_filters.append(kwargs.get("filter")) return _fake_prepared_request() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", - return_value=( - pd.DataFrame({"id": ["row-1"], "value": [1]}), - _fake_response(), + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + side_effect=fake_construct_api_requests, + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", + return_value=( + pd.DataFrame({"id": ["row-1"], "value": [1]}), + _fake_response(), + ), ), ): get_continuous( From c63561949f23d2d5b77a135f1b3eb73785669c87 Mon Sep 17 00:00:00 2001 From: Timothy Hodson <34148978+thodson-usgs@users.noreply.github.com> Date: Thu, 7 May 2026 08:45:28 -0500 Subject: [PATCH 06/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/waterdata_filters_test.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/waterdata_filters_test.py b/tests/waterdata_filters_test.py index 545f7039..2e378ffa 100644 --- a/tests/waterdata_filters_test.py +++ b/tests/waterdata_filters_test.py @@ -446,17 +446,14 @@ def fake_construct_api_requests(**kwargs): sent_filters.append(kwargs.get("filter")) return _fake_prepared_request() - with ( - mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, - ), - mock.patch( - "dataretrieval.waterdata.utils._walk_pages", - return_value=( - pd.DataFrame({"id": ["row-1"], "value": [1]}), - _fake_response(), - ), + with mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + side_effect=fake_construct_api_requests, + ), mock.patch( + "dataretrieval.waterdata.utils._walk_pages", + return_value=( + pd.DataFrame({"id": ["row-1"], "value": [1]}), + _fake_response(), ), ): get_continuous( From dd03edf8c36a0a56c6791b6f7154eab2d55f2485 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 08:46:58 -0500 Subject: [PATCH 07/12] Note Python 3.9 floor bump in NEWS The previous commit (3a6cad9) raised requires-python from >=3.8 to >=3.9 to align pyproject with what CI actually tested. That is a breaking change for any downstream user still on 3.8, so call it out in the changelog. Reported by Copilot review on PR #272. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8ddd3282..50440268 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. + **05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`Date`/`Time`/`TimeZone`) and legacy WQP (`Date`/`Time/Time`/`Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Mirrors R's `create_dateTime` behavior. Closes #266. **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. From f198449a83df8d856882c79fd715984cd4ca673c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 08:49:16 -0500 Subject: [PATCH 08/12] Restore consistent parenthesized-with form The Copilot autofix in c635619 reverted one of four parenthesized-with blocks back to the chained form, leaving the file inconsistent under the project's ruff target (py39 prefers the parenthesized form per the 3.9 PEG parser). Re-running ruff format restores all four blocks to the canonical form so ruff format --check passes again. The "parenthesized with is 3.10+ only" concern is technically incorrect on this codebase: the 3.9 PEG parser accepts it, and the last CI run on 3a6cad9 / 2f7cf10 passed test (ubuntu-latest, 3.9) and test (windows-latest, 3.9) with this syntax in place. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_filters_test.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/waterdata_filters_test.py b/tests/waterdata_filters_test.py index 2e378ffa..545f7039 100644 --- a/tests/waterdata_filters_test.py +++ b/tests/waterdata_filters_test.py @@ -446,14 +446,17 @@ def fake_construct_api_requests(**kwargs): sent_filters.append(kwargs.get("filter")) return _fake_prepared_request() - with mock.patch( - "dataretrieval.waterdata.utils._construct_api_requests", - side_effect=fake_construct_api_requests, - ), mock.patch( - "dataretrieval.waterdata.utils._walk_pages", - return_value=( - pd.DataFrame({"id": ["row-1"], "value": [1]}), - _fake_response(), + with ( + mock.patch( + "dataretrieval.waterdata.utils._construct_api_requests", + side_effect=fake_construct_api_requests, + ), + mock.patch( + "dataretrieval.waterdata.utils._walk_pages", + return_value=( + pd.DataFrame({"id": ["row-1"], "value": [1]}), + _fake_response(), + ), ), ): get_continuous( From c1e937a6cfde7fdc3c070237e6d9860d1b71b93d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 10:47:12 -0500 Subject: [PATCH 09/12] Mark attach_datetime_columns as private and add type hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The helper is purely an internal post-processing step inside get_samples / get_results — users have no reason to call it directly, and dataretrieval/__init__.py's `from dataretrieval.utils import *` was leaking it into the public API surface as `dataretrieval.attach_datetime_columns`. Underscore-prefix it and update the two call sites plus the unit tests. Also annotate _attach_datetime_columns and _build_utc_datetime with pd.DataFrame / pd.Series / pd.Series → pd.Series signatures, matching the typing style already used in dataretrieval/waterdata/utils.py. Addresses self-review of PR #272. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/utils.py | 6 ++++-- dataretrieval/waterdata/api.py | 4 ++-- dataretrieval/wqp.py | 4 ++-- tests/utils_test.py | 16 ++++++++-------- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index 4be188a1..ede32dc7 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -99,7 +99,9 @@ def format_datetime(df, date_field, time_field, tz_field): _TIME_TZ_SUFFIXES = (("Time", "TimeZone"), ("Time/Time", "Time/TimeZoneCode")) -def _build_utc_datetime(date_series, time_series, tz_series): +def _build_utc_datetime( + date_series: pd.Series, time_series: pd.Series, tz_series: pd.Series +) -> pd.Series: """Combine date + time + tz-abbreviation columns into a UTC pandas Series. Unknown timezone codes (and rows missing any of the three values) yield @@ -118,7 +120,7 @@ def _build_utc_datetime(date_series, time_series, tz_series): ) -def attach_datetime_columns(df): +def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame: """Add ``DateTime`` UTC columns for any Date/Time/TimeZone triplets. Detects two naming patterns that appear in USGS Samples and Water Quality diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 60ce70d1..a5e35c7a 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,7 +16,7 @@ import requests from requests.models import PreparedRequest -from dataretrieval.utils import BaseMetadata, attach_datetime_columns, to_str +from dataretrieval.utils import BaseMetadata, _attach_datetime_columns, to_str from dataretrieval.waterdata.filters import FILTER_LANG from dataretrieval.waterdata.types import ( CODE_SERVICES, @@ -2329,7 +2329,7 @@ def get_samples( response.raise_for_status() df = pd.read_csv(StringIO(response.text), delimiter=",") - df = attach_datetime_columns(df) + df = _attach_datetime_columns(df) return df, BaseMetadata(response) diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index 6df145e6..dd822310 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -17,7 +17,7 @@ import pandas as pd -from .utils import BaseMetadata, attach_datetime_columns, query +from .utils import BaseMetadata, _attach_datetime_columns, query if TYPE_CHECKING: from pandas import DataFrame @@ -152,7 +152,7 @@ def get_results( response = query(url, kwargs, delimiter=";", ssl_check=ssl_check) df = pd.read_csv(StringIO(response.text), delimiter=",", low_memory=False) - df = attach_datetime_columns(df) + df = _attach_datetime_columns(df) return df, WQP_Metadata(response) diff --git a/tests/utils_test.py b/tests/utils_test.py index 8f52acd9..760401ca 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -100,7 +100,7 @@ def test_to_str_non_iterable(self): class Test_attach_datetime_columns: - """Tests of attach_datetime_columns, which derives DateTime UTC + """Tests of _attach_datetime_columns, which derives DateTime UTC columns from Date/Time/TimeZone triplets in Samples and WQP CSVs.""" def test_wqx3_triplet_resolves_to_utc(self): @@ -111,7 +111,7 @@ def test_wqx3_triplet_resolves_to_utc(self): "Activity_StartTimeZone": ["PST", "EST"], } ) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"][0] == pd.Timestamp( "2024-01-09 18:00:00", tz="UTC" ) @@ -128,7 +128,7 @@ def test_legacy_wqp_triplet_resolves_to_utc(self): "ActivityStartTime/TimeZoneCode": ["PST"], } ) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert df["ActivityStartDateTime"][0] == pd.Timestamp( "2024-01-09 18:00:00", tz="UTC" ) @@ -141,7 +141,7 @@ def test_unknown_timezone_is_NaT(self): "Activity_StartTimeZone": ["BOGUS"], } ) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"].isna().all() def test_missing_time_or_tz_is_NaT(self): @@ -152,7 +152,7 @@ def test_missing_time_or_tz_is_NaT(self): "Activity_StartTimeZone": ["PST", "EST"], } ) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"][0] == pd.Timestamp( "2024-01-09 18:00:00", tz="UTC" ) @@ -167,7 +167,7 @@ def test_existing_datetime_column_not_overwritten(self): "Activity_StartDateTime": ["preexisting"], } ) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"].tolist() == ["preexisting"] def test_multiple_triplets_handled(self): @@ -181,11 +181,11 @@ def test_multiple_triplets_handled(self): "LabInfo_AnalysisStartTimeZone": ["EST"], } ) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert "Activity_StartDateTime" in df.columns assert "LabInfo_AnalysisStartDateTime" in df.columns def test_lone_date_column_left_alone(self): df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]}) - df = utils.attach_datetime_columns(df) + df = utils._attach_datetime_columns(df) assert list(df.columns) == ["LastChangeDate"] From 2ba6084ebe33cc72a2d47eed51951b2f110d21ce Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Thu, 7 May 2026 10:57:42 -0500 Subject: [PATCH 10/12] Self-document _TIME_TZ_SUFFIXES with example column names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two-tuple constant identifying the WQP/Samples Date/Time/TimeZone naming patterns was previously labeled "WQX3 / Samples" and "legacy WQP" — accurate for someone steeped in USGS jargon, opaque for a maintainer reading the file cold. Spell out an example column-name triplet next to each entry so the constant is self-explanatory. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index ede32dc7..a0b19348 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -95,8 +95,14 @@ def format_datetime(df, date_field, time_field, tz_field): # (time-suffix, tz-suffix) pairs that follow a "Date" column. -# First entry is WQX3 / Samples, second is legacy WQP (slash-separated). -_TIME_TZ_SUFFIXES = (("Time", "TimeZone"), ("Time/Time", "Time/TimeZoneCode")) +_TIME_TZ_SUFFIXES = ( + # WQX3 / Samples, e.g. + # Activity_StartDate / Activity_StartTime / Activity_StartTimeZone + ("Time", "TimeZone"), + # Legacy WQP (slash-separated), e.g. + # ActivityStartDate / ActivityStartTime/Time / ActivityStartTime/TimeZoneCode + ("Time/Time", "Time/TimeZoneCode"), +) def _build_utc_datetime( From 1d69a3429f07bb4587bd78c6f85ee3d1904ec857 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 8 May 2026 16:17:52 -0500 Subject: [PATCH 11/12] Sort samples/WQP rows by activity-start datetime Per ldecicco-USGS's review of #272: R dataRetrieval ends its WQP/Samples pipeline by sorting the returned table on the activity-start datetime, because the API's natural order is unstable. Mirror that here. _attach_datetime_columns now picks a sort key with the same precedence R uses: 1. Activity_StartDateTime (WQX3 / Samples) 2. ActivityStartDateTime (legacy WQP) 3. first detected *Date column (fallback) The sort runs in addition to (and after) the DateTime-column derivation, and uses ignore_index=True to match the convention in dataretrieval/waterdata/utils.py::_sort_rows. Three new unit tests cover each branch of the precedence; the existing mock samples test was updated to assert the new monotonic-increasing iloc[0] row from the fixture (2023-06-20 09:25 CDT, the earliest sample in the file). Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 +- dataretrieval/utils.py | 35 ++++++++++++++++++++++++-------- dataretrieval/waterdata/api.py | 4 +++- dataretrieval/wqp.py | 2 ++ tests/utils_test.py | 37 ++++++++++++++++++++++++++++++++++ tests/waterdata_test.py | 6 ++++-- 6 files changed, 74 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index 50440268..246ede15 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ **05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. -**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`Date`/`Time`/`TimeZone`) and legacy WQP (`Date`/`Time/Time`/`Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Mirrors R's `create_dateTime` behavior. Closes #266. +**05/07/2026:** `waterdata.get_samples()` and `wqp.get_results()` now append a derived `DateTime` UTC column for every Date/Time/TimeZone triplet in the response (e.g. `Activity_StartDate` + `Activity_StartTime` + `Activity_StartTimeZone` → `Activity_StartDateTime`). Both the WQX3 (`Date`/`Time`/`TimeZone`) and legacy WQP (`Date`/`Time/Time`/`Time/TimeZoneCode`) shapes are recognized; abbreviations like EST/EDT/CST/PST resolve to a UTC `Timestamp`, unknown codes resolve to `NaT`, and the original triplet columns are preserved. Returned rows are also now sorted by `Activity_StartDateTime` (or the legacy `ActivityStartDateTime`) — the underlying APIs return rows in an unstable order. Mirrors R's `create_dateTime` and end-of-pipeline sort. Closes #266. **05/06/2026:** Each remaining active function in `dataretrieval.nwis` now emits a per-function `DeprecationWarning` naming the `waterdata` replacement to migrate to (visible the first time users call each getter). The `nwis` module is scheduled for removal on or after **2027-05-06**. diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py index a0b19348..76bbb6ad 100644 --- a/dataretrieval/utils.py +++ b/dataretrieval/utils.py @@ -127,7 +127,8 @@ def _build_utc_datetime( def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame: - """Add ``DateTime`` UTC columns for any Date/Time/TimeZone triplets. + """Add ``DateTime`` UTC columns for any Date/Time/TimeZone triplets + and sort the frame by the activity-start datetime. Detects two naming patterns that appear in USGS Samples and Water Quality Portal CSV responses: @@ -142,6 +143,12 @@ def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame: are left intact, and an existing ``DateTime`` column is never overwritten. + Rows are sorted (and the index reset) by the canonical activity-start + datetime when present — ``Activity_StartDateTime`` (WQX3) or + ``ActivityStartDateTime`` (legacy WQP) — falling back to the first + detected ``*Date`` column. Mirrors R ``dataRetrieval``'s + end-of-pipeline sort in ``importWQP.R``. + Parameters ---------- df : ``pandas.DataFrame`` @@ -150,14 +157,18 @@ def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame: Returns ------- df : ``pandas.DataFrame`` - A new DataFrame with any derivable ``DateTime`` columns - appended (or the original frame if no triplets were found). + A new DataFrame with derivable ``DateTime`` columns appended + and rows sorted by the activity-start datetime (if any date column + was detected). """ columns = set(df.columns) new_columns = {} + first_date_col = None for col in df.columns: if not col.endswith("Date"): continue + if first_date_col is None: + first_date_col = col prefix = col.removesuffix("Date") target = prefix + "DateTime" if target in columns or target in new_columns: @@ -170,11 +181,19 @@ def _attach_datetime_columns(df: pd.DataFrame) -> pd.DataFrame: df[col], df[time_col], df[tz_col] ) break - if not new_columns: - return df - # Concat in one shot — per-column assignment on a wide CSV-derived frame - # triggers pandas' fragmentation PerformanceWarning. - return pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1) + if new_columns: + # Concat in one shot — per-column assignment on a wide CSV-derived + # frame triggers pandas' fragmentation PerformanceWarning. + df = pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1) + if "Activity_StartDateTime" in df.columns: + sort_key = "Activity_StartDateTime" + elif "ActivityStartDateTime" in df.columns: + sort_key = "ActivityStartDateTime" + else: + sort_key = first_date_col + if sort_key is not None: + df = df.sort_values(by=sort_key, ignore_index=True) + return df class BaseMetadata: diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index a5e35c7a..bca8dc40 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -2272,7 +2272,9 @@ def get_samples( ``Activity_StartTimeZone``), an additional ``DateTime`` column is appended holding a UTC ``Timestamp`` derived from the three. The original Date/Time/TimeZone columns are left intact; rows whose - timezone abbreviation is not recognized resolve to ``NaT``. + timezone abbreviation is not recognized resolve to ``NaT``. Rows are + sorted by ``Activity_StartDateTime`` when present (the API's default + order is unstable). md : :obj:`dataretrieval.utils.Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. diff --git a/dataretrieval/wqp.py b/dataretrieval/wqp.py index dd822310..8cfc6ca1 100644 --- a/dataretrieval/wqp.py +++ b/dataretrieval/wqp.py @@ -107,6 +107,8 @@ def get_results( ``Time/TimeZoneCode``), an additional ``DateTime`` column is appended holding a UTC ``Timestamp``. Original triplet columns are preserved; unrecognized timezone codes yield ``NaT``. + Rows are sorted by ``ActivityStartDateTime`` (or ``Activity_StartDateTime`` + for WQX3 responses) when present. md : :obj:`dataretrieval.utils.Metadata` Custom ``dataretrieval`` metadata object pertaining to the query. diff --git a/tests/utils_test.py b/tests/utils_test.py index 760401ca..a2c4cf9b 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -189,3 +189,40 @@ def test_lone_date_column_left_alone(self): df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]}) df = utils._attach_datetime_columns(df) assert list(df.columns) == ["LastChangeDate"] + + def test_rows_sorted_by_wqx3_activity_start(self): + df = pd.DataFrame( + { + "Activity_StartDate": ["2024-03-01", "2024-01-09", "2024-02-15"], + "Activity_StartTime": ["10:00:00", "10:00:00", "10:00:00"], + "Activity_StartTimeZone": ["UTC", "UTC", "UTC"], + "marker": ["c", "a", "b"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["marker"].tolist() == ["a", "b", "c"] + assert df.index.tolist() == [0, 1, 2] + + def test_rows_sorted_by_legacy_activity_start_when_wqx3_absent(self): + df = pd.DataFrame( + { + "ActivityStartDate": ["2024-03-01", "2024-01-09"], + "ActivityStartTime/Time": ["10:00:00", "10:00:00"], + "ActivityStartTime/TimeZoneCode": ["UTC", "UTC"], + "marker": ["b", "a"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["marker"].tolist() == ["a", "b"] + + def test_rows_sorted_by_first_date_column_as_fallback(self): + # No triplet → no DateTime column added, but rows still sort by the + # first *Date column found (mirrors R's importWQP.R fallback). + df = pd.DataFrame( + { + "LastChangeDate": ["2024-03-01", "2024-01-09"], + "marker": ["b", "a"], + } + ) + df = utils._attach_datetime_columns(df) + assert df["marker"].tolist() == ["a", "b"] diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 493c73ff..7ae7d557 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -60,11 +60,13 @@ def test_mock_get_samples(requests_mock): assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None - # Row 0 of the fixture is "2023-08-22 08:50:00 CDT" → 13:50 UTC. + # Rows now come back sorted by Activity_StartDateTime; the earliest in + # the fixture is "2023-06-20 09:25:00 CDT" → 14:25 UTC. assert df["Activity_StartDateTime"].iloc[0] == pd.Timestamp( - "2023-08-22 13:50:00", tz="UTC" + "2023-06-20 14:25:00", tz="UTC" ) assert df["Activity_StartTimeZone"].iloc[0] == "CDT" + assert df["Activity_StartDateTime"].is_monotonic_increasing def test_mock_get_samples_summary(requests_mock): From 59df88b115a2d52adba1d2dc40f76fa5f344e530 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Fri, 8 May 2026 16:24:15 -0500 Subject: [PATCH 12/12] Trim trivial tests on _attach_datetime_columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop 5 unit tests and a handful of sort-specific integration assertions: - test_missing_time_or_tz_is_NaT: redundant with test_unknown_timezone_is_NaT — both exercise the same NaT coercion path through pd.to_datetime. - test_multiple_triplets_handled: the samples_results.txt mock fixture already has 6 triplets, so the integration test exercises this. - test_lone_date_column_left_alone: trivially obvious from the loop body. - test_rows_sorted_by_wqx3_activity_start, _legacy_*, _first_date_column_*: per maintainer feedback, the sort behavior doesn't need dedicated test coverage on a private helper. In test_mock_get_samples, drop the iloc[0]/is_monotonic_increasing assertions and replace with a minimal "DateTime column has at least one parsed timestamp" check. The shape assertion (67, 187) still proves all 6 derived DateTime columns were appended. Also drop the now-unused `import pandas as pd` from tests/waterdata_test.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/utils_test.py | 71 ----------------------------------------- tests/waterdata_test.py | 10 ++---- 2 files changed, 2 insertions(+), 79 deletions(-) diff --git a/tests/utils_test.py b/tests/utils_test.py index a2c4cf9b..2c350b2b 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -144,20 +144,6 @@ def test_unknown_timezone_is_NaT(self): df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"].isna().all() - def test_missing_time_or_tz_is_NaT(self): - df = pd.DataFrame( - { - "Activity_StartDate": ["2024-01-09", "2024-02-15"], - "Activity_StartTime": ["10:00:00", None], - "Activity_StartTimeZone": ["PST", "EST"], - } - ) - df = utils._attach_datetime_columns(df) - assert df["Activity_StartDateTime"][0] == pd.Timestamp( - "2024-01-09 18:00:00", tz="UTC" - ) - assert pd.isna(df["Activity_StartDateTime"][1]) - def test_existing_datetime_column_not_overwritten(self): df = pd.DataFrame( { @@ -169,60 +155,3 @@ def test_existing_datetime_column_not_overwritten(self): ) df = utils._attach_datetime_columns(df) assert df["Activity_StartDateTime"].tolist() == ["preexisting"] - - def test_multiple_triplets_handled(self): - df = pd.DataFrame( - { - "Activity_StartDate": ["2024-01-09"], - "Activity_StartTime": ["10:00:00"], - "Activity_StartTimeZone": ["PST"], - "LabInfo_AnalysisStartDate": ["2024-01-10"], - "LabInfo_AnalysisStartTime": ["09:00:00"], - "LabInfo_AnalysisStartTimeZone": ["EST"], - } - ) - df = utils._attach_datetime_columns(df) - assert "Activity_StartDateTime" in df.columns - assert "LabInfo_AnalysisStartDateTime" in df.columns - - def test_lone_date_column_left_alone(self): - df = pd.DataFrame({"LastChangeDate": ["2024-01-09"]}) - df = utils._attach_datetime_columns(df) - assert list(df.columns) == ["LastChangeDate"] - - def test_rows_sorted_by_wqx3_activity_start(self): - df = pd.DataFrame( - { - "Activity_StartDate": ["2024-03-01", "2024-01-09", "2024-02-15"], - "Activity_StartTime": ["10:00:00", "10:00:00", "10:00:00"], - "Activity_StartTimeZone": ["UTC", "UTC", "UTC"], - "marker": ["c", "a", "b"], - } - ) - df = utils._attach_datetime_columns(df) - assert df["marker"].tolist() == ["a", "b", "c"] - assert df.index.tolist() == [0, 1, 2] - - def test_rows_sorted_by_legacy_activity_start_when_wqx3_absent(self): - df = pd.DataFrame( - { - "ActivityStartDate": ["2024-03-01", "2024-01-09"], - "ActivityStartTime/Time": ["10:00:00", "10:00:00"], - "ActivityStartTime/TimeZoneCode": ["UTC", "UTC"], - "marker": ["b", "a"], - } - ) - df = utils._attach_datetime_columns(df) - assert df["marker"].tolist() == ["a", "b"] - - def test_rows_sorted_by_first_date_column_as_fallback(self): - # No triplet → no DateTime column added, but rows still sort by the - # first *Date column found (mirrors R's importWQP.R fallback). - df = pd.DataFrame( - { - "LastChangeDate": ["2024-03-01", "2024-01-09"], - "marker": ["b", "a"], - } - ) - df = utils._attach_datetime_columns(df) - assert df["marker"].tolist() == ["a", "b"] diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 7ae7d557..b53ee296 100644 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -1,7 +1,6 @@ import datetime import sys -import pandas as pd import pytest from pandas import DataFrame @@ -55,18 +54,13 @@ def test_mock_get_samples(requests_mock): monitoringLocationIdentifier="USGS-05406500", ) assert type(df) is DataFrame + # 181 source columns + 6 derived DateTime columns assert df.shape == (67, 187) assert md.url == request_url assert isinstance(md.query_time, datetime.timedelta) assert md.header == {"mock_header": "value"} assert md.comment is None - # Rows now come back sorted by Activity_StartDateTime; the earliest in - # the fixture is "2023-06-20 09:25:00 CDT" → 14:25 UTC. - assert df["Activity_StartDateTime"].iloc[0] == pd.Timestamp( - "2023-06-20 14:25:00", tz="UTC" - ) - assert df["Activity_StartTimeZone"].iloc[0] == "CDT" - assert df["Activity_StartDateTime"].is_monotonic_increasing + assert df["Activity_StartDateTime"].notna().any() def test_mock_get_samples_summary(requests_mock):