diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 444c1a4b..d78387b4 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -128,8 +128,54 @@ def _switch_properties_id(properties: list[str] | None, id_name: str, service: s return [p for p in properties if p not in ["geometry", service_id]] +_DATETIME_FORMATS = ( + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%d %H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", +) + +# Anchored to ``[Pp]\d`` so a normal word containing ``p`` (e.g. ``"Apr"``) +# doesn't get mis-classified as an ISO 8601 duration; the optional ``T`` +# admits time-only forms like ``PT36H``. +_DURATION_RE = re.compile(r"^[Pp]T?\d") + + +def _parse_datetime(value: str) -> datetime | None: + """Parse a single datetime string against the supported formats. + + Returns a ``datetime`` (tz-aware iff the input carried a UTC offset), + or ``None`` if no format matched. + """ + # ``datetime.strptime`` accepts a numeric offset like ``+00:00`` but not + # the ``Z`` shorthand, so normalize trailing ``Z`` first. + candidate = value[:-1] + "+00:00" if value.endswith("Z") else value + for fmt in _DATETIME_FORMATS: + try: + return datetime.strptime(candidate, fmt) + except ValueError: + continue + return None + + +def _format_one(dt, *, date: bool, local_tz) -> str | None: + """Format a single datetime element for inclusion in the API time arg.""" + if pd.isna(dt) or dt == "" or dt is None: + return ".." + parsed = _parse_datetime(dt) + if parsed is None: + return None + if date: + return parsed.strftime("%Y-%m-%d") + aware = parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=local_tz) + return aware.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") + + def _format_api_dates( - datetime_input: str | list[str], date: bool = False + datetime_input: str | list[str | None] | None, date: bool = False ) -> str | None: """ Formats date or datetime input(s) for use with an API. @@ -139,10 +185,12 @@ def _format_api_dates( Parameters ---------- - datetime_input : Union[str, List[str]] + datetime_input : Union[str, List[Optional[str]], None] A single date/datetime string or a list of one or two date/datetime - strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative - periods (e.g., "P7D"). + strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601 (with or + without ``Z``/numeric offset), or relative periods (e.g., "P7D" / + "PT36H"). Range endpoints may be ``None``/``NaN``/empty to denote a + half-bounded range. date : bool, optional If True, uses only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). @@ -164,12 +212,16 @@ def _format_api_dates( Notes ----- - - Handles blank or NA values by returning None. - - Supports relative period strings (e.g., "P7D") and passes them through - unchanged. + - A single blank/NA value returns None. In a two-value range, a blank/NA + endpoint is rendered as ``".."`` to denote an open bound (e.g. + ``"2024-01-01/.."``); the range is only None when *every* element is + blank/NA or any non-NA element fails to parse. + - Supports ISO 8601 durations such as "P7D" and "PT36H" and pre-formatted + intervals containing ``"/"``; both are passed through unchanged. - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when - `date` is False. - - For date ranges, replaces "nan" with ".." in the output. + `date` is False. Inputs with an explicit offset (``Z`` or ``+HH:MM``) are + converted from that offset to UTC; naive inputs are interpreted in the + local time zone for backwards compatibility. """ # Get timezone local_timezone = datetime.now().astimezone().tzinfo @@ -182,48 +234,25 @@ def _format_api_dates( if all(pd.isna(dt) or dt == "" or dt is None for dt in datetime_input): return None - if len(datetime_input) <= 2: - # If the list is of length 1, first look for things like "P7D" or dates - # already formatted in ISO08601. Otherwise, try to coerce to datetime - if len(datetime_input) == 1 and ( - re.search(r"P", datetime_input[0], re.IGNORECASE) - or "/" in datetime_input[0] - ): - return datetime_input[0] - # Otherwise, use list comprehension to parse dates - else: - try: - # Parse to naive datetime - parsed_dates = [ - datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") # noqa: DTZ007 - for dt in datetime_input - ] - except ValueError: - # Parse to date only - try: - parsed_dates = [ - datetime.strptime(dt, "%Y-%m-%d") # noqa: DTZ007 - for dt in datetime_input - ] - except ValueError: - return None - # If the service only accepts dates for this input, not - # datetimes (e.g. "daily"), return just the dates separated by a - # "/", otherwise, return the datetime in UTC format. - if date: - return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) - else: - parsed_locals = [ - dt.replace(tzinfo=local_timezone) for dt in parsed_dates - ] - formatted = "/".join( - dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") - for dt in parsed_locals - ) - return formatted - else: + if len(datetime_input) > 2: raise ValueError("datetime_input should only include 1-2 values") + # Pass through duration ("P7D", "PT36H") and pre-formatted interval ("a/b") + # strings untouched. + if len(datetime_input) == 1 and isinstance(datetime_input[0], str): + single = datetime_input[0] + if _DURATION_RE.match(single) or "/" in single: + return single + + # Half-bounded ranges: NA endpoints render as ".."; any unparseable non-NA + # element invalidates the range. + formatted = [ + _format_one(dt, date=date, local_tz=local_timezone) for dt in datetime_input + ] + if any(f is None for f in formatted): + return None + return "/".join(formatted) + def _cql2_param(args: dict[str, Any]) -> str: """ diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index c533431d..d8d654b4 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -3,6 +3,7 @@ import requests from dataretrieval.waterdata.utils import ( + _format_api_dates, _get_args, _handle_stats_nesting, _walk_pages, @@ -111,3 +112,66 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns(): assert len(df) == 1 assert df["monitoring_location_id"].iloc[0] == "USGS-12345" + + +# --- _format_api_dates ------------------------------------------------------- + + +def test_format_api_dates_iso8601_with_z(): + """ISO 8601 datetimes with a 'Z' suffix must be parsed, not dropped to None.""" + assert _format_api_dates("2018-02-12T23:20:50Z") == "2018-02-12T23:20:50Z" + + +def test_format_api_dates_iso8601_with_fractional_seconds(): + assert _format_api_dates("2018-02-12T23:20:50.123Z") == "2018-02-12T23:20:50Z" + + +def test_format_api_dates_iso8601_with_offset(): + """Numeric offsets must be converted to UTC.""" + assert _format_api_dates("2018-02-12T19:20:50-04:00") == "2018-02-12T23:20:50Z" + + +def test_format_api_dates_iso8601_pair(): + """A list of two ISO 8601 datetimes must be parsed into a UTC interval.""" + result = _format_api_dates(["2018-02-12T23:20:50Z", "2018-03-18T12:31:12Z"]) + assert result == "2018-02-12T23:20:50Z/2018-03-18T12:31:12Z" + + +def test_format_api_dates_passthrough_interval(): + assert _format_api_dates("2018-02-12T00:00:00Z/..") == "2018-02-12T00:00:00Z/.." + + +def test_format_api_dates_passthrough_duration(): + assert _format_api_dates("P7D") == "P7D" + + +def test_format_api_dates_passthrough_time_only_duration(): + """ISO 8601 time-only durations (PT...) are passed through unchanged.""" + assert _format_api_dates("PT36H") == "PT36H" + + +def test_format_api_dates_word_with_p_is_not_a_duration(): + """Strings containing the letter 'p' must not be misclassified as durations.""" + assert _format_api_dates("Apr") is None + + +def test_format_api_dates_date_only(): + assert _format_api_dates("2024-01-01", date=True) == "2024-01-01" + + +def test_format_api_dates_date_only_pair(): + assert ( + _format_api_dates(["2024-01-01", "2024-02-01"], date=True) + == "2024-01-01/2024-02-01" + ) + + +def test_format_api_dates_space_separated_still_works(): + """The legacy space-separated format must still parse.""" + assert _format_api_dates("2024-01-01 00:00:00", date=True) == "2024-01-01" + + +def test_format_api_dates_open_ended_range_with_none(): + """A None / NaN endpoint becomes '..' in the output range.""" + assert _format_api_dates(["2024-01-01", None], date=True) == "2024-01-01/.." + assert _format_api_dates([None, "2024-01-01"], date=True) == "../2024-01-01"