Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions dataretrieval/waterdata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,18 +713,16 @@ def _arrange_cols(
# Rename id column to output_id
df = df.rename(columns={"id": output_id})

# If properties are provided, filter to only those columns
# plus geometry if skip_geometry is False
if properties and not all(pd.isna(properties)):
# Make sure geometry stays in the dataframe if skip_geometry is False
if "geometry" in df.columns and "geometry" not in properties:
properties.append("geometry")
# id is technically a valid column from the service, but these
# functions make the name more specific. So, if someone requests
# 'id', give them the output_id column
if "id" in properties:
properties[properties.index("id")] = output_id
df = df.loc[:, [col for col in properties if col in df.columns]]
# Don't alias the caller's list — we mutate below.
local_properties = list(properties)
if "geometry" in df.columns and "geometry" not in local_properties:
local_properties.append("geometry")
# 'id' is a valid service column, but expose it under the
# service-specific output_id name instead.
if "id" in local_properties:
local_properties[local_properties.index("id")] = output_id
df = df.loc[:, [col for col in local_properties if col in df.columns]]

# Move meaningless-to-user, extra id columns to the end
# of the dataframe, if they exist
Expand Down
46 changes: 46 additions & 0 deletions tests/waterdata_utils_test.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from unittest import mock

import pandas as pd
import requests

from dataretrieval.waterdata.utils import (
_arrange_cols,
_format_api_dates,
_get_args,
_handle_stats_nesting,
Expand Down Expand Up @@ -114,6 +116,50 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns():
assert df["monitoring_location_id"].iloc[0] == "USGS-12345"


# --- _arrange_cols ----------------------------------------------------------


def test_arrange_cols_does_not_mutate_caller_properties():
"""`_arrange_cols` must not mutate the caller's `properties` list.

Regression: previously the function did
``properties.append("geometry")`` and
``properties[properties.index("id")] = output_id`` in place, so the
caller's list grew and was rewritten across successive calls.
"""
df = pd.DataFrame(
{
"id": ["a", "b"],
"value": [1.0, 2.0],
"geometry": ["p1", "p2"],
}
)
properties = ["id", "value"]
snapshot = list(properties)

_arrange_cols(df, properties, output_id="daily_id")
_arrange_cols(df, properties, output_id="daily_id")

assert properties == snapshot, (
f"caller's properties list was mutated: {properties!r} != {snapshot!r}"
)


def test_arrange_cols_swaps_id_in_returned_columns():
"""`'id'` in `properties` should still resolve to the output_id column."""
df = pd.DataFrame({"id": ["a"], "value": [1.0]})
result = _arrange_cols(df, ["id", "value"], output_id="daily_id")
assert "daily_id" in result.columns
assert "id" not in result.columns


def test_arrange_cols_keeps_geometry_when_present():
"""Geometry must come along even if the caller didn't list it."""
df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]})
result = _arrange_cols(df, ["value"], output_id="daily_id")
assert "geometry" in result.columns


# --- _format_api_dates -------------------------------------------------------


Expand Down
Loading