diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index f35496b7..784f2969 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -713,18 +713,16 @@ def _arrange_cols( # Rename id column to output_id df = df.rename(columns={"id": output_id}) - # If properties are provided, filter to only those columns - # plus geometry if skip_geometry is False if properties and not all(pd.isna(properties)): - # Make sure geometry stays in the dataframe if skip_geometry is False - if "geometry" in df.columns and "geometry" not in properties: - properties.append("geometry") - # id is technically a valid column from the service, but these - # functions make the name more specific. So, if someone requests - # 'id', give them the output_id column - if "id" in properties: - properties[properties.index("id")] = output_id - df = df.loc[:, [col for col in properties if col in df.columns]] + # Don't alias the caller's list — we mutate below. + local_properties = list(properties) + if "geometry" in df.columns and "geometry" not in local_properties: + local_properties.append("geometry") + # 'id' is a valid service column, but expose it under the + # service-specific output_id name instead. + if "id" in local_properties: + local_properties[local_properties.index("id")] = output_id + df = df.loc[:, [col for col in local_properties if col in df.columns]] # Move meaningless-to-user, extra id columns to the end # of the dataframe, if they exist diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index d8d654b4..8151ab37 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -1,8 +1,10 @@ from unittest import mock +import pandas as pd import requests from dataretrieval.waterdata.utils import ( + _arrange_cols, _format_api_dates, _get_args, _handle_stats_nesting, @@ -114,6 +116,50 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns(): assert df["monitoring_location_id"].iloc[0] == "USGS-12345" +# --- _arrange_cols ---------------------------------------------------------- + + +def test_arrange_cols_does_not_mutate_caller_properties(): + """`_arrange_cols` must not mutate the caller's `properties` list. + + Regression: previously the function did + ``properties.append("geometry")`` and + ``properties[properties.index("id")] = output_id`` in place, so the + caller's list grew and was rewritten across successive calls. + """ + df = pd.DataFrame( + { + "id": ["a", "b"], + "value": [1.0, 2.0], + "geometry": ["p1", "p2"], + } + ) + properties = ["id", "value"] + snapshot = list(properties) + + _arrange_cols(df, properties, output_id="daily_id") + _arrange_cols(df, properties, output_id="daily_id") + + assert properties == snapshot, ( + f"caller's properties list was mutated: {properties!r} != {snapshot!r}" + ) + + +def test_arrange_cols_swaps_id_in_returned_columns(): + """`'id'` in `properties` should still resolve to the output_id column.""" + df = pd.DataFrame({"id": ["a"], "value": [1.0]}) + result = _arrange_cols(df, ["id", "value"], output_id="daily_id") + assert "daily_id" in result.columns + assert "id" not in result.columns + + +def test_arrange_cols_keeps_geometry_when_present(): + """Geometry must come along even if the caller didn't list it.""" + df = pd.DataFrame({"id": ["a"], "value": [1.0], "geometry": ["p1"]}) + result = _arrange_cols(df, ["value"], output_id="daily_id") + assert "geometry" in result.columns + + # --- _format_api_dates -------------------------------------------------------