diff --git a/tests/preprocessing/test_staypoints.py b/tests/preprocessing/test_staypoints.py index d0cb1311..4ba4c267 100644 --- a/tests/preprocessing/test_staypoints.py +++ b/tests/preprocessing/test_staypoints.py @@ -258,6 +258,53 @@ def test_dbscan_user_dataset(self): assert loc_dataset_num == 1 assert loc_user_num == 2 + def test_user_locations_with_singleton_and_multi_staypoints(self): + """User-level location generation should handle singleton and multi-staypoint locations together. + + The optimized user-level path skips the expensive dissolve operation for locations that contain only one + staypoint, while still dissolving locations with multiple staypoints. This test keeps both cases in one input + so center, extent, and staypoint linkage regressions are visible. + """ + started_at = pd.Timestamp("1971-01-01 00:00:00", tz="utc") + finished_at = pd.Timestamp("1971-01-01 01:00:00", tz="utc") + sp = ti.Staypoints( + gpd.GeoDataFrame( + [ + {"id": 0, "user_id": 0, "started_at": started_at, "finished_at": finished_at, "geom": Point(0, 0)}, + {"id": 1, "user_id": 0, "started_at": started_at, "finished_at": finished_at, "geom": Point(0, 0)}, + { + "id": 2, + "user_id": 0, + "started_at": started_at, + "finished_at": finished_at, + "geom": Point(100, 100), + }, + { + "id": 3, + "user_id": 1, + "started_at": started_at, + "finished_at": finished_at, + "geom": Point(200, 200), + }, + ], + geometry="geom", + crs="EPSG:2056", + ).set_index("id") + ) + + linked_sp, locs = sp.generate_locations( + method="dbscan", epsilon=1, num_samples=1, distance_metric="euclidean", agg_level="user" + ) + + assert len(locs) == 3 + assert linked_sp.loc[0, "location_id"] == linked_sp.loc[1, "location_id"] + assert linked_sp.loc[2, "location_id"] != linked_sp.loc[0, "location_id"] + assert linked_sp.loc[3, "location_id"] != linked_sp.loc[0, "location_id"] + assert locs.geometry.notna().all() + assert locs["extent"].notna().all() + assert locs.loc[linked_sp.loc[2, "location_id"], "center"] == Point(100, 100) + assert locs.loc[linked_sp.loc[3, "location_id"], "center"] == Point(200, 200) + def test_crs(self, example_staypoints): """Test whether the crs of the output locations is set correctly.""" sp = example_staypoints diff --git a/trackintel/geogr/distances.py b/trackintel/geogr/distances.py index 5602defe..3d677480 100644 --- a/trackintel/geogr/distances.py +++ b/trackintel/geogr/distances.py @@ -1,7 +1,7 @@ import itertools import math import warnings -from math import cos, pi +from math import pi import numpy as np import pandas as pd @@ -207,7 +207,7 @@ def meters_to_decimal_degrees(meters, latitude): -------- >>> meters_to_decimal_degrees(500.0, 47.410) """ - return meters / (111.32 * 1000.0 * cos(latitude * (pi / 180.0))) + return meters / (111.32 * 1000.0 * np.cos(latitude * (pi / 180.0))) def check_gdf_planar(gdf, transform=False): diff --git a/trackintel/preprocessing/staypoints.py b/trackintel/preprocessing/staypoints.py index a35325d4..bedf5b3d 100644 --- a/trackintel/preprocessing/staypoints.py +++ b/trackintel/preprocessing/staypoints.py @@ -3,9 +3,10 @@ import pandas as pd from sklearn.cluster import DBSCAN import warnings +from tqdm import tqdm from trackintel import Staypoints, Locations -from trackintel.geogr import meters_to_decimal_degrees, check_gdf_planar +from trackintel.geogr import check_gdf_planar, meters_to_decimal_degrees from trackintel.preprocessing.util import applyParallel, angle_centroid_multipoints @@ -98,14 +99,22 @@ def generate_locations( db = DBSCAN(eps=eps, min_samples=num_samples, algorithm="ball_tree", metric=distance_metric) if agg_level == "user": - sp = applyParallel( - sp.groupby("user_id", as_index=False), - _gen_locs_dbscan, - n_jobs=n_jobs, - print_progress=print_progress, - distance_metric=distance_metric, - db=db, - ) + grouped = sp.groupby("user_id", as_index=False) + if n_jobs == 1: + result_list = [ + _gen_locs_dbscan(group, distance_metric=distance_metric, db=db) + for _, group in tqdm(grouped, disable=not print_progress) + ] + sp = pd.concat(result_list) if result_list else sp.iloc[:0].copy() + else: + sp = applyParallel( + grouped, + _gen_locs_dbscan, + n_jobs=n_jobs, + print_progress=print_progress, + distance_metric=distance_metric, + db=db, + ) # keeping track of noise labels sp_non_noise_labels = sp[sp["location_id"] != -1] @@ -133,10 +142,22 @@ def generate_locations( _gen_locs_dbscan(sp, db=db, distance_metric=distance_metric) ### create locations as grouped staypoints - temp_sp = sp[["user_id", "location_id", sp.geometry.name]] + temp_sp = sp.loc[sp["location_id"] != -1, ["user_id", "location_id", sp.geometry.name]] if agg_level == "user": - # directly dissolve by 'user_id' and 'location_id' - locs = temp_sp.dissolve(by=["user_id", "location_id"], as_index=False) + # Dissolve only multi-staypoint locations. Singleton locations already have the target geometry. + duplicate_mask = temp_sp.duplicated(subset=["user_id", "location_id"], keep=False) + singleton_locs = temp_sp.loc[~duplicate_mask].copy() + multi_rows = temp_sp.loc[duplicate_mask] + if len(multi_rows) > 0: + multi_locs = multi_rows.dissolve(by=["user_id", "location_id"], as_index=False) + locs = gpd.GeoDataFrame( + pd.concat([singleton_locs, multi_locs], ignore_index=True), + geometry=geo_col, + crs=temp_sp.crs, + ) + else: + locs = gpd.GeoDataFrame(singleton_locs, geometry=geo_col, crs=temp_sp.crs) + locs = locs.sort_values(["user_id", "location_id"]) else: ## generate user-location pairs with same geometries across users # get user-location pairs @@ -146,9 +167,6 @@ def generate_locations( # merge pairs with location geometries locs = geom_gdf.merge(locs, on="location_id", how="right") - # filter staypoints not belonging to locations - locs = locs.loc[locs["location_id"] != -1] - if check_gdf_planar(locs): locs["center"] = locs.geometry.centroid else: @@ -161,11 +179,13 @@ def generate_locations( # We create a buffer of distance epsilon around the convex_hull to denote location extent # Perform meter to decimal conversion if the distance metric is haversine if distance_metric == "haversine": - locs["extent"] = locs.apply( - lambda p: p["extent"].buffer(meters_to_decimal_degrees(epsilon, p["center"].y)), - axis=1, - result_type="reduce", - ) + buffer_distance = meters_to_decimal_degrees(epsilon, locs["center"].y.to_numpy()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Geometry is in a geographic CRS. Results from 'buffer' are likely incorrect.", + ) + locs["extent"] = locs["extent"].buffer(buffer_distance) else: locs["extent"] = locs["extent"].buffer(epsilon)