Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions tests/preprocessing/test_staypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,53 @@ def test_dbscan_user_dataset(self):
assert loc_dataset_num == 1
assert loc_user_num == 2

def test_user_locations_with_singleton_and_multi_staypoints(self):
"""User-level location generation should handle singleton and multi-staypoint locations together.

The optimized user-level path skips the expensive dissolve operation for locations that contain only one
staypoint, while still dissolving locations with multiple staypoints. This test keeps both cases in one input
so center, extent, and staypoint linkage regressions are visible.
"""
started_at = pd.Timestamp("1971-01-01 00:00:00", tz="utc")
finished_at = pd.Timestamp("1971-01-01 01:00:00", tz="utc")
sp = ti.Staypoints(
gpd.GeoDataFrame(
[
{"id": 0, "user_id": 0, "started_at": started_at, "finished_at": finished_at, "geom": Point(0, 0)},
{"id": 1, "user_id": 0, "started_at": started_at, "finished_at": finished_at, "geom": Point(0, 0)},
{
"id": 2,
"user_id": 0,
"started_at": started_at,
"finished_at": finished_at,
"geom": Point(100, 100),
},
{
"id": 3,
"user_id": 1,
"started_at": started_at,
"finished_at": finished_at,
"geom": Point(200, 200),
},
],
geometry="geom",
crs="EPSG:2056",
).set_index("id")
)

linked_sp, locs = sp.generate_locations(
method="dbscan", epsilon=1, num_samples=1, distance_metric="euclidean", agg_level="user"
)

assert len(locs) == 3
assert linked_sp.loc[0, "location_id"] == linked_sp.loc[1, "location_id"]
assert linked_sp.loc[2, "location_id"] != linked_sp.loc[0, "location_id"]
assert linked_sp.loc[3, "location_id"] != linked_sp.loc[0, "location_id"]
assert locs.geometry.notna().all()
assert locs["extent"].notna().all()
assert locs.loc[linked_sp.loc[2, "location_id"], "center"] == Point(100, 100)
assert locs.loc[linked_sp.loc[3, "location_id"], "center"] == Point(200, 200)

def test_crs(self, example_staypoints):
"""Test whether the crs of the output locations is set correctly."""
sp = example_staypoints
Expand Down
4 changes: 2 additions & 2 deletions trackintel/geogr/distances.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import itertools
import math
import warnings
from math import cos, pi
from math import pi

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -207,7 +207,7 @@ def meters_to_decimal_degrees(meters, latitude):
--------
>>> meters_to_decimal_degrees(500.0, 47.410)
"""
return meters / (111.32 * 1000.0 * cos(latitude * (pi / 180.0)))
return meters / (111.32 * 1000.0 * np.cos(latitude * (pi / 180.0)))


def check_gdf_planar(gdf, transform=False):
Expand Down
60 changes: 40 additions & 20 deletions trackintel/preprocessing/staypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import pandas as pd
from sklearn.cluster import DBSCAN
import warnings
from tqdm import tqdm

from trackintel import Staypoints, Locations
from trackintel.geogr import meters_to_decimal_degrees, check_gdf_planar
from trackintel.geogr import check_gdf_planar, meters_to_decimal_degrees
from trackintel.preprocessing.util import applyParallel, angle_centroid_multipoints


Expand Down Expand Up @@ -98,14 +99,22 @@ def generate_locations(
db = DBSCAN(eps=eps, min_samples=num_samples, algorithm="ball_tree", metric=distance_metric)

if agg_level == "user":
sp = applyParallel(
sp.groupby("user_id", as_index=False),
_gen_locs_dbscan,
n_jobs=n_jobs,
print_progress=print_progress,
distance_metric=distance_metric,
db=db,
)
grouped = sp.groupby("user_id", as_index=False)
if n_jobs == 1:
result_list = [
_gen_locs_dbscan(group, distance_metric=distance_metric, db=db)
for _, group in tqdm(grouped, disable=not print_progress)
]
sp = pd.concat(result_list) if result_list else sp.iloc[:0].copy()
else:
sp = applyParallel(
grouped,
_gen_locs_dbscan,
n_jobs=n_jobs,
print_progress=print_progress,
distance_metric=distance_metric,
db=db,
)

# keeping track of noise labels
sp_non_noise_labels = sp[sp["location_id"] != -1]
Expand Down Expand Up @@ -133,10 +142,22 @@ def generate_locations(
_gen_locs_dbscan(sp, db=db, distance_metric=distance_metric)

### create locations as grouped staypoints
temp_sp = sp[["user_id", "location_id", sp.geometry.name]]
temp_sp = sp.loc[sp["location_id"] != -1, ["user_id", "location_id", sp.geometry.name]]
if agg_level == "user":
# directly dissolve by 'user_id' and 'location_id'
locs = temp_sp.dissolve(by=["user_id", "location_id"], as_index=False)
# Dissolve only multi-staypoint locations. Singleton locations already have the target geometry.
duplicate_mask = temp_sp.duplicated(subset=["user_id", "location_id"], keep=False)
singleton_locs = temp_sp.loc[~duplicate_mask].copy()
multi_rows = temp_sp.loc[duplicate_mask]
if len(multi_rows) > 0:
multi_locs = multi_rows.dissolve(by=["user_id", "location_id"], as_index=False)
locs = gpd.GeoDataFrame(
pd.concat([singleton_locs, multi_locs], ignore_index=True),
geometry=geo_col,
crs=temp_sp.crs,
)
else:
locs = gpd.GeoDataFrame(singleton_locs, geometry=geo_col, crs=temp_sp.crs)
locs = locs.sort_values(["user_id", "location_id"])
else:
## generate user-location pairs with same geometries across users
# get user-location pairs
Expand All @@ -146,9 +167,6 @@ def generate_locations(
# merge pairs with location geometries
locs = geom_gdf.merge(locs, on="location_id", how="right")

# filter staypoints not belonging to locations
locs = locs.loc[locs["location_id"] != -1]

if check_gdf_planar(locs):
locs["center"] = locs.geometry.centroid
else:
Expand All @@ -161,11 +179,13 @@ def generate_locations(
# We create a buffer of distance epsilon around the convex_hull to denote location extent
# Perform meter to decimal conversion if the distance metric is haversine
if distance_metric == "haversine":
locs["extent"] = locs.apply(
lambda p: p["extent"].buffer(meters_to_decimal_degrees(epsilon, p["center"].y)),
axis=1,
result_type="reduce",
)
buffer_distance = meters_to_decimal_degrees(epsilon, locs["center"].y.to_numpy())
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="Geometry is in a geographic CRS. Results from 'buffer' are likely incorrect.",
)
locs["extent"] = locs["extent"].buffer(buffer_distance)
else:
locs["extent"] = locs["extent"].buffer(epsilon)

Expand Down
Loading