All tweaks added

2026-06-06 18:10:35 +01:00
parent 043b3b7269
commit ec5a01d12a
92 changed files with 7517 additions and 784 deletions
@@ -0,0 +1,190 @@
+"""
+Route matching: identifies when multiple activities were on the same route.
+Uses a bounding-box pre-filter + dynamic time warping (DTW) for GPS track similarity.
+"""
+import math
+from typing import Optional
+import polyline as polyline_lib
+import numpy as np
+
+
+def decode_polyline_to_coords(encoded: str) -> list[tuple[float, float]]:
+    return polyline_lib.decode(encoded)
+
+
+def bounding_boxes_overlap(bb1: dict, bb2: dict, tolerance_deg: float = 0.005) -> bool:
+    """Quick check: do two bounding boxes overlap (with a tolerance margin)?"""
+    return (
+        bb1["min_lat"] - tolerance_deg <= bb2["max_lat"] + tolerance_deg and
+        bb1["max_lat"] + tolerance_deg >= bb2["min_lat"] - tolerance_deg and
+        bb1["min_lon"] - tolerance_deg <= bb2["max_lon"] + tolerance_deg and
+        bb1["max_lon"] + tolerance_deg >= bb2["min_lon"] - tolerance_deg
+    )
+
+
+def sample_coords(coords: list[tuple], n: int = 100) -> list[tuple]:
+    """Downsample a track to n evenly-spaced points for DTW efficiency."""
+    if len(coords) <= n:
+        return coords
+    indices = [int(i * (len(coords) - 1) / (n - 1)) for i in range(n)]
+    return [coords[i] for i in indices]
+
+
+def dtw_distance(track1: list[tuple], track2: list[tuple]) -> float:
+    """
+    Compute DTW distance between two GPS tracks.
+    Each point is (lat, lon). Returns average distance in metres per matched pair.
+    """
+    n, m = len(track1), len(track2)
+    dtw = np.full((n + 1, m + 1), np.inf)
+    dtw[0][0] = 0.0
+
+    for i in range(1, n + 1):
+        for j in range(1, m + 1):
+            cost = haversine_m(track1[i-1], track2[j-1])
+            dtw[i][j] = cost + min(dtw[i-1][j], dtw[i][j-1], dtw[i-1][j-1])
+
+    return dtw[n][m] / max(n, m)
+
+
+def haversine_m(p1: tuple, p2: tuple) -> float:
+    R = 6371000
+    lat1, lon1 = math.radians(p1[0]), math.radians(p1[1])
+    lat2, lon2 = math.radians(p2[0]), math.radians(p2[1])
+    dlat = lat2 - lat1
+    dlon = lon2 - lon1
+    a = math.sin(dlat/2)**2 + math.cos(lat1)*math.cos(lat2)*math.sin(dlon/2)**2
+    return 2 * R * math.asin(math.sqrt(a))
+
+
+def routes_are_similar(
+    poly1: str,
+    poly2: str,
+    bb1: Optional[dict],
+    bb2: Optional[dict],
+    dtw_threshold_m: float = 80.0,
+) -> bool:
+    """
+    Returns True if two activities are on sufficiently similar routes.
+    First does a cheap bounding box check, then DTW on downsampled tracks.
+    """
+    if bb1 and bb2:
+        if not bounding_boxes_overlap(bb1, bb2):
+            return False
+
+    try:
+        coords1 = sample_coords(decode_polyline_to_coords(poly1), 60)
+        coords2 = sample_coords(decode_polyline_to_coords(poly2), 60)
+    except Exception:
+        return False
+
+    if not coords1 or not coords2:
+        return False
+
+    dist = dtw_distance(coords1, coords2)
+    return dist < dtw_threshold_m
+
+
+def find_segment_times(
+    data_points: list[dict],
+    start_dist_m: float,
+    end_dist_m: float,
+) -> Optional[float]:
+    """
+    Given activity data points (with cumulative distance_m),
+    find the time to traverse from start_dist_m to end_dist_m.
+    Returns duration in seconds, or None if not found.
+    """
+    start_time = None
+    end_time = None
+
+    for p in data_points:
+        dist = p.get("distance_m")
+        ts = p.get("timestamp")
+        if dist is None or ts is None:
+            continue
+
+        if start_time is None and dist >= start_dist_m:
+            start_time = ts
+
+        if start_time is not None and dist >= end_dist_m:
+            end_time = ts
+            break
+
+    if start_time and end_time:
+        from datetime import datetime
+        t1 = datetime.fromisoformat(start_time) if isinstance(start_time, str) else start_time
+        t2 = datetime.fromisoformat(end_time) if isinstance(end_time, str) else end_time
+        return (t2 - t1).total_seconds()
+
+    return None
+
+
+def find_best_split_time(
+    data_points: list[dict],
+    target_distance_m: float,
+) -> Optional[float]:
+    """
+    Find the best (fastest) time over any target_distance_m window within an activity.
+    E.g. fastest 1km split in a 10km run.
+    Returns duration in seconds.
+    """
+    points_with_dist = [
+        p for p in data_points
+        if p.get("distance_m") is not None and p.get("timestamp") is not None
+    ]
+
+    if not points_with_dist:
+        return None
+
+    best = None
+    j = 0
+
+    for i, start_p in enumerate(points_with_dist):
+        start_dist = start_p["distance_m"]
+        start_ts = start_p["timestamp"]
+
+        # Advance j until distance covered >= target
+        while j < len(points_with_dist):
+            end_p = points_with_dist[j]
+            covered = end_p["distance_m"] - start_dist
+            if covered >= target_distance_m:
+                from datetime import datetime
+                t1 = datetime.fromisoformat(start_ts) if isinstance(start_ts, str) else start_ts
+                t2 = datetime.fromisoformat(end_p["timestamp"]) if isinstance(end_p["timestamp"], str) else end_p["timestamp"]
+                duration = (t2 - t1).total_seconds()
+                if best is None or duration < best:
+                    best = duration
+                break
+            j += 1
+
+        if j >= len(points_with_dist):
+            break
+
+    return best
+
+
+STANDARD_DISTANCES = [
+    (400, "400m"),
+    (800, "800m"),
+    (1000, "1k"),
+    (1609.34, "1 mile"),
+    (3000, "3k"),
+    (5000, "5k"),
+    (10000, "10k"),
+    (21097.5, "Half marathon"),
+    (42195, "Marathon"),
+    (50000, "50k"),
+    (100000, "100k"),
+]
+
+
+def compute_best_splits(data_points: list[dict], total_distance_m: float) -> dict[str, float]:
+    """Compute best split times for all standard distances that fit within the activity."""
+    results = {}
+    for dist_m, label in STANDARD_DISTANCES:
+        if total_distance_m >= dist_m * 0.95:  # allow 5% tolerance
+            best = find_best_split_time(data_points, dist_m)
+            if best:
+                results[label] = best
+    return results