Initial Commit

This commit is contained in:
2026-06-06 13:23:33 +01:00
commit 1a0d45dd67
58 changed files with 5268 additions and 0 deletions
View File
+341
View File
@@ -0,0 +1,341 @@
"""
Parses Garmin .fit files and GPX files into normalized activity data.
Handles full Strava and Garmin data export archives.
"""
import os
import zipfile
import json
import math
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
import fitparse
import gpxpy
import polyline as polyline_lib
def haversine_distance(lat1, lon1, lat2, lon2) -> float:
"""Returns distance in metres between two GPS points."""
R = 6371000
phi1, phi2 = math.radians(lat1), math.radians(lat2)
dphi = math.radians(lat2 - lat1)
dlam = math.radians(lon2 - lon1)
a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlam/2)**2
return 2 * R * math.asin(math.sqrt(a))
def semicircles_to_degrees(sc: int) -> float:
return sc * (180 / 2**31)
def parse_fit_file(filepath: str) -> dict:
"""Parse a Garmin .fit file and return normalized activity dict."""
fit = fitparse.FitFile(filepath)
data_points = []
laps = []
session = {}
for record in fit.get_messages():
name = record.name
if name == "session":
for f in record:
session[f.name] = f.value
elif name == "lap":
lap = {}
for f in record:
lap[f.name] = f.value
laps.append(lap)
elif name == "record":
point = {}
for f in record:
point[f.name] = f.value
if point:
# Convert semicircles to degrees
if "position_lat" in point and point["position_lat"] is not None:
point["position_lat"] = semicircles_to_degrees(point["position_lat"])
if "position_long" in point and point["position_long"] is not None:
point["position_long"] = semicircles_to_degrees(point["position_long"])
data_points.append(point)
# Build normalized output
sport = str(session.get("sport", "generic")).lower()
sport_map = {
"running": "running", "cycling": "cycling", "swimming": "swimming",
"hiking": "hiking", "walking": "walking", "generic": "other",
"open_water_swimming": "swimming", "trail_running": "running",
}
sport_type = sport_map.get(sport, sport)
start_time = session.get("start_time")
if start_time and start_time.tzinfo is None:
start_time = start_time.replace(tzinfo=timezone.utc)
# Build GPS track for polyline
coords = [
(p["position_lat"], p["position_long"])
for p in data_points
if p.get("position_lat") is not None and p.get("position_long") is not None
]
encoded_polyline = polyline_lib.encode(coords) if coords else None
bounding_box = _bounding_box(coords)
# Calculate cumulative distance if not in FIT
cumulative_dist = 0.0
prev_lat, prev_lon = None, None
normalized_points = []
for p in data_points:
ts = p.get("timestamp")
if ts and ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
lat = p.get("position_lat")
lon = p.get("position_long")
dist = p.get("distance")
if dist is None and lat and lon and prev_lat and prev_lon:
cumulative_dist += haversine_distance(prev_lat, prev_lon, lat, lon)
dist = cumulative_dist
elif dist is not None:
cumulative_dist = float(dist)
if lat and lon:
prev_lat, prev_lon = lat, lon
normalized_points.append({
"timestamp": ts.isoformat() if ts else None,
"latitude": lat,
"longitude": lon,
"altitude_m": p.get("altitude"),
"heart_rate": p.get("heart_rate"),
"cadence": p.get("cadence"),
"speed_ms": p.get("speed"),
"power": p.get("power"),
"temperature_c": p.get("temperature"),
"distance_m": dist,
})
# Parse laps
normalized_laps = []
for i, lap in enumerate(laps):
ls = lap.get("start_time")
if ls and ls.tzinfo is None:
ls = ls.replace(tzinfo=timezone.utc)
normalized_laps.append({
"lap_number": i + 1,
"start_time": ls.isoformat() if ls else None,
"duration_s": _safe_float(lap.get("total_elapsed_time")),
"distance_m": _safe_float(lap.get("total_distance")),
"avg_heart_rate": _safe_float(lap.get("avg_heart_rate")),
"avg_cadence": _safe_float(lap.get("avg_cadence")),
"avg_speed_ms": _safe_float(lap.get("avg_speed")),
"avg_power": _safe_float(lap.get("avg_power")),
})
return {
"name": session.get("sport", "Activity").title() + " " + (
start_time.strftime("%Y-%m-%d") if start_time else ""),
"sport_type": sport_type,
"start_time": start_time.isoformat() if start_time else None,
"distance_m": _safe_float(session.get("total_distance")),
"duration_s": _safe_float(session.get("total_elapsed_time")),
"elevation_gain_m": _safe_float(session.get("total_ascent")),
"elevation_loss_m": _safe_float(session.get("total_descent")),
"avg_heart_rate": _safe_float(session.get("avg_heart_rate")),
"max_heart_rate": _safe_float(session.get("max_heart_rate")),
"avg_cadence": _safe_float(session.get("avg_cadence")),
"avg_power": _safe_float(session.get("avg_power")),
"normalized_power": _safe_float(session.get("normalized_power")),
"avg_speed_ms": _safe_float(session.get("avg_speed")),
"max_speed_ms": _safe_float(session.get("max_speed")),
"avg_temperature_c": _safe_float(session.get("avg_temperature")),
"calories": _safe_float(session.get("total_calories")),
"training_stress_score": _safe_float(session.get("training_stress_score")),
"vo2max_estimate": _safe_float(session.get("estimated_sweat_loss")), # varies by device
"polyline": encoded_polyline,
"bounding_box": bounding_box,
"source_type": "fit",
"data_points": normalized_points,
"laps": normalized_laps,
}
def parse_gpx_file(filepath: str) -> dict:
"""Parse a GPX file into normalized activity dict."""
with open(filepath) as f:
gpx = gpxpy.parse(f)
data_points = []
track = gpx.tracks[0] if gpx.tracks else None
if not track:
raise ValueError("No tracks found in GPX file")
for segment in track.segments:
for pt in segment.points:
ts = pt.time
if ts and ts.tzinfo is None:
ts = ts.replace(tzinfo=timezone.utc)
extensions = {}
if pt.extensions:
for ext in pt.extensions:
for child in ext:
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
try:
extensions[tag] = float(child.text)
except (ValueError, TypeError):
pass
data_points.append({
"timestamp": ts.isoformat() if ts else None,
"latitude": pt.latitude,
"longitude": pt.longitude,
"altitude_m": pt.elevation,
"heart_rate": extensions.get("hr"),
"cadence": extensions.get("cad"),
"speed_ms": extensions.get("speed"),
"power": extensions.get("power"),
"temperature_c": extensions.get("temp") or extensions.get("atemp"),
"distance_m": None,
})
# Calculate distance and elevation
coords = [(p["latitude"], p["longitude"]) for p in data_points
if p["latitude"] and p["longitude"]]
encoded_polyline = polyline_lib.encode(coords) if coords else None
bounding_box = _bounding_box(coords)
# Add cumulative distance
total_dist = 0.0
prev = None
for p in data_points:
if p["latitude"] and p["longitude"]:
if prev:
total_dist += haversine_distance(prev[0], prev[1], p["latitude"], p["longitude"])
prev = (p["latitude"], p["longitude"])
p["distance_m"] = total_dist
uphill, downhill = 0.0, 0.0
alts = [p["altitude_m"] for p in data_points if p["altitude_m"]]
for i in range(1, len(alts)):
diff = alts[i] - alts[i-1]
if diff > 0:
uphill += diff
else:
downhill += abs(diff)
hrs = [p["heart_rate"] for p in data_points if p["heart_rate"]]
start_time_str = data_points[0]["timestamp"] if data_points else None
start_dt = datetime.fromisoformat(start_time_str) if start_time_str else None
end_dt = datetime.fromisoformat(data_points[-1]["timestamp"]) if data_points else None
duration = (end_dt - start_dt).total_seconds() if (start_dt and end_dt) else None
sport = "running" # GPX doesn't always include sport; default to running
if track.type:
sport = track.type.lower()
return {
"name": track.name or gpx.name or f"Activity {start_dt.date() if start_dt else ''}",
"sport_type": sport,
"start_time": start_time_str,
"distance_m": total_dist,
"duration_s": duration,
"elevation_gain_m": uphill,
"elevation_loss_m": downhill,
"avg_heart_rate": (sum(hrs) / len(hrs)) if hrs else None,
"max_heart_rate": max(hrs) if hrs else None,
"avg_cadence": None,
"avg_power": None,
"normalized_power": None,
"avg_speed_ms": (total_dist / duration) if (total_dist and duration) else None,
"max_speed_ms": None,
"avg_temperature_c": None,
"calories": None,
"training_stress_score": None,
"vo2max_estimate": None,
"polyline": encoded_polyline,
"bounding_box": bounding_box,
"source_type": "gpx",
"data_points": data_points,
"laps": [],
}
def parse_strava_export(export_dir: str) -> list[dict]:
"""
Parse a full Strava data export directory.
Structure: activities.csv + activities/ folder with .gpx/.fit.gz files
"""
activities = []
activities_dir = Path(export_dir) / "activities"
if not activities_dir.exists():
return activities
for fname in sorted(activities_dir.iterdir()):
if fname.suffix in (".fit", ".gpx"):
try:
if fname.suffix == ".fit":
act = parse_fit_file(str(fname))
else:
act = parse_gpx_file(str(fname))
act["source_type"] = "strava_" + fname.suffix[1:]
activities.append(act)
except Exception as e:
print(f"Error parsing {fname}: {e}")
return activities
def calculate_hr_zones(data_points: list[dict], max_hr: float) -> dict:
"""Calculate percentage of time spent in each HR zone."""
if not max_hr:
return {}
zones = {"z1": 0, "z2": 0, "z3": 0, "z4": 0, "z5": 0}
zone_bounds = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
total = 0
for p in data_points:
hr = p.get("heart_rate")
if not hr:
continue
pct = hr / max_hr
total += 1
if pct < zone_bounds[1]:
zones["z1"] += 1
elif pct < zone_bounds[2]:
zones["z2"] += 1
elif pct < zone_bounds[3]:
zones["z3"] += 1
elif pct < zone_bounds[4]:
zones["z4"] += 1
else:
zones["z5"] += 1
if total:
return {k: round(v / total * 100, 1) for k, v in zones.items()}
return {}
def _safe_float(val) -> Optional[float]:
try:
return float(val) if val is not None else None
except (TypeError, ValueError):
return None
def _bounding_box(coords: list[tuple]) -> Optional[dict]:
if not coords:
return None
lats = [c[0] for c in coords]
lons = [c[1] for c in coords]
return {
"min_lat": min(lats), "max_lat": max(lats),
"min_lon": min(lons), "max_lon": max(lons),
}
+190
View File
@@ -0,0 +1,190 @@
"""
Route matching: identifies when multiple activities were on the same route.
Uses a bounding-box pre-filter + dynamic time warping (DTW) for GPS track similarity.
"""
import math
from typing import Optional
import polyline as polyline_lib
import numpy as np
def decode_polyline_to_coords(encoded: str) -> list[tuple[float, float]]:
return polyline_lib.decode(encoded)
def bounding_boxes_overlap(bb1: dict, bb2: dict, tolerance_deg: float = 0.005) -> bool:
"""Quick check: do two bounding boxes overlap (with a tolerance margin)?"""
return (
bb1["min_lat"] - tolerance_deg <= bb2["max_lat"] + tolerance_deg and
bb1["max_lat"] + tolerance_deg >= bb2["min_lat"] - tolerance_deg and
bb1["min_lon"] - tolerance_deg <= bb2["max_lon"] + tolerance_deg and
bb1["max_lon"] + tolerance_deg >= bb2["min_lon"] - tolerance_deg
)
def sample_coords(coords: list[tuple], n: int = 100) -> list[tuple]:
"""Downsample a track to n evenly-spaced points for DTW efficiency."""
if len(coords) <= n:
return coords
indices = [int(i * (len(coords) - 1) / (n - 1)) for i in range(n)]
return [coords[i] for i in indices]
def dtw_distance(track1: list[tuple], track2: list[tuple]) -> float:
"""
Compute DTW distance between two GPS tracks.
Each point is (lat, lon). Returns average distance in metres per matched pair.
"""
n, m = len(track1), len(track2)
dtw = np.full((n + 1, m + 1), np.inf)
dtw[0][0] = 0.0
for i in range(1, n + 1):
for j in range(1, m + 1):
cost = haversine_m(track1[i-1], track2[j-1])
dtw[i][j] = cost + min(dtw[i-1][j], dtw[i][j-1], dtw[i-1][j-1])
return dtw[n][m] / max(n, m)
def haversine_m(p1: tuple, p2: tuple) -> float:
R = 6371000
lat1, lon1 = math.radians(p1[0]), math.radians(p1[1])
lat2, lon2 = math.radians(p2[0]), math.radians(p2[1])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = math.sin(dlat/2)**2 + math.cos(lat1)*math.cos(lat2)*math.sin(dlon/2)**2
return 2 * R * math.asin(math.sqrt(a))
def routes_are_similar(
poly1: str,
poly2: str,
bb1: Optional[dict],
bb2: Optional[dict],
dtw_threshold_m: float = 80.0,
) -> bool:
"""
Returns True if two activities are on sufficiently similar routes.
First does a cheap bounding box check, then DTW on downsampled tracks.
"""
if bb1 and bb2:
if not bounding_boxes_overlap(bb1, bb2):
return False
try:
coords1 = sample_coords(decode_polyline_to_coords(poly1), 60)
coords2 = sample_coords(decode_polyline_to_coords(poly2), 60)
except Exception:
return False
if not coords1 or not coords2:
return False
dist = dtw_distance(coords1, coords2)
return dist < dtw_threshold_m
def find_segment_times(
data_points: list[dict],
start_dist_m: float,
end_dist_m: float,
) -> Optional[float]:
"""
Given activity data points (with cumulative distance_m),
find the time to traverse from start_dist_m to end_dist_m.
Returns duration in seconds, or None if not found.
"""
start_time = None
end_time = None
for p in data_points:
dist = p.get("distance_m")
ts = p.get("timestamp")
if dist is None or ts is None:
continue
if start_time is None and dist >= start_dist_m:
start_time = ts
if start_time is not None and dist >= end_dist_m:
end_time = ts
break
if start_time and end_time:
from datetime import datetime
t1 = datetime.fromisoformat(start_time) if isinstance(start_time, str) else start_time
t2 = datetime.fromisoformat(end_time) if isinstance(end_time, str) else end_time
return (t2 - t1).total_seconds()
return None
def find_best_split_time(
data_points: list[dict],
target_distance_m: float,
) -> Optional[float]:
"""
Find the best (fastest) time over any target_distance_m window within an activity.
E.g. fastest 1km split in a 10km run.
Returns duration in seconds.
"""
points_with_dist = [
p for p in data_points
if p.get("distance_m") is not None and p.get("timestamp") is not None
]
if not points_with_dist:
return None
best = None
j = 0
for i, start_p in enumerate(points_with_dist):
start_dist = start_p["distance_m"]
start_ts = start_p["timestamp"]
# Advance j until distance covered >= target
while j < len(points_with_dist):
end_p = points_with_dist[j]
covered = end_p["distance_m"] - start_dist
if covered >= target_distance_m:
from datetime import datetime
t1 = datetime.fromisoformat(start_ts) if isinstance(start_ts, str) else start_ts
t2 = datetime.fromisoformat(end_p["timestamp"]) if isinstance(end_p["timestamp"], str) else end_p["timestamp"]
duration = (t2 - t1).total_seconds()
if best is None or duration < best:
best = duration
break
j += 1
if j >= len(points_with_dist):
break
return best
STANDARD_DISTANCES = [
(400, "400m"),
(800, "800m"),
(1000, "1k"),
(1609.34, "1 mile"),
(3000, "3k"),
(5000, "5k"),
(10000, "10k"),
(21097.5, "Half marathon"),
(42195, "Marathon"),
(50000, "50k"),
(100000, "100k"),
]
def compute_best_splits(data_points: list[dict], total_distance_m: float) -> dict[str, float]:
"""Compute best split times for all standard distances that fit within the activity."""
results = {}
for dist_m, label in STANDARD_DISTANCES:
if total_distance_m >= dist_m * 0.95: # allow 5% tolerance
best = find_best_split_time(data_points, dist_m)
if best:
results[label] = best
return results