Initial Commit
This commit is contained in:
@@ -0,0 +1,341 @@
|
||||
"""
|
||||
Parses Garmin .fit files and GPX files into normalized activity data.
|
||||
Handles full Strava and Garmin data export archives.
|
||||
"""
|
||||
import os
|
||||
import zipfile
|
||||
import json
|
||||
import math
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
import fitparse
|
||||
import gpxpy
|
||||
import polyline as polyline_lib
|
||||
|
||||
|
||||
def haversine_distance(lat1, lon1, lat2, lon2) -> float:
|
||||
"""Returns distance in metres between two GPS points."""
|
||||
R = 6371000
|
||||
phi1, phi2 = math.radians(lat1), math.radians(lat2)
|
||||
dphi = math.radians(lat2 - lat1)
|
||||
dlam = math.radians(lon2 - lon1)
|
||||
a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlam/2)**2
|
||||
return 2 * R * math.asin(math.sqrt(a))
|
||||
|
||||
|
||||
def semicircles_to_degrees(sc: int) -> float:
|
||||
return sc * (180 / 2**31)
|
||||
|
||||
|
||||
def parse_fit_file(filepath: str) -> dict:
|
||||
"""Parse a Garmin .fit file and return normalized activity dict."""
|
||||
fit = fitparse.FitFile(filepath)
|
||||
|
||||
data_points = []
|
||||
laps = []
|
||||
session = {}
|
||||
|
||||
for record in fit.get_messages():
|
||||
name = record.name
|
||||
|
||||
if name == "session":
|
||||
for f in record:
|
||||
session[f.name] = f.value
|
||||
|
||||
elif name == "lap":
|
||||
lap = {}
|
||||
for f in record:
|
||||
lap[f.name] = f.value
|
||||
laps.append(lap)
|
||||
|
||||
elif name == "record":
|
||||
point = {}
|
||||
for f in record:
|
||||
point[f.name] = f.value
|
||||
if point:
|
||||
# Convert semicircles to degrees
|
||||
if "position_lat" in point and point["position_lat"] is not None:
|
||||
point["position_lat"] = semicircles_to_degrees(point["position_lat"])
|
||||
if "position_long" in point and point["position_long"] is not None:
|
||||
point["position_long"] = semicircles_to_degrees(point["position_long"])
|
||||
data_points.append(point)
|
||||
|
||||
# Build normalized output
|
||||
sport = str(session.get("sport", "generic")).lower()
|
||||
sport_map = {
|
||||
"running": "running", "cycling": "cycling", "swimming": "swimming",
|
||||
"hiking": "hiking", "walking": "walking", "generic": "other",
|
||||
"open_water_swimming": "swimming", "trail_running": "running",
|
||||
}
|
||||
sport_type = sport_map.get(sport, sport)
|
||||
|
||||
start_time = session.get("start_time")
|
||||
if start_time and start_time.tzinfo is None:
|
||||
start_time = start_time.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Build GPS track for polyline
|
||||
coords = [
|
||||
(p["position_lat"], p["position_long"])
|
||||
for p in data_points
|
||||
if p.get("position_lat") is not None and p.get("position_long") is not None
|
||||
]
|
||||
|
||||
encoded_polyline = polyline_lib.encode(coords) if coords else None
|
||||
bounding_box = _bounding_box(coords)
|
||||
|
||||
# Calculate cumulative distance if not in FIT
|
||||
cumulative_dist = 0.0
|
||||
prev_lat, prev_lon = None, None
|
||||
normalized_points = []
|
||||
for p in data_points:
|
||||
ts = p.get("timestamp")
|
||||
if ts and ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
|
||||
lat = p.get("position_lat")
|
||||
lon = p.get("position_long")
|
||||
|
||||
dist = p.get("distance")
|
||||
if dist is None and lat and lon and prev_lat and prev_lon:
|
||||
cumulative_dist += haversine_distance(prev_lat, prev_lon, lat, lon)
|
||||
dist = cumulative_dist
|
||||
elif dist is not None:
|
||||
cumulative_dist = float(dist)
|
||||
|
||||
if lat and lon:
|
||||
prev_lat, prev_lon = lat, lon
|
||||
|
||||
normalized_points.append({
|
||||
"timestamp": ts.isoformat() if ts else None,
|
||||
"latitude": lat,
|
||||
"longitude": lon,
|
||||
"altitude_m": p.get("altitude"),
|
||||
"heart_rate": p.get("heart_rate"),
|
||||
"cadence": p.get("cadence"),
|
||||
"speed_ms": p.get("speed"),
|
||||
"power": p.get("power"),
|
||||
"temperature_c": p.get("temperature"),
|
||||
"distance_m": dist,
|
||||
})
|
||||
|
||||
# Parse laps
|
||||
normalized_laps = []
|
||||
for i, lap in enumerate(laps):
|
||||
ls = lap.get("start_time")
|
||||
if ls and ls.tzinfo is None:
|
||||
ls = ls.replace(tzinfo=timezone.utc)
|
||||
normalized_laps.append({
|
||||
"lap_number": i + 1,
|
||||
"start_time": ls.isoformat() if ls else None,
|
||||
"duration_s": _safe_float(lap.get("total_elapsed_time")),
|
||||
"distance_m": _safe_float(lap.get("total_distance")),
|
||||
"avg_heart_rate": _safe_float(lap.get("avg_heart_rate")),
|
||||
"avg_cadence": _safe_float(lap.get("avg_cadence")),
|
||||
"avg_speed_ms": _safe_float(lap.get("avg_speed")),
|
||||
"avg_power": _safe_float(lap.get("avg_power")),
|
||||
})
|
||||
|
||||
return {
|
||||
"name": session.get("sport", "Activity").title() + " " + (
|
||||
start_time.strftime("%Y-%m-%d") if start_time else ""),
|
||||
"sport_type": sport_type,
|
||||
"start_time": start_time.isoformat() if start_time else None,
|
||||
"distance_m": _safe_float(session.get("total_distance")),
|
||||
"duration_s": _safe_float(session.get("total_elapsed_time")),
|
||||
"elevation_gain_m": _safe_float(session.get("total_ascent")),
|
||||
"elevation_loss_m": _safe_float(session.get("total_descent")),
|
||||
"avg_heart_rate": _safe_float(session.get("avg_heart_rate")),
|
||||
"max_heart_rate": _safe_float(session.get("max_heart_rate")),
|
||||
"avg_cadence": _safe_float(session.get("avg_cadence")),
|
||||
"avg_power": _safe_float(session.get("avg_power")),
|
||||
"normalized_power": _safe_float(session.get("normalized_power")),
|
||||
"avg_speed_ms": _safe_float(session.get("avg_speed")),
|
||||
"max_speed_ms": _safe_float(session.get("max_speed")),
|
||||
"avg_temperature_c": _safe_float(session.get("avg_temperature")),
|
||||
"calories": _safe_float(session.get("total_calories")),
|
||||
"training_stress_score": _safe_float(session.get("training_stress_score")),
|
||||
"vo2max_estimate": _safe_float(session.get("estimated_sweat_loss")), # varies by device
|
||||
"polyline": encoded_polyline,
|
||||
"bounding_box": bounding_box,
|
||||
"source_type": "fit",
|
||||
"data_points": normalized_points,
|
||||
"laps": normalized_laps,
|
||||
}
|
||||
|
||||
|
||||
def parse_gpx_file(filepath: str) -> dict:
|
||||
"""Parse a GPX file into normalized activity dict."""
|
||||
with open(filepath) as f:
|
||||
gpx = gpxpy.parse(f)
|
||||
|
||||
data_points = []
|
||||
track = gpx.tracks[0] if gpx.tracks else None
|
||||
|
||||
if not track:
|
||||
raise ValueError("No tracks found in GPX file")
|
||||
|
||||
for segment in track.segments:
|
||||
for pt in segment.points:
|
||||
ts = pt.time
|
||||
if ts and ts.tzinfo is None:
|
||||
ts = ts.replace(tzinfo=timezone.utc)
|
||||
|
||||
extensions = {}
|
||||
if pt.extensions:
|
||||
for ext in pt.extensions:
|
||||
for child in ext:
|
||||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||
try:
|
||||
extensions[tag] = float(child.text)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
data_points.append({
|
||||
"timestamp": ts.isoformat() if ts else None,
|
||||
"latitude": pt.latitude,
|
||||
"longitude": pt.longitude,
|
||||
"altitude_m": pt.elevation,
|
||||
"heart_rate": extensions.get("hr"),
|
||||
"cadence": extensions.get("cad"),
|
||||
"speed_ms": extensions.get("speed"),
|
||||
"power": extensions.get("power"),
|
||||
"temperature_c": extensions.get("temp") or extensions.get("atemp"),
|
||||
"distance_m": None,
|
||||
})
|
||||
|
||||
# Calculate distance and elevation
|
||||
coords = [(p["latitude"], p["longitude"]) for p in data_points
|
||||
if p["latitude"] and p["longitude"]]
|
||||
encoded_polyline = polyline_lib.encode(coords) if coords else None
|
||||
bounding_box = _bounding_box(coords)
|
||||
|
||||
# Add cumulative distance
|
||||
total_dist = 0.0
|
||||
prev = None
|
||||
for p in data_points:
|
||||
if p["latitude"] and p["longitude"]:
|
||||
if prev:
|
||||
total_dist += haversine_distance(prev[0], prev[1], p["latitude"], p["longitude"])
|
||||
prev = (p["latitude"], p["longitude"])
|
||||
p["distance_m"] = total_dist
|
||||
|
||||
uphill, downhill = 0.0, 0.0
|
||||
alts = [p["altitude_m"] for p in data_points if p["altitude_m"]]
|
||||
for i in range(1, len(alts)):
|
||||
diff = alts[i] - alts[i-1]
|
||||
if diff > 0:
|
||||
uphill += diff
|
||||
else:
|
||||
downhill += abs(diff)
|
||||
|
||||
hrs = [p["heart_rate"] for p in data_points if p["heart_rate"]]
|
||||
start_time_str = data_points[0]["timestamp"] if data_points else None
|
||||
start_dt = datetime.fromisoformat(start_time_str) if start_time_str else None
|
||||
end_dt = datetime.fromisoformat(data_points[-1]["timestamp"]) if data_points else None
|
||||
duration = (end_dt - start_dt).total_seconds() if (start_dt and end_dt) else None
|
||||
|
||||
sport = "running" # GPX doesn't always include sport; default to running
|
||||
if track.type:
|
||||
sport = track.type.lower()
|
||||
|
||||
return {
|
||||
"name": track.name or gpx.name or f"Activity {start_dt.date() if start_dt else ''}",
|
||||
"sport_type": sport,
|
||||
"start_time": start_time_str,
|
||||
"distance_m": total_dist,
|
||||
"duration_s": duration,
|
||||
"elevation_gain_m": uphill,
|
||||
"elevation_loss_m": downhill,
|
||||
"avg_heart_rate": (sum(hrs) / len(hrs)) if hrs else None,
|
||||
"max_heart_rate": max(hrs) if hrs else None,
|
||||
"avg_cadence": None,
|
||||
"avg_power": None,
|
||||
"normalized_power": None,
|
||||
"avg_speed_ms": (total_dist / duration) if (total_dist and duration) else None,
|
||||
"max_speed_ms": None,
|
||||
"avg_temperature_c": None,
|
||||
"calories": None,
|
||||
"training_stress_score": None,
|
||||
"vo2max_estimate": None,
|
||||
"polyline": encoded_polyline,
|
||||
"bounding_box": bounding_box,
|
||||
"source_type": "gpx",
|
||||
"data_points": data_points,
|
||||
"laps": [],
|
||||
}
|
||||
|
||||
|
||||
def parse_strava_export(export_dir: str) -> list[dict]:
|
||||
"""
|
||||
Parse a full Strava data export directory.
|
||||
Structure: activities.csv + activities/ folder with .gpx/.fit.gz files
|
||||
"""
|
||||
activities = []
|
||||
activities_dir = Path(export_dir) / "activities"
|
||||
|
||||
if not activities_dir.exists():
|
||||
return activities
|
||||
|
||||
for fname in sorted(activities_dir.iterdir()):
|
||||
if fname.suffix in (".fit", ".gpx"):
|
||||
try:
|
||||
if fname.suffix == ".fit":
|
||||
act = parse_fit_file(str(fname))
|
||||
else:
|
||||
act = parse_gpx_file(str(fname))
|
||||
act["source_type"] = "strava_" + fname.suffix[1:]
|
||||
activities.append(act)
|
||||
except Exception as e:
|
||||
print(f"Error parsing {fname}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
|
||||
def calculate_hr_zones(data_points: list[dict], max_hr: float) -> dict:
|
||||
"""Calculate percentage of time spent in each HR zone."""
|
||||
if not max_hr:
|
||||
return {}
|
||||
|
||||
zones = {"z1": 0, "z2": 0, "z3": 0, "z4": 0, "z5": 0}
|
||||
zone_bounds = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
|
||||
total = 0
|
||||
|
||||
for p in data_points:
|
||||
hr = p.get("heart_rate")
|
||||
if not hr:
|
||||
continue
|
||||
pct = hr / max_hr
|
||||
total += 1
|
||||
if pct < zone_bounds[1]:
|
||||
zones["z1"] += 1
|
||||
elif pct < zone_bounds[2]:
|
||||
zones["z2"] += 1
|
||||
elif pct < zone_bounds[3]:
|
||||
zones["z3"] += 1
|
||||
elif pct < zone_bounds[4]:
|
||||
zones["z4"] += 1
|
||||
else:
|
||||
zones["z5"] += 1
|
||||
|
||||
if total:
|
||||
return {k: round(v / total * 100, 1) for k, v in zones.items()}
|
||||
return {}
|
||||
|
||||
|
||||
def _safe_float(val) -> Optional[float]:
|
||||
try:
|
||||
return float(val) if val is not None else None
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _bounding_box(coords: list[tuple]) -> Optional[dict]:
|
||||
if not coords:
|
||||
return None
|
||||
lats = [c[0] for c in coords]
|
||||
lons = [c[1] for c in coords]
|
||||
return {
|
||||
"min_lat": min(lats), "max_lat": max(lats),
|
||||
"min_lon": min(lons), "max_lon": max(lons),
|
||||
}
|
||||
Reference in New Issue
Block a user