GPS Waypoint Deduplication

This article explains how I removed duplicates from my Osmand favourites.

My favourite map app is Osmand. It draws nice maps that I find much easier to read than the competition (Bing, Google, Apple maps). Best of all: the data comes from OpenStreetMap and is free & open.

Pretty much the only downside that I sometimes face is that there is no way to synchronise favourites between installations of Osmand (like on my phone & tablet). You can export them to a file, sync that, and import it in the other app, but that can create duplicates.

Below you find the Python script I wrote to get rid of those duplicates. It is quite dumb, and just keeps one of the duplicates – there is no merging of info in the waypoints. The logic is like this:

Flow diagram of the waypoint deduplication.

In otherwords, two waypoints are merged into one if they

  • are in the same spot, and
  • have the same name.

This merging of waypoints is quite dumb. Only one of the waypoints is kept, and the others are discarded. This is based on the number of XML elements in each; only the one with the most elements (like name, type, elevation, creation date) is kept.

The script is below. Adjust INPUT_FILE and OUTPUT_FILE to your needs.

#!/usr/bin/env python3

import math
from pathlib import Path
import xml.etree.ElementTree as ET

MY_DIR = Path(__file__).resolve().parent
INPUT_FILE = MY_DIR / "favourites-2022-12-02.gpx"
OUTPUT_FILE = MY_DIR / "favourites-sorted.gpx"

NS_GPX = "http://www.topografix.com/GPX/1/1"
TAG_GPX = "{%s}gpx" % NS_GPX
TAG_WPT = "{%s}wpt" % NS_GPX
GROUP_DIST_EPSILON_KM = 0.005

GpsCoord = tuple[float, float]
Amsterdam = (52.3792313, 4.9002002)


def main():
    # Register default namespace BEFORE parsing the file, otherwise we cannot
    # write the GPX namespace as default. Source:
    # https://stackoverflow.com/a/18340978/875379
    ET.register_namespace("", NS_GPX)
    ET.register_namespace("osmand", "https://osmand.net")
    ET.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance")

    tree = ET.parse(INPUT_FILE)
    root = tree.getroot()

    print(f"Root tag: {root.tag}")
    if root.tag != TAG_GPX:
        raise SystemExit("This is not a GPX file")

    waypoints = [child for child in root if child.tag == TAG_WPT]
    print(f"{len(waypoints)} waypoints before sorting & filtering")

    sorted_wpts = sorted(waypoints, key=gps_sorting_key)
    grouped_wpts = group_wpts(sorted_wpts)
    output_wpts = merge_wpts(grouped_wpts)
    print(f"{len(output_wpts)} waypoints after sorting & filtering")

    # Remove all unsorted waypoints, then re-add the sorted ones.
    for wpt in waypoints:
        root.remove(wpt)
    for wpt in output_wpts:
        root.append(wpt)

    ET.indent(tree)
    tree.write(
        OUTPUT_FILE,
        encoding="UTF-8",
        xml_declaration=True,
    )


def merge_wpts(grouped_waypoints: list[list[ET.Element]]) -> list[ET.Element]:
    waypoints: list[ET.Element] = []
    for group in grouped_waypoints:
        # Due to the ordering by number of XML elements, the one with the most
        # info wins.
        waypoints.append(group[0])
    return waypoints


def group_wpts(waypoints: list[ET.Element]) -> list[list[ET.Element]]:
    """Group waypoints by similarity.

    Assumes the waypoints have been sorted by geographical likeness, i.e.
    to-be-grouped waypoints should be sequential in the list.
    """

    groups: list[list[ET.Element]] = []
    group: list[ET.Element] = []
    previous: ET.Element | None = None

    for wpt in waypoints:
        if is_same_group(previous, wpt):
            group.append(wpt)
        else:
            # Found a new group.
            groups.append(group)
            group = [wpt]
        previous = wpt

    if group:
        groups.append(group)
    return groups


def is_same_group(wpt1: ET.Element | None, wpt2: ET.Element) -> bool:
    """Return whether these waypoints belong to the same group.

    Always returns True if the first element is None.

    >>> is_same_group(None, ET.Element('wpt'))
    True

    >>> # Different GPS coordinates
    >>> wpt1 = ET.Element('wpt', lat=Amsterdam[0], lon=Amsterdam[1])
    >>> wpt2 = ET.Element('wpt', lat=52.3935660, lon=4.9119863)
    >>> is_same_group(wpt1, wpt1)
    True
    >>> is_same_group(wpt1, wpt2)  # Difference in location.
    False
    >>> wpt3 = ET.Element('wpt', lat=Amsterdam[0], lon=Amsterdam[1], name="Amsterdam")
    >>> is_same_group(wpt1, wpt3)  # Difference in name.
    False

    >>> wpt4 = ET.Element('wpt', lat=52.3935660, lon=4.9119863)
    >>> wpt5 = ET.Element('wpt', lat=52.3935600, lon=4.9120000)
    >>> is_same_group(wpt4, wpt5)  # Difference in location small enough.
    True
    """

    if wpt1 is None:
        return True

    # different names -> not the same
    if wpt1.attrib.get("name") != wpt2.attrib.get("name"):
        return False

    return waypoint_distance(wpt1, wpt2) <= GROUP_DIST_EPSILON_KM


def waypoint_distance(wpt1: ET.Element, wpt2: ET.Element) -> float:
    """Return distance between wapoints, in kilometers."""

    lat1 = float(wpt1.attrib["lat"])
    lon1 = float(wpt1.attrib["lon"])
    lat2 = float(wpt2.attrib["lat"])
    lon2 = float(wpt2.attrib["lon"])
    return haversine_distance((lat1, lon1), (lat2, lon2))


def haversine_distance(coord1: GpsCoord, coord2: GpsCoord) -> float:
    """Return distance between GPS coordinates, in kilometers."""

    # convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, coord1 + coord2)

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    )
    c = 2 * math.asin(math.sqrt(a))

    # 6371 km is the radius of the Earth
    km = 6371 * c
    return km


def gps_sorting_key(gps_waypoint: ET.Element) -> tuple[float, int]:
    """Sorting key for GPS waypoints.

    Sorts by distance to Amsterdam (smallest first), and by number of child
    elements (biggest first). The latter is meant to sort elements with the most
    information first.

    >>> wpt = ET.Element('wpt', lat=Amsterdam[0], lon=Amsterdam[1])
    >>> gps_sorting_key(wpt)
    (0.0, 0)

    >>> wpt = ET.Element('wpt', lat=52.3935660, lon=4.9119863)
    >>> gps_sorting_key(wpt)
    (1.78339, 0)

    >>> wpt = ET.Element('wpt', lat=52.3935660, lon=4.9119863)
    >>> wpt.append(ET.Element('ele'))
    >>> wpt.append(ET.Element('name'))
    >>> gps_sorting_key(wpt)
    (1.78339, -2)

    """
    lat = float(gps_waypoint.attrib["lat"])
    lon = float(gps_waypoint.attrib["lon"])
    dist_km = haversine_distance(Amsterdam, (lat, lon))

    # Round the distance to cm, that's precise enough and also avoids floating
    # point issues in the doctests.
    return round(dist_km, 5), -len(gps_waypoint)


if __name__ == "__main__":
    import doctest

    num_failed, num_tests = doctest.testmod()
    if num_failed > 0 or num_tests == 0:
        raise SystemExit("Self-test failed, aborting")
    main()