virtual-vehicle · tgoelles · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -17,8 +17,9 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ "3.11", "3.12", "3.13" ] # Define Python versions as strings
-        pandas-version: [ "2", "3" ]
+        python-version: ["3.11", "3.12", "3.13"] # Define Python versions as strings
+        pandas-version: ["2", "3"]
+        numba-enabled: ["false", "true"]
     env:
       UV_PYTHON: ${{ matrix.python-version }}
 
@@ -32,7 +33,11 @@ jobs:
         uses: astral-sh/setup-uv@v5
 
       - name: Install the project
-        run: uv sync --locked --all-extras --dev
+        run: uv sync --locked --dev
+
+      - name: Install optional numba
+        if: matrix.numba-enabled == 'true'
+        run: uv pip install "numba>=0.65.1,<1.0.0"
 
       - name: Install pandas ${{ matrix.pandas-version }}
         run: |

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,26 @@ The format is based on `Keep a Changelog <https://keepachangelog.com/en/1.0.0/>`
 and this project adheres to `Semantic Versioning <https://semver.org/spec/v2.0.0.html>`_.
 
 
+Unreleased
+-------------
+
+Added
+~~~~~~
+- ``numba`` is now an optional dependency. Install with ``pip install pointcloudset[numba]`` to enable JIT-accelerated clustering.
+
+Fixed
+~~~~~~~
+- Fixed a major bug on larger pointclouds when radiusoutlier used too much memory and the process was killed.
+- Fixed a similar bug with get_cluster where the process was killed on larger pointclouds due to too much memory usage.
+- Improved speed of clustering with numba by about 10x on larger pointclouds.
+- ``filter("radiusoutlier", ...)`` now uses KDTree neighbour counts directly instead of materializing per-point neighbour lists, preserving the expected outlier semantics while avoiding large temporary allocations on dense point clouds.
+
+Changed
+~~~~~~~
+- Internal refactor: moved the main ``PointCloud.get_cluster()`` implementation and chunk-budget helper from ``pointcloudset.pointcloud`` to ``pointcloudset.cluster`` to keep the PointCloud class focused. No user-facing behaviour changes.
+- ``PointCloud.get_cluster()`` now uses a KDTree + incremental union-find DBSCAN implementation. Core connectivity is built without materializing a global edge list, keeping memory bounded by per-point neighbourhood queries while preserving cluster labels and ``take_cluster(-1, labels)`` noise handling. The union-find inner loops are JIT-compiled with Numba when the optional ``numba`` extra is installed (``pip install pointcloudset[numba]``); a pure-Python fallback is used automatically when Numba is not available.
+
+
 0.13.0 - (2026-05-05)
 -------------
 

diff --git a/README.rst b/README.rst
@@ -111,6 +111,17 @@ Install python package with pip:
 
    pip install pointcloudset
 
+Optional extras
+~~~~~~~~~~~~~~~~
+
+For faster clustering on large point clouds, install the optional ``numba`` extra to enable JIT-accelerated union-find in ``PointCloud.get_cluster()``:
+
+.. code-block:: console
+
+   pip install pointcloudset[numba]
+
+Without this extra, a pure-Python fallback is used automatically with no change in behaviour.
+
 Installation with Docker
 ################################################
 

diff --git a/doc/sphinx/source/description_python_api.rst b/doc/sphinx/source/description_python_api.rst
@@ -26,10 +26,23 @@ As a common user these Classes are enough to tackle most proplems. If you want t
 the package than you should have a look at the other modules.
 
  Datasets and PointCloud use the functions of the following modules:
+    * :mod:`pointcloudset.cluster`
     * :mod:`pointcloudset.diff`
     * :mod:`pointcloudset.filter`
     * :mod:`pointcloudset.geometry`
     * :mod:`pointcloudset.io`
     * :mod:`pointcloudset.plot`
 
+.. note::
+
+    ``pointcloudset.cluster`` provides the DBSCAN-based clustering used by
+    ``PointCloud.get_cluster()`` via ``pointcloudset.cluster.get_cluster_labels()``.
+    The union-find inner loops are JIT-compiled
+    with `Numba <https://numba.readthedocs.io/>`_ when the optional extra is
+    installed::
+
+      pip install pointcloudset[numba]
+
+    Without Numba a pure-Python fallback is used automatically.
+
 For a more detailed documentation see the section below.
diff --git a/doc/sphinx/source/tutorial_notebooks/reading_las_pcd.ipynb b/doc/sphinx/source/tutorial_notebooks/reading_las_pcd.ipynb
@@ -22,9 +22,6 @@
    "source": [
     "from pathlib import Path\n",
     "\n",
-    "import laspy\n",
-    "import numpy as np\n",
-    "\n",
     "import pointcloudset as pcs"
    ]
   },
@@ -44,8 +41,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "las_pc = pcs.PointCloud.from_file(testlas)\n",
-    "pcd_pc = pcs.PointCloud.from_file(testpcd)"
+    "from locale import normalize\n",
+    "\n",
+    "from numpy.random import normal\n",
+    "\n",
+    "\n",
+    "las_pc = pcs.PointCloud.from_file(testlas, normalize_xyz=True)\n",
+    "pcd_pc = pcs.PointCloud.from_file(testpcd, normalize_xyz=True)"
    ]
   },
   {

diff --git a/doc/sphinx/source/tutorial_notebooks/usage.ipynb b/doc/sphinx/source/tutorial_notebooks/usage.ipynb
@@ -337,7 +337,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "testpointcloud2 = pcs.PointCloud.from_file(lasfile)"
+    "testpointcloud2 = pcs.PointCloud.from_file(lasfile, normalize_xyz=True)"
    ]
   },
   {
@@ -641,13 +641,6 @@
    "source": [
     "result.agg({\"x difference\": \"max\"}, \"pointcloud\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,12 @@ dependencies = [
     "nbformat",
     "pypcd4>=1.4.3",
 ]
+
+[project.optional-dependencies]
+numba = [
+    "numba>=0.65.1,<1.0.0",
+]
+
 [dependency-groups]
 dev = [
     "pytest",

diff --git a/src/pointcloudset/cluster/__init__.py b/src/pointcloudset/cluster/__init__.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas
+from scipy.spatial import KDTree
+
+from pointcloudset.cluster.numba import roots_for_positions, union_pairs
+from pointcloudset.config import (
+    GET_CLUSTER_BORDER_QUERY_CHUNK_SIZE,
+    GET_CLUSTER_CORE_QUERY_CHUNK_SIZE,
+    GET_CLUSTER_MEMORY_BUDGET_MB,
+)
+
+
+def _budgeted_chunk_size(requested: int, max_neighbors: int, budget_bytes: int) -> int:
+    if requested < 1:
+        return 1
+    if max_neighbors < 1:
+        return requested
+
+    # Conservative estimate for list-of-lists neighbour materialization.
+    est_bytes_per_neighbor = 64
+    est_list_overhead = 128
+    safety_fraction = 0.60
+    per_point_bytes = est_list_overhead + max_neighbors * est_bytes_per_neighbor
+    usable_budget = max(1, int(budget_bytes * safety_fraction))
+    max_points = max(1, usable_budget // max(1, per_point_bytes))
+    return max(1, min(requested, max_points))
+
+
+def get_cluster_labels(xyz: np.ndarray, eps: float, min_points: int) -> pandas.DataFrame:
+    """Return DBSCAN labels for ``xyz`` coordinates.
+
+    The implementation follows canonical DBSCAN with core connectivity via
+    union-find and border attachment in a second pass.
+    """
+    n = len(xyz)
+    tree = KDTree(xyz)
+
+    # Stage 1: identify core points via count-only batch query (no edge storage).
+    # Chunking keeps the count array allocation predictable for very large clouds.
+    counts = np.empty(n, dtype=np.intp)
+    chunk = max(1, min(n, 100_000))
+    for start in range(0, n, chunk):
+        end = min(start + chunk, n)
+        counts[start:end] = tree.query_ball_point(xyz[start:end], eps, workers=-1, return_length=True)
+    is_core = counts >= min_points
+    budget_bytes = int(GET_CLUSTER_MEMORY_BUDGET_MB * 1024 * 1024)
+
+    if not is_core.any():
+        return pandas.DataFrame(np.full(n, -1, dtype=np.intp), columns=["cluster"])
+
+    # Stage 2: build core connectivity incrementally with union-find.
+    # Keep DSU arrays only for core points to reduce memory and improve cache locality.
+    core_idx = np.flatnonzero(is_core).astype(np.intp, copy=False)
+    n_core = len(core_idx)
+
+    core_pos = np.full(n, -1, dtype=np.intp)
+    core_pos[core_idx] = np.arange(n_core, dtype=np.intp)
+
+    parent = np.arange(n_core, dtype=np.intp)
+    rank = np.zeros(n_core, dtype=np.uint8)
+
+    # Query in chunks for throughput while keeping memory bounded.
+    # Each chunk only materializes neighbour lists for that chunk.
+    core_max_neighbors = int(counts[core_idx].max()) if n_core > 0 else 0
+    edge_chunk_limit = _budgeted_chunk_size(GET_CLUSTER_CORE_QUERY_CHUNK_SIZE, core_max_neighbors, budget_bytes)
+    edge_chunk = min(n_core, edge_chunk_limit)
+    for start in range(0, n_core, edge_chunk):
+        end = min(start + edge_chunk, n_core)
+        batch_idx = core_idx[start:end]
+        nbr_lists = tree.query_ball_point(xyz[batch_idx], eps, workers=-1)
+        left_pairs: list[int] = []
+        right_pairs: list[int] = []
+        for local_i, nbrs in enumerate(nbr_lists):
+            i_global = int(batch_idx[local_i])
+            i_core = int(core_pos[i_global])
+            for j_global in nbrs:
+                if j_global <= i_global:
+                    continue
+                j_core = int(core_pos[j_global])
+                if j_core >= 0:
+                    left_pairs.append(i_core)
+                    right_pairs.append(j_core)
+
+        if left_pairs:
+            left_arr = np.asarray(left_pairs, dtype=np.intp)
+            right_arr = np.asarray(right_pairs, dtype=np.intp)
+            union_pairs(parent, rank, left_arr, right_arr)
+
+    # Stage 3: assign contiguous labels to connected core components.
+    labels = np.full(n, -1, dtype=np.intp)
+    core_positions = np.arange(n_core, dtype=np.intp)
+    roots = roots_for_positions(parent, core_positions)
+    _, inverse = np.unique(roots, return_inverse=True)
+    labels[core_idx] = inverse.astype(np.intp, copy=False)
+
+    # Stage 4: attach border points (non-core, but within eps of a core point).
+    # Standard DBSCAN ambiguity: a border point reachable from multiple clusters
+    # is assigned to the first encountered - matching open3d's behaviour.
+    non_core_idx = np.flatnonzero(~is_core).astype(np.intp, copy=False)
+    non_core_max_neighbors = int(counts[non_core_idx].max()) if len(non_core_idx) > 0 else 0
+    border_chunk_limit = _budgeted_chunk_size(
+        GET_CLUSTER_BORDER_QUERY_CHUNK_SIZE,
+        non_core_max_neighbors,
+        budget_bytes,
+    )
+    border_chunk = min(len(non_core_idx), border_chunk_limit) if len(non_core_idx) > 0 else 1
+    for start in range(0, len(non_core_idx), border_chunk):
+        end = min(start + border_chunk, len(non_core_idx))
+        batch_idx = non_core_idx[start:end]
+        nbr_lists = tree.query_ball_point(xyz[batch_idx], eps, workers=-1)
+        for local_i, nbrs in enumerate(nbr_lists):
+            i_global = int(batch_idx[local_i])
+            for j_global in nbrs:
+                if is_core[j_global]:
+                    labels[i_global] = labels[int(j_global)]
+                    break
+
+    del core_pos
+
+    return pandas.DataFrame(labels, columns=["cluster"])
diff --git a/src/pointcloudset/cluster/numba.py b/src/pointcloudset/cluster/numba.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import importlib
+
+import numpy as np
+
+
+def njit(*_args, **_kwargs):
+    def _decorator(func):
+        return func
+
+    return _decorator
+
+
+_numba = None
+try:
+    _numba = importlib.import_module("numba")
+except ImportError:
+    HAS_NUMBA = False
+else:
+    HAS_NUMBA = True
+    njit = _numba.njit
+
+
+def _find_root_python(parent: np.ndarray, i: int) -> int:
+    while parent[i] != i:
+        parent[i] = parent[parent[i]]
+        i = int(parent[i])
+    return i
+
+
+def _union_pairs_python(parent: np.ndarray, rank: np.ndarray, left: np.ndarray, right: np.ndarray) -> None:
+    for idx in range(len(left)):
+        root_a = _find_root_python(parent, int(left[idx]))
+        root_b = _find_root_python(parent, int(right[idx]))
+        if root_a == root_b:
+            continue
+        if rank[root_a] < rank[root_b]:
+            parent[root_a] = root_b
+        elif rank[root_a] > rank[root_b]:
+            parent[root_b] = root_a
+        else:
+            parent[root_b] = root_a
+            rank[root_a] += 1
+
+
+def _roots_for_positions_python(parent: np.ndarray, positions: np.ndarray) -> np.ndarray:
+    roots = np.empty(len(positions), dtype=np.intp)
+    for idx in range(len(positions)):
+        roots[idx] = _find_root_python(parent, int(positions[idx]))
+    return roots
+
+
+if HAS_NUMBA:
+
+    @njit(cache=True)
+    def _find_root_numba(parent: np.ndarray, i: int) -> int:
+        while parent[i] != i:
+            parent[i] = parent[parent[i]]
+            i = parent[i]
+        return i
+
+    @njit(cache=True)
+    def _union_pairs_numba(parent: np.ndarray, rank: np.ndarray, left: np.ndarray, right: np.ndarray) -> None:
+        for idx in range(len(left)):
+            root_a = _find_root_numba(parent, left[idx])
+            root_b = _find_root_numba(parent, right[idx])
+            if root_a == root_b:
+                continue
+            if rank[root_a] < rank[root_b]:
+                parent[root_a] = root_b
+            elif rank[root_a] > rank[root_b]:
+                parent[root_b] = root_a
+            else:
+                parent[root_b] = root_a
+                rank[root_a] += 1
+
+    @njit(cache=True)
+    def _roots_for_positions_numba(parent: np.ndarray, positions: np.ndarray) -> np.ndarray:
+        roots = np.empty(len(positions), dtype=np.intp)
+        for idx in range(len(positions)):
+            roots[idx] = _find_root_numba(parent, positions[idx])
+        return roots
+
+
+def union_pairs(parent: np.ndarray, rank: np.ndarray, left: np.ndarray, right: np.ndarray) -> None:
+    if HAS_NUMBA:
+        _union_pairs_numba(parent, rank, left, right)
+    else:
+        _union_pairs_python(parent, rank, left, right)
+
+
+def roots_for_positions(parent: np.ndarray, positions: np.ndarray) -> np.ndarray:
+    if HAS_NUMBA:
+        return _roots_for_positions_numba(parent, positions)
+    return _roots_for_positions_python(parent, positions)
diff --git a/src/pointcloudset/config.py b/src/pointcloudset/config.py
@@ -13,3 +13,8 @@
 }
 
 PLOTLYSIZELIMIT = 300000
+
+# Fixed cluster chunk sizes used by PointCloud.get_cluster.
+GET_CLUSTER_CORE_QUERY_CHUNK_SIZE = 1024
+GET_CLUSTER_BORDER_QUERY_CHUNK_SIZE = 4096
+GET_CLUSTER_MEMORY_BUDGET_MB = 1536.0