Genesis-Embodied-AI · Kashu7100 · Jun 8, 2026 · Jun 11, 2026
@@ -64,6 +64,17 @@ class BVHContext:
     # node loads instead of scattering over n_env identical trees. Recomputed on every rebuild.
     shared_across_envs: bool = False
 
+    # Tree routing. ``env_bvh_idx`` (n_env,) gives the BVH batch each env casts against; ``batch_repr_env``
+    # (n_batches,) gives the env whose geometry builds each tree slot. Together they express all three layouts:
+    #   per-env   : n_batches=n_env, env_bvh_idx=arange,    batch_repr_env=arange
+    #   shared    : n_batches=1,     env_bvh_idx=0,         batch_repr_env=[0]
+    #   N grouped : n_batches=N,     env_bvh_idx=group_id,  batch_repr_env=one env per group  (N distinct << n_env)
+    env_bvh_idx: "torch.Tensor | None" = None
+    batch_repr_env: "torch.Tensor | None" = None
+    # Compile-time thread-mapping selector for the cast: env-major (coalesced) when a tree is shared by a contiguous
+    # block of envs (shared / grouped), ray-major otherwise (distinct per-env trees).
+    env_major: bool = False
+
 
 class RaycastContext(SharedSensorContext):
     """
@@ -101,6 +112,36 @@ def _compute_visual_raycast_mask(solver: "KinematicSolver") -> np.ndarray:
         vface_vgeom_idx = qd_to_numpy(solver.vfaces_info.vgeom_idx)
         return vgeom_enabled[vface_vgeom_idx].astype(np.int8)
 
+    @staticmethod
+    def _static_geometry_groups(solver: "RigidSolver"):
+        """Group envs by identical static collision geometry, returning ``(n_groups, env_bvh_idx, batch_repr_env)``.
+
+        Heterogeneity in Genesis is expressed via per-env active-geom ranges in batched ``links_info`` (one variant
+        block per env, see ``_balanced_variant_mapping``). Two envs cast against the same geometry iff every link's
+        ``(geom_start, geom_end)`` matches, so grouping by that per-env signature yields the distinct geometries:
+        a single group when the solver isn't batched (homogeneous), and N groups for N variants across the envs.
+        ``env_bvh_idx[b]`` is env ``b``'s group; ``batch_repr_env[g]`` is the lowest env index in group ``g`` (its
+        verts/ranges build that tree). Caller guarantees, via ``shared_static_raycast_bvh``, that the static
+        geometry stays grouped as built (no per-env ``set_pos`` divergence), so the cheap range signature suffices.
+        """
+        B = solver._B
+        if not solver._options.batch_links_info:
+            # Unbatched link info → every env shares one geom range → a single shared tree.
+            env_bvh_idx = torch.zeros(B, dtype=gs.tc_int, device=gs.device)
+            return 1, env_bvh_idx, torch.zeros(1, dtype=gs.tc_int, device=gs.device)
+        gstart = qd_to_torch(solver.links_info.geom_start)  # (n_links, B)
+        gend = qd_to_torch(solver.links_info.geom_end)
+        sig = torch.cat([gstart, gend], dim=0).t().contiguous()  # (B, 2*n_links) per-env geometry signature
+        uniq, inv = torch.unique(sig, dim=0, return_inverse=True)
+        n_groups = int(uniq.shape[0])
+        env_bvh_idx = inv.to(gs.tc_int)
+        # Lowest env index per group (deterministic; any group member is a valid representative since the group is
+        # geometry-identical, but a stable choice keeps builds reproducible).
+        env_ids = torch.arange(B, dtype=torch.int64, device=inv.device)
+        repr_env = torch.full((n_groups,), B, dtype=torch.int64, device=inv.device)
+        repr_env.scatter_reduce_(0, inv.to(torch.int64), env_ids, reduce="amin", include_self=True)
+        return n_groups, env_bvh_idx, repr_env.to(gs.tc_int)
+
     def activate(self):
         """
         Build the per-(solver, mesh-type) BVHs on first activation; idempotent. Rigid solvers get a collision BVH
@@ -120,16 +161,39 @@ def activate(self):
             maybe_static = all(link.is_fixed for link in solver.links)
             if isinstance(solver, RigidSolver):
                 n_faces = solver.faces_info.geom_idx.shape[0]
-                aabb = AABB(n_batches=n_envs, n_aabbs=n_faces)
+                # A static collision mesh whose geometry is env-identical (or shared by groups of envs) produces
+                # bit-identical per-env BVHs, so one tree per distinct geometry serves its envs — dropping the
+                # n_envs-fold replication of nodes / aabbs / morton codes / radix-sort scratch, which dominates GPU
+                # memory for a high-poly terrain. Opt-in (shared_static_raycast_bvh) because env-identity is a
+                # runtime property (per-env set_pos on a fixed body diverges geometry after build) that can't be
+                # auto-proven at allocation time; the flag is a caller guarantee the static geometry stays grouped
+                # as built. With N distinct geometries across n_envs (N << n_envs, e.g. a terrain curriculum), the
+                # BVH collapses from n_envs trees to N. Without the flag, the per-env path keeps the runtime
+                # shared-across-envs detection below.
+                if maybe_static and solver._options.shared_static_raycast_bvh:
+                    n_groups, env_bvh_idx, batch_repr_env = self._static_geometry_groups(solver)
+                else:
+                    n_groups = n_envs
+                    env_bvh_idx = torch.arange(n_envs, dtype=gs.tc_int, device=gs.device)
+                    batch_repr_env = env_bvh_idx
+                aabb = AABB(n_batches=n_groups, n_aabbs=n_faces)
                 bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
-                self._bvh_contexts.append(BVHContext(solver, bvh, aabb, None, maybe_static))
+                entry = BVHContext(solver, bvh, aabb, None, maybe_static)
+                entry.env_bvh_idx = env_bvh_idx
+                entry.batch_repr_env = batch_repr_env
+                entry.env_major = n_groups < n_envs
+                self._bvh_contexts.append(entry)
             n_vfaces = solver.vfaces_info.vgeom_idx.shape[0]
             if n_vfaces > 0:
                 mask = self._compute_visual_raycast_mask(solver)
                 if mask.any():
                     aabb = AABB(n_batches=n_envs, n_aabbs=n_vfaces)
                     bvh = LBVH(aabb, max_n_query_result_per_aabb=0, n_radix_sort_groups=64)
-                    self._bvh_contexts.append(BVHContext(solver, bvh, aabb, mask, maybe_static))
+                    entry = BVHContext(solver, bvh, aabb, mask, maybe_static)
+                    # Visual BVH stays per-env (the shared-static optimization targets the collision terrain).
+                    entry.env_bvh_idx = torch.arange(n_envs, dtype=gs.tc_int, device=gs.device)
+                    entry.batch_repr_env = entry.env_bvh_idx
+                    self._bvh_contexts.append(entry)
 
         # Lazily watch each static BVH (collision or visual) for GEOMETRY changes. ``update`` polls its
         # rebuild_subscriber so an explicit set_pos / set_quat / set_vverts on the otherwise-immovable geometry forces
@@ -168,6 +232,7 @@ def update(self):
                     free_verts_state=entry.solver.free_verts_state,
                     fixed_verts_state=entry.solver.fixed_verts_state,
                     links_info=entry.solver.links_info,
+                    batch_repr_env=entry.batch_repr_env,
                     static_rigid_sim_config=entry.solver._static_rigid_sim_config,
                     aabb_state=entry.aabb,
                 )
@@ -189,19 +254,29 @@ def update(self):
                 )
                 entry.bvh.build()
             entry.needs_rebuild = False
-            # The per-env trees are bit-identical - so the cast can read one shared copy (batch 0) - exactly when the
-            # per-face AABBs they are built from match across envs. Comparing that build input directly (rather than a
-            # proxy like link poses or raw verts) captures per-env pose, batched verts, and any per-env geometry
-            # selection at once - so it stays correct whatever feeds the AABBs. A single-env solver gains nothing.
-            if entry.maybe_static and entry.aabb.n_batches > 1:
+            # Tree routing. A grouped/shared allocation (n_batches < n_envs) fixed its routing at activate time and
+            # is left as-is. For the per-env allocation (n_batches == n_envs) we recover the runtime shared-geometry
+            # optimization: if the freshly-built per-env trees are bit-identical, route every env to batch 0 (a
+            # coalesced read, no extra memory). Comparing the built AABBs directly captures per-env pose, batched
+            # verts and per-env geometry selection at once. Only static geometry can be shared this way.
+            n_envs = entry.solver._B
+            if entry.maybe_static and entry.aabb.n_batches == n_envs and n_envs > 1:
                 aabb_min = qd_to_torch(entry.aabb.aabbs.min)
                 aabb_max = qd_to_torch(entry.aabb.aabbs.max)
                 entry.shared_across_envs = bool(
                     torch.equal(aabb_min, aabb_min[:1].expand_as(aabb_min))
                     and torch.equal(aabb_max, aabb_max[:1].expand_as(aabb_max))
                 )
+                if entry.shared_across_envs:
+                    entry.env_bvh_idx = torch.zeros(n_envs, dtype=gs.tc_int, device=gs.device)
+                    entry.env_major = True
+                else:
+                    entry.env_bvh_idx = torch.arange(n_envs, dtype=gs.tc_int, device=gs.device)
+                    entry.env_major = False
             else:
-                entry.shared_across_envs = False
+                # Grouped (N<n_envs, routing set at activate) or a single shared tree (n_batches==1, shared by
+                # construction). The non-static per-env case keeps its activate-time identity routing.
+                entry.shared_across_envs = entry.aabb.n_batches < n_envs or n_envs <= 1
 
     def reset(self, envs_idx):
         # A reset may change otherwise-static geometry (re-randomized terrain, teleported obstacles), so force every
@@ -390,10 +465,11 @@ def _update_raw_data(
                 shared_metadata.sensor_cache_offsets,
                 shared_metadata.sensor_point_offsets,
                 shared_metadata.sensor_point_counts,
+                entry.env_bvh_idx,
                 raw_data_T,
                 gs.EPS,
                 i > 0,
-                entry.shared_across_envs,
+                entry.env_major,
             )
             if entry.raycast_mask is None:
                 kernel_cast_rays(

@@ -480,6 +480,18 @@ class RigidOptions(Options):
         Broadphase traversal strategy. ``SAP`` (sweep-and-prune) or ``ALL_VS_ALL`` (parallel pair iteration). Defaults
         to ``None`` (auto: ``SAP`` on CPU or when hibernation/heterogeneous entities are enabled, ``ALL_VS_ALL`` on GPU
         otherwise). See ``gs.broadphase_traversal`` for details on each strategy.
+    shared_static_raycast_bvh : bool, optional
+        Optimization for raycast sensors (``Raycaster`` / ``DepthCamera``) over large static scenes. When True, the
+        raycast BVH built over the solver's static (fully fixed) collision geometry is allocated **per distinct
+        geometry** rather than per env: envs with identical static geometry share one tree (each reads the tree for
+        its geometry). A homogeneous scene collapses to a single shared tree; a heterogeneous scene with N variants
+        across the envs (N << n_envs, e.g. a terrain curriculum) collapses from n_envs trees to N. For a high-poly
+        static terrain the per-env tree replication is the dominant raycast memory cost, so this cuts total GPU
+        memory by roughly n_envs/N and lifts the env-count ceiling. It is a caller guarantee that each env's static
+        collision geometry stays as built: do not enable it if you give fixed entities per-env poses (e.g. per-env
+        ``set_pos`` on a fixed body) after build, as the shared tree would not reflect the divergence. Defaults to
+        False, which keeps per-env trees and the runtime "shared across envs" auto-detection. Only affects
+        raycasting, not physics.
 
     Warning
     -------
@@ -504,6 +516,10 @@ class RigidOptions(Options):
     batch_joints_info: StrictBool = False
     batch_dofs_info: StrictBool = False
 
+    # raycast: share one static-geometry BVH across envs instead of one per env (caller guarantees env-identical
+    # static collision geometry). See the class docstring.
+    shared_static_raycast_bvh: StrictBool = False
+
     # constraint solver
     constraint_solver: gs.constraint_solver = gs.constraint_solver.Newton
     iterations: PositiveInt = 50