AcademySoftwareFoundation
diff --git a/‎cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java‎
Lines changed: 2 additions & 1 deletion b/‎cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DispatchQuery.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎rust/Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎rust/Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎rust/config/scheduler.yaml‎
Lines changed: 94 additions & 6 deletions b/‎rust/config/scheduler.yaml‎
Lines changed: 94 additions & 6 deletions
diff --git a/‎rust/crates/scheduler/Cargo.toml‎
Lines changed: 4 additions & 0 deletions b/‎rust/crates/scheduler/Cargo.toml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎rust/crates/scheduler/src/accounting/recompute.rs‎
Lines changed: 26 additions & 1 deletion b/‎rust/crates/scheduler/src/accounting/recompute.rs‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎rust/crates/scheduler/src/accounting/redis_client.rs‎
Lines changed: 22 additions & 5 deletions b/‎rust/crates/scheduler/src/accounting/redis_client.rs‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎rust/crates/scheduler/src/cluster.rs‎
Lines changed: 10 additions & 1 deletion b/‎rust/crates/scheduler/src/cluster.rs‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎rust/crates/scheduler/src/cluster_key.rs‎
Lines changed: 20 additions & 0 deletions b/‎rust/crates/scheduler/src/cluster_key.rs‎
Lines changed: 20 additions & 0 deletions
@@ -1243,7 +1243,8 @@ private static final String replaceQueryForFifo(String query) {
 
     /**
      * Looks for shows that are under their burst for a particular type of proc. The show has to be
-     * at least one whole proc under their burst to be considered for booking.
+     * at least one whole proc under their burst to be considered for booking. Scheduler-managed
+     * shows are excluded; their dispatch is owned by the standalone Rust scheduler.
      */
     // spotless:off
     public static final String FIND_SHOWS =
 
@@ -35,5 +35,13 @@ tracing = "0.1.40"
 tracing-appender = "0.2.3"
 tracing-rolling-file = "0.1.2"
 tracing-subscriber = { version = "0.3.18", features = ["ansi", "env-filter"] }
+# Pin time to 0.3.47: the 0.3.48 release adds an impl that collides with
+# sentry-types' blanket `impl<T> From<T> for LogAttribute`, producing an
+# E0119 "conflicting implementations" error when building the scheduler
+# (which pulls both `sentry` and, via tracing-appender, `time/formatting`).
+# Since Cargo.lock is gitignored, CI resolves fresh and would otherwise float
+# to the broken 0.3.48. Remove the pin once sentry-types/time resolve this
+# upstream. Referenced by the scheduler crate so the constraint takes effect.
+time = "=0.3.47"
 structopt = "0.3.26"
 home = "=0.5.9"
@@ -158,15 +158,103 @@ queue:
   #   # Default: 3
   #   job_buffer_size: 3
 
-  # Host booking strategy configuration
+  # ---------------------------------------------------------------------------
+  # Host booking strategy. Two variants:
+  #
+  # Saturation (default) — legacy first-fit. Saturation flags control B-tree
+  # iteration direction (saturated-first vs spread-first per dimension).
   # host_booking_strategy:
-  #   # Enable core saturation booking
-  #   # Default: true
+  #   type: saturation
   #   core_saturation: true
-  #
-  #   # Enable memory saturation booking
-  #   # Default: false
   #   memory_saturation: false
+  #
+  # Epvm — E-PVM stranding score; picks the lowest-scoring host among up to
+  # `max_candidates` scanned. Iteration is always saturated-first under Epvm,
+  # independent of the Saturation flags. `weights` express per-dimension
+  # importance ratios (W3 normalization, see scheduler design doc).
+  # host_booking_strategy:
+  #   type: epvm
+  #   max_candidates: 500
+  #   weights:
+  #     cores: 1.0
+  #     mem: 1.0
+  #     gpus: 2.0
+  #     gpu_mem: 1.0
+  #     gpu_count_reservation: 2.0
+  #     gpu_mem_reservation: 2.0
+  #
+  # --- gpu_*_reservation (soft-reservation penalty) --------------------------
+  # Applied ONLY when the layer requests no GPUs (gpus_min == 0). Penalizes GPU
+  # hosts to nudge non-GPU work onto non-GPU hosts. The penalty is split into
+  # two independent knobs so operators control the count-vs-memory balance:
+  #
+  #   penalty = gpu_count_reservation * idle_gpus
+  #           + gpu_mem_reservation   * idle_gpu_memory_gb
+  #
+  # Unlike the stranding weights these scale raw capacity (not a normalized
+  # stranding term). A typical default is 2.0/2.0 — strong enough to push GPU
+  # hosts behind non-GPU hosts for non-GPU work, weak enough not to swamp the
+  # cores/mem stranding signal. On hosts with a lot of GPU memory the mem
+  # penalty will tend to dominate; raise `gpu_count_reservation` (or lower
+  # `gpu_mem_reservation`) to rebalance.
+  #
+  # --- Example: equal-weight baseline ----------------------------------------
+  # Cores and memory stranding count equally; GPU dimensions count slightly
+  # more; non-GPU work mildly prefers non-GPU hosts. Matches the defaults.
+  # host_booking_strategy:
+  #   type: epvm
+  #   max_candidates: 500
+  #   weights:
+  #     cores: 1.0
+  #     mem: 1.0
+  #     gpus: 2.0
+  #     gpu_mem: 1.0
+  #     gpu_count_reservation: 2.0
+  #     gpu_mem_reservation: 2.0
+  #
+  # --- Example: GPU-scarce farm ----------------------------------------------
+  # Protect GPU hosts aggressively for GPU work and make GPU stranding very
+  # expensive (don't waste a GPU on a low-GPU layer). Reservation favors count
+  # heavily since GPUs themselves are the scarce resource.
+  # host_booking_strategy:
+  #   type: epvm
+  #   max_candidates: 500
+  #   weights:
+  #     cores: 1.0
+  #     mem: 1.0
+  #     gpus: 8.0
+  #     gpu_mem: 2.0
+  #     gpu_count_reservation: 10.0
+  #     gpu_mem_reservation: 1.0
+  #
+  # --- Example: memory-tight farm --------------------------------------------
+  # Penalize memory stranding more than cores so jobs with high mem:core
+  # ratios go to high-memory hosts first.
+  # host_booking_strategy:
+  #   type: epvm
+  #   max_candidates: 500
+  #   weights:
+  #     cores: 1.0
+  #     mem: 3.0
+  #     gpus: 2.0
+  #     gpu_mem: 1.0
+  #     gpu_count_reservation: 2.0
+  #     gpu_mem_reservation: 2.0
+  #
+  # --- Example: cores-only (debugging sanity check) --------------------------
+  # Scoring degenerates to "pack cores tightest" — useful for diagnosing
+  # whether other dimensions are responsible for a placement regression.
+  # host_booking_strategy:
+  #   type: epvm
+  #   max_candidates: 500
+  #   weights:
+  #     cores: 1.0
+  #     mem: 0.0
+  #     gpus: 0.0
+  #     gpu_mem: 0.0
+  #     gpu_count_reservation: 0.0
+  #     gpu_mem_reservation: 0.0
+  # ---------------------------------------------------------------------------
 
   # Soft memory limit multiplier for frame memory requirements
   # Used as a threshold to determine if a frame can be dispatched based on available memory
 
@@ -54,6 +54,9 @@ lazy_static = "1.5"
 moka = { version = "0.12.10", features = ["future"] }
 prometheus = "0.13"
 sentry = { version = "0.47", features = ["tracing"] }
+# Version constraint only — see the `time` pin note in the workspace Cargo.toml.
+# Not used directly; declared so the `=0.3.47` pin applies to resolution.
+time = { workspace = true }
 axum = "0.7"
 tower-http = { version = "0.5", features = ["trace"] }
 urlencoding = "2.1"
@@ -69,5 +72,6 @@ tokio-test = "0.4"
 tracing-test = "0.2"
 serial_test = "3.0"
 rand = "0.8"
+proptest = "1.5"
 testcontainers = "0.23"
 testcontainers-modules = { version = "0.11", features = ["redis"] }
@@ -40,6 +40,7 @@ use crate::accounting::redis_client::ReseedOp;
 use crate::accounting::AccountingService;
 use crate::config::CONFIG;
 use crate::dao::ResourceAccountingDao;
+use crate::metrics;
 use crate::models::CoreSize;
 
 pub fn spawn_loop(service: Arc<AccountingService>) {
@@ -51,11 +52,35 @@ pub fn spawn_loop(service: Arc<AccountingService>) {
                 return;
             }
         };
-        let mut interval = time::interval(CONFIG.accounting.recompute_interval);
+        let interval_dur = CONFIG.accounting.recompute_interval;
+        let mut interval = time::interval(interval_dur);
         // Skip the immediate first tick - bootstrap reseed already ran at startup.
         interval.tick().await;
+        // Dispatch heartbeat baseline: snapshot the session counters so the first
+        // logged delta only covers events after this point.
+        let mut last_dispatched = metrics::frames_dispatched_session();
+        let mut last_limit_exceeded = metrics::resource_limit_exceeded_session();
         loop {
             interval.tick().await;
+
+            // Dispatch heartbeat: the aggregate INFO that replaces the demoted
+            // per-frame dispatch logs. Decoupled from the accounting reseed below.
+            let current_dispatched = metrics::frames_dispatched_session();
+            let dispatched_delta = current_dispatched.saturating_sub(last_dispatched);
+            last_dispatched = current_dispatched;
+
+            let current_limit_exceeded = metrics::resource_limit_exceeded_session();
+            let limit_exceeded_delta =
+                current_limit_exceeded.saturating_sub(last_limit_exceeded);
+            last_limit_exceeded = current_limit_exceeded;
+
+            info!(
+                "Dispatched {} frames in the last {}ms ({} resource-limit-exceeded)",
+                dispatched_delta,
+                interval_dur.as_millis(),
+                limit_exceeded_delta
+            );
+
             let result = AssertUnwindSafe(async {
                 if let Err(err) = run_once(&service, &pg_dao).await {
                     warn!("Recompute cycle failed: {err}");
 
@@ -139,11 +139,14 @@ impl RedisAccounting {
         Ok(v.unwrap_or(0))
     }
 
-    /// Reads the subscription hash's booked cores + burst in one round-trip. Missing
-    /// keys/fields are treated as `(0, 0)`. Currently used by `AccountingService::
-    /// subscription_can_book` for diagnostic visibility and reachable future use; the
-    /// dispatcher hot path relies on the authoritative Lua booking check instead.
-    #[allow(dead_code)]
+    /// Reads the subscription hash's booked cores + burst in one round-trip from
+    /// `acct:sub:{show_id}:{alloc_id}` (fields `int_cores`, `burst`, both in
+    /// cores — the centicore→core conversion happens once at the reseed write
+    /// boundary, see the `lua.rs` unit invariant). Missing keys/fields are
+    /// treated as `(0, 0)`. Non-authoritative:
+    /// the dispatcher's Lua `BOOK_OR_FORCE` call remains the source of truth for
+    /// the booking decision; this is a snapshot suitable for optimistic pre-filters
+    /// and scoring inputs.
     pub async fn read_sub_counters(
         &self,
         show_id: uuid::Uuid,
@@ -156,6 +159,20 @@ impl RedisAccounting {
         let burst = values.get(1).copied().flatten().unwrap_or(0);
         Ok((booked, burst))
     }
+
+    /// Reads `acct:job:{job_id}` `int_cores` (live booked cores, in cores — see
+    /// the `lua.rs` unit invariant; Redis accounting counters are never stored
+    /// in centicores). Returns 0 when the key/field is missing. Used by the E-PVM placement
+    /// snapshot in `MatchingService::process_layer` (design Branch 2a).
+    pub async fn read_job_cores_in_use(
+        &self,
+        job_id: uuid::Uuid,
+    ) -> Result<i64, AccountingError> {
+        let mut conn = self.conn.clone();
+        let key = format!("acct:job:{}", job_id);
+        let v: Option<i64> = conn.hget(&key, "int_cores").await?;
+        Ok(v.unwrap_or(0))
+    }
 }
 
 /// Parses the raw `redis::Value` returned by the `BOOK_OR_FORCE` Lua script (see
 
@@ -260,14 +260,19 @@ impl ClusterFeed {
                     let facility_id = cluster.facility_id;
                     let show_id = parse_uuid(&cluster.show_id);
                     match cluster.ttype.as_str() {
-                        // Each alloc tag becomes its own cluster
+                        // Each alloc tag becomes its own cluster. Carry pk_alloc
+                        // through Tag so the matcher can snapshot the
+                        // (show, alloc) subscription burst from Redis before
+                        // host checkout (see `MatchingService::process_layer`).
                         "ALLOC" => {
+                            let alloc_id = cluster.alloc_id.as_deref().map(parse_uuid);
                             clusters.push(Cluster::single_tag(
                                 facility_id,
                                 show_id,
                                 Tag {
                                     name: cluster.tag,
                                     ttype: TagType::Alloc,
+                                    alloc_id,
                                 },
                             ));
                         }
@@ -279,6 +284,7 @@ impl ClusterFeed {
                                 .insert(Tag {
                                     name: cluster.tag,
                                     ttype: TagType::Manual,
+                                    alloc_id: None,
                                 });
                         }
                         "HOSTNAME" => {
@@ -288,6 +294,7 @@ impl ClusterFeed {
                                 .insert(Tag {
                                     name: cluster.tag,
                                     ttype: TagType::HostName,
+                                    alloc_id: None,
                                 });
                         }
                         "HARDWARE" => {
@@ -297,6 +304,7 @@ impl ClusterFeed {
                                 .insert(Tag {
                                     name: cluster.tag,
                                     ttype: TagType::Hardware,
+                                    alloc_id: None,
                                 });
                         }
                         _ => (),
@@ -775,6 +783,7 @@ mod tests {
             Tag {
                 name: tag.to_string(),
                 ttype: TagType::Alloc,
+                alloc_id: None,
             },
         )
     }
 
@@ -21,10 +21,30 @@ pub enum TagType {
     Hardware,
 }
 
+/// IDENTITY NOTE: the derived `Hash`/`PartialEq`/`Eq`/`Ord` include every field,
+/// so `alloc_id` participates in tag identity. A `Tag{name, Alloc, Some(uuid)}`
+/// and a `Tag{name, Alloc, None}` with the same `name`/`ttype` are *distinct*
+/// keys in `BTreeSet<Tag>` and `HashMap<ClusterKey, _>`. Today the DB-loaded
+/// path produces `Some(uuid)` for alloc tags and the CLI override path produces
+/// `None`; the two are not mixed within a single set, so this is latent. The
+/// CLI override path for alloc tags is being discontinued in the next stage,
+/// after which every `TagType::Alloc` tag carries a resolved `alloc_id` and
+/// this becomes a non-issue.
 #[derive(Serialize, Deserialize, Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
 pub struct Tag {
     pub name: String,
     pub ttype: TagType,
+    /// `pk_alloc` (allocation UUID) when this tag was loaded as a
+    /// `TagType::Alloc` cluster tag from the database. Populated by
+    /// `cluster.rs::load_clusters` on the `"ALLOC"` arm and consumed by
+    /// `MatchingService::process_layer` to read the per-(show, alloc)
+    /// subscription burst snapshot from Redis before host checkout.
+    ///
+    /// `None` for non-alloc tags (manual / hostname / hardware) and for
+    /// CLI-built tags where the str_tag → pk_alloc mapping isn't resolved
+    /// at startup. Those paths fall back to the burst-unaware behavior.
+    #[serde(default)]
+    pub alloc_id: Option<Uuid>,
 }
 
 impl std::ops::Deref for Tag {