AcademySoftwareFoundation
diff --git a/‎rust/Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎rust/Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎rust/crates/scheduler/Cargo.toml‎
Lines changed: 3 additions & 0 deletions b/‎rust/crates/scheduler/Cargo.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎rust/crates/scheduler/src/accounting/recompute.rs‎
Lines changed: 26 additions & 1 deletion b/‎rust/crates/scheduler/src/accounting/recompute.rs‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎rust/crates/scheduler/src/accounting/redis_client.rs‎
Lines changed: 6 additions & 3 deletions b/‎rust/crates/scheduler/src/accounting/redis_client.rs‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎rust/crates/scheduler/src/config/mod.rs‎
Lines changed: 8 additions & 0 deletions b/‎rust/crates/scheduler/src/config/mod.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎rust/crates/scheduler/src/dao/host_dao.rs‎
Lines changed: 12 additions & 34 deletions b/‎rust/crates/scheduler/src/dao/host_dao.rs‎
Lines changed: 12 additions & 34 deletions
diff --git a/‎rust/crates/scheduler/src/dao/layer_dao.rs‎
Lines changed: 41 additions & 39 deletions b/‎rust/crates/scheduler/src/dao/layer_dao.rs‎
Lines changed: 41 additions & 39 deletions
@@ -35,5 +35,13 @@ tracing = "0.1.40"
 tracing-appender = "0.2.3"
 tracing-rolling-file = "0.1.2"
 tracing-subscriber = { version = "0.3.18", features = ["ansi", "env-filter"] }
+# Pin time to 0.3.47: the 0.3.48 release adds an impl that collides with
+# sentry-types' blanket `impl<T> From<T> for LogAttribute`, producing an
+# E0119 "conflicting implementations" error when building the scheduler
+# (which pulls both `sentry` and, via tracing-appender, `time/formatting`).
+# Since Cargo.lock is gitignored, CI resolves fresh and would otherwise float
+# to the broken 0.3.48. Remove the pin once sentry-types/time resolve this
+# upstream. Referenced by the scheduler crate so the constraint takes effect.
+time = "=0.3.47"
 structopt = "0.3.26"
 home = "=0.5.9"
@@ -54,6 +54,9 @@ lazy_static = "1.5"
 moka = { version = "0.12.10", features = ["future"] }
 prometheus = "0.13"
 sentry = { version = "0.47", features = ["tracing"] }
+# Version constraint only — see the `time` pin note in the workspace Cargo.toml.
+# Not used directly; declared so the `=0.3.47` pin applies to resolution.
+time = { workspace = true }
 axum = "0.7"
 tower-http = { version = "0.5", features = ["trace"] }
 urlencoding = "2.1"
 
@@ -40,6 +40,7 @@ use crate::accounting::redis_client::ReseedOp;
 use crate::accounting::AccountingService;
 use crate::config::CONFIG;
 use crate::dao::ResourceAccountingDao;
+use crate::metrics;
 use crate::models::CoreSize;
 
 pub fn spawn_loop(service: Arc<AccountingService>) {
@@ -51,11 +52,35 @@ pub fn spawn_loop(service: Arc<AccountingService>) {
                 return;
             }
         };
-        let mut interval = time::interval(CONFIG.accounting.recompute_interval);
+        let interval_dur = CONFIG.accounting.recompute_interval;
+        let mut interval = time::interval(interval_dur);
         // Skip the immediate first tick - bootstrap reseed already ran at startup.
         interval.tick().await;
+        // Dispatch heartbeat baseline: snapshot the session counters so the first
+        // logged delta only covers events after this point.
+        let mut last_dispatched = metrics::frames_dispatched_session();
+        let mut last_limit_exceeded = metrics::resource_limit_exceeded_session();
         loop {
             interval.tick().await;
+
+            // Dispatch heartbeat: the aggregate INFO that replaces the demoted
+            // per-frame dispatch logs. Decoupled from the accounting reseed below.
+            let current_dispatched = metrics::frames_dispatched_session();
+            let dispatched_delta = current_dispatched.saturating_sub(last_dispatched);
+            last_dispatched = current_dispatched;
+
+            let current_limit_exceeded = metrics::resource_limit_exceeded_session();
+            let limit_exceeded_delta =
+                current_limit_exceeded.saturating_sub(last_limit_exceeded);
+            last_limit_exceeded = current_limit_exceeded;
+
+            info!(
+                "Dispatched {} frames in the last {}ms ({} resource-limit-exceeded)",
+                dispatched_delta,
+                interval_dur.as_millis(),
+                limit_exceeded_delta
+            );
+
             let result = AssertUnwindSafe(async {
                 if let Err(err) = run_once(&service, &pg_dao).await {
                     warn!("Recompute cycle failed: {err}");
 
@@ -141,7 +141,9 @@ impl RedisAccounting {
 
     /// Reads the subscription hash's booked cores + burst in one round-trip from
     /// `acct:sub:{show_id}:{alloc_id}` (fields `int_cores`, `burst`, both in
-    /// centicores). Missing keys/fields are treated as `(0, 0)`. Non-authoritative:
+    /// cores — the centicore→core conversion happens once at the reseed write
+    /// boundary, see the `lua.rs` unit invariant). Missing keys/fields are
+    /// treated as `(0, 0)`. Non-authoritative:
     /// the dispatcher's Lua `BOOK_OR_FORCE` call remains the source of truth for
     /// the booking decision; this is a snapshot suitable for optimistic pre-filters
     /// and scoring inputs.
@@ -158,8 +160,9 @@ impl RedisAccounting {
         Ok((booked, burst))
     }
 
-    /// Reads `acct:job:{job_id}` `int_cores` (live booked cores, in centicores).
-    /// Returns 0 when the key/field is missing. Used by the E-PVM placement
+    /// Reads `acct:job:{job_id}` `int_cores` (live booked cores, in cores — see
+    /// the `lua.rs` unit invariant; Redis accounting counters are never stored
+    /// in centicores). Returns 0 when the key/field is missing. Used by the E-PVM placement
     /// snapshot in `MatchingService::process_layer` (design Branch 2a).
     pub async fn read_job_cores_in_use(
         &self,
 
@@ -146,6 +146,13 @@ pub struct QueueConfig {
     /// The reload only swaps the live set when it actually changed.
     #[serde(with = "humantime_serde")]
     pub cluster_reload_interval: Duration,
+    /// Duration a cluster sleeps after a pass found jobs but dispatched zero
+    /// frames (saturated farm: no host candidate fits any pending layer).
+    /// Keeps the loop from re-querying jobs and layers continuously while
+    /// nothing can be placed. Should stay in the same order of magnitude as
+    /// the host cache refresh interval so freed hosts are picked up promptly.
+    #[serde(with = "humantime_serde")]
+    pub cluster_saturated_sleep: Duration,
     pub stream: StreamConfig,
     /// Maximum number of jobs returned per cluster pass. Caps the per-pass
     /// dispatch cost so a big-show cluster doesn't iterate thousands of jobs
@@ -176,6 +183,7 @@ impl Default for QueueConfig {
             job_back_off_duration: Duration::from_secs(300),
             cluster_empty_sleep: Duration::from_secs(30),
             cluster_reload_interval: Duration::from_secs(120),
+            cluster_saturated_sleep: Duration::from_secs(5),
             stream: StreamConfig::default(),
             max_jobs_per_cluster_pass: 20,
             manual_tags_chunk_size: 50,
 
@@ -272,9 +272,11 @@ impl HostDao {
 
     /// Acquires an advisory lock on a host to prevent concurrent dispatch.
     ///
-    /// Uses PostgreSQL's advisory lock mechanism to ensure only one dispatcher
-    /// can modify a host's resources at a time. The lock is based on a hash
-    /// of the host ID string.
+    /// Uses PostgreSQL's transaction-scoped advisory lock mechanism to ensure
+    /// only one dispatcher can modify a host's resources at a time. The lock is
+    /// based on a hash of the host ID string and is released automatically when
+    /// the surrounding transaction commits or rolls back — there is no unlock
+    /// call, so the lock can never leak onto a pooled connection.
     ///
     /// # Arguments
     /// * `host_id` - The UUID of the host to lock
@@ -289,40 +291,14 @@ impl HostDao {
         host_id: &Uuid,
     ) -> Result<bool> {
         trace!("Locking {}", host_id);
-        sqlx::query_scalar::<_, bool>("SELECT pg_try_advisory_lock(hashtext($1))")
+        sqlx::query_scalar::<_, bool>("SELECT pg_try_advisory_xact_lock(hashtext($1))")
             .bind(host_id.to_string())
             .fetch_one(&mut **transaction)
             .await
             .into_diagnostic()
             .wrap_err("Failed to acquire advisory lock")
     }
 
-    /// Releases an advisory lock on a host after dispatch completion.
-    ///
-    /// Releases the PostgreSQL advisory lock that was acquired during
-    /// the dispatch process, allowing other dispatchers to access the host.
-    ///
-    /// # Arguments
-    /// * `host_id` - The UUID of the host to unlock
-    ///
-    /// # Returns
-    /// * `Ok(true)` - Lock successfully released
-    /// * `Ok(false)` - Lock was not held by this process
-    /// * `Err(miette::Error)` - Database operation failed
-    pub async fn unlock(
-        &self,
-        transaction: &mut Transaction<'_, Postgres>,
-        host_id: &Uuid,
-    ) -> Result<bool> {
-        trace!("Unlocking {}", host_id);
-        sqlx::query_scalar::<_, bool>("SELECT pg_advisory_unlock(hashtext($1))")
-            .bind(host_id.to_string())
-            .fetch_one(&mut **transaction)
-            .await
-            .into_diagnostic()
-            .wrap_err("Failed to release advisory lock")
-    }
-
     /// Updates a host's available resource counts after frame dispatch.
     ///
     /// Modifies the host's idle resource counters in the database to reflect
@@ -348,7 +324,8 @@ impl HostDao {
                 .bind(virtual_proc.cores_reserved.value())
                 .bind((virtual_proc.memory_reserved.as_u64() / KB) as i64)
                 .bind(virtual_proc.gpus_reserved as i32)
-                .bind(virtual_proc.gpu_memory_reserved.as_u64() as i64)
+                // GPU memory is stored in KB on the database, like main memory
+                .bind((virtual_proc.gpu_memory_reserved.as_u64() / KB) as i64)
                 .bind(host_id.to_string())
                 .fetch_optional(&mut **transaction)
                 .await
@@ -362,7 +339,7 @@ impl HostDao {
         if CONFIG.host_cache.update_stat_on_book {
             sqlx::query(UPDATE_HOST_STAT)
                 .bind((virtual_proc.memory_reserved.as_u64() / KB) as i64)
-                .bind(virtual_proc.gpu_memory_reserved.as_u64() as i64)
+                .bind((virtual_proc.gpu_memory_reserved.as_u64() / KB) as i64)
                 .bind(host_id.to_string())
                 .execute(&mut **transaction)
                 .await
@@ -395,7 +372,8 @@ impl HostDao {
             .bind(virtual_proc.cores_reserved.value())
             .bind((virtual_proc.memory_reserved.as_u64() / KB) as i64)
             .bind(virtual_proc.gpus_reserved as i32)
-            .bind(virtual_proc.gpu_memory_reserved.as_u64() as i64)
+            // GPU memory is stored in KB on the database, like main memory
+            .bind((virtual_proc.gpu_memory_reserved.as_u64() / KB) as i64)
             .bind(host_id.to_string())
             .execute(&mut **transaction)
             .await
@@ -404,7 +382,7 @@ impl HostDao {
         if CONFIG.host_cache.update_stat_on_book {
             sqlx::query(RESTORE_HOST_STAT)
                 .bind((virtual_proc.memory_reserved.as_u64() / KB) as i64)
-                .bind(virtual_proc.gpu_memory_reserved.as_u64() as i64)
+                .bind((virtual_proc.gpu_memory_reserved.as_u64() / KB) as i64)
                 .bind(host_id.to_string())
                 .execute(&mut **transaction)
                 .await
 
@@ -243,7 +243,7 @@ SELECT DISTINCT
     l.int_gpus_min,
     l.int_gpu_mem_min,
     l.str_tags,
-    jr.int_max_cores AS int_job_max_cores,
+    jr.int_max_cores::bigint AS int_job_max_cores,
     je.job_env,
     le.layer_env,
     l.int_dispatch_order,
@@ -364,33 +364,45 @@ impl LayerDao {
     ///
     /// * `Vec<DispatchLayer>` - Structured layers with grouped frames
     fn group_layers_and_frames(&self, models: Vec<LayerWithFramesModel>) -> Vec<DispatchLayer> {
-        let mut layers_map: HashMap<String, (DispatchLayerModel, Vec<DispatchFrameModel>)> =
-            HashMap::new();
+        // Rows arrive ordered by (layer.int_dispatch_order, frame order) from the SQL
+        // query. Group by layer while preserving that ordering: a HashMap maps the
+        // layer key to its slot in the output Vec, so layers keep their dispatch
+        // priority and frames keep their dispatch order within each layer.
+        let mut layer_slots: HashMap<String, usize> = HashMap::new();
+        let mut layers: Vec<(DispatchLayerModel, Vec<DispatchFrameModel>)> = Vec::new();
 
         for model in models {
-            // Extract layer data
-            let layer_model = DispatchLayerModel {
-                pk_layer: model.pk_layer.clone(),
-                pk_job: model.pk_job.clone(),
-                pk_facility: model.pk_facility.clone(),
-                pk_show: model.pk_show.clone(),
-                pk_folder: model.pk_folder.clone(),
-                pk_dept: model.pk_dept.clone(),
-                str_name: model.layer_name.clone(),
-                str_job_name: model.job_name.clone(),
-                str_os: model.str_os.clone(),
-                int_cores_min: model.int_cores_min,
-                int_mem_min: model.int_mem_min,
-                b_threadable: model.b_threadable,
-                int_gpus_min: model.int_gpus_min,
-                int_gpu_mem_min: model.int_gpu_mem_min,
-                str_tags: model.str_tags.clone(),
-                int_job_max_cores: model.int_job_max_cores,
+            let slot = match layer_slots.get(&model.pk_layer) {
+                Some(slot) => *slot,
+                None => {
+                    let layer_model = DispatchLayerModel {
+                        pk_layer: model.pk_layer.clone(),
+                        pk_job: model.pk_job.clone(),
+                        pk_facility: model.pk_facility.clone(),
+                        pk_show: model.pk_show.clone(),
+                        pk_folder: model.pk_folder.clone(),
+                        pk_dept: model.pk_dept.clone(),
+                        str_name: model.layer_name.clone(),
+                        str_job_name: model.job_name.clone(),
+                        str_os: model.str_os.clone(),
+                        int_cores_min: model.int_cores_min,
+                        int_mem_min: model.int_mem_min,
+                        b_threadable: model.b_threadable,
+                        int_gpus_min: model.int_gpus_min,
+                        int_gpu_mem_min: model.int_gpu_mem_min,
+                        str_tags: model.str_tags.clone(),
+                        int_job_max_cores: model.int_job_max_cores,
+                    };
+                    layers.push((layer_model, vec![]));
+                    let slot = layers.len() - 1;
+                    layer_slots.insert(model.pk_layer.clone(), slot);
+                    slot
+                }
             };
 
             // Extract frame data (if present)
-            let frame_model = if let Some(pk_frame) = model.pk_frame {
-                Some(DispatchFrameModel {
+            if let Some(pk_frame) = model.pk_frame {
+                let frame_model = DispatchFrameModel {
                     pk_frame,
                     str_frame_name: model.str_frame_name.unwrap_or_default(),
                     pk_show: model.pk_show.clone(),
@@ -418,26 +430,16 @@ impl LayerDao {
                     int_version: model.int_version.unwrap_or(1),
                     str_loki_url: model.str_loki_url,
                     ts_updated: model.ts_updated,
-                    job_env: model.job_env.0.clone(),
-                    layer_env: model.layer_env.0.clone(),
-                })
-            } else {
-                None
-            };
-
-            // Group by layer_id
-            let layer_entry = layers_map
-                .entry(model.pk_layer.clone())
-                .or_insert((layer_model, vec![]));
-
-            if let Some(frame) = frame_model {
-                layer_entry.1.push(frame);
+                    job_env: model.job_env.0,
+                    layer_env: model.layer_env.0,
+                };
+                layers[slot].1.push(frame_model);
             }
         }
 
         // Convert to DispatchLayer objects
-        layers_map
-            .into_values()
+        layers
+            .into_iter()
             .map(|(layer_model, frame_models)| DispatchLayer::new(layer_model, frame_models))
             .collect()
     }