chainbound · pamungkaski · Jan 23, 2026 · Jan 23, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/DIARY.md b/DIARY.md
@@ -30,6 +30,50 @@ A log of the development journey building Vixy - an Ethereum EL/CL proxy in Rust
 
 <!-- Add new entries below this line, newest first -->
 
+### 2026-01-23 - Configurable Health Check Retry Logic
+
+**What I did:**
+- Added configurable health check retry logic to prevent nodes from being marked unhealthy on transient failures
+- Added `health_check_max_failures` config field to Global struct (default: 3)
+- Added `consecutive_failures` tracking to both ElNodeState and ClNodeState
+- Updated health calculation functions to only mark nodes unhealthy after X consecutive failures
+- Nodes reset their failure counter when they recover
+- Updated all test code to work with the new logic
+- Added comprehensive tests for retry and recovery scenarios
+
+**Key Implementation Details:**
+- Config field: `health_check_max_failures` in `[global]` section
+- Node state tracking: `consecutive_failures: u32` field
+- Health calculation logic:
+  - If check passes: reset consecutive_failures to 0, mark as healthy
+  - If check fails: increment consecutive_failures
+  - Only mark unhealthy if consecutive_failures >= threshold
+- Updated function signatures: `calculate_el_health(node, chain_head, max_lag, max_failures)`
+
+**Tests Added:**
+- test_el_node_consecutive_failures_reset_on_recovery
+- test_cl_node_consecutive_failures_reset_on_recovery
+- Updated existing tests to verify retry behavior (3 failures before unhealthy)
+
+**Challenges faced:**
+- Had to update all test helpers across multiple modules (http.rs, selection.rs, ws.rs, monitor.rs)
+- Needed to ensure the retry logic works correctly for both EL and CL nodes
+- Had to update both health check functions and all test code simultaneously
+
+**How I solved it:**
+- Added the consecutive_failures field to both node state structs
+- Modified health calculation to track failures and only mark unhealthy after threshold
+- Updated all test helpers to include the new fields
+- Updated tests to verify the retry behavior works correctly
+
+**What I learned:**
+- Rust's type system makes refactoring safe - compiler caught all missing fields
+- Configurable retry logic prevents flapping between healthy/unhealthy states
+- Resetting the counter on success allows nodes to recover naturally
+- Comprehensive tests ensure the retry logic works as expected
+
+**Mood:** Accomplished - this is a valuable feature for production resilience!
+
 ### 2026-01-21 - Fixed WSS/TLS Connection Support
 
 **What I did:**

diff --git a/config.example.toml b/config.example.toml
@@ -17,6 +17,10 @@ proxy_timeout_ms = 30000
 # Maximum number of retry attempts for failed proxy requests
 max_retries = 2
 
+# Number of consecutive health check failures before marking node as unhealthy
+# This prevents transient failures from immediately marking a node as unhealthy
+health_check_max_failures = 3
+
 [metrics]
 # Enable or disable Prometheus metrics
 enabled = true

diff --git a/src/config.rs b/src/config.rs
@@ -32,6 +32,8 @@ pub struct Global {
     pub proxy_timeout_ms: u64,
     /// Maximum number of retry attempts for failed proxy requests
     pub max_retries: u32,
+    /// Number of consecutive health check failures before marking node as unhealthy
+    pub health_check_max_failures: u32,
 }
 
 /// Metrics configuration settings
@@ -62,6 +64,7 @@ impl Default for Global {
             health_check_interval_ms: 1000,
             proxy_timeout_ms: 30000,
             max_retries: 2,
+            health_check_max_failures: 3,
         }
     }
 }

diff --git a/src/health/cl.rs b/src/health/cl.rs
@@ -81,12 +81,32 @@ pub fn update_cl_chain_head(nodes: &[ClNodeState]) -> u64 {
 }
 
 /// Calculate health status for a CL node based on chain head and max lag
-pub fn calculate_cl_health(node: &mut ClNodeState, chain_head: u64, max_lag: u64) {
+pub fn calculate_cl_health(
+    node: &mut ClNodeState,
+    chain_head: u64,
+    max_lag: u64,
+    max_failures: u32,
+) {
     // Calculate lag (how far behind the node is from chain head)
     node.lag = chain_head.saturating_sub(node.slot);
 
-    // Node is healthy if health endpoint is OK AND lag is within threshold
-    node.is_healthy = node.health_ok && node.lag <= max_lag;
+    // Determine if this check passed (health endpoint OK AND lag is within threshold)
+    let check_passed = node.health_ok && node.lag <= max_lag;
+
+    if check_passed {
+        // Reset consecutive failures on success
+        node.consecutive_failures = 0;
+        node.is_healthy = true;
+    } else {
+        // Increment consecutive failures
+        node.consecutive_failures += 1;
+
+        // Only mark as unhealthy if we've exceeded the threshold
+        if node.consecutive_failures >= max_failures {
+            node.is_healthy = false;
+        }
+        // Otherwise, keep the current health status (might still be healthy from before)
+    }
 }
 
 #[cfg(test)]
@@ -205,6 +225,7 @@ mod tests {
             health_ok,
             is_healthy: false,
             lag: 0,
+            consecutive_failures: 0,
         }
     }
 
@@ -213,37 +234,73 @@ mod tests {
         let mut node = make_cl_node("test", 1000, true);
         let chain_head = 1005;
 
-        calculate_cl_health(&mut node, chain_head, 10);
+        calculate_cl_health(&mut node, chain_head, 10, 3);
 
         assert_eq!(node.lag, 5, "Lag should be chain_head - slot");
     }
 
     #[test]
     fn test_cl_node_unhealthy_when_health_fails() {
         let mut node = make_cl_node("test", 1000, false); // health_ok = false
+        node.is_healthy = true; // Start as healthy
         let chain_head = 1000;
         let max_lag = 3;
 
-        calculate_cl_health(&mut node, chain_head, max_lag);
+        // First failure
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 1);
+        assert!(
+            node.is_healthy,
+            "Node should still be healthy after 1 failure"
+        );
 
+        // Second failure
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 2);
+        assert!(
+            node.is_healthy,
+            "Node should still be healthy after 2 failures"
+        );
+
+        // Third failure
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 3);
         assert!(
             !node.is_healthy,
-            "Node should be unhealthy when health_ok is false"
+            "Node should be unhealthy after 3 failures"
         );
         assert_eq!(node.lag, 0);
     }
 
     #[test]
     fn test_cl_node_unhealthy_when_lagging() {
         let mut node = make_cl_node("test", 990, true); // health_ok = true
+        node.is_healthy = true; // Start as healthy
         let chain_head = 1000;
         let max_lag = 3;
 
-        calculate_cl_health(&mut node, chain_head, max_lag);
+        // First failure - still healthy
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 1);
+        assert!(
+            node.is_healthy,
+            "Node should still be healthy after 1 failure"
+        );
 
+        // Second failure - still healthy
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 2);
+        assert!(
+            node.is_healthy,
+            "Node should still be healthy after 2 failures"
+        );
+
+        // Third failure - now unhealthy
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 3);
         assert!(
             !node.is_healthy,
-            "Node should be unhealthy when lag > max_lag"
+            "Node should be unhealthy after 3 failures"
         );
         assert_eq!(node.lag, 10);
     }
@@ -254,7 +311,7 @@ mod tests {
         let chain_head = 1000;
         let max_lag = 3;
 
-        calculate_cl_health(&mut node, chain_head, max_lag);
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
 
         assert!(
             node.is_healthy,
@@ -269,7 +326,7 @@ mod tests {
         let chain_head = 1000;
         let max_lag = 3;
 
-        calculate_cl_health(&mut node, chain_head, max_lag);
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
 
         assert!(
             node.is_healthy,
@@ -278,6 +335,39 @@ mod tests {
         assert_eq!(node.lag, 3);
     }
 
+    #[test]
+    fn test_cl_node_consecutive_failures_reset_on_recovery() {
+        let mut node = make_cl_node("test", 990, true);
+        node.is_healthy = true; // Start as healthy
+        let chain_head = 1000;
+        let max_lag = 3;
+
+        // First failure - still healthy
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 1);
+        assert!(
+            node.is_healthy,
+            "Node should still be healthy after 1 failure"
+        );
+
+        // Second failure - still healthy
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(node.consecutive_failures, 2);
+        assert!(
+            node.is_healthy,
+            "Node should still be healthy after 2 failures"
+        );
+
+        // Recovery - node catches up
+        node.slot = 1000;
+        calculate_cl_health(&mut node, chain_head, max_lag, 3);
+        assert_eq!(
+            node.consecutive_failures, 0,
+            "Consecutive failures should reset on success"
+        );
+        assert!(node.is_healthy, "Node should be healthy after recovery");
+    }
+
     // =========================================================================
     // update_cl_chain_head tests
     // =========================================================================