PrimeIntellect-ai · xeophon · Apr 17, 2026 · Apr 18, 2026 · cursor · Apr 18, 2026
diff --git a/assets/lab/environments/AGENTS.md b/assets/lab/environments/AGENTS.md
@@ -576,7 +576,7 @@ class MyGameEnv(vf.MultiTurnEnv):
         return state.get("lives", 1) <= 0
 ```
 
-`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, and `max_total_completion_tokens` by default.
+`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, `timeout_seconds`, and `max_total_completion_tokens` by default.
 
 Execution order can be controlled with `priority` (higher runs first). This is useful for checking cheap conditions before expensive ones:
 

diff --git a/docs/environments.md b/docs/environments.md
@@ -570,7 +570,7 @@ class MyGameEnv(vf.MultiTurnEnv):
         return state.get("lives", 1) <= 0
 ```
 
-`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, and `max_total_completion_tokens` by default.
+`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, `timeout_seconds`, and `max_total_completion_tokens` by default.
 
 Execution order can be controlled with `priority` (higher runs first). This is useful for checking cheap conditions before expensive ones:
 

diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -73,10 +73,10 @@ The `--env-args` flag passes arguments to your `load_environment()` function:
 prime eval run my-env -a '{"difficulty": "hard", "num_examples": 100}'
 ```
 
-The `--extra-env-kwargs` flag passes arguments directly to the environment constructor, useful for overriding defaults like `max_turns` which may not be exposed via `load_environment()`:
+The `--extra-env-kwargs` flag passes arguments directly to the environment constructor, useful for overriding defaults like `max_turns` or setting rollout limits like `timeout_seconds` which may not be exposed via `load_environment()`:
 
 ```bash
-prime eval run my-env -x '{"max_turns": 20}'
+prime eval run my-env -x '{"max_turns": 20, "timeout_seconds": 600}'
 ```
 
 #### Executor autoscaling

diff --git a/docs/reference.md b/docs/reference.md
@@ -327,7 +327,12 @@ Single-response Q&A tasks. Inherits from `Environment`.
 
 ```python
 class MultiTurnEnv(Environment):
-    def __init__(self, max_turns: int = -1, **kwargs): ...
+    def __init__(
+        self,
+        max_turns: int = -1,
+        timeout_seconds: float | None = None,
+        **kwargs,
+    ): ...
 ```
 
 Multi-turn interactions. Subclasses must implement `env_response`.
@@ -339,7 +344,7 @@ async def env_response(self, messages: Messages, state: State, **kwargs) -> Mess
     """Generate environment feedback after model turn."""
 ```
 
-**Built-in stop conditions:** `has_error`, `prompt_too_long`, `max_turns_reached`, `max_total_completion_tokens_reached`, `has_final_env_response`
+**Built-in stop conditions:** `has_error`, `prompt_too_long`, `max_turns_reached`, `timeout_reached`, `max_total_completion_tokens_reached`, `has_final_env_response`
 
 **Hooks:**
 

diff --git a/environments/AGENTS.md b/environments/AGENTS.md
@@ -576,7 +576,7 @@ class MyGameEnv(vf.MultiTurnEnv):
         return state.get("lives", 1) <= 0
 ```
 
-`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, and `max_total_completion_tokens` by default.
+`MultiTurnEnv` includes built-in stop conditions for errors, prompt length limits, `max_turns`, `timeout_seconds`, and `max_total_completion_tokens` by default.
 
 Execution order can be controlled with `priority` (higher runs first). This is useful for checking cheap conditions before expensive ones:
 

diff --git a/skills/evaluate-environments/SKILL.md b/skills/evaluate-environments/SKILL.md
@@ -89,7 +89,7 @@ prime eval run my-env -a '{"difficulty":"hard"}'
 ```
 2. Override constructor kwargs:
 ```bash
-prime eval run my-env -x '{"max_turns":20}'
+prime eval run my-env -x '{"max_turns":20,"timeout_seconds":600}'
 ```
 3. Save extra state columns:
 ```bash

diff --git a/tests/test_cli_agent_env.py b/tests/test_cli_agent_env.py
@@ -1,6 +1,8 @@
 """Tests for CliAgentEnv and HarborEnv."""
 
+import asyncio
 import tempfile
+import time
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch
 
@@ -63,6 +65,7 @@ def test_init_basic(self, sample_dataset):
         assert env.docker_image == "python:3.11-slim"
         assert env.interception_port == 8765
         assert env.timeout_seconds == 3600.0
+        assert env.timeout_reached.__func__ is vf.MultiTurnEnv.timeout_reached
 
     def test_init_custom_config(self, sample_dataset):
         """Test initialization with custom configuration."""
@@ -130,6 +133,9 @@ async def test_agent_completed_stop_condition(self, sample_dataset):
         state = {"agent_completed": True}
         assert await env.agent_completed(state) is True
 
+        state = {"agent_completed": True, "timed_out": True}
+        assert await env.agent_completed(state) is False
+
     @pytest.mark.asyncio
     async def test_timeout_reached_stop_condition(self, sample_dataset):
         """Test the timeout_reached stop condition."""
@@ -139,13 +145,56 @@ async def test_timeout_reached_stop_condition(self, sample_dataset):
             rubric=vf.Rubric(),
             timeout_seconds=10.0,
         )
-        import time
 
-        state = {"timing": {"start_time": time.time()}}
+        state = {
+            "timing": {"start_time": time.time()},
+            "_start_perf_counter": time.perf_counter(),
+        }
         assert await env.timeout_reached(state) is False
 
-        state = {"timing": {"start_time": time.time() - 20}}
+        state = {
+            "timing": {"start_time": time.time() - 20},
+            "_start_perf_counter": time.perf_counter() - 20,
+        }
         assert await env.timeout_reached(state) is True
+        assert state["timed_out"] is True
+        assert state["is_truncated"] is True
+
+    def test_disabled_timeout_omits_sandbox_timeout(self, sample_dataset):
+        """Disabling rollout timeout should not send a zero-minute sandbox timeout."""
+        env = vf.CliAgentEnv(
+            run_command="python agent.py",
+            dataset=sample_dataset,
+            rubric=vf.Rubric(),
+            timeout_seconds=None,
+        )
+
+        resources = env.get_sandbox_resources({})
+
+        assert "timeout_minutes" not in resources
+
+    @pytest.mark.asyncio
+    async def test_poll_next_request_exits_on_rollout_timeout(self, sample_dataset):
+        """Polling should unblock when the inherited rollout timeout is reached."""
+        env = vf.CliAgentEnv(
+            run_command="python agent.py",
+            dataset=sample_dataset,
+            rubric=vf.Rubric(),
+            timeout_seconds=0.01,
+            poll_interval=0.001,
+        )
+        state = {
+            "request_id_queue": asyncio.Queue(),
+            "agent_completed": False,
+            "timing": {"start_time": time.time() - 1.0},
+            "_start_perf_counter": time.perf_counter() - 1.0,
+        }
+
+        request_id = await env._poll_next_request(state)
+
+        assert request_id is None
+        assert state["timed_out"] is True
+        assert state["is_truncated"] is True
 
     @pytest.mark.asyncio
     async def test_env_response_returns_empty(self, sample_dataset):

diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -230,6 +230,20 @@ def test_cli_temperature_not_added_when_none(monkeypatch, run_cli):
     assert "temperature" not in sa
 
 
+def test_cli_extra_env_kwargs_support_timeout_seconds(monkeypatch, run_cli):
+    captured = run_cli(
+        monkeypatch,
+        {
+            "extra_env_kwargs": {"timeout_seconds": 30, "foo": "bar"},
+        },
+    )
+
+    assert captured["configs"][0].extra_env_kwargs == {
+        "timeout_seconds": 30,
+        "foo": "bar",
+    }
+
+
 def test_cli_headers_table_and_list_merge(monkeypatch, run_cli):
     captured = run_cli(
         monkeypatch,
@@ -872,6 +886,17 @@ def test_load_toml_config_global_values_with_per_eval_override():
     assert result[1]["num_examples"] == 50  # per-eval override
 
 
+def test_load_toml_config_with_extra_env_kwargs():
+    with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:
+        f.write(
+            '[[eval]]\nenv_id = "env1"\n[eval.extra_env_kwargs]\ntimeout_seconds = 600\n'
+        )
+        f.flush()
+        result = load_toml_config(Path(f.name))
+
+    assert result[0]["extra_env_kwargs"] == {"timeout_seconds": 600}
+
+
 def test_load_toml_config_invalid_global_field():
     """Invalid global field raises ValueError."""
     with tempfile.NamedTemporaryFile(suffix=".toml", delete=False, mode="w") as f:

diff --git a/tests/test_multiturn_env.py b/tests/test_multiturn_env.py
@@ -1,5 +1,8 @@
 """Tests for the MultiTurnEnv class."""
 
+import asyncio
+import time
+
 import pytest
 from datasets import Dataset
 
@@ -12,6 +15,7 @@ class TestMultiTurnEnv:
     def test_multiturn_env_initialization(self, mock_multiturn_env):
         """Test MultiTurnEnv initialization."""
         assert mock_multiturn_env.max_turns == 3
+        assert mock_multiturn_env.timeout_seconds is None
         assert mock_multiturn_env.message_type == "chat"  # Default from parent
 
     def test_multiturn_env_default_max_turns(self, mock_client, sample_chat_dataset):
@@ -26,6 +30,38 @@ def test_multiturn_env_default_max_turns(self, mock_client, sample_chat_dataset)
             rubric=Rubric(),
         )
         assert env.max_turns == -1  # Default value
+        assert env.timeout_seconds is None
+
+    @pytest.mark.asyncio
+    async def test_timeout_reached_stop_condition(
+        self, mock_client, sample_chat_dataset
+    ):
+        """Test the timeout_reached stop condition."""
+        from tests.conftest import SimpleMultiTurnEnv
+
+        env = SimpleMultiTurnEnv(
+            client=mock_client,
+            model="test-model",
+            dataset=sample_chat_dataset,
+            parser=Parser(),
+            rubric=Rubric(),
+            timeout_seconds=10.0,
+        )
+
+        state: State = {
+            "timing": {"start_time": time.time()},
+            "_start_perf_counter": time.perf_counter(),
+        }
+        assert await env.timeout_reached(state) is False
+        assert state.get("timed_out") is None
+
+        state = {
+            "timing": {"start_time": time.time() - 20},
+            "_start_perf_counter": time.perf_counter() - 20,
+        }
+        assert await env.timeout_reached(state) is True
+        assert state["timed_out"] is True
+        assert state["is_truncated"] is True
 
     @pytest.mark.asyncio
     async def test_basic_multiturn_rollout(self, mock_multiturn_env, make_input):
@@ -103,6 +139,86 @@ async def test_max_turns_limiting(self, mock_multiturn_env_max_turns, make_input
         assert completion[1]["role"] == "user"
         assert completion[2]["role"] == "assistant"
 
+    @pytest.mark.asyncio
+    async def test_timeout_seconds_limits_rollout(
+        self, mock_client, sample_chat_dataset, make_input
+    ):
+        """Test that rollout stops when the wall-clock timeout is reached."""
+
+        class SlowMultiTurnEnv(MultiTurnEnv):
+            async def env_response(self, messages, state, **kwargs):  # type: ignore[override]
+                return [{"role": "user", "content": "Continue"}]
+
+            async def add_model_response(self, state, prompt_messages, response):  # type: ignore[override]
+                await super().add_model_response(state, prompt_messages, response)
+                await asyncio.sleep(0.05)
+
+        env = SlowMultiTurnEnv(
+            client=mock_client,
+            model="test-model",
+            dataset=sample_chat_dataset,
+            parser=Parser(),
+            rubric=Rubric(),
+            timeout_seconds=0.01,
+        )
+        mock_client.set_default_response("Still going")
+
+        prompt = [{"role": "user", "content": "Start conversation"}]
+        state = await env.rollout(
+            input=make_input(prompt=prompt, answer="target_answer"),
+            client=mock_client,
+            model="test-model",
+        )
+
+        assert len(state["trajectory"]) == 1
+        assert state["timed_out"] is True
+        assert state["is_completed"] is True
+        assert state["is_truncated"] is True
+        assert state["stop_condition"] == "timeout_reached"
+        completion = state["completion"]
+        assert len(completion) == 1
+        assert completion[0]["role"] == "assistant"
+        assert completion[0]["content"] == "Still going"
+
+    @pytest.mark.asyncio
+    async def test_timeout_seconds_limits_setup(
+        self, mock_client, sample_chat_dataset, make_input
+    ):
+        """Test that the rollout timeout applies while setup is in flight."""
+
+        class SlowSetupEnv(MultiTurnEnv):
+            async def setup_state(self, state):  # type: ignore[override]
+                await asyncio.sleep(1)
+                return state
+
+            async def env_response(self, messages, state, **kwargs):  # type: ignore[override]
+                return [{"role": "user", "content": "Continue"}]
+
+        env = SlowSetupEnv(
+            client=mock_client,
+            model="test-model",
+            dataset=sample_chat_dataset,
+            parser=Parser(),
+            rubric=Rubric(),
+            timeout_seconds=0.01,
+        )
+
+        state = await env.rollout(
+            input=make_input(
+                prompt=[{"role": "user", "content": "Start conversation"}],
+                answer="target_answer",
+            ),
+            client=mock_client,
+            model="test-model",
+        )
+
+        assert state["timed_out"] is True
+        assert state["is_completed"] is True
+        assert state["is_truncated"] is True
+        assert state["stop_condition"] == "timeout_reached"
+        assert state["trajectory"] == []
+        assert state["completion"] == []
+
     @pytest.mark.asyncio
     async def test_override_is_completed_respects_max_turns(
         self, mock_client, sample_chat_dataset, make_input

diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -618,6 +618,7 @@ async def init_state(
             total_ms=0.0,
             start_time=time.time(),
         )
+        state["_start_perf_counter"] = time.perf_counter()
         return state
 
     @abstractmethod
@@ -663,8 +664,8 @@ async def _render_stop(self, state: State, condition) -> bool:
         return False
 
     async def _render_timing(self, state: State):
-        start_time = state["timing"]["start_time"]
-        end_time = time.time()
+        start_time = state.get("_start_perf_counter", state["timing"]["start_time"])
+        end_time = time.perf_counter()
         state["timing"]["generation_ms"] = (end_time - start_time) * 1000
         state["timing"]["total_ms"] = (end_time - start_time) * 1000