Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 62 additions & 22 deletions verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,15 @@ def _extract_gold_patch(


def _process_example(x):
info = {**x}
# Expose generic instance_id / repo aliases so TaskSet.validate() can
# surface them in JSONL output (R2E-Gym rows natively use commit_hash
# and repo_name).
info.setdefault("instance_id", x.get("commit_hash"))
info.setdefault("repo", x.get("repo_name"))
return {
"question": x["problem_statement"],
"info": {**x},
"info": info,
"answer": "",
}

Expand Down Expand Up @@ -178,6 +184,7 @@ def __init__(
ds_num_proc: int | None = 8,
ds_keep_in_memory: bool = True,
timeout_minutes: int = 60,
hide_tests_from_agent: bool = True,
):
"""
Args:
Expand All @@ -186,6 +193,15 @@ def __init__(
post-``_process_example`` rows, so predicates see the
``{"question", "info", "answer", ...}`` shape (e.g.
``"lambda x: x['info']['repo_name'] == 'pandas-dev/pandas'"``).
hide_tests_from_agent: When True (default), ``setup()`` tars
``/r2e_tests`` off to the host and removes it from the
sandbox so the running agent can't read the ground-truth
tests; ``_run_tests()`` uploads the archive back at scoring
time. Required for fair agent rollouts. Set False when no
agent is running (e.g., ``TaskSet.validate()``) to swap in
an in-sandbox ``mv /r2e_tests /testbed/r2e_tests`` instead —
eliminates the per-row tar/download/upload roundtrip and
cuts setup cost by an order of magnitude.
"""
self.dataset_name = dataset_name
self.repo_path = repo_path
Expand All @@ -194,6 +210,7 @@ def __init__(
self.ds_num_proc = ds_num_proc
self.ds_keep_in_memory = ds_keep_in_memory
self.timeout_minutes = timeout_minutes
self.hide_tests_from_agent = hide_tests_from_agent
super().__init__(
dataset=self._build_dataset(),
name="swe/r2e",
Expand Down Expand Up @@ -248,7 +265,15 @@ def get_env_vars(self) -> dict[str, str]:
}

async def setup(self, state) -> None:
"""Symlink venv, clean pycache, download r2e_tests to host and remove from sandbox."""
"""Symlink venv, clean pycache, stage r2e_tests for scoring.

If ``hide_tests_from_agent`` (default), tars ``/r2e_tests`` off to
the host and removes it from the sandbox so the agent can't read
the tests while working; ``_run_tests()`` uploads the archive back
for scoring. If False (no-agent flows like validate), just
``mv /r2e_tests /testbed/r2e_tests`` in-sandbox — no host I/O,
much faster setup.
"""
sandbox_client = state["sandbox_client"]
sandbox_id = state["sandbox_id"]

Expand Down Expand Up @@ -299,7 +324,13 @@ async def _exec(
except Exception as e:
logger.warning(f"Continuing without deleting pycache: {e!r}")

# Download r2e_tests to host, remove from sandbox
if not self.hide_tests_from_agent:
# Fast-path: no agent is running, so tests can live in
# /testbed/r2e_tests from the start. No host roundtrip.
await _exec(f"mv /r2e_tests {self.repo_path}/r2e_tests", timeout=60)
return

# Agent-safe path: stash tests on host, remove from sandbox.
remote_archive = "/tmp/r2e_tests.tar.gz"
local_archive_path = str(Path("/tmp") / f"r2e_tests_{sandbox_id}.tar.gz")
await _exec(f"tar -C / -czf {remote_archive} r2e_tests", timeout=300)
Expand All @@ -320,29 +351,38 @@ async def _run_tests(
state: dict,
test_timeout: int,
) -> str:
"""Upload cached r2e_tests, run run_tests.sh, return test output."""
# Upload cached r2e_tests archive back to sandbox
"""Restore r2e_tests into /testbed if needed, run run_tests.sh, return output.

With ``hide_tests_from_agent=True`` (default), setup() parked the
tests on the host — upload + extract now. With False, setup()
already moved them into ``/testbed/r2e_tests`` in-sandbox, so
there's nothing to restore.
"""
local_archive_path = state.get("r2e_tests_archive_local_path")
if not local_archive_path or not Path(local_archive_path).exists():
raise RuntimeError(
f"Missing cached r2e_tests archive: {local_archive_path}"
if local_archive_path and Path(local_archive_path).exists():
remote_archive = "/tmp/r2e_tests_roundtrip.tar.gz"
await sandbox_client.upload_file(
sandbox_id=sandbox_id,
file_path=remote_archive,
local_file_path=local_archive_path,
timeout=300,
)
remote_archive = "/tmp/r2e_tests_roundtrip.tar.gz"
await sandbox_client.upload_file(
sandbox_id=sandbox_id,
file_path=remote_archive,
local_file_path=local_archive_path,
timeout=300,
)
results = await sandbox_client.execute_command(
sandbox_id, f"tar -C {self.repo_path} -xzf {remote_archive}", timeout=300
)
if results.exit_code != 0:
results = await sandbox_client.execute_command(
sandbox_id,
f"tar -C {self.repo_path} -xzf {remote_archive}",
timeout=300,
)
if results.exit_code != 0:
raise RuntimeError(
f"Failed to extract r2e_tests: exit_code={results.exit_code}"
)
Path(local_archive_path).unlink(missing_ok=True)
del state["r2e_tests_archive_local_path"]
elif self.hide_tests_from_agent:
raise RuntimeError(
f"Failed to extract r2e_tests: exit_code={results.exit_code}"
f"Missing cached r2e_tests archive: {local_archive_path}"
)
Path(local_archive_path).unlink(missing_ok=True)
del state["r2e_tests_archive_local_path"]
# else: fast-path — setup() already placed tests at /testbed/r2e_tests.

# Build env vars string
env_str = " ".join(f"{k}={v}" for k, v in self.get_env_vars().items())
Expand Down
Loading