Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,8 @@ When evaluating multiple environments, the display shows an overview panel at th
| Flag | Short | Default | Description |
|------|-------|---------|-------------|
| `--verbose` | `-v` | false | Enable debug logging |
| `--fullscreen` | `-f` | false | Use alternate screen buffer (fullscreen) for the Rich display |
| `--disable-tui` | `-d` | false | Disable Rich display; use normal logging and tqdm progress |
| `--tui` | `-u` | false | Use alternate screen mode (TUI) for display |
| `--debug` | `-d` | false | Disable Rich display; use normal logging and tqdm progress |
| `--abbreviated-summary` | `-A` | false | Abbreviated summary: show settings and stats, skip example prompts |
| `--output-dir` | `-o` | — | Custom output directory for evaluation results and logs |
| `--save-results` | `-s` | false | Save results to disk |
Expand Down
4 changes: 2 additions & 2 deletions tests/test_eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def _run_cli(
"hf_hub_dataset_name": "",
"extra_env_kwargs": {},
"max_retries": 0,
"fullscreen": False,
"disable_tui": False,
"tui": False,
"debug": False,
"abbreviated_summary": False,
"heartbeat_url": None,
}
Expand Down
23 changes: 8 additions & 15 deletions verifiers/scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,14 +392,14 @@ def build_parser() -> argparse.ArgumentParser:
help='Extra environment as JSON object (e.g., \'{"key": "value", "num": 42}\'). Passed to environment constructor.',
)
parser.add_argument(
"--fullscreen",
"-f",
"--tui",
"-u",
default=False,
action="store_true",
help="Use fullscreen (alternate-screen) mode for the Rich live evaluation display",
help="Use TUI mode for live evaluation display",
)
parser.add_argument(
"--disable-tui",
"--debug",
"-d",
default=False,
action="store_true",
Expand Down Expand Up @@ -449,14 +449,7 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
def main(argv: list[str] | None = None):
args = parse_args(argv)

if args.disable_tui and args.fullscreen:
raise SystemExit(
"error: --disable-tui and --fullscreen are mutually exclusive "
"(--disable-tui turns off the Rich display entirely; --fullscreen only "
"controls whether the Rich display uses the alternate screen buffer)."
)

if args.disable_tui: # only set up console logging when TUI is disabled
if args.debug: # only set up console logging in debug mode
setup_logging(get_log_level(args.verbose))

# Build raw configs: both paths produce list[dict]
Expand Down Expand Up @@ -738,7 +731,7 @@ def build_eval_config(raw: dict) -> EvalConfig:
num_workers=raw.get("num_workers", "auto"),
disable_env_server=raw.get("disable_env_server", False),
verbose=raw.get("verbose", False),
disable_tui=raw.get("disable_tui", False),
debug=raw.get("debug", False),
state_columns=raw.get("state_columns", []),
save_results=raw.get("save_results", False),
resume_path=resume_path,
Expand Down Expand Up @@ -767,13 +760,13 @@ def build_eval_config(raw: dict) -> EvalConfig:
eval_run_config = EvalRunConfig(
evals=eval_configs, heartbeat_url=args.heartbeat_url
)
if args.disable_tui:
if args.debug:
asyncio.run(run_evaluations(eval_run_config))
else:
asyncio.run(
run_evaluations_tui(
eval_run_config,
fullscreen=args.fullscreen,
tui_mode=args.tui,
compact=args.abbreviated_summary,
)
)
Expand Down
2 changes: 1 addition & 1 deletion verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ class EvalConfig(BaseModel):
disable_env_server: bool = False
# logging
verbose: bool = False
disable_tui: bool = False
debug: bool = False
# saving
output_dir: str | None = None
state_columns: list[str] | None = None
Expand Down
2 changes: 1 addition & 1 deletion verifiers/utils/display_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def refresh(self) -> None:

def get_log_hint(self) -> Text | None:
"""Return an optional hint for viewing full logs."""
return Text("full logs: --disable-tui", style="dim")
return Text("full logs: --debug", style="dim")

def _make_log_panel(self) -> Panel:
"""Create a panel showing recent log messages with placeholder lines."""
Expand Down
13 changes: 6 additions & 7 deletions verifiers/utils/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def load_toml_config(
"disable_env_server",
# logging
"verbose",
"disable_tui",
"debug",
# saving
"output_dir",
"state_columns",
Expand Down Expand Up @@ -818,7 +818,7 @@ async def run_evaluation(
num_workers=num_workers,
log_level=get_log_level(config.verbose),
log_dir=log_dir,
console_logging=config.disable_tui,
console_logging=config.debug,
)
if on_log_file is not None:
from verifiers.serve import EnvServer
Expand Down Expand Up @@ -916,14 +916,13 @@ async def run_evaluations(config: EvalRunConfig) -> None:


async def run_evaluations_tui(
config: EvalRunConfig, fullscreen: bool = False, compact: bool = False
config: EvalRunConfig, tui_mode: bool = True, compact: bool = False
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default for tui_mode changed from False to True

Medium Severity

The run_evaluations_tui function's tui_mode parameter default was changed from False (as fullscreen had before this revert) to True. This means any caller that doesn't explicitly pass tui_mode will now get alternate-screen (fullscreen) mode by default, which is the opposite of the previous behavior. The CLI call site in eval.py passes args.tui explicitly so it's unaffected, but this changes the public API contract for anyone calling the function directly.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit cd863d7. Configure here.

) -> None:
"""Run multi-environment evaluation with a Rich display.

Args:
config: Evaluation run configuration.
fullscreen: If True, use alternate screen buffer (--fullscreen flag).
If False, refresh in-place.
tui_mode: If True, use alternate screen (--tui flag). If False, refresh in-place.
compact: If True, show compact summary (settings + stats, skip example prompts).
"""
from verifiers.utils.eval_display import EvalDisplay, is_tty
Expand All @@ -940,7 +939,7 @@ async def run_evaluations_tui(

heart = Heartbeat(config.heartbeat_url)

display = EvalDisplay(config.evals, screen=fullscreen, compact=compact)
display = EvalDisplay(config.evals, screen=tui_mode, compact=compact)

async def run_with_progress(
env_config: EvalConfig, env_idx: int
Expand Down Expand Up @@ -1044,7 +1043,7 @@ def refresh_loop() -> None:
)

display.refresh()
if fullscreen:
if tui_mode:
await display.wait_for_exit()
finally:
refresh_stop.set()
Expand Down
Loading