Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions environments/api-design/api-design.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import verifiers as vf
from datasets import Dataset


def load_environment(
num_train_examples: int = 300,
num_eval_examples: int = 50,
**kwargs,
):
"""
API Design Environment

Evaluates LLM's ability to design RESTful APIs.
"""

train_data = [
{
"prompt": "Design a REST API for a todo list application with CRUD operations.",
"answer": "Endpoints: GET /todos, POST /todos, GET /todos/:id, PUT /todos/:id, DELETE /todos/:id. Use JSON for request/response bodies.",
"domain": "productivity"
},
{
"prompt": "Design a REST API for user authentication with JWT tokens.",
"answer": "Endpoints: POST /auth/register, POST /auth/login, POST /auth/refresh, GET /auth/profile. Use JWT for tokens, bcrypt for passwords.",
"domain": "security"
},
] * (num_train_examples // 2 + 1)

eval_data = [
{
"prompt": "Design a REST API for an e-commerce product catalog.",
"answer": "Endpoints: GET /products, POST /products, GET /products/:id, PUT /products/:id, DELETE /products/:id, GET /products/search, GET /categories.",
"domain": "e-commerce"
},
] * (num_eval_examples + 1)

dataset = Dataset.from_list(train_data[:num_train_examples])
eval_dataset = Dataset.from_list(eval_data[:num_eval_examples])

def score_api_design(completion, answer):
"""Score API design quality."""
completion_text = completion[-1]["content"] if completion else ""
# Check for key API elements
score = 0.0
if "endpoint" in completion_text.lower() or "route" in completion_text.lower():
score += 0.3
if "get" in completion_text.lower() and "post" in completion_text.lower():
score += 0.3
if "json" in completion_text.lower():
score += 0.2
if ":" in completion_text: # Path parameters
score += 0.2
return min(1.0, score)

rubric = vf.Rubric(funcs=[score_api_design])

return vf.SingleTurnEnv(
dataset=dataset,
eval_dataset=eval_dataset,
system_prompt="You are an API design expert. Design clean, RESTful APIs following best practices.",
rubric=rubric,
message_type="chat",
)
20 changes: 20 additions & 0 deletions environments/api-design/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[project]
name = "api-design"
description = "API design evaluation environment"
tags = ["coding", "analysis", "train", "eval"]
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.8",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["api-design.py", "pyproject.toml"]

[tool.verifiers.eval]
num_examples = 20
rollouts_per_example = 1
21 changes: 21 additions & 0 deletions environments/code_review/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Code Review Environment

Evaluates LLM's ability to review code for bugs, security issues, and improvements.

## Features

- Multi-language support (Python, JavaScript, etc.)
- Bug detection scoring
- Code quality assessment
- Security issue identification

## Usage

```bash
prime env install code-review
prime eval run code-review -m openai/gpt-4.1-mini
```

## Tags

`code-review` `coding` `analysis` `train` `eval`
63 changes: 63 additions & 0 deletions environments/code_review/code_review.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import verifiers as vf
from datasets import Dataset


def load_environment(
num_train_examples: int = 500,
num_eval_examples: int = 100,
**kwargs,
):
"""
Code Review Environment

Evaluates LLM's ability to review code for bugs, style issues, and improvements.
"""

# Sample code review dataset
train_data = [
{
"prompt": "Review this Python code for bugs and improvements:\n\ndef calculate_average(numbers):\n total = 0\n for n in numbers:\n total += n\n return total / len(numbers)",
"answer": "Bug: Division by zero if numbers is empty. Should check len(numbers) > 0 first.",
"language": "python"
},
{
"prompt": "Review this JavaScript code:\n\nfunction fetchData(url) {\n fetch(url).then(response => response.json());\n}",
"answer": "Missing error handling. Should add .catch() for error handling. Also missing return statement.",
"language": "javascript"
},
{
"prompt": "Review this Python function:\n\ndef process_data(data):\n result = []\n for item in data:\n if item > 0:\n result.append(item * 2)\n return result",
"answer": "Could use list comprehension: return [item * 2 for item in data if item > 0]. More Pythonic.",
"language": "python"
},
] * (num_train_examples // 3 + 1)

eval_data = [
{
"prompt": "Review this code for security issues:\n\nimport os\ndef read_file(filename):\n return open(filename).read()",
"answer": "Security issue: No path validation. Could allow directory traversal attacks. Should validate filename.",
"language": "python"
},
] * (num_eval_examples + 1)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dataset uses prompt string, causing assertion crash

High Severity

All three new environments use "prompt" as the dataset column key with a plain string value, but also pass a system_prompt to SingleTurnEnv. The framework's _ensure_prompt method, when it finds a prompt column already present and system_prompt is set, asserts that prompt must be a list of messages — causing an AssertionError at initialization. The column key needs to be "question" instead of "prompt" so the framework properly wraps the string into a messages list. All existing environments that use string-valued prompts use "question" for this reason.

Additional Locations (2)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 439fa61. Configure here.


dataset = Dataset.from_list(train_data[:num_train_examples])
eval_dataset = Dataset.from_list(eval_data[:num_eval_examples])

def score_review(completion, answer):
"""Score code review quality."""
completion_text = completion[-1]["content"] if completion else ""
# Simple keyword matching - can be improved
answer_keywords = set(answer.lower().split())
completion_keywords = set(completion_text.lower().split())
overlap = len(answer_keywords & completion_keywords) / len(answer_keywords) if answer_keywords else 0
return min(1.0, overlap * 2) # Scale up

rubric = vf.Rubric(funcs=[score_review])

return vf.SingleTurnEnv(
dataset=dataset,
eval_dataset=eval_dataset,
system_prompt="You are an expert code reviewer. Analyze the code and identify bugs, security issues, and potential improvements.",
rubric=rubric,
message_type="chat",
)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New environments missing from environments/README.md

Low Severity

Three new environments (code_review, api-design, sql-query) are added to the environments/ folder but environments/README.md is not updated to list them. The project rules require that any PR adding or removing an environment must update environments/README.md to reflect the change under the appropriate category/pattern section.

Additional Locations (2)
Fix in Cursor Fix in Web

Triggered by project rule: BugBot Instructions

Reviewed by Cursor Bugbot for commit 439fa61. Configure here.

20 changes: 20 additions & 0 deletions environments/code_review/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[project]
name = "code-review"
description = "Code review environment for evaluating LLM's ability to identify bugs and improvements"
tags = ["code-review", "coding", "analysis", "train", "eval"]
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.8",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["code_review.py", "pyproject.toml"]

[tool.verifiers.eval]
num_examples = 20
rollouts_per_example = 1
20 changes: 20 additions & 0 deletions environments/sql-query/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[project]
name = "sql-query"
description = "SQL query writing environment"
tags = ["coding", "analysis", "train", "eval"]
version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"verifiers>=0.1.8",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build]
include = ["sql-query.py", "pyproject.toml"]

[tool.verifiers.eval]
num_examples = 20
rollouts_per_example = 1
62 changes: 62 additions & 0 deletions environments/sql-query/sql-query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import verifiers as vf
from datasets import Dataset


def load_environment(
num_train_examples: int = 400,
num_eval_examples: int = 80,
**kwargs,
):
"""
SQL Query Environment

Evaluates LLM's ability to write SQL queries.
"""

train_data = [
{
"prompt": "Write a SQL query to find all users who registered in the last 30 days.",
"answer": "SELECT * FROM users WHERE created_at >= DATE_SUB(NOW(), INTERVAL 30 DAY);",
"difficulty": "easy"
},
{
"prompt": "Write a SQL query to find the top 5 customers by total order amount.",
"answer": "SELECT c.name, SUM(o.amount) as total FROM customers c JOIN orders o ON c.id = o.customer_id GROUP BY c.id ORDER BY total DESC LIMIT 5;",
"difficulty": "medium"
},
] * (num_train_examples // 2 + 1)

eval_data = [
{
"prompt": "Write a SQL query to find products that have never been ordered.",
"answer": "SELECT p.* FROM products p LEFT JOIN order_items oi ON p.id = oi.product_id WHERE oi.product_id IS NULL;",
"difficulty": "medium"
},
] * (num_eval_examples + 1)

dataset = Dataset.from_list(train_data[:num_train_examples])
eval_dataset = Dataset.from_list(eval_data[:num_eval_examples])

def score_sql(completion, answer):
"""Score SQL query quality."""
completion_text = completion[-1]["content"] if completion else ""
score = 0.0
if "select" in completion_text.lower():
score += 0.3
if "from" in completion_text.lower():
score += 0.2
if "where" in completion_text.lower() or "join" in completion_text.lower():
score += 0.3
if "group by" in completion_text.lower() or "order by" in completion_text.lower():
score += 0.2
return min(1.0, score)

rubric = vf.Rubric(funcs=[score_sql])

return vf.SingleTurnEnv(
dataset=dataset,
eval_dataset=eval_dataset,
system_prompt="You are a SQL expert. Write efficient, correct SQL queries.",
rubric=rubric,
message_type="chat",
)