-
Notifications
You must be signed in to change notification settings - Fork 535
feat: add code-review environment #1152
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| import verifiers as vf | ||
| from datasets import Dataset | ||
|
|
||
|
|
||
| def load_environment( | ||
| num_train_examples: int = 300, | ||
| num_eval_examples: int = 50, | ||
| **kwargs, | ||
| ): | ||
| """ | ||
| API Design Environment | ||
|
|
||
| Evaluates LLM's ability to design RESTful APIs. | ||
| """ | ||
|
|
||
| train_data = [ | ||
| { | ||
| "prompt": "Design a REST API for a todo list application with CRUD operations.", | ||
| "answer": "Endpoints: GET /todos, POST /todos, GET /todos/:id, PUT /todos/:id, DELETE /todos/:id. Use JSON for request/response bodies.", | ||
| "domain": "productivity" | ||
| }, | ||
| { | ||
| "prompt": "Design a REST API for user authentication with JWT tokens.", | ||
| "answer": "Endpoints: POST /auth/register, POST /auth/login, POST /auth/refresh, GET /auth/profile. Use JWT for tokens, bcrypt for passwords.", | ||
| "domain": "security" | ||
| }, | ||
| ] * (num_train_examples // 2 + 1) | ||
|
|
||
| eval_data = [ | ||
| { | ||
| "prompt": "Design a REST API for an e-commerce product catalog.", | ||
| "answer": "Endpoints: GET /products, POST /products, GET /products/:id, PUT /products/:id, DELETE /products/:id, GET /products/search, GET /categories.", | ||
| "domain": "e-commerce" | ||
| }, | ||
| ] * (num_eval_examples + 1) | ||
|
|
||
| dataset = Dataset.from_list(train_data[:num_train_examples]) | ||
| eval_dataset = Dataset.from_list(eval_data[:num_eval_examples]) | ||
|
|
||
| def score_api_design(completion, answer): | ||
| """Score API design quality.""" | ||
| completion_text = completion[-1]["content"] if completion else "" | ||
| # Check for key API elements | ||
| score = 0.0 | ||
| if "endpoint" in completion_text.lower() or "route" in completion_text.lower(): | ||
| score += 0.3 | ||
| if "get" in completion_text.lower() and "post" in completion_text.lower(): | ||
| score += 0.3 | ||
| if "json" in completion_text.lower(): | ||
| score += 0.2 | ||
| if ":" in completion_text: # Path parameters | ||
| score += 0.2 | ||
| return min(1.0, score) | ||
|
|
||
| rubric = vf.Rubric(funcs=[score_api_design]) | ||
|
|
||
| return vf.SingleTurnEnv( | ||
| dataset=dataset, | ||
| eval_dataset=eval_dataset, | ||
| system_prompt="You are an API design expert. Design clean, RESTful APIs following best practices.", | ||
| rubric=rubric, | ||
| message_type="chat", | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| [project] | ||
| name = "api-design" | ||
| description = "API design evaluation environment" | ||
| tags = ["coding", "analysis", "train", "eval"] | ||
| version = "0.1.0" | ||
| requires-python = ">=3.11" | ||
| dependencies = [ | ||
| "verifiers>=0.1.8", | ||
| ] | ||
|
|
||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [tool.hatch.build] | ||
| include = ["api-design.py", "pyproject.toml"] | ||
|
|
||
| [tool.verifiers.eval] | ||
| num_examples = 20 | ||
| rollouts_per_example = 1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| # Code Review Environment | ||
|
|
||
| Evaluates LLM's ability to review code for bugs, security issues, and improvements. | ||
|
|
||
| ## Features | ||
|
|
||
| - Multi-language support (Python, JavaScript, etc.) | ||
| - Bug detection scoring | ||
| - Code quality assessment | ||
| - Security issue identification | ||
|
|
||
| ## Usage | ||
|
|
||
| ```bash | ||
| prime env install code-review | ||
| prime eval run code-review -m openai/gpt-4.1-mini | ||
| ``` | ||
|
|
||
| ## Tags | ||
|
|
||
| `code-review` `coding` `analysis` `train` `eval` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| import verifiers as vf | ||
| from datasets import Dataset | ||
|
|
||
|
|
||
| def load_environment( | ||
| num_train_examples: int = 500, | ||
| num_eval_examples: int = 100, | ||
| **kwargs, | ||
| ): | ||
| """ | ||
| Code Review Environment | ||
|
|
||
| Evaluates LLM's ability to review code for bugs, style issues, and improvements. | ||
| """ | ||
|
|
||
| # Sample code review dataset | ||
| train_data = [ | ||
| { | ||
| "prompt": "Review this Python code for bugs and improvements:\n\ndef calculate_average(numbers):\n total = 0\n for n in numbers:\n total += n\n return total / len(numbers)", | ||
| "answer": "Bug: Division by zero if numbers is empty. Should check len(numbers) > 0 first.", | ||
| "language": "python" | ||
| }, | ||
| { | ||
| "prompt": "Review this JavaScript code:\n\nfunction fetchData(url) {\n fetch(url).then(response => response.json());\n}", | ||
| "answer": "Missing error handling. Should add .catch() for error handling. Also missing return statement.", | ||
| "language": "javascript" | ||
| }, | ||
| { | ||
| "prompt": "Review this Python function:\n\ndef process_data(data):\n result = []\n for item in data:\n if item > 0:\n result.append(item * 2)\n return result", | ||
| "answer": "Could use list comprehension: return [item * 2 for item in data if item > 0]. More Pythonic.", | ||
| "language": "python" | ||
| }, | ||
| ] * (num_train_examples // 3 + 1) | ||
|
|
||
| eval_data = [ | ||
| { | ||
| "prompt": "Review this code for security issues:\n\nimport os\ndef read_file(filename):\n return open(filename).read()", | ||
| "answer": "Security issue: No path validation. Could allow directory traversal attacks. Should validate filename.", | ||
| "language": "python" | ||
| }, | ||
| ] * (num_eval_examples + 1) | ||
|
|
||
| dataset = Dataset.from_list(train_data[:num_train_examples]) | ||
| eval_dataset = Dataset.from_list(eval_data[:num_eval_examples]) | ||
|
|
||
| def score_review(completion, answer): | ||
| """Score code review quality.""" | ||
| completion_text = completion[-1]["content"] if completion else "" | ||
| # Simple keyword matching - can be improved | ||
| answer_keywords = set(answer.lower().split()) | ||
| completion_keywords = set(completion_text.lower().split()) | ||
| overlap = len(answer_keywords & completion_keywords) / len(answer_keywords) if answer_keywords else 0 | ||
| return min(1.0, overlap * 2) # Scale up | ||
|
|
||
| rubric = vf.Rubric(funcs=[score_review]) | ||
|
|
||
| return vf.SingleTurnEnv( | ||
| dataset=dataset, | ||
| eval_dataset=eval_dataset, | ||
| system_prompt="You are an expert code reviewer. Analyze the code and identify bugs, security issues, and potential improvements.", | ||
| rubric=rubric, | ||
| message_type="chat", | ||
| ) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. New environments missing from
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| [project] | ||
| name = "code-review" | ||
| description = "Code review environment for evaluating LLM's ability to identify bugs and improvements" | ||
| tags = ["code-review", "coding", "analysis", "train", "eval"] | ||
| version = "0.1.0" | ||
| requires-python = ">=3.11" | ||
| dependencies = [ | ||
| "verifiers>=0.1.8", | ||
| ] | ||
|
|
||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [tool.hatch.build] | ||
| include = ["code_review.py", "pyproject.toml"] | ||
|
|
||
| [tool.verifiers.eval] | ||
| num_examples = 20 | ||
| rollouts_per_example = 1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| [project] | ||
| name = "sql-query" | ||
| description = "SQL query writing environment" | ||
| tags = ["coding", "analysis", "train", "eval"] | ||
| version = "0.1.0" | ||
| requires-python = ">=3.11" | ||
| dependencies = [ | ||
| "verifiers>=0.1.8", | ||
| ] | ||
|
|
||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [tool.hatch.build] | ||
| include = ["sql-query.py", "pyproject.toml"] | ||
|
|
||
| [tool.verifiers.eval] | ||
| num_examples = 20 | ||
| rollouts_per_example = 1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| import verifiers as vf | ||
| from datasets import Dataset | ||
|
|
||
|
|
||
| def load_environment( | ||
| num_train_examples: int = 400, | ||
| num_eval_examples: int = 80, | ||
| **kwargs, | ||
| ): | ||
| """ | ||
| SQL Query Environment | ||
|
|
||
| Evaluates LLM's ability to write SQL queries. | ||
| """ | ||
|
|
||
| train_data = [ | ||
| { | ||
| "prompt": "Write a SQL query to find all users who registered in the last 30 days.", | ||
| "answer": "SELECT * FROM users WHERE created_at >= DATE_SUB(NOW(), INTERVAL 30 DAY);", | ||
| "difficulty": "easy" | ||
| }, | ||
| { | ||
| "prompt": "Write a SQL query to find the top 5 customers by total order amount.", | ||
| "answer": "SELECT c.name, SUM(o.amount) as total FROM customers c JOIN orders o ON c.id = o.customer_id GROUP BY c.id ORDER BY total DESC LIMIT 5;", | ||
| "difficulty": "medium" | ||
| }, | ||
| ] * (num_train_examples // 2 + 1) | ||
|
|
||
| eval_data = [ | ||
| { | ||
| "prompt": "Write a SQL query to find products that have never been ordered.", | ||
| "answer": "SELECT p.* FROM products p LEFT JOIN order_items oi ON p.id = oi.product_id WHERE oi.product_id IS NULL;", | ||
| "difficulty": "medium" | ||
| }, | ||
| ] * (num_eval_examples + 1) | ||
|
|
||
| dataset = Dataset.from_list(train_data[:num_train_examples]) | ||
| eval_dataset = Dataset.from_list(eval_data[:num_eval_examples]) | ||
|
|
||
| def score_sql(completion, answer): | ||
| """Score SQL query quality.""" | ||
| completion_text = completion[-1]["content"] if completion else "" | ||
| score = 0.0 | ||
| if "select" in completion_text.lower(): | ||
| score += 0.3 | ||
| if "from" in completion_text.lower(): | ||
| score += 0.2 | ||
| if "where" in completion_text.lower() or "join" in completion_text.lower(): | ||
| score += 0.3 | ||
| if "group by" in completion_text.lower() or "order by" in completion_text.lower(): | ||
| score += 0.2 | ||
| return min(1.0, score) | ||
|
|
||
| rubric = vf.Rubric(funcs=[score_sql]) | ||
|
|
||
| return vf.SingleTurnEnv( | ||
| dataset=dataset, | ||
| eval_dataset=eval_dataset, | ||
| system_prompt="You are a SQL expert. Write efficient, correct SQL queries.", | ||
| rubric=rubric, | ||
| message_type="chat", | ||
| ) |


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dataset uses
promptstring, causing assertion crashHigh Severity
All three new environments use
"prompt"as the dataset column key with a plain string value, but also pass asystem_prompttoSingleTurnEnv. The framework's_ensure_promptmethod, when it finds apromptcolumn already present andsystem_promptis set, asserts thatpromptmust be a list of messages — causing anAssertionErrorat initialization. The column key needs to be"question"instead of"prompt"so the framework properly wraps the string into a messages list. All existing environments that use string-valued prompts use"question"for this reason.Additional Locations (2)
environments/api-design/api-design.py#L15-L35environments/sql-query/sql-query.py#L15-L35Reviewed by Cursor Bugbot for commit 439fa61. Configure here.