PrimeIntellect-ai · vominh1919 · Apr 16, 2026 · Apr 16, 2026 · cursor · Apr 16, 2026
diff --git a/environments/api-design/api-design.py b/environments/api-design/api-design.py
@@ -0,0 +1,63 @@
+import verifiers as vf
+from datasets import Dataset
+
+
+def load_environment(
+    num_train_examples: int = 300,
+    num_eval_examples: int = 50,
+    **kwargs,
+):
+    """
+    API Design Environment
+
+    Evaluates LLM's ability to design RESTful APIs.
+    """
+
+    train_data = [
+        {
+            "prompt": "Design a REST API for a todo list application with CRUD operations.",
+            "answer": "Endpoints: GET /todos, POST /todos, GET /todos/:id, PUT /todos/:id, DELETE /todos/:id. Use JSON for request/response bodies.",
+            "domain": "productivity"
+        },
+        {
+            "prompt": "Design a REST API for user authentication with JWT tokens.",
+            "answer": "Endpoints: POST /auth/register, POST /auth/login, POST /auth/refresh, GET /auth/profile. Use JWT for tokens, bcrypt for passwords.",
+            "domain": "security"
+        },
+    ] * (num_train_examples // 2 + 1)
+
+    eval_data = [
+        {
+            "prompt": "Design a REST API for an e-commerce product catalog.",
+            "answer": "Endpoints: GET /products, POST /products, GET /products/:id, PUT /products/:id, DELETE /products/:id, GET /products/search, GET /categories.",
+            "domain": "e-commerce"
+        },
+    ] * (num_eval_examples + 1)
+
+    dataset = Dataset.from_list(train_data[:num_train_examples])
+    eval_dataset = Dataset.from_list(eval_data[:num_eval_examples])
+
+    def score_api_design(completion, answer):
+        """Score API design quality."""
+        completion_text = completion[-1]["content"] if completion else ""
+        # Check for key API elements
+        score = 0.0
+        if "endpoint" in completion_text.lower() or "route" in completion_text.lower():
+            score += 0.3
+        if "get" in completion_text.lower() and "post" in completion_text.lower():
+            score += 0.3
+        if "json" in completion_text.lower():
+            score += 0.2
+        if ":" in completion_text:  # Path parameters
+            score += 0.2
+        return min(1.0, score)
+
+    rubric = vf.Rubric(funcs=[score_api_design])
+
+    return vf.SingleTurnEnv(
+        dataset=dataset,
+        eval_dataset=eval_dataset,
+        system_prompt="You are an API design expert. Design clean, RESTful APIs following best practices.",
+        rubric=rubric,
+        message_type="chat",
+    )
diff --git a/environments/api-design/pyproject.toml b/environments/api-design/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "api-design"
+description = "API design evaluation environment"
+tags = ["coding", "analysis", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.8",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["api-design.py", "pyproject.toml"]
+
+[tool.verifiers.eval]
+num_examples = 20
+rollouts_per_example = 1
diff --git a/environments/code_review/README.md b/environments/code_review/README.md
@@ -0,0 +1,21 @@
+# Code Review Environment
+
+Evaluates LLM's ability to review code for bugs, security issues, and improvements.
+
+## Features
+
+- Multi-language support (Python, JavaScript, etc.)
+- Bug detection scoring
+- Code quality assessment
+- Security issue identification
+
+## Usage
+
+```bash
+prime env install code-review
+prime eval run code-review -m openai/gpt-4.1-mini
+```
+
+## Tags
+
+`code-review` `coding` `analysis` `train` `eval`
diff --git a/environments/code_review/code_review.py b/environments/code_review/code_review.py
@@ -0,0 +1,63 @@
+import verifiers as vf
+from datasets import Dataset
+
+
+def load_environment(
+    num_train_examples: int = 500,
+    num_eval_examples: int = 100,
+    **kwargs,
+):
+    """
+    Code Review Environment
+
+    Evaluates LLM's ability to review code for bugs, style issues, and improvements.
+    """
+
+    # Sample code review dataset
+    train_data = [
+        {
+            "prompt": "Review this Python code for bugs and improvements:\n\ndef calculate_average(numbers):\n    total = 0\n    for n in numbers:\n        total += n\n    return total / len(numbers)",
+            "answer": "Bug: Division by zero if numbers is empty. Should check len(numbers) > 0 first.",
+            "language": "python"
+        },
+        {
+            "prompt": "Review this JavaScript code:\n\nfunction fetchData(url) {\n  fetch(url).then(response => response.json());\n}",
+            "answer": "Missing error handling. Should add .catch() for error handling. Also missing return statement.",
+            "language": "javascript"
+        },
+        {
+            "prompt": "Review this Python function:\n\ndef process_data(data):\n    result = []\n    for item in data:\n        if item > 0:\n            result.append(item * 2)\n    return result",
+            "answer": "Could use list comprehension: return [item * 2 for item in data if item > 0]. More Pythonic.",
+            "language": "python"
+        },
+    ] * (num_train_examples // 3 + 1)
+
+    eval_data = [
+        {
+            "prompt": "Review this code for security issues:\n\nimport os\ndef read_file(filename):\n    return open(filename).read()",
+            "answer": "Security issue: No path validation. Could allow directory traversal attacks. Should validate filename.",
+            "language": "python"
+        },
+    ] * (num_eval_examples + 1)
+
+    dataset = Dataset.from_list(train_data[:num_train_examples])
+    eval_dataset = Dataset.from_list(eval_data[:num_eval_examples])
+
+    def score_review(completion, answer):
+        """Score code review quality."""
+        completion_text = completion[-1]["content"] if completion else ""
+        # Simple keyword matching - can be improved
+        answer_keywords = set(answer.lower().split())
+        completion_keywords = set(completion_text.lower().split())
+        overlap = len(answer_keywords & completion_keywords) / len(answer_keywords) if answer_keywords else 0
+        return min(1.0, overlap * 2)  # Scale up
+
+    rubric = vf.Rubric(funcs=[score_review])
+
+    return vf.SingleTurnEnv(
+        dataset=dataset,
+        eval_dataset=eval_dataset,
+        system_prompt="You are an expert code reviewer. Analyze the code and identify bugs, security issues, and potential improvements.",
+        rubric=rubric,
+        message_type="chat",
+    )
diff --git a/environments/code_review/pyproject.toml b/environments/code_review/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "code-review"
+description = "Code review environment for evaluating LLM's ability to identify bugs and improvements"
+tags = ["code-review", "coding", "analysis", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.8",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["code_review.py", "pyproject.toml"]
+
+[tool.verifiers.eval]
+num_examples = 20
+rollouts_per_example = 1
diff --git a/environments/sql-query/pyproject.toml b/environments/sql-query/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "sql-query"
+description = "SQL query writing environment"
+tags = ["coding", "analysis", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.8",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["sql-query.py", "pyproject.toml"]
+
+[tool.verifiers.eval]
+num_examples = 20
+rollouts_per_example = 1
diff --git a/environments/sql-query/sql-query.py b/environments/sql-query/sql-query.py
@@ -0,0 +1,62 @@
+import verifiers as vf
+from datasets import Dataset
+
+
+def load_environment(
+    num_train_examples: int = 400,
+    num_eval_examples: int = 80,
+    **kwargs,
+):
+    """
+    SQL Query Environment
+
+    Evaluates LLM's ability to write SQL queries.
+    """
+
+    train_data = [
+        {
+            "prompt": "Write a SQL query to find all users who registered in the last 30 days.",
+            "answer": "SELECT * FROM users WHERE created_at >= DATE_SUB(NOW(), INTERVAL 30 DAY);",
+            "difficulty": "easy"
+        },
+        {
+            "prompt": "Write a SQL query to find the top 5 customers by total order amount.",
+            "answer": "SELECT c.name, SUM(o.amount) as total FROM customers c JOIN orders o ON c.id = o.customer_id GROUP BY c.id ORDER BY total DESC LIMIT 5;",
+            "difficulty": "medium"
+        },
+    ] * (num_train_examples // 2 + 1)
+
+    eval_data = [
+        {
+            "prompt": "Write a SQL query to find products that have never been ordered.",
+            "answer": "SELECT p.* FROM products p LEFT JOIN order_items oi ON p.id = oi.product_id WHERE oi.product_id IS NULL;",
+            "difficulty": "medium"
+        },
+    ] * (num_eval_examples + 1)
+
+    dataset = Dataset.from_list(train_data[:num_train_examples])
+    eval_dataset = Dataset.from_list(eval_data[:num_eval_examples])
+
+    def score_sql(completion, answer):
+        """Score SQL query quality."""
+        completion_text = completion[-1]["content"] if completion else ""
+        score = 0.0
+        if "select" in completion_text.lower():
+            score += 0.3
+        if "from" in completion_text.lower():
+            score += 0.2
+        if "where" in completion_text.lower() or "join" in completion_text.lower():
+            score += 0.3
+        if "group by" in completion_text.lower() or "order by" in completion_text.lower():
+            score += 0.2
+        return min(1.0, score)
+
+    rubric = vf.Rubric(funcs=[score_sql])
+
+    return vf.SingleTurnEnv(
+        dataset=dataset,
+        eval_dataset=eval_dataset,
+        system_prompt="You are a SQL expert. Write efficient, correct SQL queries.",
+        rubric=rubric,
+        message_type="chat",
+    )