Merge pull request #1 from fswair/test_durations

fswair · web-flow · commit 67ae47ea25bb · 2025-11-29T16:45:24.000+03:00
v0.2.5 - improvements &amp; fixes
diff --git a/.gitignore b/.gitignore
@@ -31,7 +31,9 @@ env/
 src/vowel/ai.py
 AGENT_*.md
 tests/
+functions/
 eval_gen.py
+demo.txt
 
 # IDE
 .vscode/
diff --git a/README.md b/README.md
@@ -582,7 +582,150 @@ pattern: "^prefix"
 pattern: "suffix$"
 ```
 
-### 8. LLM Judge Evaluator
+### 8. Raises Evaluator (Exception Testing)
+
+Tests that a function raises a specific exception. Similar to pytest's `pytest.raises`, this evaluator verifies both the exception type and optionally the exception message pattern. This is a **case-level only** evaluator.
+
+```yaml
+function_name:
+  dataset:
+    - case:
+        input: invalid_value
+        raises: ExceptionType # Required: exception type name
+        match: "pattern" # Optional: regex pattern for exception message
+```
+
+**Important Notes:**
+
+- `raises` is **case-level only** - cannot be used as a global evaluator
+- `match` can only be used together with `raises`
+- When `raises` is specified, the test expects an exception and will fail if the function returns normally
+- Global evaluators (type checks, assertions, etc.) are automatically skipped for exception cases
+
+**Examples:**
+
+```yaml
+# Basic exception testing
+calculate_discount:
+  evals:
+    IsFloat:
+      type: float
+  dataset:
+    - case:
+        id: "valid_calculation"
+        inputs: [100.0, 20.0]
+        expected: 80.0
+
+    - case:
+        id: "negative_price"
+        inputs: [-100.0, 20.0]
+        raises: ValueError
+        match: "must be positive" # Checks exception message
+
+    - case:
+        id: "invalid_discount"
+        inputs: [100.0, 150.0]
+        raises: ValueError # Just checks type, not message
+
+# Division by zero
+divide:
+  evals:
+    IsNumber:
+      type: "int | float"
+  dataset:
+    - case:
+        inputs: [10, 2]
+        expected: 5.0
+
+    - case:
+        inputs: [10, 0]
+        raises: ZeroDivisionError
+
+# Type validation
+parse_age:
+  dataset:
+    - case:
+        input: "25"
+        expected: 25
+
+    - case:
+        input: "invalid"
+        raises: ValueError
+        match: "invalid literal"
+
+    - case:
+        input: -5
+        raises: ValueError
+        match: "age must be positive"
+
+# Key errors
+get_config_value:
+  dataset:
+    - case:
+        input: "api_key"
+        expected: "secret_key_123"
+
+    - case:
+        input: "nonexistent_key"
+        raises: KeyError
+        match: "nonexistent_key"
+
+# Multiple exception types
+process_data:
+  dataset:
+    - case:
+        input: { "valid": "data" }
+        expected: "processed"
+
+    - case:
+        input: null
+        raises: TypeError
+        match: "NoneType"
+
+    - case:
+        input: []
+        raises: ValueError
+        match: "empty"
+
+    - case:
+        input: { "invalid": "format" }
+        raises: KeyError
+
+# Index errors
+get_element:
+  dataset:
+    - case:
+        inputs: [[1, 2, 3], 1]
+        expected: 2
+
+    - case:
+        inputs: [[1, 2, 3], 10]
+        raises: IndexError
+        match: "out of range"
+```
+
+**How it works:**
+
+1. When `raises` is present in a case, the framework wraps the function execution in a try-catch
+2. If an exception is raised:
+   - Checks if exception type matches `raises`
+   - If `match` is provided, validates exception message against the regex pattern
+   - Global evaluators are skipped (they would fail on exception dict)
+3. If no exception is raised when `raises` is specified, the test fails
+4. If exception type doesn't match, the test fails and shows actual vs expected
+
+**Common Exception Types:**
+
+- `ValueError`: Invalid value/argument
+- `TypeError`: Wrong type
+- `KeyError`: Missing dictionary key
+- `IndexError`: List/array index out of range
+- `ZeroDivisionError`: Division by zero
+- `AttributeError`: Missing attribute
+- `FileNotFoundError`: File doesn't exist
+- `RuntimeError`: Generic runtime error
+
+### 9. LLM Judge Evaluator
 
 Uses a Language Model to evaluate outputs based on a custom rubric. Ideal for semantic evaluation, quality assessment, and cases where rule-based checking is insufficient.
 
diff --git a/examples/functions.py b/examples/functions.py
@@ -21,7 +21,9 @@ def validate_email(email: str) -> bool:
     return bool(re.match(pattern, email))
 
 
-def calculate_discount(price: float, discount_percent: float) -> float:
+async def calculate_discount(price: float, discount_percent: float) -> float:
+    if price < 0:
+        raise ValueError("Price must be positive")
     if discount_percent < 0 or discount_percent > 100:
         raise ValueError("Discount must be between 0 and 100")
     return round(price * (1 - discount_percent / 100), 2)
@@ -152,3 +154,18 @@ async def _run():
         return result.output
 
     return asyncio.run(_run())
+
+
+def binary_search(sorted_list: list[int], target: int) -> int:
+    """Binary search for target in sorted_list (ascending). Returns index or -1 if not found."""
+    low = 0
+    high = len(sorted_list) - 1
+    while low <= high:
+        mid = low + (high - low) // 2
+        if sorted_list[mid] == target:
+            return mid
+        elif sorted_list[mid] < target:
+            low = mid + 1
+        else:
+            high = mid - 1
+    return -1
diff --git a/examples/test_raises.yml b/examples/test_raises.yml
@@ -0,0 +1,21 @@
+examples.functions.calculate_discount:
+  evals:
+    IsFloat:
+      type: float
+  dataset:
+    - case:
+        id: "valid_discount"
+        inputs: [100.0, 20.0]
+        expected: 80.0
+
+    - case:
+        id: "negative_price_raises"
+        inputs: [-100.0, 20.0]
+        raises: ValueError
+        match: "must be positive"
+
+    - case:
+        id: "invalid_discount_raises"
+        inputs: [-100.0, 21.0]
+        raises: ValueError
+        match: "must be positive"
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "vowel"
-version = "0.2.4"
+version = "0.2.5"
 description = "A modular evaluation framework for testing functions with YAML-based specifications"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -25,10 +25,12 @@ classifiers = [
 ]
 
 dependencies = [
+    "click",
+    "dotenv",
+    "pyyaml",
     "pydantic>=2.0.0",
-    "pydantic-evals>=0.2.0",
-    "pyyaml>=6.0.0",
-    "click>=8.0.0",
+    "pydantic-ai>=1.0.0",
+    "pydantic-evals>=1.0.0",
 ]
 
 [project.optional-dependencies]
diff --git a/src/vowel/__init__.py b/src/vowel/__init__.py
@@ -1,14 +1,23 @@
 """
 vowel - A modular evaluation framework for testing functions with YAML-based specifications.
 """
+import importlib.metadata
 
-__version__ = "0.1.0"
+__version__ = importlib.metadata.version("vowel")
 
 from .eval_types import EvalsFile
 from .runner import RunEvals
-from .utils import (EvalResult, EvalSummary, load_evals, load_evals_file,
-                    load_evals_from_dict, load_evals_from_object,
-                    load_evals_from_yaml_string, run_evals, to_dataset)
+from .utils import (
+    EvalResult,
+    EvalSummary,
+    load_evals,
+    load_evals_file,
+    load_evals_from_dict,
+    load_evals_from_object,
+    load_evals_from_yaml_string,
+    run_evals,
+    to_dataset,
+)
 
 __all__ = [
     "load_evals_file",
diff --git a/src/vowel/cli.py b/src/vowel/cli.py
@@ -1,25 +1,9 @@
-import importlib.util
-import os
-import sys
-
+import click
 import dotenv
 
-dotenv.load_dotenv()
-
-if "--debug" in sys.argv:
-    if importlib.util.find_spec("logfire"):
-        import logfire
-
-        logfire.configure()
-        logfire.instrument_pydantic_ai()
-    else:
-        raise ImportError(
-            "Debug mode enabled but logfire is not installed. Please install logfire or disable debug mode."
-        )
-
 from pathlib import Path
 
-import click
+dotenv.load_dotenv()
 
 
 @click.command()
diff --git a/src/vowel/eval_types.py b/src/vowel/eval_types.py
@@ -110,6 +110,20 @@ class PatternMatchCase(BaseModel):
     )
 
 
+class RaisesCase(BaseModel):
+    """Exception raising evaluation case. Validates that the function raises a specific exception."""
+
+    raises: str = Field(
+        description="Expected exception type as string (e.g., 'ValueError', 'KeyError', 'TypeError').",
+        examples=["ValueError", "TypeError", "KeyError", "ZeroDivisionError", "IndexError"],
+    )
+    match: Optional[str] = Field(
+        default=None,
+        description="Optional regex pattern to match against the exception message.",
+        examples=["invalid input", "must be positive", "not found"],
+    )
+
+
 class LLMJudgeCase(BaseModel):
     """LLM Judge evaluation case. Uses an LLM to evaluate the output based on a rubric."""
 
@@ -183,7 +197,7 @@ class MatchCase(BaseModel):
         ),
         examples=[5, "hello", [1, 2, 3], {"x": 10, "y": 20}, {"name": "test", "value": 42}],
     )
-    inputs: Optional[list[Any]] = Field(
+    inputs: Optional[list | dict] = Field(
         default=None,
         description=(
             "Multiple input values to pass to the function as separate arguments (*args). "
@@ -226,6 +240,23 @@ class MatchCase(BaseModel):
         default=True,
         description="Whether the regex pattern matching should be case-sensitive (only used if pattern is specified).",
     )
+    raises: Optional[str] = Field(
+        default=None,
+        description="Expected exception type for this case. If specified, the test expects the function to raise this exception.",
+        examples=["ValueError", "TypeError", "KeyError", "ZeroDivisionError"],
+    )
+    match: Optional[str] = Field(
+        default=None,
+        description="Optional regex pattern to match against the exception message (only used if raises is specified).",
+        examples=["invalid input", "must be positive", "not found"],
+    )
+
+    @field_validator("match")
+    @classmethod
+    def validate_match_requires_raises(cls, v, info):
+        if v is not None and info.data.get("raises") is None:
+            raise ValueError("'match' can only be used together with 'raises'")
+        return v
 
     @field_validator("inputs")
     @classmethod
@@ -260,6 +291,10 @@ def has_assertion(self) -> bool:
     def has_pattern(self) -> bool:
         return self.pattern is not None
 
+    @property
+    def has_raises(self) -> bool:
+        return self.raises is not None
+
 
 class EvalCase(BaseModel):
     """Internal representation of an evaluation case with its data."""
diff --git a/src/vowel/evals.py b/src/vowel/evals.py
diff --git a/src/vowel/runner.py b/src/vowel/runner.py
diff --git a/src/vowel/utils.py b/src/vowel/utils.py