Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
############################
# Azure OpenAI (required)
############################
AZURE_OPENAI_API_VERSION=2024-02-15-preview
AZURE_OPENAI_ENDPOINT=https://your-resource-name.openai.azure.com/
AZURE_OPENAI_KEY=your-azure-openai-api-key
DEPLOYMENT_NAME=your-chat-model-deployment

############################
# MLflow (required for evaluation)
############################
# Required by evaluation/run_evaluation.py
MLFLOW_TRACKING_URI=azureml://<region>.api.azureml.ms/mlflow/v1.0/subscriptions/<subscription-id>/resourceGroups/<resource-group>/providers/Microsoft.MachineLearningServices/workspaces/<workspace-name>
MLFLOW_EXPERIMENT_NAME=ContractMap-Evaluation
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ Optional arguments:
- `--truth-set /path/to/truth.csv` to use a different truth set file.
- `--prompt system_prompt_v2.md` to choose a prompt from `prompts/`.
- `--list-prompts` to print available prompt files.
- `--mlflow-tracking-uri <azureml://...>` to set tracking server (or use `MLFLOW_TRACKING_URI`).
- `--mlflow-experiment-name ContractMap-Evaluation` to set experiment (or use `MLFLOW_EXPERIMENT_NAME`).
- `--mlflow-run-name my-run` to set a custom run name.

`run_evaluation.py` always logs to MLflow (params, metrics, prompt, and results CSV).

### From a jupyter notebook

Expand Down
127 changes: 105 additions & 22 deletions evaluation/run_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import argparse
import asyncio
import os
import sys
import time
from pathlib import Path
from typing import Any
from dotenv import load_dotenv

import pandas as pd

load_dotenv()

REPO_ROOT = Path(__file__).resolve().parent.parent
SRC_DIR = REPO_ROOT / "src"
PROMPTS_DIR = REPO_ROOT / "prompts"
Expand All @@ -16,7 +23,6 @@
sys.path.insert(0, str(SRC_DIR))



def _available_prompt_files() -> list[Path]:
files = sorted(PROMPTS_DIR.glob("*.md"))
return [path for path in files if path.is_file()]
Expand Down Expand Up @@ -56,7 +62,9 @@ def _load_truth_set(truth_set_path: Path) -> pd.DataFrame:
return df.dropna(subset=["Description", "Category"]).reset_index(drop=True)


async def _classify_description(description: str, mapper: str, prompt_file: Path) -> str:
async def _classify_description(
description: str, mapper: str, prompt_file: Path
) -> str:
if mapper == "v1":
from core.classification_v1 import contract_mapper

Expand All @@ -66,18 +74,39 @@ async def _classify_description(description: str, mapper: str, prompt_file: Path
)

from core.classification_v2 import contract_mapper_v2

return await contract_mapper_v2(
user_contract_description=description,
system_prompt_file_location=prompt_file,
)


def _get_mlflow_module() -> Any | None:
"""Import Azure MLflow module (azureml-mlflow extends standard mlflow)."""
try:
import importlib

return importlib.import_module("mlflow")
except ModuleNotFoundError:
return None


async def run_evaluation(
truth_set_path: Path,
mapper: str,
prompt_name: str | None,
output_path: Path | None,
mlflow_tracking_uri: str | None = None,
mlflow_experiment_name: str | None = None,
mlflow_run_name: str | None = None,
) -> None:
mlflow_module = _get_mlflow_module()
if mlflow_module is None:
raise ModuleNotFoundError(
"Azure MLflow is not installed. Install with 'pip install azureml-mlflow' "
"and authenticate using 'az login'."
)

prompt_file = _resolve_prompt_file(prompt_name=prompt_name, mapper=mapper)
df = _load_truth_set(truth_set_path=truth_set_path)

Expand All @@ -87,29 +116,62 @@ async def run_evaluation(
predictions: list[str] = []
correct = 0

for description, expected in zip(descriptions, categories):
prediction = await _classify_description(
description=description,
mapper=mapper,
prompt_file=prompt_file,
)
predictions.append(prediction)
is_correct = prediction == expected
correct += int(is_correct)
print(f"expected: {expected} | predicted: {prediction} | correct: {is_correct}")

accuracy = (correct / len(predictions)) * 100 if predictions else 0.0
print(f"\n{mapper.upper()} accuracy: {accuracy:.2f}% on {len(predictions)} samples")

if output_path is None:
output_path = REPO_ROOT / f"data/results/eval_{mapper}_{prompt_file.stem}.csv"

output_path.parent.mkdir(parents=True, exist_ok=True)
result_df = df[["Category", "Description"]].copy()
result_df["AI classification"] = predictions
result_df["correct"] = result_df["Category"] == result_df["AI classification"]
result_df.to_csv(output_path, index=False)
print(f"Saved results to: {output_path}")

tracking_uri = mlflow_tracking_uri or os.getenv("MLFLOW_TRACKING_URI")
if not tracking_uri:
raise ValueError(
"MLflow tracking URI is required. "
"Set --mlflow-tracking-uri or MLFLOW_TRACKING_URI."
)
experiment_name = mlflow_experiment_name or os.getenv(
"MLFLOW_EXPERIMENT_NAME", "ContractMap-Evaluation"
)
mlflow_module.set_tracking_uri(tracking_uri)
mlflow_module.set_experiment(experiment_name)
run_context = mlflow_module.start_run(run_name=mlflow_run_name)

with run_context:
mlflow_module.log_param("mapper", mapper)
mlflow_module.log_param("truth_set_path", str(truth_set_path.resolve()))
mlflow_module.log_param("prompt_name", prompt_name or prompt_file.name)
mlflow_module.log_param("prompt_path", str(prompt_file.resolve()))
mlflow_module.log_param("num_samples", len(descriptions))
mlflow_module.log_artifact(str(prompt_file.resolve()), artifact_path="prompts")

start_time = time.perf_counter()
for description, expected in zip(descriptions, categories):
prediction = await _classify_description(
description=description,
mapper=mapper,
prompt_file=prompt_file,
)
predictions.append(prediction)
is_correct = prediction == expected
correct += int(is_correct)
print(
f"expected: {expected} | predicted: {prediction} | correct: {is_correct}"
)

elapsed_seconds = time.perf_counter() - start_time
accuracy = (correct / len(predictions)) * 100 if predictions else 0.0
print(
f"\n{mapper.upper()} accuracy: {accuracy:.2f}% on {len(predictions)} samples"
)

result_df = df[["Category", "Description"]].copy()
result_df["AI classification"] = predictions
result_df["correct"] = result_df["Category"] == result_df["AI classification"]
result_df.to_csv(output_path, index=False)
print(f"Saved results to: {output_path}")

mlflow_module.log_metric("accuracy_percent", accuracy)
mlflow_module.log_metric("accuracy_fraction", accuracy / 100)
mlflow_module.log_metric("correct_predictions", correct)
mlflow_module.log_metric("evaluation_duration_seconds", elapsed_seconds)
mlflow_module.log_artifact(str(output_path.resolve()), artifact_path="results")


def _build_arg_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -145,6 +207,24 @@ def _build_arg_parser() -> argparse.ArgumentParser:
action="store_true",
help="List available prompt files in prompts/ and exit.",
)
parser.add_argument(
"--mlflow-tracking-uri",
type=str,
default=None,
help="Azure MLflow tracking URI (azureml:// scheme). Defaults to MLFLOW_TRACKING_URI env var.",
)
parser.add_argument(
"--mlflow-experiment-name",
type=str,
default=None,
help="Optional Azure MLflow experiment name. Defaults to MLFLOW_EXPERIMENT_NAME or ContractMap-Evaluation.",
)
parser.add_argument(
"--mlflow-run-name",
type=str,
default=None,
help="Optional Azure MLflow run name.",
)
return parser


Expand All @@ -164,6 +244,9 @@ def main() -> None:
mapper=args.mapper,
prompt_name=args.prompt,
output_path=args.output,
mlflow_tracking_uri=args.mlflow_tracking_uri,
mlflow_experiment_name=args.mlflow_experiment_name,
mlflow_run_name=args.mlflow_run_name,
)
)

Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@ fastapi>=0.116.1,<0.117.0
uvicorn>=0.35.0,<0.36.0
pytest
ruff==0.15.1
pre-commit
azureml-mlflow
azure-ai-ml
azure-identity
92 changes: 92 additions & 0 deletions tests/test_run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,43 @@
import evaluation.run_evaluation as run_evaluation


class _DummyRunContext:
def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False


class FakeMLflow:
def __init__(self):
self.tracking_uri = None
self.experiment_name = None
self.start_run_name = None
self.params: dict[str, object] = {}
self.metrics: dict[str, float] = {}
self.artifacts: list[tuple[str, str]] = []

def set_tracking_uri(self, uri: str) -> None:
self.tracking_uri = uri

def set_experiment(self, name: str) -> None:
self.experiment_name = name

def start_run(self, run_name: str | None = None):
self.start_run_name = run_name
return _DummyRunContext()

def log_param(self, key: str, value) -> None:
self.params[key] = value

def log_metric(self, key: str, value: float) -> None:
self.metrics[key] = value

def log_artifact(self, path: str, artifact_path: str | None = None) -> None:
self.artifacts.append((path, artifact_path))


def test_load_truth_set_raises_when_required_columns_missing(tmp_path):
truth_set = tmp_path / "truth.csv"
pd.DataFrame({"Description": ["desc only"]}).to_csv(truth_set, index=False)
Expand Down Expand Up @@ -49,13 +86,15 @@ async def fake_classify(description: str, mapper: str, prompt_file: Path) -> str
monkeypatch.setattr(run_evaluation, "REPO_ROOT", repo_root)
monkeypatch.setattr(run_evaluation, "_resolve_prompt_file", lambda *args, **kwargs: prompt_file)
monkeypatch.setattr(run_evaluation, "_classify_description", fake_classify)
monkeypatch.setattr(run_evaluation, "_get_mlflow_module", lambda: FakeMLflow())

asyncio.run(
run_evaluation.run_evaluation(
truth_set_path=truth_set,
mapper="v2",
prompt_name=None,
output_path=None,
mlflow_tracking_uri="azureml://unit-test",
)
)

Expand All @@ -65,3 +104,56 @@ async def fake_classify(description: str, mapper: str, prompt_file: Path) -> str
result_df = pd.read_csv(output_csv)
assert result_df["AI classification"].tolist() == ["cat-a", "wrong-cat"]
assert result_df["correct"].tolist() == [True, False]


def test_run_evaluation_logs_mlflow_params_metrics_and_artifacts(monkeypatch, tmp_path):
truth_set = tmp_path / "truth.csv"
prompt_file = tmp_path / "prompts" / "custom_prompt.md"
output_path = tmp_path / "results.csv"
prompt_file.parent.mkdir(parents=True, exist_ok=True)
prompt_file.write_text("prompt")
pd.DataFrame(
{
"Description": ["desc-a", "desc-b"],
"Category": ["cat-a", "cat-b"],
}
).to_csv(truth_set, index=False)

async def fake_classify(description: str, mapper: str, prompt_file: Path) -> str:
mapping = {"desc-a": "cat-a", "desc-b": "wrong-cat"}
return mapping[description]

fake_mlflow = FakeMLflow()
monkeypatch.setattr(run_evaluation, "_resolve_prompt_file", lambda *args, **kwargs: prompt_file)
monkeypatch.setattr(run_evaluation, "_classify_description", fake_classify)
monkeypatch.setattr(run_evaluation, "_get_mlflow_module", lambda: fake_mlflow)

asyncio.run(
run_evaluation.run_evaluation(
truth_set_path=truth_set,
mapper="v2",
prompt_name=None,
output_path=output_path,
mlflow_tracking_uri="http://mlflow.local:5000",
mlflow_experiment_name="ContractMap-Evaluation-Test",
mlflow_run_name="unit-test-run",
)
)

assert fake_mlflow.tracking_uri == "http://mlflow.local:5000"
assert fake_mlflow.experiment_name == "ContractMap-Evaluation-Test"
assert fake_mlflow.start_run_name == "unit-test-run"

assert fake_mlflow.params["mapper"] == "v2"
assert fake_mlflow.params["truth_set_path"] == str(truth_set.resolve())
assert fake_mlflow.params["prompt_name"] == "custom_prompt.md"
assert fake_mlflow.params["prompt_path"] == str(prompt_file.resolve())
assert fake_mlflow.params["num_samples"] == 2

assert fake_mlflow.metrics["accuracy_percent"] == 50.0
assert fake_mlflow.metrics["accuracy_fraction"] == 0.5
assert fake_mlflow.metrics["correct_predictions"] == 1
assert "evaluation_duration_seconds" in fake_mlflow.metrics

assert (str(prompt_file.resolve()), "prompts") in fake_mlflow.artifacts
assert (str(output_path.resolve()), "results") in fake_mlflow.artifacts
Loading