Crown-Commercial-Service · SamuelHLewis · Mar 20, 2026 · Mar 19, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,14 @@
+############################
+# Azure OpenAI (required)
+############################
+AZURE_OPENAI_API_VERSION=2024-02-15-preview
+AZURE_OPENAI_ENDPOINT=https://your-resource-name.openai.azure.com/
+AZURE_OPENAI_KEY=your-azure-openai-api-key
+DEPLOYMENT_NAME=your-chat-model-deployment
+
+############################
+# MLflow (required for evaluation)
+############################
+# Required by evaluation/run_evaluation.py
+MLFLOW_TRACKING_URI=azureml://<region>.api.azureml.ms/mlflow/v1.0/subscriptions/<subscription-id>/resourceGroups/<resource-group>/providers/Microsoft.MachineLearningServices/workspaces/<workspace-name>
+MLFLOW_EXPERIMENT_NAME=ContractMap-Evaluation
diff --git a/README.md b/README.md
@@ -130,6 +130,11 @@ Optional arguments:
 - `--truth-set /path/to/truth.csv` to use a different truth set file.
 - `--prompt system_prompt_v2.md` to choose a prompt from `prompts/`.
 - `--list-prompts` to print available prompt files.
+- `--mlflow-tracking-uri <azureml://...>` to set tracking server (or use `MLFLOW_TRACKING_URI`).
+- `--mlflow-experiment-name ContractMap-Evaluation` to set experiment (or use `MLFLOW_EXPERIMENT_NAME`).
+- `--mlflow-run-name my-run` to set a custom run name.
+
+`run_evaluation.py` always logs to MLflow (params, metrics, prompt, and results CSV).
 
 ### From a jupyter notebook
 

diff --git a/evaluation/run_evaluation.py b/evaluation/run_evaluation.py
@@ -1,9 +1,16 @@
 import argparse
 import asyncio
+import os
 import sys
+import time
 from pathlib import Path
+from typing import Any
+from dotenv import load_dotenv
 
 import pandas as pd
+
+load_dotenv()
+
 REPO_ROOT = Path(__file__).resolve().parent.parent
 SRC_DIR = REPO_ROOT / "src"
 PROMPTS_DIR = REPO_ROOT / "prompts"
@@ -16,7 +23,6 @@
     sys.path.insert(0, str(SRC_DIR))
 
 
-
 def _available_prompt_files() -> list[Path]:
     files = sorted(PROMPTS_DIR.glob("*.md"))
     return [path for path in files if path.is_file()]
@@ -56,7 +62,9 @@ def _load_truth_set(truth_set_path: Path) -> pd.DataFrame:
     return df.dropna(subset=["Description", "Category"]).reset_index(drop=True)
 
 
-async def _classify_description(description: str, mapper: str, prompt_file: Path) -> str:
+async def _classify_description(
+    description: str, mapper: str, prompt_file: Path
+) -> str:
     if mapper == "v1":
         from core.classification_v1 import contract_mapper
 
@@ -66,18 +74,39 @@ async def _classify_description(description: str, mapper: str, prompt_file: Path
         )
 
     from core.classification_v2 import contract_mapper_v2
+
     return await contract_mapper_v2(
         user_contract_description=description,
         system_prompt_file_location=prompt_file,
     )
 
 
+def _get_mlflow_module() -> Any | None:
+    """Import Azure MLflow module (azureml-mlflow extends standard mlflow)."""
+    try:
+        import importlib
+
+        return importlib.import_module("mlflow")
+    except ModuleNotFoundError:
+        return None
+
+
 async def run_evaluation(
     truth_set_path: Path,
     mapper: str,
     prompt_name: str | None,
     output_path: Path | None,
+    mlflow_tracking_uri: str | None = None,
+    mlflow_experiment_name: str | None = None,
+    mlflow_run_name: str | None = None,
 ) -> None:
+    mlflow_module = _get_mlflow_module()
+    if mlflow_module is None:
+        raise ModuleNotFoundError(
+            "Azure MLflow is not installed. Install with 'pip install azureml-mlflow' "
+            "and authenticate using 'az login'."
+        )
+
     prompt_file = _resolve_prompt_file(prompt_name=prompt_name, mapper=mapper)
     df = _load_truth_set(truth_set_path=truth_set_path)
 
@@ -87,29 +116,62 @@ async def run_evaluation(
     predictions: list[str] = []
     correct = 0
 
-    for description, expected in zip(descriptions, categories):
-        prediction = await _classify_description(
-            description=description,
-            mapper=mapper,
-            prompt_file=prompt_file,
-        )
-        predictions.append(prediction)
-        is_correct = prediction == expected
-        correct += int(is_correct)
-        print(f"expected: {expected} | predicted: {prediction} | correct: {is_correct}")
-
-    accuracy = (correct / len(predictions)) * 100 if predictions else 0.0
-    print(f"\n{mapper.upper()} accuracy: {accuracy:.2f}% on {len(predictions)} samples")
-
     if output_path is None:
         output_path = REPO_ROOT / f"data/results/eval_{mapper}_{prompt_file.stem}.csv"
-
     output_path.parent.mkdir(parents=True, exist_ok=True)
-    result_df = df[["Category", "Description"]].copy()
-    result_df["AI classification"] = predictions
-    result_df["correct"] = result_df["Category"] == result_df["AI classification"]
-    result_df.to_csv(output_path, index=False)
-    print(f"Saved results to: {output_path}")
+
+    tracking_uri = mlflow_tracking_uri or os.getenv("MLFLOW_TRACKING_URI")
+    if not tracking_uri:
+        raise ValueError(
+            "MLflow tracking URI is required. "
+            "Set --mlflow-tracking-uri or MLFLOW_TRACKING_URI."
+        )
+    experiment_name = mlflow_experiment_name or os.getenv(
+        "MLFLOW_EXPERIMENT_NAME", "ContractMap-Evaluation"
+    )
+    mlflow_module.set_tracking_uri(tracking_uri)
+    mlflow_module.set_experiment(experiment_name)
+    run_context = mlflow_module.start_run(run_name=mlflow_run_name)
+
+    with run_context:
+        mlflow_module.log_param("mapper", mapper)
+        mlflow_module.log_param("truth_set_path", str(truth_set_path.resolve()))
+        mlflow_module.log_param("prompt_name", prompt_name or prompt_file.name)
+        mlflow_module.log_param("prompt_path", str(prompt_file.resolve()))
+        mlflow_module.log_param("num_samples", len(descriptions))
+        mlflow_module.log_artifact(str(prompt_file.resolve()), artifact_path="prompts")
+
+        start_time = time.perf_counter()
+        for description, expected in zip(descriptions, categories):
+            prediction = await _classify_description(
+                description=description,
+                mapper=mapper,
+                prompt_file=prompt_file,
+            )
+            predictions.append(prediction)
+            is_correct = prediction == expected
+            correct += int(is_correct)
+            print(
+                f"expected: {expected} | predicted: {prediction} | correct: {is_correct}"
+            )
+
+        elapsed_seconds = time.perf_counter() - start_time
+        accuracy = (correct / len(predictions)) * 100 if predictions else 0.0
+        print(
+            f"\n{mapper.upper()} accuracy: {accuracy:.2f}% on {len(predictions)} samples"
+        )
+
+        result_df = df[["Category", "Description"]].copy()
+        result_df["AI classification"] = predictions
+        result_df["correct"] = result_df["Category"] == result_df["AI classification"]
+        result_df.to_csv(output_path, index=False)
+        print(f"Saved results to: {output_path}")
+
+        mlflow_module.log_metric("accuracy_percent", accuracy)
+        mlflow_module.log_metric("accuracy_fraction", accuracy / 100)
+        mlflow_module.log_metric("correct_predictions", correct)
+        mlflow_module.log_metric("evaluation_duration_seconds", elapsed_seconds)
+        mlflow_module.log_artifact(str(output_path.resolve()), artifact_path="results")
 
 
 def _build_arg_parser() -> argparse.ArgumentParser:
@@ -145,6 +207,24 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="List available prompt files in prompts/ and exit.",
     )
+    parser.add_argument(
+        "--mlflow-tracking-uri",
+        type=str,
+        default=None,
+        help="Azure MLflow tracking URI (azureml:// scheme). Defaults to MLFLOW_TRACKING_URI env var.",
+    )
+    parser.add_argument(
+        "--mlflow-experiment-name",
+        type=str,
+        default=None,
+        help="Optional Azure MLflow experiment name. Defaults to MLFLOW_EXPERIMENT_NAME or ContractMap-Evaluation.",
+    )
+    parser.add_argument(
+        "--mlflow-run-name",
+        type=str,
+        default=None,
+        help="Optional Azure MLflow run name.",
+    )
     return parser
 
 
@@ -164,6 +244,9 @@ def main() -> None:
             mapper=args.mapper,
             prompt_name=args.prompt,
             output_path=args.output,
+            mlflow_tracking_uri=args.mlflow_tracking_uri,
+            mlflow_experiment_name=args.mlflow_experiment_name,
+            mlflow_run_name=args.mlflow_run_name,
         )
     )
 

diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,7 @@ fastapi>=0.116.1,<0.117.0
 uvicorn>=0.35.0,<0.36.0
 pytest
 ruff==0.15.1
+pre-commit
+azureml-mlflow
+azure-ai-ml
+azure-identity
diff --git a/tests/test_run_evaluation.py b/tests/test_run_evaluation.py
@@ -7,6 +7,43 @@
 import evaluation.run_evaluation as run_evaluation
 
 
+class _DummyRunContext:
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        return False
+
+
+class FakeMLflow:
+    def __init__(self):
+        self.tracking_uri = None
+        self.experiment_name = None
+        self.start_run_name = None
+        self.params: dict[str, object] = {}
+        self.metrics: dict[str, float] = {}
+        self.artifacts: list[tuple[str, str]] = []
+
+    def set_tracking_uri(self, uri: str) -> None:
+        self.tracking_uri = uri
+
+    def set_experiment(self, name: str) -> None:
+        self.experiment_name = name
+
+    def start_run(self, run_name: str | None = None):
+        self.start_run_name = run_name
+        return _DummyRunContext()
+
+    def log_param(self, key: str, value) -> None:
+        self.params[key] = value
+
+    def log_metric(self, key: str, value: float) -> None:
+        self.metrics[key] = value
+
+    def log_artifact(self, path: str, artifact_path: str | None = None) -> None:
+        self.artifacts.append((path, artifact_path))
+
+
 def test_load_truth_set_raises_when_required_columns_missing(tmp_path):
     truth_set = tmp_path / "truth.csv"
     pd.DataFrame({"Description": ["desc only"]}).to_csv(truth_set, index=False)
@@ -49,13 +86,15 @@ async def fake_classify(description: str, mapper: str, prompt_file: Path) -> str
     monkeypatch.setattr(run_evaluation, "REPO_ROOT", repo_root)
     monkeypatch.setattr(run_evaluation, "_resolve_prompt_file", lambda *args, **kwargs: prompt_file)
     monkeypatch.setattr(run_evaluation, "_classify_description", fake_classify)
+    monkeypatch.setattr(run_evaluation, "_get_mlflow_module", lambda: FakeMLflow())
 
     asyncio.run(
         run_evaluation.run_evaluation(
             truth_set_path=truth_set,
             mapper="v2",
             prompt_name=None,
             output_path=None,
+            mlflow_tracking_uri="azureml://unit-test",
         )
     )
 
@@ -65,3 +104,56 @@ async def fake_classify(description: str, mapper: str, prompt_file: Path) -> str
     result_df = pd.read_csv(output_csv)
     assert result_df["AI classification"].tolist() == ["cat-a", "wrong-cat"]
     assert result_df["correct"].tolist() == [True, False]
+
+
+def test_run_evaluation_logs_mlflow_params_metrics_and_artifacts(monkeypatch, tmp_path):
+    truth_set = tmp_path / "truth.csv"
+    prompt_file = tmp_path / "prompts" / "custom_prompt.md"
+    output_path = tmp_path / "results.csv"
+    prompt_file.parent.mkdir(parents=True, exist_ok=True)
+    prompt_file.write_text("prompt")
+    pd.DataFrame(
+        {
+            "Description": ["desc-a", "desc-b"],
+            "Category": ["cat-a", "cat-b"],
+        }
+    ).to_csv(truth_set, index=False)
+
+    async def fake_classify(description: str, mapper: str, prompt_file: Path) -> str:
+        mapping = {"desc-a": "cat-a", "desc-b": "wrong-cat"}
+        return mapping[description]
+
+    fake_mlflow = FakeMLflow()
+    monkeypatch.setattr(run_evaluation, "_resolve_prompt_file", lambda *args, **kwargs: prompt_file)
+    monkeypatch.setattr(run_evaluation, "_classify_description", fake_classify)
+    monkeypatch.setattr(run_evaluation, "_get_mlflow_module", lambda: fake_mlflow)
+
+    asyncio.run(
+        run_evaluation.run_evaluation(
+            truth_set_path=truth_set,
+            mapper="v2",
+            prompt_name=None,
+            output_path=output_path,
+            mlflow_tracking_uri="http://mlflow.local:5000",
+            mlflow_experiment_name="ContractMap-Evaluation-Test",
+            mlflow_run_name="unit-test-run",
+        )
+    )
+
+    assert fake_mlflow.tracking_uri == "http://mlflow.local:5000"
+    assert fake_mlflow.experiment_name == "ContractMap-Evaluation-Test"
+    assert fake_mlflow.start_run_name == "unit-test-run"
+
+    assert fake_mlflow.params["mapper"] == "v2"
+    assert fake_mlflow.params["truth_set_path"] == str(truth_set.resolve())
+    assert fake_mlflow.params["prompt_name"] == "custom_prompt.md"
+    assert fake_mlflow.params["prompt_path"] == str(prompt_file.resolve())
+    assert fake_mlflow.params["num_samples"] == 2
+
+    assert fake_mlflow.metrics["accuracy_percent"] == 50.0
+    assert fake_mlflow.metrics["accuracy_fraction"] == 0.5
+    assert fake_mlflow.metrics["correct_predictions"] == 1
+    assert "evaluation_duration_seconds" in fake_mlflow.metrics
+
+    assert (str(prompt_file.resolve()), "prompts") in fake_mlflow.artifacts
+    assert (str(output_path.resolve()), "results") in fake_mlflow.artifacts