autoevolve/evolve.py at main · MrTsepa/autoevolve · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
autoevolve — evolution loop for automated iterative improvement.

Mutate -> Evaluate -> Promote -> Archive

A coding agent creates candidate variants of an artifact (a bot, prompt, strategy,
or model config), evaluates them through head-to-head comparison, promotes the
winners based on Elo ratings, and archives everything for traceability.

This file defines the core loop and its building blocks.
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol, runtime_checkable


# ── Data ────────────────────────────────────────────────────


@dataclass
class MatchResult:
    """Outcome of evaluating artifact A vs artifact B."""

    a: str
    b: str
    wins_a: int
    wins_b: int
    mean_a: float | None = None
    mean_b: float | None = None
    note: str | None = None

    def to_dict(self) -> dict:
        d = {"a": self.a, "b": self.b, "wins_a": self.wins_a, "wins_b": self.wins_b}
        if self.mean_a is not None:
            d["mean_a"] = self.mean_a
        if self.mean_b is not None:
            d["mean_b"] = self.mean_b
        if self.note:
            d["note"] = self.note
        return d


# ── Protocols ───────────────────────────────────────────────


@runtime_checkable
class Artifact(Protocol):
    """Something that can be versioned and compared."""

    @property
    def version(self) -> str: ...


@runtime_checkable
class Evaluator(Protocol):
    """Compares two artifacts. Returns a MatchResult."""

    def evaluate(self, a: Artifact, b: Artifact, n_games: int) -> MatchResult: ...


@runtime_checkable
class Mutator(Protocol):
    """Creates a new candidate from a parent artifact."""

    def mutate(self, parent: Artifact) -> Artifact: ...


# ── Database ────────────────────────────────────────────────


def load_db(path: str | Path = "matches.json") -> dict:
    """Load match database from JSON file."""
    p = Path(path)
    if p.exists():
        return json.loads(p.read_text())
    return {"matches": [], "versions": {}}


def save_db(db: dict, path: str | Path = "matches.json"):
    """Save match database to JSON file."""
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(json.dumps(db, indent=2) + "\n")


def record(db: dict, result: MatchResult) -> dict:
    """Append a match result to the database."""
    db["matches"].append(result.to_dict())
    for v in [result.a, result.b]:
        if v not in db["versions"]:
            db["versions"][v] = {}
    return db


# ── Evolution loop ──────────────────────────────────────────


def evolve(
    seed: Artifact,
    mutator: Mutator,
    evaluator: Evaluator,
    *,
    db_path: str | Path = "matches.json",
    n_games: int = 100,
    n_candidates: int = 1,
    max_generations: int = 50,
    on_step: callable | None = None,
) -> dict:
    """
    Run the Mutate-Evaluate-Promote-Archive loop.

    Each generation:
      1. Generate — mutator creates n_candidates variants from current best
      2. Evaluate — each candidate plays n_games against current best
      3. Promote — recompute ratings, crown new best if earned
      4. Archive — save DB, call on_step callback

    Returns the final database.
    """
    from ratings import compute_ratings

    db = load_db(db_path)
    best_version = seed.version

    for gen in range(max_generations):
        # GENERATE
        candidates = [mutator.mutate(seed) for _ in range(n_candidates)]

        # EVALUATE
        for candidate in candidates:
            result = evaluator.evaluate(candidate, seed, n_games)
            record(db, result)

        # PROMOTE
        save_db(db, db_path)
        ratings, _ = compute_ratings(db)
        if ratings:
            best_version = max(ratings, key=ratings.get)

        # ARCHIVE
        if on_step:
            on_step(gen=gen, best=best_version, ratings=ratings, db=db)

    return db