-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathevolve.py
More file actions
148 lines (106 loc) · 4.23 KB
/
Copy pathevolve.py
File metadata and controls
148 lines (106 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
autoevolve — evolution loop for automated iterative improvement.
Mutate -> Evaluate -> Promote -> Archive
A coding agent creates candidate variants of an artifact (a bot, prompt, strategy,
or model config), evaluates them through head-to-head comparison, promotes the
winners based on Elo ratings, and archives everything for traceability.
This file defines the core loop and its building blocks.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol, runtime_checkable
# ── Data ────────────────────────────────────────────────────
@dataclass
class MatchResult:
"""Outcome of evaluating artifact A vs artifact B."""
a: str
b: str
wins_a: int
wins_b: int
mean_a: float | None = None
mean_b: float | None = None
note: str | None = None
def to_dict(self) -> dict:
d = {"a": self.a, "b": self.b, "wins_a": self.wins_a, "wins_b": self.wins_b}
if self.mean_a is not None:
d["mean_a"] = self.mean_a
if self.mean_b is not None:
d["mean_b"] = self.mean_b
if self.note:
d["note"] = self.note
return d
# ── Protocols ───────────────────────────────────────────────
@runtime_checkable
class Artifact(Protocol):
"""Something that can be versioned and compared."""
@property
def version(self) -> str: ...
@runtime_checkable
class Evaluator(Protocol):
"""Compares two artifacts. Returns a MatchResult."""
def evaluate(self, a: Artifact, b: Artifact, n_games: int) -> MatchResult: ...
@runtime_checkable
class Mutator(Protocol):
"""Creates a new candidate from a parent artifact."""
def mutate(self, parent: Artifact) -> Artifact: ...
# ── Database ────────────────────────────────────────────────
def load_db(path: str | Path = "matches.json") -> dict:
"""Load match database from JSON file."""
p = Path(path)
if p.exists():
return json.loads(p.read_text())
return {"matches": [], "versions": {}}
def save_db(db: dict, path: str | Path = "matches.json"):
"""Save match database to JSON file."""
p = Path(path)
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(json.dumps(db, indent=2) + "\n")
def record(db: dict, result: MatchResult) -> dict:
"""Append a match result to the database."""
db["matches"].append(result.to_dict())
for v in [result.a, result.b]:
if v not in db["versions"]:
db["versions"][v] = {}
return db
# ── Evolution loop ──────────────────────────────────────────
def evolve(
seed: Artifact,
mutator: Mutator,
evaluator: Evaluator,
*,
db_path: str | Path = "matches.json",
n_games: int = 100,
n_candidates: int = 1,
max_generations: int = 50,
on_step: callable | None = None,
) -> dict:
"""
Run the Mutate-Evaluate-Promote-Archive loop.
Each generation:
1. Generate — mutator creates n_candidates variants from current best
2. Evaluate — each candidate plays n_games against current best
3. Promote — recompute ratings, crown new best if earned
4. Archive — save DB, call on_step callback
Returns the final database.
"""
from ratings import compute_ratings
db = load_db(db_path)
best_version = seed.version
for gen in range(max_generations):
# GENERATE
candidates = [mutator.mutate(seed) for _ in range(n_candidates)]
# EVALUATE
for candidate in candidates:
result = evaluator.evaluate(candidate, seed, n_games)
record(db, result)
# PROMOTE
save_db(db, db_path)
ratings, _ = compute_ratings(db)
if ratings:
best_version = max(ratings, key=ratings.get)
# ARCHIVE
if on_step:
on_step(gen=gen, best=best_version, ratings=ratings, db=db)
return db