Skip to content

Commit 671c0cc

Browse files
Qardclaude
andcommitted
Add classifier support to eval
Mirrors the Ruby SDK's classifier port (braintrustdata/braintrust-sdk-ruby#154) and the canonical classifier spec at braintrust-spec/docs/features/classifiers.md. Classifiers return structured Classification items (id, optional label, optional metadata) instead of numeric scores. They run alongside scorers, their failures are non-fatal, and at least one of scorers/classifiers is required (relaxes the prior scorers-required check). New public types: Classification, Classifier (+ Classifier.of / .single factories), TracedClassifier. Eval gains a classifiers(...) builder method and a runClassifier helper that emits classifier spans with type=classifier, purpose=scorer; per-case classifications aggregate onto the root eval span as braintrust.classifications, and classifier exceptions land in braintrust.metadata.classifier_errors. Also fixes an inverted-condition bug in TestHarness.ensureRemoteDataset's post-rebuild verify check (threw when datasets matched). Re-recorded the braintrust VCR cassettes from scratch so the new ClassifierEvalTest integration tests replay without scenario collisions; third-party AI provider cassettes are untouched. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e110611 commit 671c0cc

416 files changed

Lines changed: 5217 additions & 4101 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package dev.braintrust.eval;
2+
3+
import java.util.Map;
4+
import javax.annotation.Nullable;
5+
6+
/**
7+
* A single structured classification produced by a {@link Classifier}.
8+
*
9+
* <p>Unlike a {@link Score} (numeric 0-1), a Classification carries a stable id, an optional
10+
* display label, and optional metadata. The {@code name} acts as the grouping key in the aggregated
11+
* result map; when {@code name} is {@code null} or blank, the owning classifier's resolved name is
12+
* used instead.
13+
*
14+
* @param name optional grouping key; defaults to the owning classifier's resolved name when null or
15+
* blank
16+
* @param id stable identifier for the classification (required)
17+
* @param label optional display label
18+
* @param metadata optional arbitrary metadata
19+
*/
20+
public record Classification(
21+
@Nullable String name,
22+
String id,
23+
@Nullable String label,
24+
@Nullable Map<String, Object> metadata) {
25+
26+
public static Classification of(String id) {
27+
return new Classification(null, id, null, null);
28+
}
29+
30+
public static Classification of(String id, String label) {
31+
return new Classification(null, id, label, null);
32+
}
33+
34+
public static Classification of(String name, String id, String label) {
35+
return new Classification(name, id, label, null);
36+
}
37+
}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package dev.braintrust.eval;
2+
3+
import java.util.List;
4+
import java.util.function.Function;
5+
6+
/**
7+
* A classifier categorizes and labels eval outputs, producing zero or more structured {@link
8+
* Classification} items.
9+
*
10+
* <p>Classifiers run independently from {@link Scorer}s. Each Classifier exposes a name (used as
11+
* the span name and as the default grouping key for classifications whose own {@code name} is
12+
* blank).
13+
*
14+
* @param <INPUT> type of the input data
15+
* @param <OUTPUT> type of the output data
16+
*/
17+
public interface Classifier<INPUT, OUTPUT> {
18+
String INVALID_CLASSIFICATION_MESSAGE =
19+
"When returning structured classifier results, each classification must be a non-empty"
20+
+ " object.";
21+
22+
String getName();
23+
24+
/**
25+
* Classifies the result of a successful task execution.
26+
*
27+
* @param taskResult the task output and originating dataset case
28+
* @return zero or more classifications. An empty list means "no classifications for this case".
29+
*/
30+
List<Classification> classify(TaskResult<INPUT, OUTPUT> taskResult);
31+
32+
/**
33+
* Creates a classifier from a function that returns a (possibly empty or null) list of
34+
* classifications.
35+
*
36+
* <p>A {@code null} return value is treated as no classifications. Each returned {@link
37+
* Classification} must have a non-blank {@code id}; otherwise the classifier throws an
38+
* exception (which the eval runner records but does not abort on).
39+
*/
40+
static <INPUT, OUTPUT> Classifier<INPUT, OUTPUT> of(
41+
String classifierName,
42+
Function<TaskResult<INPUT, OUTPUT>, List<Classification>> classifierFn) {
43+
return new Classifier<>() {
44+
@Override
45+
public String getName() {
46+
return classifierName;
47+
}
48+
49+
@Override
50+
public List<Classification> classify(TaskResult<INPUT, OUTPUT> taskResult) {
51+
var result = classifierFn.apply(taskResult);
52+
if (result == null) {
53+
return List.of();
54+
}
55+
for (var item : result) {
56+
validate(item);
57+
}
58+
return result;
59+
}
60+
};
61+
}
62+
63+
/**
64+
* Creates a classifier from a function that returns a single classification.
65+
*
66+
* <p>A {@code null} return value is treated as no classifications.
67+
*/
68+
static <INPUT, OUTPUT> Classifier<INPUT, OUTPUT> single(
69+
String classifierName,
70+
Function<TaskResult<INPUT, OUTPUT>, Classification> classifierFn) {
71+
return new Classifier<>() {
72+
@Override
73+
public String getName() {
74+
return classifierName;
75+
}
76+
77+
@Override
78+
public List<Classification> classify(TaskResult<INPUT, OUTPUT> taskResult) {
79+
var item = classifierFn.apply(taskResult);
80+
if (item == null) {
81+
return List.of();
82+
}
83+
validate(item);
84+
return List.of(item);
85+
}
86+
};
87+
}
88+
89+
/**
90+
* Validates a single classification: it must have a non-blank id. Throws with the spec-mandated
91+
* wording on failure.
92+
*/
93+
private static void validate(Classification item) {
94+
if (item == null || item.id() == null || item.id().isBlank()) {
95+
throw new IllegalArgumentException(INVALID_CLASSIFICATION_MESSAGE + " Got: " + item);
96+
}
97+
}
98+
}

braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public final class Eval<INPUT, OUTPUT> {
4242
private final @Nonnull Dataset<INPUT, OUTPUT> dataset;
4343
private final @Nonnull Task<INPUT, OUTPUT> task;
4444
private final @Nonnull List<Scorer<INPUT, OUTPUT>> scorers;
45+
private final @Nonnull List<Classifier<INPUT, OUTPUT>> classifiers;
4546
private final @Nonnull List<String> tags;
4647
private final @Nonnull Map<String, Object> metadata;
4748
private final @Nonnull Parameters parameters;
@@ -58,6 +59,7 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
5859
this.dataset = builder.dataset;
5960
this.task = Objects.requireNonNull(builder.task);
6061
this.scorers = List.copyOf(builder.scorers);
62+
this.classifiers = List.copyOf(builder.classifiers);
6163
this.tags = List.copyOf(builder.tags);
6264
this.metadata = Map.copyOf(builder.metadata);
6365
this.parameters = builder.buildParameters();
@@ -172,6 +174,42 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
172174
for (var scorer : scorers) {
173175
runScorer(experimentId, rootSpan, scorer, taskResult, trace);
174176
}
177+
178+
// run classifiers - one span per classifier. Classifier exceptions are non-fatal:
179+
// they are recorded on the classifier span and surfaced in the root span's metadata
180+
// under `classifier_errors`, but do not abort the eval or affect other classifiers/
181+
// scorers. Classifiers only run when the task succeeded (no scoreForTaskException
182+
// analogue).
183+
if (!classifiers.isEmpty()) {
184+
Map<String, List<Map<String, Object>>> caseClassifications = new LinkedHashMap<>();
185+
Map<String, String> classifierErrors = new LinkedHashMap<>();
186+
for (int i = 0; i < classifiers.size(); i++) {
187+
var classifier = classifiers.get(i);
188+
var classifierName = classifier.getName();
189+
if (classifierName == null || classifierName.isBlank()) {
190+
classifierName = "classifier_" + i;
191+
}
192+
runClassifier(
193+
experimentId,
194+
classifier,
195+
classifierName,
196+
taskResult,
197+
trace,
198+
caseClassifications,
199+
classifierErrors);
200+
}
201+
if (!caseClassifications.isEmpty()) {
202+
rootSpan.setAttribute(
203+
"braintrust.classifications", toJson(caseClassifications));
204+
}
205+
if (!classifierErrors.isEmpty()) {
206+
Map<String, Object> mergedMetadata =
207+
new LinkedHashMap<>(datasetCase.metadata());
208+
mergedMetadata.put("classifier_errors", classifierErrors);
209+
rootSpan.setAttribute(
210+
AttributeKey.stringKey("braintrust.metadata"), toJson(mergedMetadata));
211+
}
212+
}
175213
} finally {
176214
rootSpan.end();
177215
}
@@ -236,6 +274,84 @@ private void runScoreForTaskException(
236274
}
237275
}
238276

277+
/**
278+
* Runs a classifier inside its own span. Exceptions are recorded on the classifier span and
279+
* surfaced via {@code classifierErrors}; they do not propagate.
280+
*/
281+
private void runClassifier(
282+
String experimentId,
283+
Classifier<INPUT, OUTPUT> classifier,
284+
String resolvedName,
285+
TaskResult<INPUT, OUTPUT> taskResult,
286+
BrainstoreTrace trace,
287+
Map<String, List<Map<String, Object>>> caseClassifications,
288+
Map<String, String> classifierErrors) {
289+
var classifierSpan =
290+
tracer.spanBuilder(resolvedName)
291+
.setAttribute(PARENT, "experiment_id:" + experimentId)
292+
.startSpan();
293+
try (var unused =
294+
BraintrustContext.ofExperiment(experimentId, classifierSpan).makeCurrent()) {
295+
Map<String, Object> spanAttrs = new LinkedHashMap<>();
296+
spanAttrs.put("type", "classifier");
297+
spanAttrs.put("name", resolvedName);
298+
spanAttrs.put("purpose", "scorer");
299+
classifierSpan.setAttribute("braintrust.span_attributes", toJson(spanAttrs));
300+
301+
List<Classification> classifications;
302+
try {
303+
if (classifier instanceof TracedClassifier<INPUT, OUTPUT> tracedClassifier) {
304+
classifications = tracedClassifier.classify(taskResult, trace);
305+
} else {
306+
classifications = classifier.classify(taskResult);
307+
}
308+
if (classifications == null) {
309+
classifications = List.of();
310+
}
311+
} catch (Exception e) {
312+
classifierSpan.setStatus(StatusCode.ERROR, e.getMessage());
313+
classifierSpan.recordException(e);
314+
log.debug("Classifier '{}' threw exception", resolvedName, e);
315+
classifierErrors.put(
316+
resolvedName, e.getMessage() == null ? e.toString() : e.getMessage());
317+
return;
318+
}
319+
320+
// Group results by resolved item name (item.name, falling back to the classifier
321+
// name when blank). Same map is logged to the classifier span and merged into the
322+
// per-case aggregate logged on the root span.
323+
Map<String, List<Map<String, Object>>> outputByName = new LinkedHashMap<>();
324+
for (var item : classifications) {
325+
var itemName = item.name();
326+
if (itemName == null || itemName.isBlank()) {
327+
itemName = resolvedName;
328+
}
329+
var itemMap = toClassificationItem(item);
330+
outputByName.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
331+
caseClassifications.computeIfAbsent(itemName, k -> new ArrayList<>()).add(itemMap);
332+
}
333+
classifierSpan.setAttribute("braintrust.output_json", toJson(outputByName));
334+
} finally {
335+
classifierSpan.end();
336+
}
337+
}
338+
339+
/**
340+
* Converts a {@link Classification} to the wire-format {@code ClassificationItem}: drops {@code
341+
* name}, includes {@code label} and {@code metadata} only when present.
342+
*/
343+
private static Map<String, Object> toClassificationItem(Classification c) {
344+
Map<String, Object> m = new LinkedHashMap<>();
345+
m.put("id", c.id());
346+
if (c.label() != null) {
347+
m.put("label", c.label());
348+
}
349+
if (c.metadata() != null) {
350+
m.put("metadata", c.metadata());
351+
}
352+
return m;
353+
}
354+
239355
/** Validates and records scores on the score span and root span. */
240356
private void recordScores(
241357
Span scoreSpan, Span rootSpan, Scorer<INPUT, OUTPUT> scorer, List<Score> scores) {
@@ -276,6 +392,7 @@ public static final class Builder<INPUT, OUTPUT> {
276392
private @Nullable Tracer tracer = null;
277393
private @Nullable Task<INPUT, OUTPUT> task;
278394
private @Nonnull List<Scorer<INPUT, OUTPUT>> scorers = List.of();
395+
private @Nonnull List<Classifier<INPUT, OUTPUT>> classifiers = List.of();
279396
private @Nonnull List<ParameterDef<?>> parameterDefs = List.of();
280397
private @Nonnull Map<String, Object> parameterValues = Map.of();
281398
private @Nonnull List<String> tags = List.of();
@@ -291,8 +408,8 @@ public Eval<INPUT, OUTPUT> build() {
291408
if (projectId == null) {
292409
projectId = config.defaultProjectId().orElse(null);
293410
}
294-
if (scorers.isEmpty()) {
295-
throw new RuntimeException("must provide at least one scorer");
411+
if (scorers.isEmpty() && classifiers.isEmpty()) {
412+
throw new RuntimeException("must provide at least one scorer or classifier");
296413
}
297414
if (null == apiClient) {
298415
apiClient = BraintrustOpenApiClient.of(config);
@@ -380,6 +497,12 @@ public final Builder<INPUT, OUTPUT> scorers(Scorer<INPUT, OUTPUT>... scorers) {
380497
return this;
381498
}
382499

500+
@SafeVarargs
501+
public final Builder<INPUT, OUTPUT> classifiers(Classifier<INPUT, OUTPUT>... classifiers) {
502+
this.classifiers = List.of(classifiers);
503+
return this;
504+
}
505+
383506
/** Sets tags for the experiment. */
384507
public Builder<INPUT, OUTPUT> tags(List<String> tags) {
385508
this.tags = List.copyOf(tags);
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package dev.braintrust.eval;
2+
3+
import dev.braintrust.trace.BrainstoreTrace;
4+
import java.util.List;
5+
6+
/**
7+
* A classifier that receives access to the full distributed trace of the task that was evaluated.
8+
*
9+
* <p>Implement this interface when your classifier needs to examine intermediate LLM calls, tool
10+
* invocations, or other spans produced during task execution — not just the final {@link
11+
* TaskResult}.
12+
*
13+
* @param <INPUT> type of the input data
14+
* @param <OUTPUT> type of the output data
15+
*/
16+
public interface TracedClassifier<INPUT, OUTPUT> extends Classifier<INPUT, OUTPUT> {
17+
18+
/**
19+
* Classifies the task result using the distributed trace for additional context. Called instead
20+
* of {@link Classifier#classify(TaskResult)} when a {@link BrainstoreTrace} is available.
21+
*
22+
* @param taskResult the task output and originating dataset case
23+
* @param trace lazy access to the distributed trace spans for this eval case
24+
* @return zero or more classifications
25+
*/
26+
List<Classification> classify(TaskResult<INPUT, OUTPUT> taskResult, BrainstoreTrace trace);
27+
28+
/**
29+
* {@inheritDoc}
30+
*
31+
* <p>When used inside an {@link Eval}, this overload is never called — {@link
32+
* #classify(TaskResult, BrainstoreTrace)} is dispatched instead. This default implementation
33+
* throws {@link UnsupportedOperationException} to surface any accidental direct calls.
34+
*/
35+
@Override
36+
default List<Classification> classify(TaskResult<INPUT, OUTPUT> taskResult) {
37+
throw new UnsupportedOperationException(
38+
"traced classifier classify method directly called. This is likely an accident. If"
39+
+ " you wish to support this, your implementation must override this method.");
40+
}
41+
}

0 commit comments

Comments
 (0)