Skip to content
Merged
Show file tree
Hide file tree
Changes from 48 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
08a15de
Messing with a better estimation system
onmyraedar Jun 1, 2026
2e310d3
Fix type hints
onmyraedar Jun 1, 2026
d0be13f
Improved branch weighting algorithm
onmyraedar Jun 1, 2026
c59da6e
Clean up job cost estimator
onmyraedar Jun 1, 2026
672f015
Small file renaming + import updates
onmyraedar Jun 1, 2026
d240892
Allow characters per token overrides; unify logic
onmyraedar Jun 2, 2026
426cc6a
Smarter question estimation based on type
onmyraedar Jun 2, 2026
d4c453c
Token dataclass + QTE tests
onmyraedar Jun 2, 2026
a3d6cd7
Rename input_tokens to prompt_tokens
onmyraedar Jun 2, 2026
e9f368c
Question estimator tests
onmyraedar Jun 2, 2026
e53eabc
Reach probability & job cost estimate tests
onmyraedar Jun 2, 2026
496887d
Fix test
onmyraedar Jun 2, 2026
6c03358
Fix functional test
onmyraedar Jun 2, 2026
9209f7a
Add compute test
onmyraedar Jun 2, 2026
7d71e92
Add .md method to JobCostEstimator
onmyraedar Jun 2, 2026
b1f10ad
Add credits + model summary to job cost estimate
onmyraedar Jun 2, 2026
2f42550
Add .describe() methods to estimators for use in generating Markdown …
onmyraedar Jun 3, 2026
4977f3d
QE describe tests + better descriptions for manual overrides
onmyraedar Jun 3, 2026
27fdb79
Take out assumptons section now that we have .describe() methods
onmyraedar Jun 3, 2026
6486c4f
Description should reflect that overrides are merged with base estimate
onmyraedar Jun 3, 2026
24aa0d5
Don't show skip logic warning if the survey has no skip rules
onmyraedar Jun 3, 2026
30d5cc8
Estimate clarifications
onmyraedar Jun 3, 2026
65ea28a
Accurate estimator description for offloaded files
onmyraedar Jun 3, 2026
67163a1
Fall back to 1,000 tokens for offloaded files by default
onmyraedar Jun 3, 2026
8ed0010
Get image dimensions for estimates, when we can
onmyraedar Jun 3, 2026
59fce5c
Use proper OpenAI image estimation
onmyraedar Jun 3, 2026
3d13dfb
Refactor FileStoreEstimator to use type-based classes
onmyraedar Jun 4, 2026
e1ce17f
Add AnthropicImageEstimator
onmyraedar Jun 4, 2026
b3ade3e
Add GoogleImageEstimator
onmyraedar Jun 4, 2026
1683b40
Separate file for service-based image estimators
onmyraedar Jun 4, 2026
c8c1aba
Override refactor; calibrate_from_results
onmyraedar Jun 4, 2026
eb3b3fe
Update tests
onmyraedar Jun 4, 2026
99b1a5a
OpenAI PDF estimator v1
onmyraedar Jun 4, 2026
372463c
Anthropic PDF estimator v1
onmyraedar Jun 4, 2026
629dcf5
Add Google PDF estimator v1
onmyraedar Jun 4, 2026
3815961
More PDF algorithm calibration
onmyraedar Jun 4, 2026
33229e7
Improve file estimates: general improvements, PDFs, images
onmyraedar Jun 6, 2026
51575d0
Fix tests
onmyraedar Jun 6, 2026
9ff19f6
Model calibration should be True by default
onmyraedar Jun 6, 2026
3aa1c5b
Calibrate thinking tokens
onmyraedar Jun 6, 2026
8012785
Better skill
onmyraedar Jun 6, 2026
e4962e6
Delete skill (moved to ep-agent)
onmyraedar Jun 6, 2026
10bdb65
Merge remote-tracking branch 'origin/main' into humanize_file_upload
onmyraedar Jun 6, 2026
2b533db
Update estimate_remote_job_cost docstring & types
onmyraedar Jun 6, 2026
f642094
Greptile fixes
onmyraedar Jun 7, 2026
294b482
More small fixes
onmyraedar Jun 7, 2026
235bc2a
Fix reach -> cost impact; add regression test
onmyraedar Jun 7, 2026
ada6742
Fix EOS reach double-count
onmyraedar Jun 7, 2026
e67270e
Add image estimator tests; ensure minimum of one patch
onmyraedar Jun 9, 2026
f9d01a1
Skip calibration if all values are None
onmyraedar Jun 9, 2026
b33a2fc
Fix chars_per_token inconsistency with PDF estimator
onmyraedar Jun 9, 2026
46204a7
Include reach probabilities in summary; add regression test
onmyraedar Jun 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions edsl/jobs/cost_estimation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from .question_token_estimate import QuestionTokenEstimate
from .cost_estimation_constants import (
EDSL_DEFAULT_CHARS_PER_TOKEN,
TokenAmount,
TokenRatio,
)
from .job_cost_estimate import JobCostEstimate
from .question_estimators import (
QuestionEstimator,
ZeroCostEstimator,
FreeTextStyleEstimator,
StructuredAnswerEstimator,
DemandEstimator,
MatrixEstimator,
DefaultEstimator,
DEFAULT_ESTIMATORS,
)
from .file_store_estimator import FileStoreEstimator
from .job_cost_estimator import JobCostEstimator
from .token_override import TokenOverride
from .cost_estimate_calibration import calibrate_from_results

__all__ = [
"QuestionTokenEstimate",
"JobCostEstimate",
"EDSL_DEFAULT_CHARS_PER_TOKEN",
"TokenAmount",
"TokenRatio",
"QuestionEstimator",
"ZeroCostEstimator",
"FreeTextStyleEstimator",
"StructuredAnswerEstimator",
"DemandEstimator",
"MatrixEstimator",
"DefaultEstimator",
"DEFAULT_ESTIMATORS",
"FileStoreEstimator",
"JobCostEstimator",
"TokenOverride",
"calibrate_from_results",
]
134 changes: 134 additions & 0 deletions edsl/jobs/cost_estimation/cost_estimate_calibration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
from __future__ import annotations
from typing import TYPE_CHECKING

from .token_override import TokenOverride

if TYPE_CHECKING:
from ...results import Results


def calibrate_from_results(
results: "Results",
percentile: int = 75,
by_model: bool = True,
) -> dict[str, TokenOverride | list[TokenOverride]]:
"""Derive token overrides from a pilot Results object.

Computes the given percentile of actual output tokens per question (and
optionally per service/model), returning a dict ready to pass as
token_overrides to JobCostEstimator.estimate_cost().

Calibrates both answer_tokens (from raw_model_response.{q}_output_tokens)
and thinking_tokens (from raw_model_response.{q}_thinking_tokens) when
thinking token data is present.

Args:
results: a completed Results object from a pilot run
percentile: which percentile of observed output tokens to use (default 75).
Use 50 for median or a higher value (75-90) for a conservative
budget estimate.
by_model: if True (default), return per-(service, model) overrides so each
model gets its own calibrated estimate; if False, pool all models
into one global override per question

Returns:
dict[str, TokenOverride | list[TokenOverride]] ready for token_overrides=
"""
prefix = "raw_model_response."
output_suffix = "_output_tokens"
thinking_suffix = "_thinking_tokens"

output_cols = {
c[len(prefix) : -len(output_suffix)]: c
for c in results.columns
if c.startswith(prefix) and c.endswith(output_suffix)
}
thinking_cols = {
c[len(prefix) : -len(thinking_suffix)]: c
for c in results.columns
if c.startswith(prefix) and c.endswith(thinking_suffix)
}

overrides: dict[str, TokenOverride | list[TokenOverride]] = {}

for q, output_col in output_cols.items():
thinking_col = thinking_cols.get(q)

if by_model:
select_cols = [output_col, "model.inference_service", "model.model"]
if thinking_col:
select_cols.insert(1, thinking_col)
df = results.select(*select_cols).to_pandas()
df = df.dropna(subset=[output_col])
entries: list[TokenOverride] = []
for (svc, mdl), grp in df.groupby(
["model.inference_service", "model.model"]
):
output_vals = grp[output_col].tolist()
thinking_tokens = None
if thinking_col:
thinking_vals = grp[thinking_col].dropna().tolist()
if thinking_vals:
thinking_tokens = _percentile(thinking_vals, percentile)
entries.append(
TokenOverride(
answer_tokens=_percentile(output_vals, percentile),
thinking_tokens=thinking_tokens,
service=svc,
model=mdl,
note=f"calibrated from pilot (n={len(output_vals)}, p{percentile})",
)
)
overrides[q] = entries
else:
df = results.select(output_col).to_pandas().dropna(subset=[output_col])
output_vals = df[output_col].tolist()
thinking_tokens = None
if thinking_col:
thinking_df = (
results.select(thinking_col)
.to_pandas()
.dropna(subset=[thinking_col])
)
thinking_vals = thinking_df[thinking_col].tolist()
if thinking_vals:
thinking_tokens = _percentile(thinking_vals, percentile)
overrides[q] = TokenOverride(
answer_tokens=_percentile(output_vals, percentile),
thinking_tokens=thinking_tokens,
note=f"calibrated from pilot (n={len(output_vals)}, p{percentile})",
)
Comment thread
greptile-apps[bot] marked this conversation as resolved.

return overrides


def _percentile(values: list[float], p: int) -> int:
"""Return the p-th percentile of values using linear interpolation.

Computes a float index into the sorted list, then interpolates between
the two surrounding values. This matches numpy.percentile(method='linear')
and correctly returns the average of the two middle elements for even-length
lists at p=50 (e.g. [10, 20, 30, 40] -> 25, not 30).

Args:
values: list of numeric values
p: percentile to compute, 0-100 inclusive

Returns:
Interpolated percentile value truncated to int, or 0 for an empty list.
"""
if not values:
return 0
sorted_values = sorted(values)
count = len(sorted_values)
# A float index in [0, count-1] that maps p=0 to the first element
# and p=100 to the last, with fractional positions in between.
float_index = (count - 1) * p / 100
lower_idx = int(float_index)
upper_idx = min(lower_idx + 1, count - 1)
# How far float_index sits between lower_idx and upper_idx (0.0 to 1.0).
fraction = float_index - lower_idx
interpolated = sorted_values[lower_idx] + fraction * (
sorted_values[upper_idx] - sorted_values[lower_idx]
)
return int(interpolated)
22 changes: 22 additions & 0 deletions edsl/jobs/cost_estimation/cost_estimation_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from __future__ import annotations
from dataclasses import dataclass

EDSL_DEFAULT_CHARS_PER_TOKEN = 4


@dataclass(frozen=True)
class TokenAmount:
"""Fixed token count, independent of input length."""
value: int


@dataclass(frozen=True)
class TokenRatio:
"""Token count as a fraction of input tokens."""
value: float


def _resolve_token_spec(spec: TokenAmount | TokenRatio, input_tokens: int) -> int:
if isinstance(spec, TokenRatio):
return int(input_tokens * spec.value)
return spec.value
Loading
Loading