-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.sh
More file actions
executable file
·188 lines (164 loc) · 6.79 KB
/
run.sh
File metadata and controls
executable file
·188 lines (164 loc) · 6.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env bash
set -euo pipefail
# Automated pipeline wrapper around inspect eval-set.
# Reads configuration from pipeline/config.env (already exported by Makefile).
#
# Usage:
# make run # run all model x persona combinations
# make run DRY_RUN=1 # print commands without executing
REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
source "$REPO_ROOT/scripts/run-experiments/setup_aws_env.sh"
TASK="eval_task.py@data_science_benchmark"
LIMIT="${LIMIT:-}" # set LIMIT=1 for smoke tests
DRY_RUN="${DRY_RUN:-}"
# Resolve run directory from .generated/run_id.txt (or RUN_ID env var)
if [[ -z "${RUN_ID:-}" ]]; then
RUN_ID_FILE="$REPO_ROOT/.generated/run_id.txt"
if [[ -f "$RUN_ID_FILE" ]]; then
RUN_ID="$(tr -d '[:space:]' < "$RUN_ID_FILE")"
else
echo "ERROR: No run ID found. Run 'make prepare' first." >&2
exit 1
fi
fi
RUN_DIR="$REPO_ROOT/runs/$RUN_ID"
echo "Run ID: $RUN_ID"
echo "Run dir: $RUN_DIR"
# datasets.json lives inside the run directory (created by `make prepare`)
DATASET_PATH_ARG="$RUN_DIR/datasets.json"
if [[ ! -f "$DATASET_PATH_ARG" ]]; then
echo "ERROR: datasets.json not found at $DATASET_PATH_ARG. Run 'make prepare' first." >&2
exit 1
fi
echo "Using dataset file: $DATASET_PATH_ARG"
read -ra MODEL_ARR <<< "$MODELS"
read -ra PERSONA_ARR <<< "$PERSONAS"
# All workspaces go inside the run directory.
# WORKSPACE_MIRROR_DIR is read by compose.yaml for the Docker volume mount.
export WORKSPACE_MIRROR_DIR="$RUN_DIR/workspaces"
export INSPECT_EVAL_WORKSPACE_DIR="$WORKSPACE_MIRROR_DIR"
mkdir -p "$WORKSPACE_MIRROR_DIR"
# ---------------------------------------------------------------------------
# run_model_personas: run all personas for a single model (sequential).
# Called once per model, potentially in parallel across models.
# ---------------------------------------------------------------------------
run_model_personas() {
local model="$1"
local SAFE_MODEL
SAFE_MODEL=$(echo "$model" | sed 's/[^A-Za-z0-9._-]/_/g')
local MODEL_LOG="$RUN_DIR/runlogs/${SAFE_MODEL}.log"
for prompt in "${PERSONA_ARR[@]}"; do
echo "==========================================" | tee -a "$MODEL_LOG"
echo "Eval-Set: model=${model} judge=${JUDGE_MODEL} prompt=${prompt}" | tee -a "$MODEL_LOG"
echo "==========================================" | tee -a "$MODEL_LOG"
local LOG_DIR="$RUN_DIR/logs/${SAFE_MODEL}/${prompt}"
# Read eval-set-id if present (for retry)
local EVAL_SET_ID=""
if [[ -f "$LOG_DIR/.eval-set-id" ]]; then
EVAL_SET_ID="$(tr -d '[:space:]' < "$LOG_DIR/.eval-set-id")"
fi
echo "Log dir: $LOG_DIR" | tee -a "$MODEL_LOG"
if [[ -z "$EVAL_SET_ID" ]]; then
echo "No .eval-set-id found -> running fresh eval-set" | tee -a "$MODEL_LOG"
local CMD_STR="DOCKER_BUILDKIT=1 inspect eval-set $TASK --model $model --model-role judge=$JUDGE_MODEL -T prompt=$prompt"
[[ -n "$DATASET_PATH_ARG" ]] && CMD_STR+=" -T dataset_path=$DATASET_PATH_ARG"
CMD_STR+=" --epochs $EPOCHS --max-connections $MAX_CONNS --max-retries $MAX_RETRIES --timeout $TIMEOUT"
CMD_STR+=" --retry-attempts $RETRY_ATTEMPTS --retry-wait $RETRY_WAIT --retry-connections $RETRY_CONN"
CMD_STR+=" --max-tasks $MAX_TASKS --no-fail-on-error True --retry-on-error 5"
[[ -n "${LIMIT:-}" ]] && CMD_STR+=" --limit $LIMIT"
CMD_STR+=" --log-dir $LOG_DIR"
if [[ -n "$DRY_RUN" ]]; then
echo "[DRY RUN] $CMD_STR" | tee -a "$MODEL_LOG"
else
DOCKER_BUILDKIT=1 inspect eval-set "$TASK" \
--model "$model" \
--model-role judge="$JUDGE_MODEL" \
-T prompt="$prompt" \
${DATASET_PATH_ARG:+-T dataset_path="$DATASET_PATH_ARG"} \
--epochs "$EPOCHS" \
--max-connections "$MAX_CONNS" \
--max-retries "$MAX_RETRIES" \
--timeout "$TIMEOUT" \
--retry-attempts "$RETRY_ATTEMPTS" \
--retry-wait "$RETRY_WAIT" \
--retry-connections "$RETRY_CONN" \
--max-tasks "$MAX_TASKS" \
--no-fail-on-error True \
--retry-on-error 5 \
${LIMIT:+--limit "$LIMIT"} \
--log-dir "$LOG_DIR" 2>&1 | tee -a "$MODEL_LOG"
fi
else
echo "Found eval-set-id: $EVAL_SET_ID -> retrying failed items" | tee -a "$MODEL_LOG"
if [[ -n "$DRY_RUN" ]]; then
echo "[DRY RUN] inspect eval-retry --log-dir $LOG_DIR ..." | tee -a "$MODEL_LOG"
else
shopt -s nullglob
local CANDIDATES=( "$LOG_DIR"/*.eval "$LOG_DIR"/*.json )
local CLEANED=()
local f base
for f in "${CANDIDATES[@]}"; do
base="$(basename "$f")"
[[ "$base" == "logs.json" ]] && continue
if [[ "$f" == *.json ]]; then
if jq -e 'has("eval")' "$f" >/dev/null 2>&1; then
CLEANED+=("$f")
fi
else
CLEANED+=("$f")
fi
done
if [[ ${#CLEANED[@]} -eq 0 ]]; then
echo "ERROR: No retriable eval logs in $LOG_DIR. Run a fresh eval-set first." | tee -a "$MODEL_LOG"
return 2
fi
DOCKER_BUILDKIT=1 inspect eval-retry \
--log-dir "$LOG_DIR" \
--retry-on-error=3 \
--no-fail-on-error True \
--max-connections "$MAX_CONNS" \
--timeout "$TIMEOUT" \
"${CLEANED[@]}" 2>&1 | tee -a "$MODEL_LOG"
fi
fi
if [[ -z "$DRY_RUN" ]]; then
sudo chown -R "$(whoami):$(id -gn)" "$INSPECT_EVAL_WORKSPACE_DIR" 2>/dev/null || true
fi
echo | tee -a "$MODEL_LOG"
done
}
export -f run_model_personas
export TASK JUDGE_MODEL DATASET_PATH_ARG EPOCHS MAX_CONNS MAX_RETRIES
export TIMEOUT RETRY_ATTEMPTS RETRY_WAIT RETRY_CONN MAX_TASKS LIMIT DRY_RUN
export REPO_ROOT PERSONA_ARR RUN_DIR RUN_ID WORKSPACE_MIRROR_DIR INSPECT_EVAL_WORKSPACE_DIR
# ---------------------------------------------------------------------------
# Launch all models in parallel (different models use separate compute).
# Personas within each model run sequentially.
# ---------------------------------------------------------------------------
MODEL_PIDS=()
for model in "${MODEL_ARR[@]}"; do
SAFE_MODEL=$(echo "$model" | sed 's/[^A-Za-z0-9._-]/_/g')
echo "Launching model: $model (log: $RUN_DIR/runlogs/${SAFE_MODEL}.log)"
run_model_personas "$model" &
MODEL_PIDS+=($!)
done
echo ""
echo "All ${#MODEL_ARR[@]} models launched in parallel (PIDs: ${MODEL_PIDS[*]})"
echo "Waiting for all to finish..."
# Wait for all, track failures
FAILED=0
for pid in "${MODEL_PIDS[@]}"; do
if ! wait "$pid"; then
echo "WARNING: Model process $pid exited with error"
FAILED=$((FAILED + 1))
fi
done
echo ""
if [[ $FAILED -gt 0 ]]; then
echo "$FAILED model(s) had errors. Check individual model logs in $RUN_DIR/runlogs/"
else
echo "All eval-sets completed successfully!"
fi
if [[ -z "$DRY_RUN" ]]; then
"$REPO_ROOT/scripts/run-experiments/cleanup_permissions.sh"
fi