copilot-skillgrade/eval.yaml at main · LayZeeDK-Premium/copilot-skillgrade · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Skillgrade Self-Evaluation
#
# Dogfooding: evaluates whether agents can correctly use
# the skillgrade-setup and skillgrade-graders skills.
#
# Run:
#   GEMINI_API_KEY=your-key skillgrade --smoke

version: "1"

defaults:
  agent: gemini
  provider: docker
  trials: 5
  timeout: 600
  threshold: 0.7
  docker:
    base: node:20-slim
    setup: |
      npm i -g js-yaml 2>/dev/null

tasks:
  # ── Task 1: Create eval.yaml from a SKILL.md ──────────────
  # Tests whether the agent can follow the skillgrade-setup skill
  # to produce a valid eval configuration.
  - name: create-eval-config
    instruction: |
      A skill called "code-formatter" is defined in SKILL.md.
      Create a valid eval.yaml that evaluates this skill.

      Requirements:
      1. The eval.yaml must have `version: "1"`
      2. It must define at least one task under `tasks:`
      3. Each task must have: name, instruction, workspace, and graders
      4. Include at least one deterministic grader and one llm_rubric grader
      5. The instruction for each task should be specific and actionable
      6. Save the file as `eval.yaml` in the current directory

    workspace:
      - src: fixtures/code-formatter-skill.md
        dest: SKILL.md

    graders:
      - type: deterministic
        run: node graders/check-eval-yaml.js
        weight: 0.7

      - type: llm_rubric
        rubric: |
          Evaluate the generated eval.yaml quality:

          Structure (0-0.4):
          - Does eval.yaml have version "1"?
          - Are defaults sensibly configured?
          - Does it define at least one well-structured task?

          Task Quality (0-0.3):
          - Is the instruction specific enough for an agent to follow?
          - Are workspace files mapped correctly?
          - Are grader weights reasonable?

          Grader Design (0-0.3):
          - Does it include both deterministic and llm_rubric graders?
          - Is the deterministic grader checking concrete outcomes?
          - Is the LLM rubric focused on qualitative assessment?
        weight: 0.3

  # ── Task 2: Write a deterministic grader ───────────────────
  # Tests whether the agent can follow the skillgrade-graders skill
  # to produce a working deterministic grader script.
  - name: write-deterministic-grader
    instruction: |
      Write a deterministic grader script for a skillgrade evaluation.

      The grader should verify that a file called `output.txt` was created
      and contains the text "Hello, World!".

      Requirements:
      1. Write a bash script at `graders/check-output.sh`
      2. The script must output valid JSON to stdout
      3. The JSON must have "score" (0.0-1.0) and "details" fields
      4. Include a "checks" array with individual check results
      5. Check 1: verify output.txt exists
      6. Check 2: verify output.txt contains "Hello, World!"
      7. Score should be the proportion of checks that passed

      To test your grader, create a sample output.txt with "Hello, World!"
      and run `bash graders/check-output.sh` to verify it produces valid JSON.

    workspace:
      - src: fixtures/sample-output.txt
        dest: expected-output.txt

    graders:
      - type: deterministic
        run: |
          # Create the expected file so the agent's grader should score 1.0
          echo "Hello, World!" > output.txt
          mkdir -p graders
          # Run the agent's grader and validate its output
          if [ ! -f graders/check-output.sh ]; then
            echo '{"score":0,"details":"graders/check-output.sh not found"}'
            exit 0
          fi
          output=$(bash graders/check-output.sh 2>/dev/null)
          if ! echo "$output" | node -e "const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf8')); if(typeof d.score!=='number'||!d.details)process.exit(1)" 2>/dev/null; then
            echo '{"score":0,"details":"Grader output is not valid JSON with score and details"}'
            exit 0
          fi
          # Check the grader scores correctly
          score=$(echo "$output" | node -e "console.log(JSON.parse(require('fs').readFileSync('/dev/stdin','utf8')).score)")
          if [ "$score" = "1" ] || [ "$score" = "1.00" ]; then
            echo '{"score":1,"details":"Grader produced valid JSON and correct score"}'
          else
            echo "{\"score\":0.5,\"details\":\"Grader produced valid JSON but score was $score instead of 1.0\"}"
          fi
        weight: 0.7

      - type: llm_rubric
        rubric: |
          Evaluate the grader script quality:

          Correctness (0-0.4):
          - Does the script output valid JSON with "score" and "details"?
          - Does it include a "checks" array?
          - Does it correctly test for file existence and content?

          Robustness (0-0.3):
          - Does it handle the case where output.txt doesn't exist?
          - Does it use proper bash patterns (no bc, uses awk)?
          - Does it redirect debug output to stderr?

          Code Quality (0-0.3):
          - Is the script well-structured and readable?
          - Are variable names descriptive?
          - Is the JSON output properly escaped?
        weight: 0.3