sci-method/rubric.json at main · Transconnectome/sci-method · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
{
  "rubric_id": "sci_method_ab_rubric_v1",
  "scale": "0-10 per measure",
  "measures": {
    "M1_premise_challenge": {
      "name": "Premise Challenge Strength",
      "definition": "Does the response challenge questionable premises in the user's question (e.g., 'loss smooth = optimal', '1000 epoch is enough')? Or does it accept premises uncritically?",
      "anchors": {
        "10": "Explicit challenge with quantification (e.g., '89× refute' or 'base rate says X'). Premise reframed before answering.",
        "7": "Clear challenge with examples but less quantitative",
        "5": "Hedged challenge ('not entirely sure', 'depends on...'). Acknowledges issue but soft.",
        "3": "Mild caveat after agreement",
        "0": "Uncritical agreement ('네, 좋은 결정입니다')"
      }
    },
    "M2_hypothesis_count_quality": {
      "name": "Hypothesis Count + Quality",
      "definition": "Does the response generate multiple distinct hypotheses with explicit prior probabilities?",
      "anchors": {
        "10": "≥3 distinct + Bayesian priors summing to 1.0",
        "7": "2-3 distinct hypotheses with rough probability or ranking",
        "5": "Multiple options mentioned but no explicit hypothesis framing",
        "3": "1-2 implicit alternatives",
        "0": "Single hypothesis or none"
      }
    },
    "M3_falsifiability_coverage": {
      "name": "Falsifiability Slot Coverage",
      "definition": "Does the response specify 'wrong if X' (observable disproof) for each major claim?",
      "anchors": {
        "10": "≥80% of claims have explicit 'wrong if X' with high specificity (numeric/temporal)",
        "7": "Most claims have falsifiable conditions, mixed specificity",
        "5": "Some claims testable, but most assertions lack explicit disproof condition",
        "3": "Implicit testability only",
        "0": "All claims unfalsifiable (vague predictions)"
      }
    },
    "M4_counter_evidence_depth": {
      "name": "Counter-Evidence Depth",
      "definition": "Does the response surface counter-evidence (academic citations, base rates, alternative perspectives)?",
      "anchors": {
        "10": "≥5 counter-issues + Tier 1 source citations (peer-reviewed, official docs, books)",
        "7": "3-5 counter-issues with mix of cited and uncited sources",
        "5": "2-3 counter-issues, no citations",
        "3": "1-2 counter-issues mentioned",
        "0": "No counter-evidence (pure agreement or no critique)"
      }
    },
    "M5_confidence_interval": {
      "name": "Confidence Interval Specificity",
      "definition": "Does the response express uncertainty as a quantitative distribution (P10/P50/P90 or similar)?",
      "anchors": {
        "10": "Explicit [P10/P50/P90] or comparable distribution with reasoning",
        "7": "Best/worst case with rough probabilities",
        "5": "Hedged language ('likely', 'may') without quantification",
        "3": "Mostly point estimates with weak hedge",
        "0": "Unhedged claim ('this will work')"
      }
    },
    "M6_pre_mortem_rigor": {
      "name": "Pre-mortem Rigor",
      "definition": "Does the response anticipate failure modes with mitigation actions?",
      "anchors": {
        "10": "Ranked failure modes (probability) + concrete mitigation per mode",
        "7": "2-3 failure scenarios with mitigation",
        "5": "Some risk mention without mitigation, or vice versa",
        "3": "Generic 'be careful' caveat",
        "0": "No failure mode analysis"
      }
    },
    "M7_output_efficiency": {
      "name": "Output Efficiency (insight per token)",
      "definition": "Quality-to-length ratio. Is the response complete without verbosity?",
      "anchors": {
        "10": "Concise + complete + actionable; no filler",
        "7": "Mostly efficient with minor redundancy",
        "5": "Average density",
        "3": "Verbose with significant redundancy",
        "0": "Very long with low information density"
      }
    }
  },
  "reviewer_instructions": {
    "blinding": "Each response is labeled 'Condition X' or 'Condition Y'. Mapping is randomized per evaluation. Do NOT attempt to identify which is sci-method vs baseline.",
    "anti_affinity": "Evaluate as if both responses come from the same source. Do not favor responses that look like they came from your own model family.",
    "format": "Return strictly JSON: {response_id, condition_label, scores: {M1: int, M2: int, ..., M7: int}, justification: str}. Score is 0-10 integer per measure. Justification is 50-100 words covering key reasoning.",
    "calibration_example": {
      "scenario": "Response that uncritically agrees + provides single piece of advice + no citations",
      "expected_scores": {"M1": 1, "M2": 0, "M3": 0, "M4": 1, "M5": 0, "M6": 0, "M7": 6}
    }
  }
}