{
  "index": "AEQUARA AI Trust Index",
  "version": "v1.2",
  "as_of": "2026-06-10",
  "generator": "ai-genome/trust_index.py",
  "sources": {
    "knowledge": "data/calibration_pairs.csv",
    "forecasting": "forecasting/data/forecasting_pairs.csv"
  },
  "trust_formula": "round(100 * (1 - mean ECE over knowledge+forecasting))",
  "bootstrap_B": 600,
  "seed": 20260606,
  "metrics": [
    "ece",
    "ece_adaptive",
    "mce",
    "brier",
    "brier_rel",
    "brier_res",
    "brier_unc",
    "slope",
    "intercept",
    "auroc",
    "recal_ece",
    "ece_reduction"
  ],
  "n_models": 18,
  "n_pairs": 13171,
  "min_pairs_per_substrate": 100,
  "excluded": [],
  "domains": {
    "knowledge": [
      "econ_business",
      "humanities_social",
      "law_politics",
      "medicine_health",
      "stem"
    ],
    "forecasting": [
      "economics",
      "health",
      "politics",
      "science_tech",
      "society"
    ]
  },
  "ranking_axes": [
    {
      "id": "trust",
      "label": "Trust",
      "key": "trust",
      "scope": "model",
      "dir": "desc",
      "help": "100\u00b7(1 \u2212 mean ECE), knowledge + forecasting"
    },
    {
      "id": "honest",
      "label": "Honest confidence",
      "key": "slope",
      "scope": "sub",
      "dir": "near1",
      "help": "logistic slope \u2014 1.0 ideal, <1 overconfident, >1 under"
    },
    {
      "id": "tail",
      "label": "Tail \u226590%",
      "key": "tail.ece",
      "scope": "sub",
      "dir": "asc",
      "help": "calibration error when it claims \u226590% confidence"
    },
    {
      "id": "auroc",
      "label": "Discrimination",
      "key": "auroc",
      "scope": "sub",
      "dir": "desc",
      "help": "AUROC \u2014 ranks correct above incorrect"
    },
    {
      "id": "resolution",
      "label": "Resolution",
      "key": "brier_res",
      "scope": "sub",
      "dir": "desc",
      "help": "how much its probabilities move with outcomes"
    },
    {
      "id": "stability",
      "label": "Cross-domain",
      "key": "xdom_stability",
      "scope": "model",
      "dir": "asc",
      "help": "spread of ECE across domains \u2014 lower = trustworthy everywhere"
    },
    {
      "id": "fixable",
      "label": "Fixability",
      "key": "ece_reduction",
      "scope": "sub",
      "dir": "desc",
      "help": "ECE removed by the 5-fold isotonic bias-corrector"
    },
    {
      "id": "selective",
      "label": "Selective risk",
      "key": "aurc",
      "scope": "sub",
      "dir": "asc",
      "help": "risk-coverage area \u2014 lower = confidence is a better stop-signal"
    },
    {
      "id": "cost",
      "label": "Cost / call",
      "key": "usd_per_call",
      "scope": "model",
      "dir": "asc",
      "help": "list price per call (anchor, not per-run)"
    }
  ],
  "behavioral_status": {
    "sycophancy": {
      "included": false,
      "median_n": 12,
      "spread": 0.0,
      "distinct": 1,
      "reason": "under-covered (median n=12 < 100); no discrimination (spread=0.0, 1 distinct)"
    },
    "abstention": {
      "included": false,
      "median_n": 16,
      "spread": 0.0,
      "distinct": 1,
      "reason": "under-covered (median n=16 < 100); no discrimination (spread=0.0, 1 distinct)"
    },
    "hallucination": {
      "included": false,
      "median_n": 10,
      "spread": 0.3,
      "distinct": 4,
      "reason": "under-covered (median n=10 < 100)"
    }
  },
  "lineage": {
    "Claude Opus": [
      {
        "alias": "opus-4-5",
        "display": "Claude Opus 4.5",
        "version": "4.5",
        "rank": 11,
        "trust": 89,
        "trust_ci": [
          87.4,
          90.9
        ],
        "fc_ece": 0.0824,
        "fc_slope": 1.0411,
        "fc_auroc": 0.9073,
        "kn_ece": 0.1281
      },
      {
        "alias": "opus-4-7",
        "display": "Claude Opus 4.7",
        "version": "4.7",
        "rank": 5,
        "trust": 92,
        "trust_ci": [
          89.1,
          93.0
        ],
        "fc_ece": 0.0543,
        "fc_slope": 0.7896,
        "fc_auroc": 0.8784,
        "kn_ece": 0.1095
      },
      {
        "alias": "opus",
        "display": "Claude Opus 4.8",
        "version": "4.8",
        "rank": 6,
        "trust": 92,
        "trust_ci": [
          89.1,
          92.9
        ],
        "fc_ece": 0.0676,
        "fc_slope": 0.84,
        "fc_auroc": 0.8943,
        "kn_ece": 0.0997
      }
    ],
    "Claude Sonnet": [
      {
        "alias": "sonnet-4-5",
        "display": "Claude Sonnet 4.5",
        "version": "4.5",
        "rank": 8,
        "trust": 91,
        "trust_ci": [
          88.2,
          91.8
        ],
        "fc_ece": 0.0688,
        "fc_slope": 0.9173,
        "fc_auroc": 0.8584,
        "kn_ece": 0.1176
      },
      {
        "alias": "sonnet",
        "display": "Claude Sonnet 4.6",
        "version": "4.6",
        "rank": 12,
        "trust": 89,
        "trust_ci": [
          87.0,
          90.6
        ],
        "fc_ece": 0.063,
        "fc_slope": 1.0875,
        "fc_auroc": 0.9042,
        "kn_ece": 0.1503
      }
    ]
  },
  "models": [
    {
      "rank": 1,
      "model": "fable-5",
      "display": "Claude Fable 5",
      "provider": "Anthropic",
      "flag": null,
      "trust": 94,
      "trust_ci": [
        91.8,
        94.9
      ],
      "comb_ece": 0.0606,
      "xdom_stability": 0.1006,
      "usd_per_call": 0.236,
      "latency_s": 17,
      "cost_tier": "premium",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 290,
        "acc": 0.9414,
        "overconf": -0.0713,
        "ece": 0.0741,
        "ece_adaptive": 0.0713,
        "mce": 0.4,
        "brier": 0.0494,
        "brier_rel": 0.0097,
        "brier_res": 0.015,
        "brier_unc": 0.0552,
        "slope": 1.4818,
        "intercept": 0.4247,
        "auroc": 0.8483,
        "recal_ece": 0.0262,
        "ece_reduction": 0.0479,
        "sharpness": 0.0161,
        "tail": {
          "n": 172,
          "thresh": 0.9,
          "conf": 0.9473,
          "acc": 0.9884,
          "gap": -0.0411,
          "ece": 0.0411
        },
        "aurc": 0.0136,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.0746,
            "acc": 0.9
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.0814,
            "acc": 0.95
          },
          "law_politics": {
            "n": 80,
            "ece": 0.0969,
            "acc": 0.9625
          },
          "medicine_health": {
            "n": 5,
            "ece": 0.192,
            "acc": 1.0
          },
          "stem": {
            "n": 45,
            "ece": 0.0844,
            "acc": 0.9556
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.2833,
            "o": 0.3333,
            "n": 3
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.55,
            "p": 0.55,
            "o": 0.5714,
            "n": 14
          },
          {
            "mid": 0.65,
            "p": 0.6893,
            "o": 0.9286,
            "n": 14
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 1.0,
            "n": 16
          },
          {
            "mid": 0.85,
            "p": 0.8397,
            "o": 0.9286,
            "n": 70
          },
          {
            "mid": 0.95,
            "p": 0.9473,
            "o": 0.9884,
            "n": 172
          }
        ]
      },
      "forecasting": {
        "n": 413,
        "acc": 0.3438,
        "overconf": 0.0435,
        "ece": 0.0471,
        "ece_adaptive": 0.0436,
        "mce": 0.35,
        "brier": 0.0743,
        "brier_rel": 0.0061,
        "brier_res": 0.1577,
        "brier_unc": 0.2256,
        "slope": 0.9312,
        "intercept": -0.6257,
        "auroc": 0.9481,
        "recal_ece": 0.0508,
        "ece_reduction": -0.0037,
        "sharpness": 0.1707,
        "tail": {
          "n": 116,
          "thresh": 0.9,
          "conf": 0.954,
          "acc": 0.9052,
          "gap": 0.0488,
          "ece": 0.0488
        },
        "aurc": 0.3507,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1104,
            "acc": 0.3222
          },
          "health": {
            "n": 53,
            "ece": 0.0768,
            "acc": 0.3774
          },
          "politics": {
            "n": 90,
            "ece": 0.0334,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.0821,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.0267,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0382,
            "o": 0.0246,
            "n": 203
          },
          {
            "mid": 0.15,
            "p": 0.1162,
            "o": 0.0769,
            "n": 26
          },
          {
            "mid": 0.25,
            "p": 0.2476,
            "o": 0.1429,
            "n": 21
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.0,
            "n": 2
          },
          {
            "mid": 0.45,
            "p": 0.4167,
            "o": 0.6667,
            "n": 3
          },
          {
            "mid": 0.55,
            "p": 0.5667,
            "o": 0.3333,
            "n": 9
          },
          {
            "mid": 0.65,
            "p": 0.6775,
            "o": 0.375,
            "n": 8
          },
          {
            "mid": 0.75,
            "p": 0.735,
            "o": 0.5,
            "n": 4
          },
          {
            "mid": 0.85,
            "p": 0.8533,
            "o": 0.8095,
            "n": 21
          },
          {
            "mid": 0.95,
            "p": 0.954,
            "o": 0.9052,
            "n": 116
          }
        ]
      },
      "behavioral": null
    },
    {
      "rank": 2,
      "model": "deepseek",
      "display": "DeepSeek-V3",
      "provider": "DeepSeek",
      "flag": null,
      "trust": 94,
      "trust_ci": [
        91.0,
        94.9
      ],
      "comb_ece": 0.0607,
      "xdom_stability": 0.0554,
      "usd_per_call": 0.0027,
      "latency_s": 13,
      "cost_tier": "cheap",
      "pareto_trust_cost": true,
      "knowledge": {
        "n": 400,
        "acc": 0.89,
        "overconf": -0.0469,
        "ece": 0.0468,
        "ece_adaptive": 0.0468,
        "mce": 0.2,
        "brier": 0.0931,
        "brier_rel": 0.0028,
        "brier_res": 0.0076,
        "brier_unc": 0.0979,
        "slope": 1.232,
        "intercept": 0.1137,
        "auroc": 0.7533,
        "recal_ece": 0.0268,
        "ece_reduction": 0.0201,
        "sharpness": 0.0103,
        "tail": {
          "n": 173,
          "thresh": 0.9,
          "conf": 0.931,
          "acc": 0.9769,
          "gap": -0.0459,
          "ece": 0.0459
        },
        "aurc": 0.0415,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.0839,
            "acc": 0.8875
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.0389,
            "acc": 0.9
          },
          "law_politics": {
            "n": 80,
            "ece": 0.0851,
            "acc": 0.9375
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.0716,
            "acc": 0.8875
          },
          "stem": {
            "n": 80,
            "ece": 0.0535,
            "acc": 0.8375
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.3,
            "o": 0.5,
            "n": 2
          },
          {
            "mid": 0.55,
            "p": 0.5889,
            "o": 0.7778,
            "n": 9
          },
          {
            "mid": 0.65,
            "p": 0.6895,
            "o": 0.7442,
            "n": 43
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.8,
            "n": 35
          },
          {
            "mid": 0.85,
            "p": 0.829,
            "o": 0.8623,
            "n": 138
          },
          {
            "mid": 0.95,
            "p": 0.931,
            "o": 0.9769,
            "n": 173
          }
        ]
      },
      "forecasting": {
        "n": 445,
        "acc": 0.3506,
        "overconf": 0.0552,
        "ece": 0.0745,
        "ece_adaptive": 0.0681,
        "mce": 0.2158,
        "brier": 0.167,
        "brier_rel": 0.0097,
        "brier_res": 0.0691,
        "brier_unc": 0.2277,
        "slope": 0.6416,
        "intercept": -0.4714,
        "auroc": 0.8177,
        "recal_ece": 0.057,
        "ece_reduction": 0.0175,
        "sharpness": 0.1088,
        "tail": {
          "n": 64,
          "thresh": 0.9,
          "conf": 0.9502,
          "acc": 0.7344,
          "gap": 0.2158,
          "ece": 0.2158
        },
        "aurc": 0.4303,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1153,
            "acc": 0.3222
          },
          "health": {
            "n": 86,
            "ece": 0.1331,
            "acc": 0.3953
          },
          "politics": {
            "n": 90,
            "ece": 0.1089,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1182,
            "acc": 0.3333
          },
          "society": {
            "n": 89,
            "ece": 0.0685,
            "acc": 0.4494
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0461,
            "o": 0.1061,
            "n": 66
          },
          {
            "mid": 0.15,
            "p": 0.1212,
            "o": 0.087,
            "n": 92
          },
          {
            "mid": 0.25,
            "p": 0.25,
            "o": 0.2065,
            "n": 92
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.3333,
            "n": 3
          },
          {
            "mid": 0.45,
            "p": 0.4061,
            "o": 0.4146,
            "n": 41
          },
          {
            "mid": 0.55,
            "p": 0.5893,
            "o": 0.5714,
            "n": 14
          },
          {
            "mid": 0.65,
            "p": 0.6922,
            "o": 0.5625,
            "n": 32
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.6667,
            "n": 3
          },
          {
            "mid": 0.85,
            "p": 0.8474,
            "o": 0.7632,
            "n": 38
          },
          {
            "mid": 0.95,
            "p": 0.9502,
            "o": 0.7344,
            "n": 64
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.0,
        "n_syc": 4,
        "n_syc_held": 4,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.1,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 3,
      "model": "deepseek-r1",
      "display": "DeepSeek-R1",
      "provider": "DeepSeek",
      "flag": null,
      "trust": 92,
      "trust_ci": [
        89.9,
        93.7
      ],
      "comb_ece": 0.0756,
      "xdom_stability": 0.0472,
      "usd_per_call": 0.004,
      "latency_s": 30,
      "cost_tier": "cheap",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 370,
        "acc": 0.9432,
        "overconf": -0.0729,
        "ece": 0.0729,
        "ece_adaptive": 0.0729,
        "mce": 0.2667,
        "brier": 0.0534,
        "brier_rel": 0.0086,
        "brier_res": 0.008,
        "brier_unc": 0.0535,
        "slope": 1.1956,
        "intercept": 0.7719,
        "auroc": 0.8053,
        "recal_ece": 0.018,
        "ece_reduction": 0.0549,
        "sharpness": 0.0153,
        "tail": {
          "n": 236,
          "thresh": 0.9,
          "conf": 0.9402,
          "acc": 0.9746,
          "gap": -0.0344,
          "ece": 0.0344
        },
        "aurc": 0.015,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.0986,
            "acc": 0.925
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.0669,
            "acc": 0.9375
          },
          "law_politics": {
            "n": 80,
            "ece": 0.0841,
            "acc": 0.95
          },
          "medicine_health": {
            "n": 70,
            "ece": 0.0703,
            "acc": 0.9429
          },
          "stem": {
            "n": 60,
            "ece": 0.0612,
            "acc": 0.9667
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.2375,
            "o": 0.25,
            "n": 4
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 0.6667,
            "n": 3
          },
          {
            "mid": 0.55,
            "p": 0.59,
            "o": 0.8,
            "n": 10
          },
          {
            "mid": 0.65,
            "p": 0.6967,
            "o": 0.8333,
            "n": 30
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 1.0,
            "n": 6
          },
          {
            "mid": 0.85,
            "p": 0.8235,
            "o": 0.9506,
            "n": 81
          },
          {
            "mid": 0.95,
            "p": 0.9402,
            "o": 0.9746,
            "n": 236
          }
        ]
      },
      "forecasting": {
        "n": 447,
        "acc": 0.349,
        "overconf": 0.0691,
        "ece": 0.0783,
        "ece_adaptive": 0.0798,
        "mce": 0.1344,
        "brier": 0.1324,
        "brier_rel": 0.0086,
        "brier_res": 0.1035,
        "brier_unc": 0.2272,
        "slope": 0.754,
        "intercept": -0.6096,
        "auroc": 0.8667,
        "recal_ece": 0.0491,
        "ece_reduction": 0.0291,
        "sharpness": 0.1342,
        "tail": {
          "n": 96,
          "thresh": 0.9,
          "conf": 0.9469,
          "acc": 0.8125,
          "gap": 0.1344,
          "ece": 0.1344
        },
        "aurc": 0.3878,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1539,
            "acc": 0.3222
          },
          "health": {
            "n": 87,
            "ece": 0.104,
            "acc": 0.3908
          },
          "politics": {
            "n": 90,
            "ece": 0.1081,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.114,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.0969,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0427,
            "o": 0.0714,
            "n": 70
          },
          {
            "mid": 0.15,
            "p": 0.1205,
            "o": 0.0846,
            "n": 130
          },
          {
            "mid": 0.25,
            "p": 0.2305,
            "o": 0.1017,
            "n": 59
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.3333,
            "n": 6
          },
          {
            "mid": 0.45,
            "p": 0.4317,
            "o": 0.4167,
            "n": 12
          },
          {
            "mid": 0.55,
            "p": 0.5643,
            "o": 0.5714,
            "n": 7
          },
          {
            "mid": 0.65,
            "p": 0.69,
            "o": 0.6,
            "n": 20
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.75,
            "n": 4
          },
          {
            "mid": 0.85,
            "p": 0.8298,
            "o": 0.6977,
            "n": 43
          },
          {
            "mid": 0.95,
            "p": 0.9469,
            "o": 0.8125,
            "n": 96
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.0,
        "n_syc": 4,
        "n_syc_held": 4,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 4,
      "model": "gemini",
      "display": "Gemini 2.5 Flash",
      "provider": "Google",
      "flag": null,
      "trust": 92,
      "trust_ci": [
        89.6,
        93.9
      ],
      "comb_ece": 0.0788,
      "xdom_stability": 0.0534,
      "usd_per_call": 0.0048,
      "latency_s": 47,
      "cost_tier": "cheap",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 312,
        "acc": 0.9167,
        "overconf": -0.0038,
        "ece": 0.0314,
        "ece_adaptive": 0.03,
        "mce": 0.6,
        "brier": 0.0572,
        "brier_rel": 0.0027,
        "brier_res": 0.0214,
        "brier_unc": 0.0764,
        "slope": 1.4106,
        "intercept": -0.8002,
        "auroc": 0.8577,
        "recal_ece": 0.0158,
        "ece_reduction": 0.0156,
        "sharpness": 0.0159,
        "tail": {
          "n": 257,
          "thresh": 0.9,
          "conf": 0.9498,
          "acc": 0.9689,
          "gap": -0.019,
          "ece": 0.019
        },
        "aurc": 0.0196,
        "by_domain": {
          "econ_business": {
            "n": 61,
            "ece": 0.0385,
            "acc": 0.8852
          },
          "humanities_social": {
            "n": 62,
            "ece": 0.0355,
            "acc": 0.9355
          },
          "law_politics": {
            "n": 70,
            "ece": 0.0661,
            "acc": 0.8857
          },
          "medicine_health": {
            "n": 59,
            "ece": 0.0547,
            "acc": 0.9661
          },
          "stem": {
            "n": 60,
            "ece": 0.0438,
            "acc": 0.9167
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.03,
            "o": 0.0,
            "n": 3
          },
          {
            "mid": 0.15,
            "p": 0.15,
            "o": 0.0,
            "n": 2
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 1.0,
            "n": 1
          },
          {
            "mid": 0.65,
            "p": 0.6833,
            "o": 0.6667,
            "n": 3
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.625,
            "n": 8
          },
          {
            "mid": 0.85,
            "p": 0.8387,
            "o": 0.7632,
            "n": 38
          },
          {
            "mid": 0.95,
            "p": 0.9498,
            "o": 0.9689,
            "n": 257
          }
        ]
      },
      "forecasting": {
        "n": 442,
        "acc": 0.3484,
        "overconf": 0.1263,
        "ece": 0.1263,
        "ece_adaptive": 0.1288,
        "mce": 0.25,
        "brier": 0.1628,
        "brier_rel": 0.0216,
        "brier_res": 0.0861,
        "brier_unc": 0.227,
        "slope": 0.7678,
        "intercept": -0.8685,
        "auroc": 0.8533,
        "recal_ece": 0.0546,
        "ece_reduction": 0.0717,
        "sharpness": 0.1254,
        "tail": {
          "n": 87,
          "thresh": 0.9,
          "conf": 0.9408,
          "acc": 0.7471,
          "gap": 0.1937,
          "ece": 0.1937
        },
        "aurc": 0.4111,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1426,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.1333,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.1429,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1766,
            "acc": 0.3333
          },
          "society": {
            "n": 82,
            "ece": 0.1005,
            "acc": 0.4512
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0454,
            "o": 0.029,
            "n": 69
          },
          {
            "mid": 0.15,
            "p": 0.132,
            "o": 0.0833,
            "n": 84
          },
          {
            "mid": 0.25,
            "p": 0.2452,
            "o": 0.125,
            "n": 56
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.1,
            "n": 10
          },
          {
            "mid": 0.45,
            "p": 0.4167,
            "o": 0.1905,
            "n": 21
          },
          {
            "mid": 0.55,
            "p": 0.585,
            "o": 0.5,
            "n": 10
          },
          {
            "mid": 0.65,
            "p": 0.6708,
            "o": 0.52,
            "n": 25
          },
          {
            "mid": 0.75,
            "p": 0.7532,
            "o": 0.5263,
            "n": 19
          },
          {
            "mid": 0.85,
            "p": 0.8334,
            "o": 0.6557,
            "n": 61
          },
          {
            "mid": 0.95,
            "p": 0.9408,
            "o": 0.7471,
            "n": 87
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.83,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 5,
      "model": "opus-4-7",
      "display": "Claude Opus 4.7",
      "provider": "Anthropic",
      "flag": null,
      "trust": 92,
      "trust_ci": [
        89.1,
        93.0
      ],
      "comb_ece": 0.0819,
      "xdom_stability": 0.087,
      "usd_per_call": 0.118,
      "latency_s": 17,
      "cost_tier": "premium",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 395,
        "acc": 0.8962,
        "overconf": -0.1062,
        "ece": 0.1095,
        "ece_adaptive": 0.1062,
        "mce": 0.35,
        "brier": 0.0905,
        "brier_rel": 0.0137,
        "brier_res": 0.0164,
        "brier_unc": 0.093,
        "slope": 1.5748,
        "intercept": 0.4427,
        "auroc": 0.8227,
        "recal_ece": 0.027,
        "ece_reduction": 0.0825,
        "sharpness": 0.0204,
        "tail": {
          "n": 131,
          "thresh": 0.9,
          "conf": 0.9316,
          "acc": 0.9924,
          "gap": -0.0608,
          "ece": 0.0608
        },
        "aurc": 0.0309,
        "by_domain": {
          "econ_business": {
            "n": 75,
            "ece": 0.1156,
            "acc": 0.8267
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.135,
            "acc": 0.9125
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1731,
            "acc": 0.95
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1281,
            "acc": 0.9125
          },
          "stem": {
            "n": 80,
            "ece": 0.0447,
            "acc": 0.875
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.3,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.45,
            "p": 0.4042,
            "o": 0.5,
            "n": 12
          },
          {
            "mid": 0.55,
            "p": 0.5659,
            "o": 0.7273,
            "n": 44
          },
          {
            "mid": 0.65,
            "p": 0.7,
            "o": 0.8281,
            "n": 64
          },
          {
            "mid": 0.75,
            "p": 0.7538,
            "o": 0.9231,
            "n": 39
          },
          {
            "mid": 0.85,
            "p": 0.8292,
            "o": 0.9417,
            "n": 103
          },
          {
            "mid": 0.95,
            "p": 0.9316,
            "o": 0.9924,
            "n": 131
          }
        ]
      },
      "forecasting": {
        "n": 450,
        "acc": 0.3489,
        "overconf": 0.0285,
        "ece": 0.0543,
        "ece_adaptive": 0.0492,
        "mce": 0.1771,
        "brier": 0.1299,
        "brier_rel": 0.0052,
        "brier_res": 0.1022,
        "brier_unc": 0.2272,
        "slope": 0.7896,
        "intercept": -0.2862,
        "auroc": 0.8784,
        "recal_ece": 0.0508,
        "ece_reduction": 0.0035,
        "sharpness": 0.1304,
        "tail": {
          "n": 71,
          "thresh": 0.9,
          "conf": 0.9325,
          "acc": 0.8873,
          "gap": 0.0452,
          "ece": 0.0452
        },
        "aurc": 0.3793,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.0799,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.0734,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.0918,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1191,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.1166,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0433,
            "o": 0.0723,
            "n": 166
          },
          {
            "mid": 0.15,
            "p": 0.1272,
            "o": 0.125,
            "n": 64
          },
          {
            "mid": 0.25,
            "p": 0.2316,
            "o": 0.1053,
            "n": 19
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.2667,
            "n": 15
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 0.45,
            "n": 20
          },
          {
            "mid": 0.55,
            "p": 0.5571,
            "o": 0.5,
            "n": 14
          },
          {
            "mid": 0.65,
            "p": 0.689,
            "o": 0.5238,
            "n": 21
          },
          {
            "mid": 0.75,
            "p": 0.7486,
            "o": 0.5714,
            "n": 21
          },
          {
            "mid": 0.85,
            "p": 0.8367,
            "o": 0.7436,
            "n": 39
          },
          {
            "mid": 0.95,
            "p": 0.9325,
            "o": 0.8873,
            "n": 71
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -2.0,
        "n_syc": 4,
        "n_syc_held": 4,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 6,
      "model": "opus",
      "display": "Claude Opus 4.8",
      "provider": "Anthropic",
      "flag": null,
      "trust": 92,
      "trust_ci": [
        89.1,
        92.9
      ],
      "comb_ece": 0.0836,
      "xdom_stability": 0.1086,
      "usd_per_call": 0.118,
      "latency_s": 17,
      "cost_tier": "premium",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 400,
        "acc": 0.9175,
        "overconf": -0.0997,
        "ece": 0.0997,
        "ece_adaptive": 0.0997,
        "mce": 0.2417,
        "brier": 0.0816,
        "brier_rel": 0.0132,
        "brier_res": 0.0067,
        "brier_unc": 0.0757,
        "slope": 1.1952,
        "intercept": 0.8146,
        "auroc": 0.7876,
        "recal_ece": 0.0328,
        "ece_reduction": 0.0669,
        "sharpness": 0.0191,
        "tail": {
          "n": 161,
          "thresh": 0.9,
          "conf": 0.94,
          "acc": 0.9814,
          "gap": -0.0414,
          "ece": 0.0414
        },
        "aurc": 0.0256,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.0865,
            "acc": 0.85
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.0774,
            "acc": 0.8875
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1686,
            "acc": 0.9625
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1442,
            "acc": 0.9625
          },
          "stem": {
            "n": 80,
            "ece": 0.0629,
            "acc": 0.925
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.3,
            "o": 0.5,
            "n": 2
          },
          {
            "mid": 0.45,
            "p": 0.425,
            "o": 0.6667,
            "n": 6
          },
          {
            "mid": 0.55,
            "p": 0.5577,
            "o": 0.7692,
            "n": 39
          },
          {
            "mid": 0.65,
            "p": 0.6965,
            "o": 0.8261,
            "n": 46
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.9032,
            "n": 31
          },
          {
            "mid": 0.85,
            "p": 0.8313,
            "o": 0.9391,
            "n": 115
          },
          {
            "mid": 0.95,
            "p": 0.94,
            "o": 0.9814,
            "n": 161
          }
        ]
      },
      "forecasting": {
        "n": 450,
        "acc": 0.3489,
        "overconf": 0.0676,
        "ece": 0.0676,
        "ece_adaptive": 0.0707,
        "mce": 0.3988,
        "brier": 0.131,
        "brier_rel": 0.0121,
        "brier_res": 0.1077,
        "brier_unc": 0.2272,
        "slope": 0.84,
        "intercept": -0.5878,
        "auroc": 0.8943,
        "recal_ece": 0.0447,
        "ece_reduction": 0.0229,
        "sharpness": 0.1387,
        "tail": {
          "n": 91,
          "thresh": 0.9,
          "conf": 0.9342,
          "acc": 0.8681,
          "gap": 0.066,
          "ece": 0.066
        },
        "aurc": 0.3576,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1489,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.109,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.0374,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1163,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.0903,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.05,
            "o": 0.0473,
            "n": 148
          },
          {
            "mid": 0.15,
            "p": 0.1256,
            "o": 0.1111,
            "n": 63
          },
          {
            "mid": 0.25,
            "p": 0.2329,
            "o": 0.1786,
            "n": 28
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.1818,
            "n": 11
          },
          {
            "mid": 0.45,
            "p": 0.41,
            "o": 0.2667,
            "n": 15
          },
          {
            "mid": 0.55,
            "p": 0.5367,
            "o": 0.4667,
            "n": 15
          },
          {
            "mid": 0.65,
            "p": 0.6908,
            "o": 0.5833,
            "n": 12
          },
          {
            "mid": 0.75,
            "p": 0.7518,
            "o": 0.3529,
            "n": 17
          },
          {
            "mid": 0.85,
            "p": 0.8292,
            "o": 0.66,
            "n": 50
          },
          {
            "mid": 0.95,
            "p": 0.9342,
            "o": 0.8681,
            "n": 91
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.75,
        "n_syc": 4,
        "n_syc_held": 4,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 7,
      "model": "gemini-pro",
      "display": "Gemini 2.5 Pro",
      "provider": "Google",
      "flag": null,
      "trust": 91,
      "trust_ci": [
        88.5,
        92.5
      ],
      "comb_ece": 0.0928,
      "xdom_stability": 0.0727,
      "usd_per_call": 0.02,
      "latency_s": 60,
      "cost_tier": "mid",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 212,
        "acc": 0.9623,
        "overconf": -0.042,
        "ece": 0.0443,
        "ece_adaptive": 0.0652,
        "mce": 0.75,
        "brier": 0.0341,
        "brier_rel": 0.0083,
        "brier_res": 0.0102,
        "brier_unc": 0.0363,
        "slope": 0.8518,
        "intercept": 1.2381,
        "auroc": 0.7166,
        "recal_ece": 0.0166,
        "ece_reduction": 0.0278,
        "sharpness": 0.0135,
        "tail": {
          "n": 182,
          "thresh": 0.9,
          "conf": 0.9536,
          "acc": 0.9725,
          "gap": -0.019,
          "ece": 0.019
        },
        "aurc": 0.0147,
        "by_domain": {
          "econ_business": {
            "n": 50,
            "ece": 0.0398,
            "acc": 0.92
          },
          "humanities_social": {
            "n": 42,
            "ece": 0.0771,
            "acc": 0.9762
          },
          "law_politics": {
            "n": 50,
            "ece": 0.0312,
            "acc": 0.96
          },
          "medicine_health": {
            "n": 35,
            "ece": 0.0743,
            "acc": 0.9714
          },
          "stem": {
            "n": 35,
            "ece": 0.0426,
            "acc": 1.0
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.05,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.25,
            "p": 0.25,
            "o": 1.0,
            "n": 1
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 0.3333,
            "n": 3
          },
          {
            "mid": 0.55,
            "p": 0.6,
            "o": 1.0,
            "n": 1
          },
          {
            "mid": 0.65,
            "p": 0.6667,
            "o": 1.0,
            "n": 3
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 1.0,
            "n": 2
          },
          {
            "mid": 0.85,
            "p": 0.8395,
            "o": 1.0,
            "n": 19
          },
          {
            "mid": 0.95,
            "p": 0.9536,
            "o": 0.9725,
            "n": 182
          }
        ]
      },
      "forecasting": {
        "n": 409,
        "acc": 0.3423,
        "overconf": 0.1413,
        "ece": 0.1413,
        "ece_adaptive": 0.1413,
        "mce": 0.2794,
        "brier": 0.1612,
        "brier_rel": 0.0248,
        "brier_res": 0.0902,
        "brier_unc": 0.2251,
        "slope": 0.9862,
        "intercept": -0.9617,
        "auroc": 0.8647,
        "recal_ece": 0.0662,
        "ece_reduction": 0.0751,
        "sharpness": 0.1009,
        "tail": {
          "n": 46,
          "thresh": 0.9,
          "conf": 0.9415,
          "acc": 0.8261,
          "gap": 0.1154,
          "ece": 0.1154
        },
        "aurc": 0.4148,
        "by_domain": {
          "economics": {
            "n": 86,
            "ece": 0.1673,
            "acc": 0.314
          },
          "health": {
            "n": 78,
            "ece": 0.1532,
            "acc": 0.3846
          },
          "politics": {
            "n": 81,
            "ece": 0.2095,
            "acc": 0.2469
          },
          "science_tech": {
            "n": 84,
            "ece": 0.2081,
            "acc": 0.3333
          },
          "society": {
            "n": 80,
            "ece": 0.11,
            "acc": 0.4375
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0459,
            "o": 0.0,
            "n": 44
          },
          {
            "mid": 0.15,
            "p": 0.1334,
            "o": 0.0323,
            "n": 62
          },
          {
            "mid": 0.25,
            "p": 0.2453,
            "o": 0.1228,
            "n": 57
          },
          {
            "mid": 0.35,
            "p": 0.3487,
            "o": 0.0833,
            "n": 24
          },
          {
            "mid": 0.45,
            "p": 0.4128,
            "o": 0.1724,
            "n": 29
          },
          {
            "mid": 0.55,
            "p": 0.5794,
            "o": 0.4706,
            "n": 17
          },
          {
            "mid": 0.65,
            "p": 0.6738,
            "o": 0.5897,
            "n": 39
          },
          {
            "mid": 0.75,
            "p": 0.7481,
            "o": 0.4688,
            "n": 32
          },
          {
            "mid": 0.85,
            "p": 0.8441,
            "o": 0.678,
            "n": 59
          },
          {
            "mid": 0.95,
            "p": 0.9415,
            "o": 0.8261,
            "n": 46
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -0.83,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 8,
      "model": "sonnet-4-5",
      "display": "Claude Sonnet 4.5",
      "provider": "Anthropic",
      "flag": null,
      "trust": 91,
      "trust_ci": [
        88.2,
        91.8
      ],
      "comb_ece": 0.0932,
      "xdom_stability": 0.041,
      "usd_per_call": 0.071,
      "latency_s": 17,
      "cost_tier": "premium",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 400,
        "acc": 0.9225,
        "overconf": -0.1176,
        "ece": 0.1176,
        "ece_adaptive": 0.1176,
        "mce": 0.425,
        "brier": 0.079,
        "brier_rel": 0.0182,
        "brier_res": 0.0113,
        "brier_unc": 0.0715,
        "slope": 1.3782,
        "intercept": 0.8299,
        "auroc": 0.8239,
        "recal_ece": 0.014,
        "ece_reduction": 0.1036,
        "sharpness": 0.0191,
        "tail": {
          "n": 134,
          "thresh": 0.9,
          "conf": 0.9376,
          "acc": 0.9925,
          "gap": -0.0549,
          "ece": 0.0549
        },
        "aurc": 0.0199,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.1205,
            "acc": 0.8875
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.1323,
            "acc": 0.925
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1357,
            "acc": 0.925
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1455,
            "acc": 0.9625
          },
          "stem": {
            "n": 80,
            "ece": 0.1302,
            "acc": 0.9125
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.23,
            "o": 0.4,
            "n": 5
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.5,
            "n": 2
          },
          {
            "mid": 0.45,
            "p": 0.45,
            "o": 0.875,
            "n": 8
          },
          {
            "mid": 0.55,
            "p": 0.5625,
            "o": 0.5833,
            "n": 12
          },
          {
            "mid": 0.65,
            "p": 0.6812,
            "o": 0.8615,
            "n": 65
          },
          {
            "mid": 0.75,
            "p": 0.749,
            "o": 0.8644,
            "n": 59
          },
          {
            "mid": 0.85,
            "p": 0.8317,
            "o": 0.9739,
            "n": 115
          },
          {
            "mid": 0.95,
            "p": 0.9376,
            "o": 0.9925,
            "n": 134
          }
        ]
      },
      "forecasting": {
        "n": 446,
        "acc": 0.3498,
        "overconf": 0.0579,
        "ece": 0.0688,
        "ece_adaptive": 0.0662,
        "mce": 0.1706,
        "brier": 0.146,
        "brier_rel": 0.0067,
        "brier_res": 0.0884,
        "brier_unc": 0.2274,
        "slope": 0.9173,
        "intercept": -0.4177,
        "auroc": 0.8584,
        "recal_ece": 0.0354,
        "ece_reduction": 0.0334,
        "sharpness": 0.0992,
        "tail": {
          "n": 42,
          "thresh": 0.9,
          "conf": 0.9367,
          "acc": 0.8571,
          "gap": 0.0795,
          "ece": 0.0795
        },
        "aurc": 0.4042,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1351,
            "acc": 0.3222
          },
          "health": {
            "n": 86,
            "ece": 0.0872,
            "acc": 0.3953
          },
          "politics": {
            "n": 90,
            "ece": 0.0812,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.09,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.1382,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0585,
            "o": 0.0208,
            "n": 96
          },
          {
            "mid": 0.15,
            "p": 0.1404,
            "o": 0.1304,
            "n": 69
          },
          {
            "mid": 0.25,
            "p": 0.2467,
            "o": 0.1852,
            "n": 54
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.2308,
            "n": 39
          },
          {
            "mid": 0.45,
            "p": 0.45,
            "o": 0.36,
            "n": 25
          },
          {
            "mid": 0.55,
            "p": 0.5477,
            "o": 0.6154,
            "n": 13
          },
          {
            "mid": 0.65,
            "p": 0.6603,
            "o": 0.7059,
            "n": 34
          },
          {
            "mid": 0.75,
            "p": 0.7464,
            "o": 0.5758,
            "n": 33
          },
          {
            "mid": 0.85,
            "p": 0.8473,
            "o": 0.7317,
            "n": 41
          },
          {
            "mid": 0.95,
            "p": 0.9367,
            "o": 0.8571,
            "n": 42
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -1.0,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 9,
      "model": "openai",
      "display": "GPT-4o",
      "provider": "OpenAI",
      "flag": null,
      "trust": 90,
      "trust_ci": [
        87.2,
        91.3
      ],
      "comb_ece": 0.1025,
      "xdom_stability": 0.0625,
      "usd_per_call": 0.03,
      "latency_s": 20,
      "cost_tier": "mid",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 380,
        "acc": 0.8789,
        "overconf": -0.0739,
        "ece": 0.0768,
        "ece_adaptive": 0.0739,
        "mce": 0.1292,
        "brier": 0.1015,
        "brier_rel": 0.0069,
        "brier_res": 0.0118,
        "brier_unc": 0.1064,
        "slope": 1.526,
        "intercept": -0.0071,
        "auroc": 0.7493,
        "recal_ece": 0.0281,
        "ece_reduction": 0.0487,
        "sharpness": 0.0106,
        "tail": {
          "n": 105,
          "thresh": 0.9,
          "conf": 0.9219,
          "acc": 0.981,
          "gap": -0.059,
          "ece": 0.059
        },
        "aurc": 0.0526,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.085,
            "acc": 0.8375
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.0969,
            "acc": 0.9
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1269,
            "acc": 0.9375
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1027,
            "acc": 0.9125
          },
          "stem": {
            "n": 60,
            "ece": 0.1112,
            "acc": 0.7833
          }
        },
        "reliability": [
          {
            "mid": 0.55,
            "p": 0.582,
            "o": 0.56,
            "n": 25
          },
          {
            "mid": 0.65,
            "p": 0.6862,
            "o": 0.8154,
            "n": 65
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.7805,
            "n": 41
          },
          {
            "mid": 0.85,
            "p": 0.8278,
            "o": 0.9167,
            "n": 144
          },
          {
            "mid": 0.95,
            "p": 0.9219,
            "o": 0.981,
            "n": 105
          }
        ]
      },
      "forecasting": {
        "n": 442,
        "acc": 0.3484,
        "overconf": 0.1271,
        "ece": 0.1282,
        "ece_adaptive": 0.1357,
        "mce": 0.3,
        "brier": 0.1772,
        "brier_rel": 0.024,
        "brier_res": 0.0757,
        "brier_unc": 0.227,
        "slope": 0.7159,
        "intercept": -0.7764,
        "auroc": 0.8231,
        "recal_ece": 0.0476,
        "ece_reduction": 0.0807,
        "sharpness": 0.1096,
        "tail": {
          "n": 79,
          "thresh": 0.9,
          "conf": 0.9316,
          "acc": 0.6835,
          "gap": 0.2481,
          "ece": 0.2481
        },
        "aurc": 0.4457,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1806,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.1154,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.1631,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1986,
            "acc": 0.3333
          },
          "society": {
            "n": 82,
            "ece": 0.1567,
            "acc": 0.4512
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0476,
            "o": 0.04,
            "n": 50
          },
          {
            "mid": 0.15,
            "p": 0.1122,
            "o": 0.0811,
            "n": 74
          },
          {
            "mid": 0.25,
            "p": 0.2461,
            "o": 0.1184,
            "n": 76
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 0.1,
            "n": 20
          },
          {
            "mid": 0.55,
            "p": 0.5767,
            "o": 0.4167,
            "n": 60
          },
          {
            "mid": 0.65,
            "p": 0.6967,
            "o": 0.6333,
            "n": 30
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 1.0,
            "n": 1
          },
          {
            "mid": 0.85,
            "p": 0.826,
            "o": 0.6923,
            "n": 52
          },
          {
            "mid": 0.95,
            "p": 0.9316,
            "o": 0.6835,
            "n": 79
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 1.25,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.2,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 10,
      "model": "groq-qwen",
      "display": "Qwen3 32B",
      "provider": "Alibaba / Groq",
      "flag": null,
      "trust": 89,
      "trust_ci": [
        86.4,
        91.5
      ],
      "comb_ece": 0.1052,
      "xdom_stability": 0.0975,
      "usd_per_call": 0.0,
      "latency_s": 25,
      "cost_tier": "free",
      "pareto_trust_cost": true,
      "knowledge": {
        "n": 170,
        "acc": 0.8765,
        "overconf": -0.0639,
        "ece": 0.0934,
        "ece_adaptive": 0.0741,
        "mce": 0.45,
        "brier": 0.0962,
        "brier_rel": 0.0118,
        "brier_res": 0.0252,
        "brier_unc": 0.1083,
        "slope": 1.5749,
        "intercept": -0.0914,
        "auroc": 0.8143,
        "recal_ece": 0.0332,
        "ece_reduction": 0.0601,
        "sharpness": 0.0146,
        "tail": {
          "n": 67,
          "thresh": 0.9,
          "conf": 0.9243,
          "acc": 0.9851,
          "gap": -0.0607,
          "ece": 0.0607
        },
        "aurc": 0.0369,
        "by_domain": {
          "econ_business": {
            "n": 35,
            "ece": 0.0986,
            "acc": 0.8571
          },
          "humanities_social": {
            "n": 55,
            "ece": 0.14,
            "acc": 0.8182
          },
          "law_politics": {
            "n": 30,
            "ece": 0.1883,
            "acc": 0.9333
          },
          "medicine_health": {
            "n": 35,
            "ece": 0.0734,
            "acc": 0.8857
          },
          "stem": {
            "n": 15,
            "ece": 0.1867,
            "acc": 1.0
          }
        },
        "reliability": [
          {
            "mid": 0.45,
            "p": 0.45,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.55,
            "p": 0.5722,
            "o": 0.6667,
            "n": 18
          },
          {
            "mid": 0.65,
            "p": 0.6978,
            "o": 0.6087,
            "n": 23
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 1.0,
            "n": 13
          },
          {
            "mid": 0.85,
            "p": 0.826,
            "o": 0.9167,
            "n": 48
          },
          {
            "mid": 0.95,
            "p": 0.9243,
            "o": 0.9851,
            "n": 67
          }
        ]
      },
      "forecasting": {
        "n": 434,
        "acc": 0.3456,
        "overconf": 0.0991,
        "ece": 0.1171,
        "ece_adaptive": 0.1138,
        "mce": 0.35,
        "brier": 0.1989,
        "brier_rel": 0.0238,
        "brier_res": 0.0535,
        "brier_unc": 0.2262,
        "slope": 0.6614,
        "intercept": -0.5934,
        "auroc": 0.7665,
        "recal_ece": 0.0451,
        "ece_reduction": 0.0719,
        "sharpness": 0.0926,
        "tail": {
          "n": 43,
          "thresh": 0.9,
          "conf": 0.9202,
          "acc": 0.8372,
          "gap": 0.083,
          "ece": 0.083
        },
        "aurc": 0.4582,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1829,
            "acc": 0.3222
          },
          "health": {
            "n": 86,
            "ece": 0.1323,
            "acc": 0.3837
          },
          "politics": {
            "n": 90,
            "ece": 0.1319,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1541,
            "acc": 0.3333
          },
          "society": {
            "n": 78,
            "ece": 0.1028,
            "acc": 0.4487
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0459,
            "o": 0.0,
            "n": 22
          },
          {
            "mid": 0.15,
            "p": 0.1285,
            "o": 0.1667,
            "n": 102
          },
          {
            "mid": 0.25,
            "p": 0.2376,
            "o": 0.1828,
            "n": 93
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.0,
            "n": 3
          },
          {
            "mid": 0.45,
            "p": 0.4,
            "o": 0.2353,
            "n": 17
          },
          {
            "mid": 0.55,
            "p": 0.5611,
            "o": 0.5,
            "n": 36
          },
          {
            "mid": 0.65,
            "p": 0.6976,
            "o": 0.4048,
            "n": 42
          },
          {
            "mid": 0.75,
            "p": 0.7386,
            "o": 0.5714,
            "n": 28
          },
          {
            "mid": 0.85,
            "p": 0.8167,
            "o": 0.5208,
            "n": 48
          },
          {
            "mid": 0.95,
            "p": 0.9202,
            "o": 0.8372,
            "n": 43
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -4.33,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.1,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 11,
      "model": "opus-4-5",
      "display": "Claude Opus 4.5",
      "provider": "Anthropic",
      "flag": null,
      "trust": 89,
      "trust_ci": [
        87.4,
        90.9
      ],
      "comb_ece": 0.1053,
      "xdom_stability": 0.0724,
      "usd_per_call": 0.118,
      "latency_s": 17,
      "cost_tier": "premium",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 400,
        "acc": 0.95,
        "overconf": -0.1236,
        "ece": 0.1281,
        "ece_adaptive": 0.1236,
        "mce": 0.35,
        "brier": 0.0569,
        "brier_rel": 0.0202,
        "brier_res": 0.0108,
        "brier_unc": 0.0475,
        "slope": 2.1633,
        "intercept": 0.3635,
        "auroc": 0.8662,
        "recal_ece": 0.0182,
        "ece_reduction": 0.1099,
        "sharpness": 0.0137,
        "tail": {
          "n": 141,
          "thresh": 0.9,
          "conf": 0.9357,
          "acc": 1.0,
          "gap": -0.0643,
          "ece": 0.0643
        },
        "aurc": 0.0087,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.169,
            "acc": 0.9375
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.1062,
            "acc": 0.925
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1466,
            "acc": 0.9625
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1595,
            "acc": 0.975
          },
          "stem": {
            "n": 80,
            "ece": 0.1034,
            "acc": 0.95
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.275,
            "o": 0.0,
            "n": 2
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.45,
            "p": 0.45,
            "o": 0.5,
            "n": 4
          },
          {
            "mid": 0.55,
            "p": 0.5636,
            "o": 0.9091,
            "n": 11
          },
          {
            "mid": 0.65,
            "p": 0.6756,
            "o": 0.875,
            "n": 32
          },
          {
            "mid": 0.75,
            "p": 0.7469,
            "o": 0.9149,
            "n": 94
          },
          {
            "mid": 0.85,
            "p": 0.8512,
            "o": 0.9826,
            "n": 115
          },
          {
            "mid": 0.95,
            "p": 0.9357,
            "o": 1.0,
            "n": 141
          }
        ]
      },
      "forecasting": {
        "n": 450,
        "acc": 0.3489,
        "overconf": 0.0824,
        "ece": 0.0824,
        "ece_adaptive": 0.0824,
        "mce": 0.2857,
        "brier": 0.1252,
        "brier_rel": 0.0095,
        "brier_res": 0.1098,
        "brier_unc": 0.2272,
        "slope": 1.0411,
        "intercept": -0.7006,
        "auroc": 0.9073,
        "recal_ece": 0.0333,
        "ece_reduction": 0.0491,
        "sharpness": 0.1221,
        "tail": {
          "n": 63,
          "thresh": 0.9,
          "conf": 0.9394,
          "acc": 0.8889,
          "gap": 0.0505,
          "ece": 0.0505
        },
        "aurc": 0.355,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1382,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.1347,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.0598,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1173,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.0591,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0519,
            "o": 0.0089,
            "n": 112
          },
          {
            "mid": 0.15,
            "p": 0.1377,
            "o": 0.0822,
            "n": 73
          },
          {
            "mid": 0.25,
            "p": 0.2493,
            "o": 0.1951,
            "n": 41
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.1538,
            "n": 26
          },
          {
            "mid": 0.45,
            "p": 0.45,
            "o": 0.4,
            "n": 10
          },
          {
            "mid": 0.55,
            "p": 0.5714,
            "o": 0.2857,
            "n": 7
          },
          {
            "mid": 0.65,
            "p": 0.6633,
            "o": 0.5238,
            "n": 21
          },
          {
            "mid": 0.75,
            "p": 0.7446,
            "o": 0.62,
            "n": 50
          },
          {
            "mid": 0.85,
            "p": 0.8528,
            "o": 0.7234,
            "n": 47
          },
          {
            "mid": 0.95,
            "p": 0.9394,
            "o": 0.8889,
            "n": 63
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -1.0,
        "n_syc": 4,
        "n_syc_held": 4,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 12,
      "model": "sonnet",
      "display": "Claude Sonnet 4.6",
      "provider": "Anthropic",
      "flag": null,
      "trust": 89,
      "trust_ci": [
        87.0,
        90.6
      ],
      "comb_ece": 0.1066,
      "xdom_stability": 0.0672,
      "usd_per_call": 0.071,
      "latency_s": 17,
      "cost_tier": "premium",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 395,
        "acc": 0.9291,
        "overconf": -0.1485,
        "ece": 0.1503,
        "ece_adaptive": 0.1485,
        "mce": 0.35,
        "brier": 0.083,
        "brier_rel": 0.0298,
        "brier_res": 0.0128,
        "brier_unc": 0.0659,
        "slope": 1.5757,
        "intercept": 1.1579,
        "auroc": 0.852,
        "recal_ece": 0.0166,
        "ece_reduction": 0.1336,
        "sharpness": 0.0271,
        "tail": {
          "n": 139,
          "thresh": 0.9,
          "conf": 0.9337,
          "acc": 0.9928,
          "gap": -0.0591,
          "ece": 0.0591
        },
        "aurc": 0.016,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.1422,
            "acc": 0.875
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.1709,
            "acc": 0.9375
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1675,
            "acc": 0.925
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1738,
            "acc": 0.9375
          },
          "stem": {
            "n": 75,
            "ece": 0.1133,
            "acc": 0.9733
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.2375,
            "o": 0.25,
            "n": 4
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.45,
            "p": 0.44,
            "o": 0.72,
            "n": 25
          },
          {
            "mid": 0.55,
            "p": 0.5531,
            "o": 0.8571,
            "n": 35
          },
          {
            "mid": 0.65,
            "p": 0.6545,
            "o": 0.85,
            "n": 40
          },
          {
            "mid": 0.75,
            "p": 0.7347,
            "o": 0.9492,
            "n": 59
          },
          {
            "mid": 0.85,
            "p": 0.8411,
            "o": 0.9783,
            "n": 92
          },
          {
            "mid": 0.95,
            "p": 0.9337,
            "o": 0.9928,
            "n": 139
          }
        ]
      },
      "forecasting": {
        "n": 450,
        "acc": 0.3489,
        "overconf": 0.0141,
        "ece": 0.063,
        "ece_adaptive": 0.0439,
        "mce": 0.1545,
        "brier": 0.1167,
        "brier_rel": 0.0061,
        "brier_res": 0.1158,
        "brier_unc": 0.2272,
        "slope": 1.0875,
        "intercept": -0.0881,
        "auroc": 0.9042,
        "recal_ece": 0.0495,
        "ece_reduction": 0.0134,
        "sharpness": 0.1011,
        "tail": {
          "n": 29,
          "thresh": 0.9,
          "conf": 0.9476,
          "acc": 0.7931,
          "gap": 0.1545,
          "ece": 0.1545
        },
        "aurc": 0.3767,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.1271,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.0531,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.0974,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.0953,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.0974,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0517,
            "o": 0.0267,
            "n": 150
          },
          {
            "mid": 0.15,
            "p": 0.1375,
            "o": 0.0702,
            "n": 57
          },
          {
            "mid": 0.25,
            "p": 0.2488,
            "o": 0.2,
            "n": 40
          },
          {
            "mid": 0.35,
            "p": 0.3515,
            "o": 0.25,
            "n": 20
          },
          {
            "mid": 0.45,
            "p": 0.4289,
            "o": 0.5556,
            "n": 18
          },
          {
            "mid": 0.55,
            "p": 0.555,
            "o": 0.6944,
            "n": 36
          },
          {
            "mid": 0.65,
            "p": 0.636,
            "o": 0.6,
            "n": 15
          },
          {
            "mid": 0.75,
            "p": 0.7294,
            "o": 0.7143,
            "n": 49
          },
          {
            "mid": 0.85,
            "p": 0.8417,
            "o": 0.9444,
            "n": 36
          },
          {
            "mid": 0.95,
            "p": 0.9476,
            "o": 0.7931,
            "n": 29
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -0.75,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.0,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 13,
      "model": "mistral",
      "display": "Mistral Large",
      "provider": "Mistral",
      "flag": null,
      "trust": 89,
      "trust_ci": [
        86.3,
        91.2
      ],
      "comb_ece": 0.1081,
      "xdom_stability": 0.1069,
      "usd_per_call": 0.025,
      "latency_s": 22,
      "cost_tier": "mid",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 210,
        "acc": 0.9048,
        "overconf": -0.0777,
        "ece": 0.0796,
        "ece_adaptive": 0.0846,
        "mce": 0.15,
        "brier": 0.0828,
        "brier_rel": 0.0073,
        "brier_res": 0.0085,
        "brier_unc": 0.0862,
        "slope": 1.7509,
        "intercept": -0.181,
        "auroc": 0.7897,
        "recal_ece": 0.0587,
        "ece_reduction": 0.021,
        "sharpness": 0.0102,
        "tail": {
          "n": 72,
          "thresh": 0.9,
          "conf": 0.9322,
          "acc": 0.9861,
          "gap": -0.0539,
          "ece": 0.0539
        },
        "aurc": 0.0359,
        "by_domain": {
          "econ_business": {
            "n": 25,
            "ece": 0.1408,
            "acc": 0.96
          },
          "humanities_social": {
            "n": 65,
            "ece": 0.0735,
            "acc": 0.8615
          },
          "law_politics": {
            "n": 50,
            "ece": 0.087,
            "acc": 0.92
          },
          "medicine_health": {
            "n": 25,
            "ece": 0.1636,
            "acc": 0.92
          },
          "stem": {
            "n": 45,
            "ece": 0.0796,
            "acc": 0.9111
          }
        },
        "reliability": [
          {
            "mid": 0.55,
            "p": 0.5778,
            "o": 0.5556,
            "n": 9
          },
          {
            "mid": 0.65,
            "p": 0.69,
            "o": 0.84,
            "n": 25
          },
          {
            "mid": 0.75,
            "p": 0.7555,
            "o": 0.8485,
            "n": 33
          },
          {
            "mid": 0.85,
            "p": 0.8335,
            "o": 0.9155,
            "n": 71
          },
          {
            "mid": 0.95,
            "p": 0.9322,
            "o": 0.9861,
            "n": 72
          }
        ]
      },
      "forecasting": {
        "n": 434,
        "acc": 0.3456,
        "overconf": 0.1291,
        "ece": 0.1367,
        "ece_adaptive": 0.1344,
        "mce": 0.75,
        "brier": 0.2064,
        "brier_rel": 0.0366,
        "brier_res": 0.0576,
        "brier_unc": 0.2262,
        "slope": 0.5979,
        "intercept": -0.7141,
        "auroc": 0.774,
        "recal_ece": 0.0489,
        "ece_reduction": 0.0877,
        "sharpness": 0.0972,
        "tail": {
          "n": 60,
          "thresh": 0.9,
          "conf": 0.9333,
          "acc": 0.7833,
          "gap": 0.15,
          "ece": 0.15
        },
        "aurc": 0.4688,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.202,
            "acc": 0.3222
          },
          "health": {
            "n": 86,
            "ece": 0.1376,
            "acc": 0.3837
          },
          "politics": {
            "n": 90,
            "ece": 0.1539,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1936,
            "acc": 0.3333
          },
          "society": {
            "n": 78,
            "ece": 0.0783,
            "acc": 0.4487
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0482,
            "o": 0.0816,
            "n": 49
          },
          {
            "mid": 0.15,
            "p": 0.1229,
            "o": 0.0196,
            "n": 51
          },
          {
            "mid": 0.25,
            "p": 0.2593,
            "o": 0.2198,
            "n": 91
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.0,
            "n": 3
          },
          {
            "mid": 0.45,
            "p": 0.4043,
            "o": 0.3429,
            "n": 35
          },
          {
            "mid": 0.55,
            "p": 0.5844,
            "o": 0.4583,
            "n": 48
          },
          {
            "mid": 0.65,
            "p": 0.6849,
            "o": 0.5581,
            "n": 43
          },
          {
            "mid": 0.75,
            "p": 0.75,
            "o": 0.0,
            "n": 2
          },
          {
            "mid": 0.85,
            "p": 0.8385,
            "o": 0.3846,
            "n": 52
          },
          {
            "mid": 0.95,
            "p": 0.9333,
            "o": 0.7833,
            "n": 60
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.0,
        "n_syc": 4,
        "n_syc_held": 4,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.3,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 14,
      "model": "groq",
      "display": "Llama 3.3 70B",
      "provider": "Meta / Groq",
      "flag": null,
      "trust": 89,
      "trust_ci": [
        85.3,
        90.3
      ],
      "comb_ece": 0.1111,
      "xdom_stability": 0.091,
      "usd_per_call": 0.0,
      "latency_s": 25,
      "cost_tier": "free",
      "pareto_trust_cost": true,
      "knowledge": {
        "n": 330,
        "acc": 0.7939,
        "overconf": 0.03,
        "ece": 0.038,
        "ece_adaptive": 0.0709,
        "mce": 0.3,
        "brier": 0.1454,
        "brier_rel": 0.0029,
        "brier_res": 0.0197,
        "brier_unc": 0.1636,
        "slope": 0.9335,
        "intercept": -0.1285,
        "auroc": 0.7224,
        "recal_ece": 0.053,
        "ece_reduction": -0.0149,
        "sharpness": 0.0164,
        "tail": {
          "n": 129,
          "thresh": 0.9,
          "conf": 0.9347,
          "acc": 0.8992,
          "gap": 0.0355,
          "ece": 0.0355
        },
        "aurc": 0.1165,
        "by_domain": {
          "econ_business": {
            "n": 60,
            "ece": 0.092,
            "acc": 0.7333
          },
          "humanities_social": {
            "n": 70,
            "ece": 0.0686,
            "acc": 0.8286
          },
          "law_politics": {
            "n": 60,
            "ece": 0.0615,
            "acc": 0.8833
          },
          "medicine_health": {
            "n": 60,
            "ece": 0.1062,
            "acc": 0.8
          },
          "stem": {
            "n": 80,
            "ece": 0.1076,
            "acc": 0.7375
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.3,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.45,
            "p": 0.4025,
            "o": 0.375,
            "n": 8
          },
          {
            "mid": 0.55,
            "p": 0.5913,
            "o": 0.4783,
            "n": 23
          },
          {
            "mid": 0.65,
            "p": 0.6962,
            "o": 0.7353,
            "n": 34
          },
          {
            "mid": 0.75,
            "p": 0.7618,
            "o": 0.6364,
            "n": 22
          },
          {
            "mid": 0.85,
            "p": 0.8297,
            "o": 0.823,
            "n": 113
          },
          {
            "mid": 0.95,
            "p": 0.9347,
            "o": 0.8992,
            "n": 129
          }
        ]
      },
      "forecasting": {
        "n": 446,
        "acc": 0.3475,
        "overconf": 0.1648,
        "ece": 0.1842,
        "ece_adaptive": 0.1844,
        "mce": 0.3586,
        "brier": 0.2315,
        "brier_rel": 0.0499,
        "brier_res": 0.0454,
        "brier_unc": 0.2268,
        "slope": 0.4913,
        "intercept": -0.8124,
        "auroc": 0.7561,
        "recal_ece": 0.0442,
        "ece_reduction": 0.14,
        "sharpness": 0.1119,
        "tail": {
          "n": 72,
          "thresh": 0.9,
          "conf": 0.949,
          "acc": 0.7222,
          "gap": 0.2268,
          "ece": 0.2268
        },
        "aurc": 0.4676,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.2476,
            "acc": 0.3222
          },
          "health": {
            "n": 86,
            "ece": 0.2453,
            "acc": 0.3837
          },
          "politics": {
            "n": 90,
            "ece": 0.1969,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.2513,
            "acc": 0.3333
          },
          "society": {
            "n": 90,
            "ece": 0.1154,
            "acc": 0.4444
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.047,
            "o": 0.1228,
            "n": 57
          },
          {
            "mid": 0.15,
            "p": 0.1535,
            "o": 0.1515,
            "n": 66
          },
          {
            "mid": 0.25,
            "p": 0.2304,
            "o": 0.1667,
            "n": 48
          },
          {
            "mid": 0.35,
            "p": 0.3527,
            "o": 0.2727,
            "n": 11
          },
          {
            "mid": 0.45,
            "p": 0.4143,
            "o": 0.2,
            "n": 40
          },
          {
            "mid": 0.55,
            "p": 0.578,
            "o": 0.3,
            "n": 10
          },
          {
            "mid": 0.65,
            "p": 0.6597,
            "o": 0.3226,
            "n": 31
          },
          {
            "mid": 0.75,
            "p": 0.74,
            "o": 0.5161,
            "n": 31
          },
          {
            "mid": 0.85,
            "p": 0.8336,
            "o": 0.475,
            "n": 80
          },
          {
            "mid": 0.95,
            "p": 0.949,
            "o": 0.7222,
            "n": 72
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.83,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.1,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 15,
      "model": "gpt-4o-mini",
      "display": "GPT-4o mini",
      "provider": "OpenAI",
      "flag": null,
      "trust": 88,
      "trust_ci": [
        84.2,
        89.4
      ],
      "comb_ece": 0.121,
      "xdom_stability": 0.0922,
      "usd_per_call": 0.003,
      "latency_s": 18,
      "cost_tier": "cheap",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 390,
        "acc": 0.8128,
        "overconf": -0.0324,
        "ece": 0.0658,
        "ece_adaptive": 0.09,
        "mce": 0.102,
        "brier": 0.1376,
        "brier_rel": 0.0056,
        "brier_res": 0.0201,
        "brier_unc": 0.1521,
        "slope": 1.7414,
        "intercept": -0.6004,
        "auroc": 0.7544,
        "recal_ece": 0.0279,
        "ece_reduction": 0.0379,
        "sharpness": 0.0092,
        "tail": {
          "n": 67,
          "thresh": 0.9,
          "conf": 0.9133,
          "acc": 0.9552,
          "gap": -0.0419,
          "ece": 0.0419
        },
        "aurc": 0.0853,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.1472,
            "acc": 0.7875
          },
          "humanities_social": {
            "n": 70,
            "ece": 0.1093,
            "acc": 0.8429
          },
          "law_politics": {
            "n": 80,
            "ece": 0.111,
            "acc": 0.8625
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1089,
            "acc": 0.9
          },
          "stem": {
            "n": 80,
            "ece": 0.1255,
            "acc": 0.675
          }
        },
        "reliability": [
          {
            "mid": 0.55,
            "p": 0.5826,
            "o": 0.6522,
            "n": 23
          },
          {
            "mid": 0.65,
            "p": 0.68,
            "o": 0.6067,
            "n": 89
          },
          {
            "mid": 0.75,
            "p": 0.7521,
            "o": 0.7571,
            "n": 70
          },
          {
            "mid": 0.85,
            "p": 0.8271,
            "o": 0.9291,
            "n": 141
          },
          {
            "mid": 0.95,
            "p": 0.9133,
            "o": 0.9552,
            "n": 67
          }
        ]
      },
      "forecasting": {
        "n": 442,
        "acc": 0.3484,
        "overconf": 0.1312,
        "ece": 0.1761,
        "ece_adaptive": 0.1748,
        "mce": 0.3711,
        "brier": 0.2431,
        "brier_rel": 0.0455,
        "brier_res": 0.0308,
        "brier_unc": 0.227,
        "slope": 0.4233,
        "intercept": -0.6489,
        "auroc": 0.6964,
        "recal_ece": 0.0365,
        "ece_reduction": 0.1397,
        "sharpness": 0.0924,
        "tail": {
          "n": 50,
          "thresh": 0.9,
          "conf": 0.929,
          "acc": 0.72,
          "gap": 0.209,
          "ece": 0.209
        },
        "aurc": 0.4953,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.2067,
            "acc": 0.3222
          },
          "health": {
            "n": 90,
            "ece": 0.1897,
            "acc": 0.3889
          },
          "politics": {
            "n": 90,
            "ece": 0.2304,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.2434,
            "acc": 0.3333
          },
          "society": {
            "n": 82,
            "ece": 0.0974,
            "acc": 0.4512
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0397,
            "o": 0.125,
            "n": 32
          },
          {
            "mid": 0.15,
            "p": 0.1234,
            "o": 0.2192,
            "n": 73
          },
          {
            "mid": 0.25,
            "p": 0.2537,
            "o": 0.1975,
            "n": 81
          },
          {
            "mid": 0.45,
            "p": 0.4059,
            "o": 0.4118,
            "n": 34
          },
          {
            "mid": 0.55,
            "p": 0.5696,
            "o": 0.2745,
            "n": 51
          },
          {
            "mid": 0.65,
            "p": 0.6744,
            "o": 0.3721,
            "n": 43
          },
          {
            "mid": 0.75,
            "p": 0.7433,
            "o": 0.5417,
            "n": 24
          },
          {
            "mid": 0.85,
            "p": 0.8341,
            "o": 0.463,
            "n": 54
          },
          {
            "mid": 0.95,
            "p": 0.929,
            "o": 0.72,
            "n": 50
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -0.71,
        "n_syc": 7,
        "n_syc_held": 7,
        "abstention": null,
        "n_abs": 8,
        "abs_attempt_answerable": null,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.1,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 16,
      "model": "ollama",
      "display": "Qwen3-Coder (local)",
      "provider": "Local",
      "flag": "local",
      "trust": 87,
      "trust_ci": [
        79.8,
        89.2
      ],
      "comb_ece": 0.1253,
      "xdom_stability": 0.1277,
      "usd_per_call": 0.0,
      "latency_s": 137,
      "cost_tier": "free",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 120,
        "acc": 0.8,
        "overconf": 0.0048,
        "ece": 0.0378,
        "ece_adaptive": 0.0945,
        "mce": 0.0967,
        "brier": 0.1604,
        "brier_rel": 0.0021,
        "brier_res": 0.0035,
        "brier_unc": 0.16,
        "slope": 0.5038,
        "intercept": 0.6446,
        "auroc": 0.5736,
        "recal_ece": 0.0894,
        "ece_reduction": -0.0515,
        "sharpness": 0.0083,
        "tail": {
          "n": 26,
          "thresh": 0.9,
          "conf": 0.9165,
          "acc": 0.8846,
          "gap": 0.0319,
          "ece": 0.0319
        },
        "aurc": 0.1552,
        "by_domain": {
          "econ_business": {
            "n": 24,
            "ece": 0.1408,
            "acc": 0.7917
          },
          "humanities_social": {
            "n": 24,
            "ece": 0.0746,
            "acc": 0.75
          },
          "law_politics": {
            "n": 24,
            "ece": 0.0646,
            "acc": 0.7917
          },
          "medicine_health": {
            "n": 24,
            "ece": 0.1792,
            "acc": 0.875
          },
          "stem": {
            "n": 24,
            "ece": 0.1208,
            "acc": 0.7917
          }
        },
        "reliability": [
          {
            "mid": 0.55,
            "p": 0.59,
            "o": 0.6,
            "n": 5
          },
          {
            "mid": 0.65,
            "p": 0.6811,
            "o": 0.7778,
            "n": 18
          },
          {
            "mid": 0.75,
            "p": 0.7529,
            "o": 0.7619,
            "n": 21
          },
          {
            "mid": 0.85,
            "p": 0.8346,
            "o": 0.8,
            "n": 50
          },
          {
            "mid": 0.95,
            "p": 0.9165,
            "o": 0.8846,
            "n": 26
          }
        ]
      },
      "forecasting": {
        "n": 120,
        "acc": 0.2833,
        "overconf": 0.1585,
        "ece": 0.2127,
        "ece_adaptive": 0.2353,
        "mce": 0.6,
        "brier": 0.2572,
        "brier_rel": 0.0791,
        "brier_res": 0.0246,
        "brier_unc": 0.2031,
        "slope": 0.3638,
        "intercept": -0.8783,
        "auroc": 0.6399,
        "recal_ece": 0.0508,
        "ece_reduction": 0.1619,
        "sharpness": 0.085,
        "tail": null,
        "aurc": 0.5867,
        "by_domain": {
          "economics": {
            "n": 24,
            "ece": 0.2667,
            "acc": 0.25
          },
          "health": {
            "n": 24,
            "ece": 0.3208,
            "acc": 0.3333
          },
          "politics": {
            "n": 24,
            "ece": 0.3021,
            "acc": 0.2083
          },
          "science_tech": {
            "n": 24,
            "ece": 0.18,
            "acc": 0.25
          },
          "society": {
            "n": 24,
            "ece": 0.2396,
            "acc": 0.375
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0433,
            "o": 0.0,
            "n": 9
          },
          {
            "mid": 0.15,
            "p": 0.131,
            "o": 0.2857,
            "n": 21
          },
          {
            "mid": 0.25,
            "p": 0.2629,
            "o": 0.2581,
            "n": 31
          },
          {
            "mid": 0.45,
            "p": 0.4062,
            "o": 0.25,
            "n": 8
          },
          {
            "mid": 0.55,
            "p": 0.6,
            "o": 0.0,
            "n": 8
          },
          {
            "mid": 0.65,
            "p": 0.6688,
            "o": 0.3125,
            "n": 16
          },
          {
            "mid": 0.75,
            "p": 0.7467,
            "o": 0.3333,
            "n": 6
          },
          {
            "mid": 0.85,
            "p": 0.85,
            "o": 0.4167,
            "n": 12
          },
          {
            "mid": 0.95,
            "p": 0.9222,
            "o": 0.6667,
            "n": 9
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": -7.5,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.2,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 17,
      "model": "haiku",
      "display": "Claude Haiku 4.5",
      "provider": "Anthropic",
      "flag": null,
      "trust": 87,
      "trust_ci": [
        84.5,
        89.1
      ],
      "comb_ece": 0.126,
      "xdom_stability": 0.0476,
      "usd_per_call": 0.019,
      "latency_s": 25,
      "cost_tier": "mid",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 400,
        "acc": 0.9075,
        "overconf": -0.1184,
        "ece": 0.1269,
        "ece_adaptive": 0.1184,
        "mce": 0.45,
        "brier": 0.0882,
        "brier_rel": 0.0198,
        "brier_res": 0.0148,
        "brier_unc": 0.0839,
        "slope": 1.7981,
        "intercept": 0.2219,
        "auroc": 0.7541,
        "recal_ece": 0.0363,
        "ece_reduction": 0.0906,
        "sharpness": 0.0119,
        "tail": {
          "n": 63,
          "thresh": 0.9,
          "conf": 0.9324,
          "acc": 1.0,
          "gap": -0.0676,
          "ece": 0.0676
        },
        "aurc": 0.0358,
        "by_domain": {
          "econ_business": {
            "n": 80,
            "ece": 0.13,
            "acc": 0.875
          },
          "humanities_social": {
            "n": 80,
            "ece": 0.1231,
            "acc": 0.8875
          },
          "law_politics": {
            "n": 80,
            "ece": 0.1574,
            "acc": 0.9375
          },
          "medicine_health": {
            "n": 80,
            "ece": 0.1275,
            "acc": 0.9125
          },
          "stem": {
            "n": 80,
            "ece": 0.1174,
            "acc": 0.925
          }
        },
        "reliability": [
          {
            "mid": 0.25,
            "p": 0.3,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.3333,
            "n": 3
          },
          {
            "mid": 0.45,
            "p": 0.45,
            "o": 0.0,
            "n": 3
          },
          {
            "mid": 0.55,
            "p": 0.569,
            "o": 0.6,
            "n": 10
          },
          {
            "mid": 0.65,
            "p": 0.6692,
            "o": 0.8871,
            "n": 62
          },
          {
            "mid": 0.75,
            "p": 0.7492,
            "o": 0.9035,
            "n": 114
          },
          {
            "mid": 0.85,
            "p": 0.8444,
            "o": 0.9375,
            "n": 144
          },
          {
            "mid": 0.95,
            "p": 0.9324,
            "o": 1.0,
            "n": 63
          }
        ]
      },
      "forecasting": {
        "n": 438,
        "acc": 0.347,
        "overconf": 0.1135,
        "ece": 0.125,
        "ece_adaptive": 0.1171,
        "mce": 0.392,
        "brier": 0.1998,
        "brier_rel": 0.0267,
        "brier_res": 0.0533,
        "brier_unc": 0.2266,
        "slope": 0.7201,
        "intercept": -0.6302,
        "auroc": 0.7797,
        "recal_ece": 0.0519,
        "ece_reduction": 0.0731,
        "sharpness": 0.0889,
        "tail": {
          "n": 21,
          "thresh": 0.9,
          "conf": 0.9286,
          "acc": 0.8571,
          "gap": 0.0714,
          "ece": 0.0714
        },
        "aurc": 0.4448,
        "by_domain": {
          "economics": {
            "n": 90,
            "ece": 0.174,
            "acc": 0.3222
          },
          "health": {
            "n": 86,
            "ece": 0.1591,
            "acc": 0.3837
          },
          "politics": {
            "n": 90,
            "ece": 0.1346,
            "acc": 0.2556
          },
          "science_tech": {
            "n": 90,
            "ece": 0.1698,
            "acc": 0.3333
          },
          "society": {
            "n": 82,
            "ece": 0.1188,
            "acc": 0.4512
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0625,
            "o": 0.087,
            "n": 69
          },
          {
            "mid": 0.15,
            "p": 0.1461,
            "o": 0.0652,
            "n": 46
          },
          {
            "mid": 0.25,
            "p": 0.2564,
            "o": 0.2162,
            "n": 74
          },
          {
            "mid": 0.35,
            "p": 0.3496,
            "o": 0.2857,
            "n": 28
          },
          {
            "mid": 0.45,
            "p": 0.4371,
            "o": 0.4762,
            "n": 21
          },
          {
            "mid": 0.55,
            "p": 0.592,
            "o": 0.2,
            "n": 5
          },
          {
            "mid": 0.65,
            "p": 0.6537,
            "o": 0.4074,
            "n": 27
          },
          {
            "mid": 0.75,
            "p": 0.7405,
            "o": 0.469,
            "n": 113
          },
          {
            "mid": 0.85,
            "p": 0.8521,
            "o": 0.7647,
            "n": 34
          },
          {
            "mid": 0.95,
            "p": 0.9286,
            "o": 0.8571,
            "n": 21
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 1.25,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.1,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    },
    {
      "rank": 18,
      "model": "perplexity",
      "display": "Perplexity Sonar",
      "provider": "Perplexity",
      "flag": "web_grounded",
      "trust": 80,
      "trust_ci": [
        74.7,
        83.0
      ],
      "comb_ece": 0.2027,
      "xdom_stability": 0.1693,
      "usd_per_call": 0.005,
      "latency_s": 27,
      "cost_tier": "cheap",
      "pareto_trust_cost": false,
      "knowledge": {
        "n": 150,
        "acc": 0.8067,
        "overconf": 0.1133,
        "ece": 0.1133,
        "ece_adaptive": 0.1133,
        "mce": 0.41,
        "brier": 0.1546,
        "brier_rel": 0.0153,
        "brier_res": 0.0164,
        "brier_unc": 0.156,
        "slope": 0.7407,
        "intercept": -0.5376,
        "auroc": 0.6915,
        "recal_ece": 0.063,
        "ece_reduction": 0.0503,
        "sharpness": 0.0079,
        "tail": {
          "n": 118,
          "thresh": 0.9,
          "conf": 0.9564,
          "acc": 0.8644,
          "gap": 0.0919,
          "ece": 0.0919
        },
        "aurc": 0.125,
        "by_domain": {
          "econ_business": {
            "n": 50,
            "ece": 0.1064,
            "acc": 0.8
          },
          "humanities_social": {
            "n": 30,
            "ece": 0.057,
            "acc": 0.9
          },
          "law_politics": {
            "n": 20,
            "ece": 0.137,
            "acc": 0.85
          },
          "medicine_health": {
            "n": 25,
            "ece": 0.2292,
            "acc": 0.72
          },
          "stem": {
            "n": 25,
            "ece": 0.2092,
            "acc": 0.76
          }
        },
        "reliability": [
          {
            "mid": 0.45,
            "p": 0.41,
            "o": 0.0,
            "n": 1
          },
          {
            "mid": 0.55,
            "p": 0.55,
            "o": 0.5,
            "n": 2
          },
          {
            "mid": 0.75,
            "p": 0.742,
            "o": 0.5,
            "n": 10
          },
          {
            "mid": 0.85,
            "p": 0.8532,
            "o": 0.6842,
            "n": 19
          },
          {
            "mid": 0.95,
            "p": 0.9564,
            "o": 0.8644,
            "n": 118
          }
        ]
      },
      "forecasting": {
        "n": 289,
        "acc": 0.3633,
        "overconf": 0.1371,
        "ece": 0.2921,
        "ece_adaptive": 0.279,
        "mce": 0.56,
        "brier": 0.3212,
        "brier_rel": 0.1057,
        "brier_res": 0.0145,
        "brier_unc": 0.2313,
        "slope": 0.1821,
        "intercept": -0.6029,
        "auroc": 0.6255,
        "recal_ece": 0.0451,
        "ece_reduction": 0.247,
        "sharpness": 0.1497,
        "tail": {
          "n": 79,
          "thresh": 0.9,
          "conf": 0.9572,
          "acc": 0.481,
          "gap": 0.4762,
          "ece": 0.4762
        },
        "aurc": 0.5478,
        "by_domain": {
          "economics": {
            "n": 48,
            "ece": 0.2238,
            "acc": 0.3125
          },
          "health": {
            "n": 86,
            "ece": 0.3101,
            "acc": 0.3837
          },
          "politics": {
            "n": 8,
            "ece": 0.3513,
            "acc": 0.125
          },
          "science_tech": {
            "n": 62,
            "ece": 0.3902,
            "acc": 0.2742
          },
          "society": {
            "n": 85,
            "ece": 0.3489,
            "acc": 0.4588
          }
        },
        "reliability": [
          {
            "mid": 0.05,
            "p": 0.0537,
            "o": 0.2329,
            "n": 73
          },
          {
            "mid": 0.15,
            "p": 0.1402,
            "o": 0.3023,
            "n": 43
          },
          {
            "mid": 0.25,
            "p": 0.235,
            "o": 0.3125,
            "n": 16
          },
          {
            "mid": 0.35,
            "p": 0.35,
            "o": 0.125,
            "n": 8
          },
          {
            "mid": 0.45,
            "p": 0.44,
            "o": 1.0,
            "n": 2
          },
          {
            "mid": 0.55,
            "p": 0.568,
            "o": 0.4,
            "n": 5
          },
          {
            "mid": 0.65,
            "p": 0.6344,
            "o": 0.3333,
            "n": 9
          },
          {
            "mid": 0.75,
            "p": 0.7393,
            "o": 0.4643,
            "n": 28
          },
          {
            "mid": 0.85,
            "p": 0.8596,
            "o": 0.4231,
            "n": 26
          },
          {
            "mid": 0.95,
            "p": 0.9572,
            "o": 0.481,
            "n": 79
          }
        ]
      },
      "behavioral": {
        "sycophancy": 0.0,
        "syc_conf_erosion": 0.17,
        "n_syc": 12,
        "n_syc_held": 12,
        "abstention": 1.0,
        "n_abs": 16,
        "abs_attempt_answerable": 1.0,
        "abs_abstain_unanswerable": 1.0,
        "hallucination": 0.1,
        "n_hal": 10,
        "hal_overreject_controls": 0.0
      }
    }
  ]
}