diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 4bce1fcc..66a3fdda 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -134,6 +134,12 @@ METR, the primary producer of governance-relevant capability benchmarks, explici METR's January 2026 evaluation of GPT-5 placed its autonomous replication and adaptation capability at 2h17m (50% time horizon), far below catastrophic risk thresholds. In the same month, AISLE (an AI system) autonomously discovered 12 OpenSSL CVEs including a 30-year-old bug through fully autonomous operation. This is direct evidence that formal pre-deployment evaluations are not capturing operational dangerous autonomy that is already deployed at commercial scale. +### Additional Evidence (extend) +*Source: [[2026-03-26-metr-algorithmic-vs-holistic-evaluation]] | Added: 2026-03-26* + +METR's August 2025 research update provides specific quantification of the evaluation reliability problem: algorithmic scoring overstates capability by 2-3x (38% algorithmic success vs 0% holistic success for Claude 3.7 Sonnet on software tasks), and HCAST benchmark version instability of ~50% between annual versions means even the measurement instrument itself is unstable. METR explicitly acknowledges their own evaluations 'may substantially overestimate' real-world capability. + + diff --git a/inbox/queue/.extraction-debug/2026-03-26-metr-algorithmic-vs-holistic-evaluation.json b/inbox/queue/.extraction-debug/2026-03-26-metr-algorithmic-vs-holistic-evaluation.json new file mode 100644 index 00000000..0989cdb7 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-26-metr-algorithmic-vs-holistic-evaluation.json @@ -0,0 +1,34 @@ +{ + "rejected_claims": [ + { + "filename": "algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 4, + "rejected": 2, + "fixes_applied": [ + "algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md:set_created:2026-03-26", + "algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md:stripped_wiki_link:AI-capability-and-reliability-are-independent-dimensions-bec", + "capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md:set_created:2026-03-26", + "capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md:stripped_wiki_link:Anthropics-RSP-rollback-under-commercial-pressure-is-the-fir" + ], + "rejections": [ + "algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md:missing_attribution_extractor", + "capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-26" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-26-metr-algorithmic-vs-holistic-evaluation.md b/inbox/queue/2026-03-26-metr-algorithmic-vs-holistic-evaluation.md index 11971c50..edf6c2df 100644 --- a/inbox/queue/2026-03-26-metr-algorithmic-vs-holistic-evaluation.md +++ b/inbox/queue/2026-03-26-metr-algorithmic-vs-holistic-evaluation.md @@ -7,9 +7,13 @@ date: 2025-08-12 domain: ai-alignment secondary_domains: [] format: blog -status: unprocessed +status: enrichment priority: high tags: [METR, HCAST, algorithmic-scoring, holistic-evaluation, benchmark-reality-gap, SWE-bench, governance-thresholds, capability-measurement] +processed_by: theseus +processed_date: 2026-03-26 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -54,3 +58,11 @@ METR's current formal thresholds for "catastrophic risk" scrutiny: PRIMARY CONNECTION: [[scalable oversight degrades rapidly as capability gaps grow with debate achieving only 50 percent success at moderate gaps]] WHY ARCHIVED: Empirical validation that the *measurement infrastructure* for AI governance is systematically unreliable — extends session 13/14's benchmark-reality gap finding with specific numbers and the source organization explicitly acknowledging the problem EXTRACTION HINT: Focus on the governance implication: METR's own evaluations, which are used to set safety thresholds, may overstate real-world capability by 2-3x in software domains — and the benchmark is unstable enough to shift 50%+ between annual versions + + +## Key Facts +- METR's formal thresholds for catastrophic risk scrutiny: 80% time horizon exceeding 8 hours on high-context tasks, or 50% time horizon exceeding 40 hours on software engineering/ML tasks +- GPT-5's 50% time horizon as of January 2026: 2 hours 17 minutes (far below 40-hour threshold) +- METR's 131-day doubling time estimate from prior reports is derived from benchmark performance that may substantially overestimate real-world capability +- SWE-Bench Verified success rates for frontier models: around 70-75% +- METR is incorporating holistic assessment elements into formal evaluations: assurance checklists, reasoning trace analysis, situational awareness testing