From 4d5266d74ff934365af7ed963c89d0f8eb5283b5 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 26 Mar 2026 00:48:42 +0000 Subject: [PATCH] extract: 2026-03-26-metr-gpt5-evaluation-time-horizon Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...ernance-built-on-unreliable-foundations.md | 6 ++++ ...-26-metr-gpt5-evaluation-time-horizon.json | 35 +++++++++++++++++++ ...03-26-metr-gpt5-evaluation-time-horizon.md | 17 ++++++++- 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-26-metr-gpt5-evaluation-time-horizon.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 66a3fddaa..4a72236b6 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -139,6 +139,12 @@ METR's January 2026 evaluation of GPT-5 placed its autonomous replication and ad METR's August 2025 research update provides specific quantification of the evaluation reliability problem: algorithmic scoring overstates capability by 2-3x (38% algorithmic success vs 0% holistic success for Claude 3.7 Sonnet on software tasks), and HCAST benchmark version instability of ~50% between annual versions means even the measurement instrument itself is unstable. METR explicitly acknowledges their own evaluations 'may substantially overestimate' real-world capability. +### Additional Evidence (extend) +*Source: [[2026-03-26-metr-gpt5-evaluation-time-horizon]] | Added: 2026-03-26* + +METR's HCAST benchmark showed 50-57% volatility in time horizon estimates between v1.0 and v1.1 for the same models, demonstrating that pre-deployment evaluation metrics are unstable at the measurement level independent of capability changes. This adds a new failure mode beyond prediction-deployment gaps: the evaluation instruments themselves lack measurement reliability. + + diff --git a/inbox/queue/.extraction-debug/2026-03-26-metr-gpt5-evaluation-time-horizon.json b/inbox/queue/.extraction-debug/2026-03-26-metr-gpt5-evaluation-time-horizon.json new file mode 100644 index 000000000..83b2b4490 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-26-metr-gpt5-evaluation-time-horizon.json @@ -0,0 +1,35 @@ +{ + "rejected_claims": [ + { + "filename": "metr-time-horizon-benchmarks-show-50-percent-instability-between-versions-making-governance-thresholds-unreliable.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "current-frontier-models-evaluate-17x-below-metrs-catastrophic-autonomy-threshold-for-ai-research-automation.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 5, + "rejected": 2, + "fixes_applied": [ + "metr-time-horizon-benchmarks-show-50-percent-instability-between-versions-making-governance-thresholds-unreliable.md:set_created:2026-03-26", + "metr-time-horizon-benchmarks-show-50-percent-instability-between-versions-making-governance-thresholds-unreliable.md:stripped_wiki_link:pre-deployment-AI-evaluations-do-not-predict-real-world-risk", + "current-frontier-models-evaluate-17x-below-metrs-catastrophic-autonomy-threshold-for-ai-research-automation.md:set_created:2026-03-26", + "current-frontier-models-evaluate-17x-below-metrs-catastrophic-autonomy-threshold-for-ai-research-automation.md:stripped_wiki_link:three-conditions-gate-AI-takeover-risk-autonomy-robotics-and", + "current-frontier-models-evaluate-17x-below-metrs-catastrophic-autonomy-threshold-for-ai-research-automation.md:stripped_wiki_link:AI-models-distinguish-testing-from-deployment-environments-p" + ], + "rejections": [ + "metr-time-horizon-benchmarks-show-50-percent-instability-between-versions-making-governance-thresholds-unreliable.md:missing_attribution_extractor", + "current-frontier-models-evaluate-17x-below-metrs-catastrophic-autonomy-threshold-for-ai-research-automation.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-26" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-26-metr-gpt5-evaluation-time-horizon.md b/inbox/queue/2026-03-26-metr-gpt5-evaluation-time-horizon.md index bf791129d..db774d68a 100644 --- a/inbox/queue/2026-03-26-metr-gpt5-evaluation-time-horizon.md +++ b/inbox/queue/2026-03-26-metr-gpt5-evaluation-time-horizon.md @@ -7,9 +7,13 @@ date: 2026-01-01 domain: ai-alignment secondary_domains: [] format: report -status: unprocessed +status: enrichment priority: medium tags: [METR, GPT-5, time-horizon, capability-thresholds, safety-evaluation, holistic-evaluation, governance-thresholds, catastrophic-risk] +processed_by: theseus +processed_date: 2026-03-26 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -59,3 +63,14 @@ This suggests ~50% volatility in time horizon estimates between benchmark versio PRIMARY CONNECTION: [[scalable oversight degrades rapidly as capability gaps grow with debate achieving only 50 percent success at moderate gaps]] WHY ARCHIVED: Provides formal numerical calibration of where current frontier models sit relative to governance thresholds — essential context for evaluating B1's "greatest outstanding problem" claim. The finding (2h17m vs 40-hour threshold) partially challenges alarmist interpretations while the 50%+ benchmark instability maintains the governance concern EXTRACTION HINT: Separate claims: (1) "Current frontier models evaluate at ~17x below METR's catastrophic risk threshold for autonomous AI R&D" — calibrating B1; (2) "METR's time horizon benchmark shifted 50-57% between v1.0 and v1.1 versions, making governance thresholds derived from it a moving target" — the reliability problem + + +## Key Facts +- GPT-5 achieved 50% time horizon of 2 hours 17 minutes on METR's HCAST evaluation +- GPT-5's 80% time horizon was below 8 hours +- METR's catastrophic risk thresholds are: 8 hours (80% threshold for heightened scrutiny) and 40 hours (50% threshold for strong concern) +- HCAST v1.1 contains 228 tasks as of January 2026 +- Between HCAST v1.0 and v1.1, GPT-4 1106's time horizon estimate dropped 57% +- Between HCAST v1.0 and v1.1, GPT-5's time horizon estimate rose 55% +- METR's evaluation methodology includes assurance checklists, reasoning trace analysis, and situational awareness testing +- METR evaluations are used by OpenAI, Anthropic, and other frontier labs for safety milestone assessments