From 98d283e794e85eff3bddc68d6d9a9574b1ce8a80 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Tue, 24 Mar 2026 00:17:26 +0000 Subject: [PATCH] extract: 2026-01-29-metr-time-horizon-1-1 Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...ernance-built-on-unreliable-foundations.md | 6 ++++ .../2026-01-29-metr-time-horizon-1-1.json | 33 +++++++++++++++++++ .../queue/2026-01-29-metr-time-horizon-1-1.md | 18 +++++++++- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index d8fdd327..65aec47e 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -104,6 +104,12 @@ IAISR 2026 states that 'pre-deployment testing increasingly fails to predict rea Anthropic's explicit admission that 'the science of model evaluation isn't well-developed enough to provide definitive threshold assessments' is direct confirmation from a frontier lab that evaluation tools are insufficient for governance. This aligns with METR's March 2026 modeling assumptions note, suggesting field-wide consensus that current evaluation science cannot support the governance structures built on top of it. +### Additional Evidence (extend) +*Source: [[2026-01-29-metr-time-horizon-1-1]] | Added: 2026-03-24* + +METR's scaffold sensitivity finding (GPT-4o and o3 performing better under Vivaria than Inspect) adds a new dimension to evaluation unreliability: the same model produces different capability estimates depending on evaluation infrastructure, introducing cross-model comparison uncertainty that governance frameworks do not account for. + + diff --git a/inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1.json b/inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1.json new file mode 100644 index 00000000..4a465e01 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1.json @@ -0,0 +1,33 @@ +{ + "rejected_claims": [ + { + "filename": "metr-time-horizon-benchmark-saturating-at-governance-relevant-capability-levels.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "ai-capability-evaluation-scaffold-sensitivity-introduces-cross-model-comparison-uncertainty.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 3, + "rejected": 2, + "fixes_applied": [ + "metr-time-horizon-benchmark-saturating-at-governance-relevant-capability-levels.md:set_created:2026-03-24", + "metr-time-horizon-benchmark-saturating-at-governance-relevant-capability-levels.md:stripped_wiki_link:verification degrades faster than capability grows", + "ai-capability-evaluation-scaffold-sensitivity-introduces-cross-model-comparison-uncertainty.md:set_created:2026-03-24" + ], + "rejections": [ + "metr-time-horizon-benchmark-saturating-at-governance-relevant-capability-levels.md:missing_attribution_extractor", + "ai-capability-evaluation-scaffold-sensitivity-introduces-cross-model-comparison-uncertainty.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-24" +} \ No newline at end of file diff --git a/inbox/queue/2026-01-29-metr-time-horizon-1-1.md b/inbox/queue/2026-01-29-metr-time-horizon-1-1.md index e25c3c87..3f1d6a6e 100644 --- a/inbox/queue/2026-01-29-metr-time-horizon-1-1.md +++ b/inbox/queue/2026-01-29-metr-time-horizon-1-1.md @@ -7,9 +7,13 @@ date: 2026-01-29 domain: ai-alignment secondary_domains: [] format: research-report -status: unprocessed +status: enrichment priority: high tags: [metr, time-horizon, capability-evaluation, task-saturation, measurement, frontier-ai, benchmark] +processed_by: theseus +processed_date: 2026-03-24 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -68,3 +72,15 @@ PRIMARY CONNECTION: [[verification degrades faster than capability grows]] WHY ARCHIVED: TH1.1 provides the empirical grounding for "131-day doubling time" and simultaneously the evidence that the measurement tool tracking that doubling is saturating. The saturation acknowledgment from METR itself is the most reliable source for this claim. EXTRACTION HINT: The extractor should distinguish between two separate findings: (1) capability is doubling every 131 days — this is a finding; (2) the measurement tool for this doubling is saturating — this is also a finding. Both can be true simultaneously and both deserve separate KB claims. The saturation finding specifically challenges the reliability of the doubling-time estimate itself. + + +## Key Facts +- METR's full historical trend (2019-2025) estimates 196-day capability doubling time +- METR's TH1.1 estimates 131-day capability doubling since 2023 (20% faster than previous 165-day estimate) +- METR's TH1.1 estimates 89-day capability doubling since 2024 +- Claude Opus 4.5 achieved 320-minute (5.3 hour) time horizon in TH1.1 +- GPT-5 achieved 214-minute time horizon in TH1.1 +- o3 achieved 121-minute time horizon in TH1.1 +- METR doubled long-duration tasks from 14 to 31 in TH1.1 +- Only 5 of 31 long tasks in TH1.1 have actual human baseline times +- GPT-4 variants saw 35-57% downward revisions in TH1.1 estimates