diff --git a/inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1-methodology-update.json b/inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1-methodology-update.json new file mode 100644 index 00000000..f7c744c5 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-01-29-metr-time-horizon-1-1-methodology-update.json @@ -0,0 +1,36 @@ +{ + "rejected_claims": [ + { + "filename": "ai-autonomous-capability-doubling-every-131-days-creates-structural-governance-lag.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "evaluation-infrastructure-saturates-at-capability-levels-where-oversight-matters-most.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 6, + "rejected": 2, + "fixes_applied": [ + "ai-autonomous-capability-doubling-every-131-days-creates-structural-governance-lag.md:set_created:2026-03-23", + "ai-autonomous-capability-doubling-every-131-days-creates-structural-governance-lag.md:stripped_wiki_link:verification degrades faster than capability grows", + "evaluation-infrastructure-saturates-at-capability-levels-where-oversight-matters-most.md:set_created:2026-03-23", + "evaluation-infrastructure-saturates-at-capability-levels-where-oversight-matters-most.md:stripped_wiki_link:verification degrades faster than capability grows", + "evaluation-infrastructure-saturates-at-capability-levels-where-oversight-matters-most.md:stripped_wiki_link:economic forces push humans out of every cognitive loop wher", + "evaluation-infrastructure-saturates-at-capability-levels-where-oversight-matters-most.md:stripped_wiki_link:human verification bandwidth is the binding constraint on AG" + ], + "rejections": [ + "ai-autonomous-capability-doubling-every-131-days-creates-structural-governance-lag.md:missing_attribution_extractor", + "evaluation-infrastructure-saturates-at-capability-levels-where-oversight-matters-most.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-23" +} \ No newline at end of file diff --git a/inbox/queue/2026-01-29-metr-time-horizon-1-1-methodology-update.md b/inbox/queue/2026-01-29-metr-time-horizon-1-1-methodology-update.md index a9ab04dc..fcf370f0 100644 --- a/inbox/queue/2026-01-29-metr-time-horizon-1-1-methodology-update.md +++ b/inbox/queue/2026-01-29-metr-time-horizon-1-1-methodology-update.md @@ -7,9 +7,12 @@ date: 2026-01-29 domain: ai-alignment secondary_domains: [] format: blog-post -status: unprocessed +status: enrichment priority: high tags: [metr, time-horizon, capability-measurement, evaluation-methodology, autonomy, scaling, saturation] +processed_by: theseus +processed_date: 2026-03-23 +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -65,3 +68,15 @@ METR published an updated version of their autonomous AI capability measurement PRIMARY CONNECTION: [[scalable oversight degrades rapidly as capability gaps grow with debate achieving only 50 percent success at moderate gaps]] WHY ARCHIVED: Quantifies the capability-governance gap with the most precise measurement available; reveals measurement infrastructure itself is failing for frontier models EXTRACTION HINT: Two claims possible — one on the doubling rate as governance timeline mismatch; one on evaluation saturation as a new instance of B4. Check whether the doubling rate number updates or supersedes existing claims. + + +## Key Facts +- METR Time Horizon 1.1 expanded task suite from 170 to 228 tasks (34% growth) +- Long tasks (8+ hours) doubled from 14 to 31 in the updated framework +- Only 5 of 31 long tasks have measured human baseline times; remainder use estimates +- Claude Opus 4.6 (Feb 2026): ~719 minutes (~12 hours) 50% success horizon, later revised to ~14.5 hours +- GPT-5.2 (Dec 2025): ~352 minutes 50% success horizon +- Claude Opus 4.5 (Nov 2025): ~320 minutes (revised up from 289) +- GPT-4 Turbo (2024): 3-10 minutes 50% success horizon +- Infrastructure migrated from in-house Vivaria to open-source Inspect framework (UK AI Security Institute) +- Different model versions use varying scaffolds: modular-public, flock-public, triframe_inspect