From e2dc9f54f07e2354a352925ddddacbc79f5852d4 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 19 Mar 2026 00:33:50 +0000 Subject: [PATCH] extract: 2026-03-00-metr-aisi-pre-deployment-evaluation-practice Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA> --- ...ernance-built-on-unreliable-foundations.md | 6 +++++ ...si-pre-deployment-evaluation-practice.json | 26 +++++++++++++++++++ ...aisi-pre-deployment-evaluation-practice.md | 13 +++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index acc452c2..d8235603 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -32,6 +32,12 @@ The problem compounds the alignment challenge: even if safety research produces - Risk management remains "largely voluntary" while regulatory regimes begin formalizing requirements based on these unreliable evaluation methods - The report identifies this as a structural governance problem, not a technical limitation that engineering can solve + +### Additional Evidence (extend) +*Source: [[2026-03-00-metr-aisi-pre-deployment-evaluation-practice]] | Added: 2026-03-19* + +The voluntary-collaborative model adds a selection bias dimension to evaluation unreliability: evaluations only happen when labs consent, meaning the sample of evaluated models is systematically biased toward labs confident in their safety measures. Labs with weaker safety practices can avoid evaluation entirely. + --- Relevant Notes: diff --git a/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json b/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json new file mode 100644 index 00000000..192b18cc --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json @@ -0,0 +1,26 @@ +{ + "rejected_claims": [ + { + "filename": "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 1, + "kept": 0, + "fixed": 3, + "rejected": 1, + "fixes_applied": [ + "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:set_created:2026-03-19", + "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:stripped_wiki_link:only-binding-regulation-with-enforcement-teeth-changes-front" + ], + "rejections": [ + "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-19" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md b/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md index 9019a480..4dea7f98 100644 --- a/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md +++ b/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md @@ -7,9 +7,13 @@ date: 2026-03-01 domain: ai-alignment secondary_domains: [] format: article -status: unprocessed +status: enrichment priority: medium tags: [evaluation-infrastructure, pre-deployment, METR, AISI, voluntary-collaborative, Inspect, Claude-Opus-4-6, cyber-evaluation] +processed_by: theseus +processed_date: 2026-03-19 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -61,3 +65,10 @@ PRIMARY CONNECTION: [[safe AI development requires building alignment mechanisms WHY ARCHIVED: Documents the actual state of pre-deployment AI evaluation practice in early 2026. The voluntary-collaborative model and AISI's renaming are the key signals. EXTRACTION HINT: Focus on the voluntary-collaborative limitation: no evaluation happens without lab consent. Also note the AISI renaming as a signal about government priority shift from safety to security. + + +## Key Facts +- METR reviewed Anthropic's Claude Opus 4.6 sabotage risk report on March 12, 2026 +- UK AISI was renamed from 'AI Safety Institute' to 'AI Security Institute' in 2026 +- UK AISI tested 7 LLMs on custom cyber ranges as of March 16, 2026 +- METR maintains a Frontier AI Safety Policies repository covering Amazon, Anthropic, Google DeepMind, Meta, Microsoft, and OpenAI