diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index b88c23b0..269d6dce 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -44,6 +44,12 @@ The voluntary-collaborative model adds a selection bias dimension to evaluation Agents of Chaos study provides concrete empirical evidence: 11 documented case studies of security vulnerabilities (unauthorized compliance, identity spoofing, cross-agent propagation, destructive actions) that emerged only in realistic multi-agent deployment with persistent memory and system access—none of which would be detected by static single-agent benchmarks. The study explicitly argues that current evaluation paradigms are insufficient for realistic deployment conditions. + +### Additional Evidence (extend) +*Source: [[2026-03-00-metr-aisi-pre-deployment-evaluation-practice]] | Added: 2026-03-19* + +METR and UK AISI evaluations as of March 2026 focus primarily on sabotage risk and cyber capabilities (METR's Claude Opus 4.6 sabotage assessment, AISI's cyber range testing of 7 LLMs). This narrow scope may miss alignment-relevant risks that don't manifest as sabotage or cyber threats. The evaluation infrastructure is optimizing for measurable near-term risks rather than harder-to-operationalize catastrophic scenarios. + --- Relevant Notes: diff --git a/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json b/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json index 192b18cc..0a9198ef 100644 --- a/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json +++ b/inbox/queue/.extraction-debug/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.json @@ -1,24 +1,34 @@ { "rejected_claims": [ { - "filename": "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md", + "filename": "pre-deployment-AI-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "UK-AISI-renaming-to-Security-Institute-signals-government-priority-shift-from-existential-safety-to-cybersecurity-threats.md", "issues": [ "missing_attribution_extractor" ] } ], "validation_stats": { - "total": 1, + "total": 2, "kept": 0, - "fixed": 3, - "rejected": 1, + "fixed": 6, + "rejected": 2, "fixes_applied": [ - "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:set_created:2026-03-19", - "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", - "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:stripped_wiki_link:only-binding-regulation-with-enforcement-teeth-changes-front" + "pre-deployment-AI-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:set_created:2026-03-19", + "pre-deployment-AI-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "pre-deployment-AI-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:stripped_wiki_link:only-binding-regulation-with-enforcement-teeth-changes-front", + "UK-AISI-renaming-to-Security-Institute-signals-government-priority-shift-from-existential-safety-to-cybersecurity-threats.md:set_created:2026-03-19", + "UK-AISI-renaming-to-Security-Institute-signals-government-priority-shift-from-existential-safety-to-cybersecurity-threats.md:stripped_wiki_link:government-designation-of-safety-conscious-AI-labs-as-supply", + "UK-AISI-renaming-to-Security-Institute-signals-government-priority-shift-from-existential-safety-to-cybersecurity-threats.md:stripped_wiki_link:compute-export-controls-are-the-most-impactful-AI-governance" ], "rejections": [ - "pre-deployment-ai-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:missing_attribution_extractor" + "pre-deployment-AI-evaluation-operates-on-voluntary-collaborative-model-where-labs-can-decline-without-consequence.md:missing_attribution_extractor", + "UK-AISI-renaming-to-Security-Institute-signals-government-priority-shift-from-existential-safety-to-cybersecurity-threats.md:missing_attribution_extractor" ] }, "model": "anthropic/claude-sonnet-4.5", diff --git a/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md b/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md index 38ba72fc..3fb59be4 100644 --- a/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md +++ b/inbox/queue/2026-03-00-metr-aisi-pre-deployment-evaluation-practice.md @@ -7,13 +7,17 @@ date: 2026-03-01 domain: ai-alignment secondary_domains: [] format: article -status: unprocessed +status: enrichment priority: medium tags: [evaluation-infrastructure, pre-deployment, METR, AISI, voluntary-collaborative, Inspect, Claude-Opus-4-6, cyber-evaluation] processed_by: theseus processed_date: 2026-03-19 enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] extraction_model: "anthropic/claude-sonnet-4.5" +processed_by: theseus +processed_date: 2026-03-19 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -72,3 +76,14 @@ EXTRACTION HINT: Focus on the voluntary-collaborative limitation: no evaluation - UK AISI was renamed from 'AI Safety Institute' to 'AI Security Institute' in 2026 - UK AISI tested 7 LLMs on custom cyber ranges as of March 16, 2026 - METR maintains a Frontier AI Safety Policies repository covering Amazon, Anthropic, Google DeepMind, Meta, Microsoft, and OpenAI + + +## Key Facts +- METR reviewed Anthropic's Claude Opus 4.6 sabotage risk report on March 12, 2026 +- UK AISI tested 7 LLMs on custom cyber ranges as of March 16, 2026 +- UK AISI was renamed from 'AI Safety Institute' to 'AI Security Institute' in 2026 +- METR maintains a Frontier AI Safety Policies repository covering Amazon, Anthropic, Google DeepMind, Meta, Microsoft, and OpenAI +- UK AISI released the Inspect evaluation framework in April 2024 +- UK AISI released Inspect Scout transcript analysis tool on February 25, 2026 +- UK AISI released ControlArena library for AI control experiments on October 22, 2025 +- UK AISI conducted international joint testing exercise on agentic systems in July 2025