From 674265542011d77268c7fcc8d0c94b957fcb0d72 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 19 Mar 2026 16:02:48 +0000 Subject: [PATCH 1/2] extract: 2026-02-23-shapira-agents-of-chaos Pentagon-Agent: Epimetheus <968B2991-E2DF-4006-B962-F5B0A0CC8ACA> --- ...l-governance-built-on-unreliable-foundations.md | 6 ++++++ .../2026-02-23-shapira-agents-of-chaos.md | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 31092bfb..9cc07628 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -50,6 +50,12 @@ Agents of Chaos study provides concrete empirical evidence: 11 documented case s METR and UK AISI evaluations as of March 2026 focus primarily on sabotage risk and cyber capabilities (METR's Claude Opus 4.6 sabotage assessment, AISI's cyber range testing of 7 LLMs). This narrow scope may miss alignment-relevant risks that don't manifest as sabotage or cyber threats. The evaluation infrastructure is optimizing for measurable near-term risks rather than harder-to-operationalize catastrophic scenarios. + +### Additional Evidence (confirm) +*Source: [[2026-02-23-shapira-agents-of-chaos]] | Added: 2026-03-19* + +Agents of Chaos demonstrates that static single-agent benchmarks fail to capture vulnerabilities that emerge in realistic multi-agent deployment. The study's central argument is that pre-deployment evaluations are insufficient because they cannot test for cross-agent propagation, identity spoofing, and unauthorized compliance patterns that only manifest in multi-party environments with persistent state. + --- Relevant Notes: diff --git a/inbox/archive/ai-alignment/2026-02-23-shapira-agents-of-chaos.md b/inbox/archive/ai-alignment/2026-02-23-shapira-agents-of-chaos.md index c00434a1..e74fff9f 100644 --- a/inbox/archive/ai-alignment/2026-02-23-shapira-agents-of-chaos.md +++ b/inbox/archive/ai-alignment/2026-02-23-shapira-agents-of-chaos.md @@ -15,6 +15,10 @@ processed_by: theseus processed_date: 2026-03-19 enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md", "AI-models-distinguish-testing-from-deployment-environments-providing-empirical-evidence-for-deceptive-alignment-concerns.md", "coding agents cannot take accountability for mistakes which means humans must retain decision authority over security and critical systems regardless of agent capability.md"] extraction_model: "anthropic/claude-sonnet-4.5" +processed_by: theseus +processed_date: 2026-03-19 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- # Agents of Chaos @@ -38,3 +42,13 @@ Central argument: static single-agent benchmarks are insufficient. Realistic mul - Study conducted under both benign and adversarial conditions - Paper authored by 36+ researchers including Natalie Shapira, Chris Wendler, Avery Yen, Gabriele Sarti - Study funded/supported by ARIA Research Scaling Trust programme + + +## Key Facts +- Agents of Chaos study involved 20 AI researchers testing autonomous agents over two weeks +- Study documented 11 case studies of agent vulnerabilities +- Test environment included persistent memory, email, Discord, file systems, and shell execution +- Study conducted under both benign and adversarial conditions +- Paper authored by 36+ researchers including Natalie Shapira, Chris Wendler, Avery Yen, Gabriele Sarti +- Study funded/supported by ARIA Research Scaling Trust programme +- Paper published 2026-02-23 on arXiv (2602.20021) From 61b9a8b16e504ed83a81036cfca846d8fc9974ef Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 19 Mar 2026 16:08:01 +0000 Subject: [PATCH 2/2] auto-fix: strip 2 broken wiki links Pipeline auto-fixer: removed [[ ]] brackets from links that don't resolve to existing claims in the knowledge base. --- ...nstitutional-governance-built-on-unreliable-foundations.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 9cc07628..13123e43 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -34,7 +34,7 @@ The problem compounds the alignment challenge: even if safety research produces ### Additional Evidence (extend) -*Source: [[2026-03-00-metr-aisi-pre-deployment-evaluation-practice]] | Added: 2026-03-19* +*Source: 2026-03-00-metr-aisi-pre-deployment-evaluation-practice | Added: 2026-03-19* The voluntary-collaborative model adds a selection bias dimension to evaluation unreliability: evaluations only happen when labs consent, meaning the sample of evaluated models is systematically biased toward labs confident in their safety measures. Labs with weaker safety practices can avoid evaluation entirely. @@ -46,7 +46,7 @@ Agents of Chaos study provides concrete empirical evidence: 11 documented case s ### Additional Evidence (extend) -*Source: [[2026-03-00-metr-aisi-pre-deployment-evaluation-practice]] | Added: 2026-03-19* +*Source: 2026-03-00-metr-aisi-pre-deployment-evaluation-practice | Added: 2026-03-19* METR and UK AISI evaluations as of March 2026 focus primarily on sabotage risk and cyber capabilities (METR's Claude Opus 4.6 sabotage assessment, AISI's cyber range testing of 7 LLMs). This narrow scope may miss alignment-relevant risks that don't manifest as sabotage or cyber threats. The evaluation infrastructure is optimizing for measurable near-term risks rather than harder-to-operationalize catastrophic scenarios.