From 4ba0a551608cdbe718dc0d88d3da78fae7e4e6b4 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 26 Mar 2026 00:47:57 +0000 Subject: [PATCH 1/2] extract: 2026-03-26-anthropic-activating-asl3-protections Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...ernance-built-on-unreliable-foundations.md | 6 +++ ...anthropic-activating-asl3-protections.json | 37 +++++++++++++++++++ ...6-anthropic-activating-asl3-protections.md | 14 ++++++- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 66a3fdda..d2a7f5a0 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -139,6 +139,12 @@ METR's January 2026 evaluation of GPT-5 placed its autonomous replication and ad METR's August 2025 research update provides specific quantification of the evaluation reliability problem: algorithmic scoring overstates capability by 2-3x (38% algorithmic success vs 0% holistic success for Claude 3.7 Sonnet on software tasks), and HCAST benchmark version instability of ~50% between annual versions means even the measurement instrument itself is unstable. METR explicitly acknowledges their own evaluations 'may substantially overestimate' real-world capability. +### Additional Evidence (extend) +*Source: [[2026-03-26-anthropic-activating-asl3-protections]] | Added: 2026-03-26* + +Anthropic explicitly acknowledged that 'dangerous capability evaluations of AI models are inherently challenging, and as models approach our thresholds of concern, it takes longer to determine their status.' This is a frontier lab publicly stating that evaluation reliability degrades precisely when it matters most—near capability thresholds. The ASL-3 activation was triggered by this evaluation uncertainty rather than confirmed capability, suggesting governance frameworks are adapting to evaluation unreliability rather than solving it. + + diff --git a/inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json b/inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json new file mode 100644 index 00000000..20188d9b --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json @@ -0,0 +1,37 @@ +{ + "rejected_claims": [ + { + "filename": "precautionary-ai-governance-triggers-protection-escalation-when-capability-evaluation-becomes-unreliable.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "ai-safety-commitments-lack-independent-verification-creating-self-referential-accountability-that-cannot-detect-motivated-reasoning.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 7, + "rejected": 2, + "fixes_applied": [ + "precautionary-ai-governance-triggers-protection-escalation-when-capability-evaluation-becomes-unreliable.md:set_created:2026-03-26", + "precautionary-ai-governance-triggers-protection-escalation-when-capability-evaluation-becomes-unreliable.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "precautionary-ai-governance-triggers-protection-escalation-when-capability-evaluation-becomes-unreliable.md:stripped_wiki_link:safe-AI-development-requires-building-alignment-mechanisms-b", + "ai-safety-commitments-lack-independent-verification-creating-self-referential-accountability-that-cannot-detect-motivated-reasoning.md:set_created:2026-03-26", + "ai-safety-commitments-lack-independent-verification-creating-self-referential-accountability-that-cannot-detect-motivated-reasoning.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "ai-safety-commitments-lack-independent-verification-creating-self-referential-accountability-that-cannot-detect-motivated-reasoning.md:stripped_wiki_link:Anthropics-RSP-rollback-under-commercial-pressure-is-the-fir", + "ai-safety-commitments-lack-independent-verification-creating-self-referential-accountability-that-cannot-detect-motivated-reasoning.md:stripped_wiki_link:AI-transparency-is-declining-not-improving-because-Stanford-" + ], + "rejections": [ + "precautionary-ai-governance-triggers-protection-escalation-when-capability-evaluation-becomes-unreliable.md:missing_attribution_extractor", + "ai-safety-commitments-lack-independent-verification-creating-self-referential-accountability-that-cannot-detect-motivated-reasoning.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-26" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md b/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md index e7816299..080040b3 100644 --- a/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md +++ b/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md @@ -7,9 +7,13 @@ date: 2025-05-01 domain: ai-alignment secondary_domains: [] format: blog -status: unprocessed +status: enrichment priority: high tags: [ASL-3, precautionary-governance, CBRN, capability-thresholds, RSP, measurement-uncertainty, safety-cases] +processed_by: theseus +processed_date: 2026-03-26 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -49,3 +53,11 @@ ASL-3 protections were narrowly scoped: preventing assistance with extended, end PRIMARY CONNECTION: [[voluntary safety pledges cannot survive competitive pressure because unilateral commitments are structurally punished when competitors advance without equivalent constraints]] WHY ARCHIVED: First documented precautionary capability threshold activation — governance acting before measurement confirmation rather than after EXTRACTION HINT: Focus on the *logic* of precautionary activation (uncertainty triggers more caution) as the claim, not just the CBRN specifics — the governance principle generalizes + + +## Key Facts +- Claude Opus 4 was the first Claude model that could not be positively confirmed as below ASL-3 thresholds +- ASL-3 protections were narrowly scoped to prevent assistance with extended end-to-end CBRN workflows +- Claude Sonnet 3.7 showed measurable participant uplift on CBRN weapon acquisition tasks compared to standard internet resources +- Virology Capabilities Test performance had been steadily increasing over time across Claude model generations +- Anthropic's RSP explicitly permits deployment under a higher standard than confirmed necessary -- 2.45.2 From 14e1670e134af646f5ee93f6c35f88d988250ad1 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 26 Mar 2026 00:48:54 +0000 Subject: [PATCH 2/2] auto-fix: strip 3 broken wiki links Pipeline auto-fixer: removed [[ ]] brackets from links that don't resolve to existing claims in the knowledge base. --- ...titutional-governance-built-on-unreliable-foundations.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index d2a7f5a0..77a6cd94 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -125,17 +125,17 @@ METR's scaffold sensitivity finding (GPT-4o and o3 performing better under Vivar METR's methodology (RCT + 143 hours of screen recordings at ~10-second resolution) represents the most rigorous empirical design deployed for AI productivity research. The combination of randomized assignment, real tasks developers would normally work on, and granular behavioral decomposition sets a new standard for evaluation quality. This contrasts sharply with pre-deployment evaluations that lack real-world task context. ### Additional Evidence (confirm) -*Source: [[2026-03-25-metr-algorithmic-vs-holistic-evaluation-benchmark-inflation]] | Added: 2026-03-25* +*Source: 2026-03-25-metr-algorithmic-vs-holistic-evaluation-benchmark-inflation | Added: 2026-03-25* METR, the primary producer of governance-relevant capability benchmarks, explicitly acknowledges their own time horizon metric (which uses algorithmic scoring) likely overstates operational autonomous capability. The 131-day doubling time for dangerous autonomy may reflect benchmark performance growth rather than real-world capability growth, as the same algorithmic scoring approach that produces 70-75% SWE-Bench success yields 0% production-ready output under holistic evaluation. ### Additional Evidence (confirm) -*Source: [[2026-03-26-aisle-openssl-zero-days]] | Added: 2026-03-26* +*Source: 2026-03-26-aisle-openssl-zero-days | Added: 2026-03-26* METR's January 2026 evaluation of GPT-5 placed its autonomous replication and adaptation capability at 2h17m (50% time horizon), far below catastrophic risk thresholds. In the same month, AISLE (an AI system) autonomously discovered 12 OpenSSL CVEs including a 30-year-old bug through fully autonomous operation. This is direct evidence that formal pre-deployment evaluations are not capturing operational dangerous autonomy that is already deployed at commercial scale. ### Additional Evidence (extend) -*Source: [[2026-03-26-metr-algorithmic-vs-holistic-evaluation]] | Added: 2026-03-26* +*Source: 2026-03-26-metr-algorithmic-vs-holistic-evaluation | Added: 2026-03-26* METR's August 2025 research update provides specific quantification of the evaluation reliability problem: algorithmic scoring overstates capability by 2-3x (38% algorithmic success vs 0% holistic success for Claude 3.7 Sonnet on software tasks), and HCAST benchmark version instability of ~50% between annual versions means even the measurement instrument itself is unstable. METR explicitly acknowledges their own evaluations 'may substantially overestimate' real-world capability. -- 2.45.2