From fcd3c793e27e6d6dd3c35fc19af7ff980c1693dc Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 26 Mar 2026 00:31:58 +0000 Subject: [PATCH 1/2] extract: 2026-03-26-anthropic-activating-asl3-protections Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...ernance-built-on-unreliable-foundations.md | 6 +++ ...anthropic-activating-asl3-protections.json | 37 +++++++++++++++++++ ...6-anthropic-activating-asl3-protections.md | 14 ++++++- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 43482c04f..9997f1e20 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -129,6 +129,12 @@ METR's methodology (RCT + 143 hours of screen recordings at ~10-second resolutio METR, the primary producer of governance-relevant capability benchmarks, explicitly acknowledges their own time horizon metric (which uses algorithmic scoring) likely overstates operational autonomous capability. The 131-day doubling time for dangerous autonomy may reflect benchmark performance growth rather than real-world capability growth, as the same algorithmic scoring approach that produces 70-75% SWE-Bench success yields 0% production-ready output under holistic evaluation. +### Additional Evidence (extend) +*Source: [[2026-03-26-anthropic-activating-asl3-protections]] | Added: 2026-03-26* + +Anthropic explicitly acknowledged that 'dangerous capability evaluations of AI models are inherently challenging, and as models approach our thresholds of concern, it takes longer to determine their status.' This evaluation unreliability near thresholds is precisely where governance decisions matter most, creating a structural problem: the governance framework depends on measurements that become less reliable at the decision boundary. + + diff --git a/inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json b/inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json new file mode 100644 index 000000000..5fb0a1f48 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-26-anthropic-activating-asl3-protections.json @@ -0,0 +1,37 @@ +{ + "rejected_claims": [ + { + "filename": "precautionary-ai-governance-triggers-higher-protections-when-capability-evaluation-becomes-unreliable.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "self-referential-ai-safety-commitments-lack-independent-verification-creating-accountability-gap.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 7, + "rejected": 2, + "fixes_applied": [ + "precautionary-ai-governance-triggers-higher-protections-when-capability-evaluation-becomes-unreliable.md:set_created:2026-03-26", + "precautionary-ai-governance-triggers-higher-protections-when-capability-evaluation-becomes-unreliable.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "precautionary-ai-governance-triggers-higher-protections-when-capability-evaluation-becomes-unreliable.md:stripped_wiki_link:safe-AI-development-requires-building-alignment-mechanisms-b", + "self-referential-ai-safety-commitments-lack-independent-verification-creating-accountability-gap.md:set_created:2026-03-26", + "self-referential-ai-safety-commitments-lack-independent-verification-creating-accountability-gap.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "self-referential-ai-safety-commitments-lack-independent-verification-creating-accountability-gap.md:stripped_wiki_link:Anthropics-RSP-rollback-under-commercial-pressure-is-the-fir", + "self-referential-ai-safety-commitments-lack-independent-verification-creating-accountability-gap.md:stripped_wiki_link:AI-transparency-is-declining-not-improving-because-Stanford-" + ], + "rejections": [ + "precautionary-ai-governance-triggers-higher-protections-when-capability-evaluation-becomes-unreliable.md:missing_attribution_extractor", + "self-referential-ai-safety-commitments-lack-independent-verification-creating-accountability-gap.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-26" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md b/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md index e7816299d..5471d5643 100644 --- a/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md +++ b/inbox/queue/2026-03-26-anthropic-activating-asl3-protections.md @@ -7,9 +7,13 @@ date: 2025-05-01 domain: ai-alignment secondary_domains: [] format: blog -status: unprocessed +status: enrichment priority: high tags: [ASL-3, precautionary-governance, CBRN, capability-thresholds, RSP, measurement-uncertainty, safety-cases] +processed_by: theseus +processed_date: 2026-03-26 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -49,3 +53,11 @@ ASL-3 protections were narrowly scoped: preventing assistance with extended, end PRIMARY CONNECTION: [[voluntary safety pledges cannot survive competitive pressure because unilateral commitments are structurally punished when competitors advance without equivalent constraints]] WHY ARCHIVED: First documented precautionary capability threshold activation — governance acting before measurement confirmation rather than after EXTRACTION HINT: Focus on the *logic* of precautionary activation (uncertainty triggers more caution) as the claim, not just the CBRN specifics — the governance principle generalizes + + +## Key Facts +- Claude Opus 4 was the first Anthropic model that could not be positively confirmed as below ASL-3 thresholds +- ASL-3 protections were narrowly scoped to prevent assistance with extended end-to-end CBRN workflows +- Claude Sonnet 3.7 showed measurable uplift in CBRN weapon acquisition tasks compared to internet resources, though below formal thresholds +- Virology Capabilities Test performance had been steadily increasing over time across Claude model generations +- Anthropic's RSP explicitly permits deployment under higher standards than confirmed necessary -- 2.45.2 From aa261a5e4b068c1133044519ed54ddae118b3912 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 26 Mar 2026 00:32:46 +0000 Subject: [PATCH 2/2] auto-fix: strip 1 broken wiki links Pipeline auto-fixer: removed [[ ]] brackets from links that don't resolve to existing claims in the knowledge base. --- ...-institutional-governance-built-on-unreliable-foundations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 9997f1e20..f66f29d63 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -125,7 +125,7 @@ METR's scaffold sensitivity finding (GPT-4o and o3 performing better under Vivar METR's methodology (RCT + 143 hours of screen recordings at ~10-second resolution) represents the most rigorous empirical design deployed for AI productivity research. The combination of randomized assignment, real tasks developers would normally work on, and granular behavioral decomposition sets a new standard for evaluation quality. This contrasts sharply with pre-deployment evaluations that lack real-world task context. ### Additional Evidence (confirm) -*Source: [[2026-03-25-metr-algorithmic-vs-holistic-evaluation-benchmark-inflation]] | Added: 2026-03-25* +*Source: 2026-03-25-metr-algorithmic-vs-holistic-evaluation-benchmark-inflation | Added: 2026-03-25* METR, the primary producer of governance-relevant capability benchmarks, explicitly acknowledges their own time horizon metric (which uses algorithmic scoring) likely overstates operational autonomous capability. The 131-day doubling time for dangerous autonomy may reflect benchmark performance growth rather than real-world capability growth, as the same algorithmic scoring approach that produces 70-75% SWE-Bench success yields 0% production-ready output under holistic evaluation. -- 2.45.2