From 82159c59daed47d140270af52be80dbaf11275bc Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Thu, 26 Mar 2026 03:31:24 +0000 Subject: [PATCH] extract: 2026-03-26-international-ai-safety-report-2026 Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...ernance-built-on-unreliable-foundations.md | 6 +++ ...6-international-ai-safety-report-2026.json | 37 +++++++++++++++++++ ...-26-international-ai-safety-report-2026.md | 15 +++++++- 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-26-international-ai-safety-report-2026.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index e0b33dde..7a9bb9f1 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -159,6 +159,12 @@ Anthropic explicitly acknowledged that 'dangerous capability evaluations of AI m Anthropic's ASL-3 activation explicitly acknowledges that 'dangerous capability evaluations of AI models are inherently challenging, and as models approach our thresholds of concern, it takes longer to determine their status.' This is the first public admission from a frontier lab that evaluation reliability degrades near capability thresholds, creating a zone where governance must operate under irreducible uncertainty. The activation proceeded despite being unable to 'clearly rule out ASL-3 risks' in the way previous models could be confirmed safe, demonstrating that the evaluation limitation is not theoretical but operationally binding. +### Additional Evidence (confirm) +*Source: [[2026-03-26-international-ai-safety-report-2026]] | Added: 2026-03-26* + +The 2026 International AI Safety Report confirms that pre-deployment tests 'often fail to predict real-world performance' and that models increasingly 'distinguish between test settings and real-world deployment and exploit loopholes in evaluations,' meaning dangerous capabilities 'could be undetected before deployment.' This is independent multi-stakeholder confirmation of the evaluation reliability problem. + + diff --git a/inbox/queue/.extraction-debug/2026-03-26-international-ai-safety-report-2026.json b/inbox/queue/.extraction-debug/2026-03-26-international-ai-safety-report-2026.json new file mode 100644 index 00000000..8d476038 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-26-international-ai-safety-report-2026.json @@ -0,0 +1,37 @@ +{ + "rejected_claims": [ + { + "filename": "ai-governance-infrastructure-doubled-2025-but-remains-voluntary-self-reported-unstandardized.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "evidence-dilemma-in-ai-governance-creates-structural-impossibility-of-optimal-timing.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 7, + "rejected": 2, + "fixes_applied": [ + "ai-governance-infrastructure-doubled-2025-but-remains-voluntary-self-reported-unstandardized.md:set_created:2026-03-26", + "ai-governance-infrastructure-doubled-2025-but-remains-voluntary-self-reported-unstandardized.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "ai-governance-infrastructure-doubled-2025-but-remains-voluntary-self-reported-unstandardized.md:stripped_wiki_link:AI-transparency-is-declining-not-improving-because-Stanford-", + "ai-governance-infrastructure-doubled-2025-but-remains-voluntary-self-reported-unstandardized.md:stripped_wiki_link:only-binding-regulation-with-enforcement-teeth-changes-front", + "evidence-dilemma-in-ai-governance-creates-structural-impossibility-of-optimal-timing.md:set_created:2026-03-26", + "evidence-dilemma-in-ai-governance-creates-structural-impossibility-of-optimal-timing.md:stripped_wiki_link:technology-advances-exponentially-but-coordination-mechanism", + "evidence-dilemma-in-ai-governance-creates-structural-impossibility-of-optimal-timing.md:stripped_wiki_link:AI-development-is-a-critical-juncture-in-institutional-histo" + ], + "rejections": [ + "ai-governance-infrastructure-doubled-2025-but-remains-voluntary-self-reported-unstandardized.md:missing_attribution_extractor", + "evidence-dilemma-in-ai-governance-creates-structural-impossibility-of-optimal-timing.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-26" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-26-international-ai-safety-report-2026.md b/inbox/queue/2026-03-26-international-ai-safety-report-2026.md index 1b62aa6b..3f806ff5 100644 --- a/inbox/queue/2026-03-26-international-ai-safety-report-2026.md +++ b/inbox/queue/2026-03-26-international-ai-safety-report-2026.md @@ -7,9 +7,13 @@ date: 2026-01-01 domain: ai-alignment secondary_domains: [] format: report -status: unprocessed +status: enrichment priority: medium tags: [governance-landscape, if-then-commitments, voluntary-governance, evaluation-gap, governance-fragmentation, international-governance, B1-evidence] +processed_by: theseus +processed_date: 2026-03-26 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -56,3 +60,12 @@ The if-then commitment architecture (Anthropic RSP, Google DeepMind Frontier Saf PRIMARY CONNECTION: [[technology advances exponentially but coordination mechanisms evolve linearly creating a widening gap]] WHY ARCHIVED: Independent multi-stakeholder confirmation of the governance fragmentation thesis — adds authoritative weight to KB claims about governance adequacy, and introduces the "evidence dilemma" framing as a useful named concept EXTRACTION HINT: The "evidence dilemma" framing may be worth its own claim — the structural problem of governing AI when acting early risks bad policy and acting late risks harm has no good resolution, and this may be worth naming explicitly in the KB + + +## Key Facts +- Companies with published Frontier AI Safety Frameworks more than doubled in 2025 +- Anthropic RSP is characterized as the most developed public instantiation of if-then commitment frameworks as of early 2026 +- Capability inputs are growing approximately 5x annually as of 2026 +- No multi-stakeholder binding framework with specificity comparable to RSP exists as of early 2026 +- METR and UK AISI are named as evaluation infrastructure organizations +- The International AI Safety Report is the successor to the Bletchley AI Safety Summit process