From ad990c43e4536e719b6faec17304e1873885a060 Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Fri, 20 Mar 2026 16:27:53 +0000 Subject: [PATCH] extract: 2026-03-20-bench2cop-benchmarks-insufficient-compliance Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...moved safety language from mission statements.md | 6 ++++++ ...al-governance-built-on-unreliable-foundations.md | 6 ++++++ ...ench2cop-benchmarks-insufficient-compliance.json | 10 ++++++---- ...-bench2cop-benchmarks-insufficient-compliance.md | 13 +++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/domains/ai-alignment/AI transparency is declining not improving because Stanford FMTI scores dropped 17 points in one year while frontier labs dissolved safety teams and removed safety language from mission statements.md b/domains/ai-alignment/AI transparency is declining not improving because Stanford FMTI scores dropped 17 points in one year while frontier labs dissolved safety teams and removed safety language from mission statements.md index 0ca0eab7..a580e164 100644 --- a/domains/ai-alignment/AI transparency is declining not improving because Stanford FMTI scores dropped 17 points in one year while frontier labs dissolved safety teams and removed safety language from mission statements.md +++ b/domains/ai-alignment/AI transparency is declining not improving because Stanford FMTI scores dropped 17 points in one year while frontier labs dissolved safety teams and removed safety language from mission statements.md @@ -53,6 +53,12 @@ Stanford FMTI 2024→2025 data: mean transparency score declined 17 points. Meta The Bench-2-CoP analysis reveals that even when labs do conduct evaluations, the benchmark infrastructure itself is architecturally incapable of measuring loss-of-control risks. This compounds the transparency decline: labs are not just hiding information, they're using evaluation tools that cannot detect the most critical failure modes even if applied honestly. + +### Additional Evidence (extend) +*Source: [[2026-03-20-bench2cop-benchmarks-insufficient-compliance]] | Added: 2026-03-20* + +The Bench-2-CoP paper reveals that even when labs do provide benchmark results for transparency, those benchmarks structurally cannot measure alignment-critical capabilities. The 195,000-question corpus analyzed had zero coverage of oversight evasion, self-replication, or autonomous AI development. This means transparency decline isn't just about labs sharing less — it's that what they're sharing (benchmark performance) doesn't measure what matters for safety. + --- Relevant Notes: diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 784ae5dd..f879f9da 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -62,6 +62,12 @@ Agents of Chaos demonstrates that static single-agent benchmarks fail to capture Prandi et al. (2025) found that 195,000 benchmark questions provided zero coverage of oversight evasion, self-replication, and autonomous AI development capabilities. This extends the evaluation unreliability thesis by showing the gap is not just predictive validity but complete absence of measurement for alignment-critical capabilities. + +### Additional Evidence (extend) +*Source: [[2026-03-20-bench2cop-benchmarks-insufficient-compliance]] | Added: 2026-03-20* + +Prandi et al. provide the specific mechanism for why pre-deployment evaluations fail: current benchmark suites concentrate 92.8% of regulatory-relevant coverage on behavioral propensities (hallucination and reliability) while providing zero coverage of the three capability classes (oversight evasion, self-replication, autonomous AI development) that matter most for loss-of-control scenarios. This isn't just that evaluations don't predict real-world risk — it's that the evaluation tools measure orthogonal dimensions to the risks regulators care about. + --- Relevant Notes: diff --git a/inbox/queue/.extraction-debug/2026-03-20-bench2cop-benchmarks-insufficient-compliance.json b/inbox/queue/.extraction-debug/2026-03-20-bench2cop-benchmarks-insufficient-compliance.json index 1f1878dc..4f8a5632 100644 --- a/inbox/queue/.extraction-debug/2026-03-20-bench2cop-benchmarks-insufficient-compliance.json +++ b/inbox/queue/.extraction-debug/2026-03-20-bench2cop-benchmarks-insufficient-compliance.json @@ -1,7 +1,7 @@ { "rejected_claims": [ { - "filename": "ai-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-regulatory-compliance.md", + "filename": "current-AI-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-EU-AI-Act-compliance.md", "issues": [ "missing_attribution_extractor" ] @@ -10,13 +10,15 @@ "validation_stats": { "total": 1, "kept": 0, - "fixed": 1, + "fixed": 3, "rejected": 1, "fixes_applied": [ - "ai-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-regulatory-compliance.md:set_created:2026-03-20" + "current-AI-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-EU-AI-Act-compliance.md:set_created:2026-03-20", + "current-AI-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-EU-AI-Act-compliance.md:stripped_wiki_link:pre-deployment-AI-evaluations-do-not-predict-real-world-risk", + "current-AI-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-EU-AI-Act-compliance.md:stripped_wiki_link:AI transparency is declining not improving because Stanford " ], "rejections": [ - "ai-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-regulatory-compliance.md:missing_attribution_extractor" + "current-AI-benchmarks-provide-zero-coverage-of-loss-of-control-capabilities-making-them-structurally-insufficient-for-EU-AI-Act-compliance.md:missing_attribution_extractor" ] }, "model": "anthropic/claude-sonnet-4.5", diff --git a/inbox/queue/2026-03-20-bench2cop-benchmarks-insufficient-compliance.md b/inbox/queue/2026-03-20-bench2cop-benchmarks-insufficient-compliance.md index eb66808b..22288b10 100644 --- a/inbox/queue/2026-03-20-bench2cop-benchmarks-insufficient-compliance.md +++ b/inbox/queue/2026-03-20-bench2cop-benchmarks-insufficient-compliance.md @@ -14,6 +14,10 @@ processed_by: theseus processed_date: 2026-03-20 enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md", "AI transparency is declining not improving because Stanford FMTI scores dropped 17 points in one year while frontier labs dissolved safety teams and removed safety language from mission statements.md"] extraction_model: "anthropic/claude-sonnet-4.5" +processed_by: theseus +processed_date: 2026-03-20 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md", "AI transparency is declining not improving because Stanford FMTI scores dropped 17 points in one year while frontier labs dissolved safety teams and removed safety language from mission statements.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -64,3 +68,12 @@ EXTRACTION HINT: Focus on the zero-coverage finding for loss-of-control capabili - 61.6% of regulatory-relevant benchmark coverage addresses 'tendency to hallucinate' - 31.2% of regulatory-relevant benchmark coverage addresses 'lack of performance reliability' - Zero benchmark questions in the analyzed corpus covered oversight evasion, self-replication, or autonomous AI development capabilities + + +## Key Facts +- EU AI Act GPAI obligations (Article 55) came into force August 2, 2025 +- Prandi et al. analyzed approximately 195,000 benchmark questions using LLM-as-judge methodology +- 61.6% of regulatory-relevant benchmark coverage addresses 'tendency to hallucinate' +- 31.2% of regulatory-relevant benchmark coverage addresses 'lack of performance reliability' +- Zero benchmark questions in the analyzed corpus covered oversight evasion, self-replication, or autonomous AI development capabilities +- Paper published August 2025 as retrospective assessment of evaluation infrastructure adequacy