diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 9ea6a7ce..d8fdd327 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -99,6 +99,12 @@ METR recommended 'deeper investigations of evaluation awareness and obfuscated m IAISR 2026 states that 'pre-deployment testing increasingly fails to predict real-world model behavior,' providing authoritative international consensus confirmation that the evaluation-deployment gap is widening. The report explicitly connects this to dangerous capabilities going undetected, confirming the governance implications. +### Additional Evidence (confirm) +*Source: [[2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse]] | Added: 2026-03-23* + +Anthropic's explicit admission that 'the science of model evaluation isn't well-developed enough to provide definitive threshold assessments' is direct confirmation from a frontier lab that evaluation tools are insufficient for governance. This aligns with METR's March 2026 modeling assumptions note, suggesting field-wide consensus that current evaluation science cannot support the governance structures built on top of it. + + diff --git a/inbox/queue/.extraction-debug/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.json b/inbox/queue/.extraction-debug/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.json new file mode 100644 index 00000000..c5d09407 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.json @@ -0,0 +1,35 @@ +{ + "rejected_claims": [ + { + "filename": "evaluation-science-insufficiency-makes-capability-thresholds-unenforceable-before-competitive-pressure-matters.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "public-goals-with-open-grading-replace-binding-commitments-when-enforcement-mechanisms-fail.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 5, + "rejected": 2, + "fixes_applied": [ + "evaluation-science-insufficiency-makes-capability-thresholds-unenforceable-before-competitive-pressure-matters.md:set_created:2026-03-23", + "evaluation-science-insufficiency-makes-capability-thresholds-unenforceable-before-competitive-pressure-matters.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "public-goals-with-open-grading-replace-binding-commitments-when-enforcement-mechanisms-fail.md:set_created:2026-03-23", + "public-goals-with-open-grading-replace-binding-commitments-when-enforcement-mechanisms-fail.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "public-goals-with-open-grading-replace-binding-commitments-when-enforcement-mechanisms-fail.md:stripped_wiki_link:only-binding-regulation-with-enforcement-teeth-changes-front" + ], + "rejections": [ + "evaluation-science-insufficiency-makes-capability-thresholds-unenforceable-before-competitive-pressure-matters.md:missing_attribution_extractor", + "public-goals-with-open-grading-replace-binding-commitments-when-enforcement-mechanisms-fail.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-23" +} \ No newline at end of file diff --git a/inbox/queue/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.md b/inbox/queue/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.md index 1b4b624f..f140bed1 100644 --- a/inbox/queue/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.md +++ b/inbox/queue/2026-02-24-anthropic-rsp-v3-voluntary-safety-collapse.md @@ -7,9 +7,13 @@ date: 2026-02-24 domain: ai-alignment secondary_domains: [] format: policy-document -status: unprocessed +status: enrichment priority: high tags: [anthropic, RSP, voluntary-safety, governance, evaluation-insufficiency, race-dynamics, B1-disconfirmation] +processed_by: theseus +processed_date: 2026-03-23 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -59,3 +63,14 @@ Hard commitments replaced by publicly-graded non-binding "public goals" (Frontie PRIMARY CONNECTION: [[voluntary safety pledges cannot survive competitive pressure because unilateral commitments are structurally punished when competitors advance without equivalent constraints]] WHY ARCHIVED: Direct empirical confirmation of two separate mechanisms causing voluntary safety commitments to fail — competitive pressure AND evaluation science insufficiency EXTRACTION HINT: The evaluation science admission may be more important than the competitive pressure angle — it suggests hard commitments cannot be defined, not just that they won't be kept + + +## Key Facts +- Anthropic published Responsible Scaling Policy v3.0 on February 24, 2026 +- RSP v3.0 removed the hard capability-threshold pause trigger that was the centerpiece of v1.0 and v2.0 +- Anthropic stated 'the science of model evaluation isn't well-developed enough to provide definitive threshold assessments' +- RSP v3.0 introduced a 'dual-track' approach: unilateral commitments and industry recommendations +- The new policy includes Frontier Safety Roadmaps and risk reports every 3-6 months with external expert reviewer access +- CNN, Semafor, and Winbuzzer characterized the change as 'Anthropic drops hard safety limits' and 'scales back AI safety pledge' +- Semafor headline: 'Anthropic eases AI safety restrictions to avoid slowing development' +- The policy change occurred during Anthropic's conflict with the Pentagon over 'supply chain risk' designation