diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index d8fdd3275..fd186c73b 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -104,6 +104,12 @@ IAISR 2026 states that 'pre-deployment testing increasingly fails to predict rea Anthropic's explicit admission that 'the science of model evaluation isn't well-developed enough to provide definitive threshold assessments' is direct confirmation from a frontier lab that evaluation tools are insufficient for governance. This aligns with METR's March 2026 modeling assumptions note, suggesting field-wide consensus that current evaluation science cannot support the governance structures built on top of it. +### Additional Evidence (confirm) +*Source: [[2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap]] | Added: 2026-03-24* + +Anthropic's stated rationale for extending evaluation intervals from 3 to 6 months explicitly acknowledges that 'the science of model evaluation isn't well-developed enough' and that rushed evaluations produce lower-quality results. This is a direct admission from a frontier lab that current evaluation methodologies are insufficiently mature to support the governance structures built on them. The 'zone of ambiguity' where capabilities approached but didn't definitively pass thresholds in v2.0 demonstrates that evaluation uncertainty creates governance paralysis. + + diff --git a/inbox/queue/.extraction-debug/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.json b/inbox/queue/.extraction-debug/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.json new file mode 100644 index 000000000..e240620c5 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.json @@ -0,0 +1,43 @@ +{ + "rejected_claims": [ + { + "filename": "rsp-frontier-safety-roadmaps-create-public-accountability-through-concrete-milestones-not-binding-thresholds.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "extended-evaluation-intervals-trade-speed-for-quality-when-evaluation-science-is-immature.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "redacted-risk-reports-undermine-quantified-risk-commitments-by-preventing-external-verification.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 3, + "kept": 0, + "fixed": 6, + "rejected": 3, + "fixes_applied": [ + "rsp-frontier-safety-roadmaps-create-public-accountability-through-concrete-milestones-not-binding-thresholds.md:set_created:2026-03-24", + "rsp-frontier-safety-roadmaps-create-public-accountability-through-concrete-milestones-not-binding-thresholds.md:stripped_wiki_link:voluntary-safety-pledges-cannot-survive-competitive-pressure", + "rsp-frontier-safety-roadmaps-create-public-accountability-through-concrete-milestones-not-binding-thresholds.md:stripped_wiki_link:Anthropics-RSP-rollback-under-commercial-pressure-is-the-fir", + "extended-evaluation-intervals-trade-speed-for-quality-when-evaluation-science-is-immature.md:set_created:2026-03-24", + "redacted-risk-reports-undermine-quantified-risk-commitments-by-preventing-external-verification.md:set_created:2026-03-24", + "redacted-risk-reports-undermine-quantified-risk-commitments-by-preventing-external-verification.md:stripped_wiki_link:AI-transparency-is-declining-not-improving-because-Stanford-" + ], + "rejections": [ + "rsp-frontier-safety-roadmaps-create-public-accountability-through-concrete-milestones-not-binding-thresholds.md:missing_attribution_extractor", + "extended-evaluation-intervals-trade-speed-for-quality-when-evaluation-science-is-immature.md:missing_attribution_extractor", + "redacted-risk-reports-undermine-quantified-risk-commitments-by-preventing-external-verification.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-24" +} \ No newline at end of file diff --git a/inbox/queue/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.md b/inbox/queue/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.md index d786a3606..60e2419f9 100644 --- a/inbox/queue/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.md +++ b/inbox/queue/2026-02-24-anthropic-rsp-v3-0-frontier-safety-roadmap.md @@ -7,9 +7,13 @@ date: 2026-02-24 domain: ai-alignment secondary_domains: [] format: policy-document -status: unprocessed +status: enrichment priority: high tags: [rsp, responsible-scaling-policy, frontier-safety-roadmap, capability-thresholds, asl, evaluation-governance, anthropic] +processed_by: theseus +processed_date: 2026-03-24 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -66,3 +70,14 @@ PRIMARY CONNECTION: [[voluntary safety pledges cannot survive competitive pressu WHY ARCHIVED: RSP v3.0 is the primary empirical test of whether Anthropic's governance evolution is moving toward or away from structural accountability. The Frontier Safety Roadmap adds concrete milestones not present in v2.0, but the "moderate confidence" on interpretability and redacted Risk Reports are significant limitations. EXTRACTION HINT: Two competing claims worth developing — (1) RSP v3.0's Frontier Safety Roadmap represents a genuine governance innovation (public grading, concrete milestones, internal forcing function) that goes beyond prior voluntary commitments; (2) RSP v3.0's self-imposed, redacted, and legally-unenforceable structure cannot close the accountability gap identified by independent evaluators. These may coexist as a divergence rather than resolving to one claim. + + +## Key Facts +- RSP v3.0 effective date: February 24, 2026 +- Evaluation interval changed from 3 months (v2.0) to 6 months (v3.0) +- Frontier Safety Roadmap milestones: April 2026 (moonshot security projects), July 2026 (policy recommendations), October 2026 (alignment assessments with 'moderate confidence'), January 2027 (world-class red-teaming), July 2027 (broad security maturity) +- AI R&D capability threshold disaggregated into two: (1) ability to fully automate entry-level AI research work; (2) ability to cause dramatic acceleration in effective scaling rate +- ASL-3 safeguards remain in effect under v3.0 +- METR continues external evaluation partnership +- October 2026 alignment assessment targets 'interpretability techniques that produce meaningful signal beyond behavioral methods alone' with 'moderate confidence' +- Anthropic describes Frontier Safety Roadmap as 'self-imposed public accountability mechanism rather than a legally binding contract'