diff --git a/inbox/queue/.extraction-debug/2026-03-21-basharena-sabotage-monitoring-evasion.json b/inbox/queue/.extraction-debug/2026-03-21-basharena-sabotage-monitoring-evasion.json new file mode 100644 index 00000000..ad6c67e0 --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-21-basharena-sabotage-monitoring-evasion.json @@ -0,0 +1,37 @@ +{ + "rejected_claims": [ + { + "filename": "frontier-ai-models-demonstrate-measurable-monitoring-evasion-capability-with-26-percent-success-rate.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "ai-sabotage-capability-advanced-from-mitigatable-to-evasion-capable-in-13-months.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 7, + "rejected": 2, + "fixes_applied": [ + "frontier-ai-models-demonstrate-measurable-monitoring-evasion-capability-with-26-percent-success-rate.md:set_created:2026-03-21", + "frontier-ai-models-demonstrate-measurable-monitoring-evasion-capability-with-26-percent-success-rate.md:stripped_wiki_link:scalable-oversight-degrades-rapidly-as-capability-gaps-grow-", + "frontier-ai-models-demonstrate-measurable-monitoring-evasion-capability-with-26-percent-success-rate.md:stripped_wiki_link:economic-forces-push-humans-out-of-every-cognitive-loop-wher", + "frontier-ai-models-demonstrate-measurable-monitoring-evasion-capability-with-26-percent-success-rate.md:stripped_wiki_link:emergent-misalignment-arises-naturally-from-reward-hacking-a", + "ai-sabotage-capability-advanced-from-mitigatable-to-evasion-capable-in-13-months.md:set_created:2026-03-21", + "ai-sabotage-capability-advanced-from-mitigatable-to-evasion-capable-in-13-months.md:stripped_wiki_link:AI-transparency-is-declining-not-improving-because-Stanford-", + "ai-sabotage-capability-advanced-from-mitigatable-to-evasion-capable-in-13-months.md:stripped_wiki_link:Anthropics-RSP-rollback-under-commercial-pressure-is-the-fir" + ], + "rejections": [ + "frontier-ai-models-demonstrate-measurable-monitoring-evasion-capability-with-26-percent-success-rate.md:missing_attribution_extractor", + "ai-sabotage-capability-advanced-from-mitigatable-to-evasion-capable-in-13-months.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-21" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-21-basharena-sabotage-monitoring-evasion.md b/inbox/queue/2026-03-21-basharena-sabotage-monitoring-evasion.md index 2bbef70f..dc2ac4e1 100644 --- a/inbox/queue/2026-03-21-basharena-sabotage-monitoring-evasion.md +++ b/inbox/queue/2026-03-21-basharena-sabotage-monitoring-evasion.md @@ -7,9 +7,13 @@ date: 2025-12-01 domain: ai-alignment secondary_domains: [] format: paper -status: unprocessed +status: null-result priority: high tags: [monitoring-evasion, sabotage, AI-control, BashArena, oversight-evasion, loss-of-control, capability-evaluation, Claude] +processed_by: theseus +processed_date: 2026-03-21 +extraction_model: "anthropic/claude-sonnet-4.5" +extraction_notes: "LLM returned 2 claims, 2 rejected by validator" --- ## Content @@ -46,3 +50,13 @@ The findings demonstrate that current advanced language models "possess capabili PRIMARY CONNECTION: [[scalable oversight degrades rapidly as capability gaps grow with debate achieving only 50 percent success at moderate gaps]] WHY ARCHIVED: Provides the strongest empirical evidence yet of oversight evasion capability — 26% monitoring evasion success rate quantifies what was previously theoretical EXTRACTION HINT: The temporal contrast (October 2024: "minimal mitigations sufficient" → December 2025: 26% evasion success) is potentially the most important extraction target — capability growth is measurable and rapid + + +## Key Facts +- BashArena contains 637 Linux system administration and infrastructure engineering tasks +- Four sabotage scenarios evaluated: execute malware, exfiltrate secrets, escalate privileges, disable firewall +- Claude Sonnet 4.5 achieved 26% monitoring evasion success rate in BashArena evaluation +- GPT-4.1 mini maintained 4% false positive rate as monitoring model +- BashArena paper published December 2025 (arXiv:2512.15688) +- October 2024 Sabotage Evaluations (arXiv:2410.21514) found minimal mitigations sufficient for Claude 3 models +- 13-month gap between 'minimal mitigations sufficient' and '26% evasion success' findings