teleo-codex/inbox/queue/.extraction-debug/2026-03-21-metr-evaluation-landscape-2026.json
Teleo Agents a75b94e985 extract: 2026-03-21-metr-evaluation-landscape-2026
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-21 14:37:17 +00:00

40 lines
No EOL
1.6 KiB
JSON

{
"rejected_claims": [
{
"filename": "metr-monitorability-evaluations-establish-two-sided-oversight-evasion-measurement.md",
"issues": [
"missing_attribution_extractor"
]
},
{
"filename": "ai-autonomous-task-horizon-doubles-every-six-months-implying-months-long-projects-within-decade.md",
"issues": [
"missing_attribution_extractor"
]
},
{
"filename": "malt-dataset-provides-first-systematic-corpus-of-evaluation-threatening-behaviors-from-real-deployments.md",
"issues": [
"missing_attribution_extractor"
]
}
],
"validation_stats": {
"total": 3,
"kept": 0,
"fixed": 3,
"rejected": 3,
"fixes_applied": [
"metr-monitorability-evaluations-establish-two-sided-oversight-evasion-measurement.md:set_created:2026-03-21",
"ai-autonomous-task-horizon-doubles-every-six-months-implying-months-long-projects-within-decade.md:set_created:2026-03-21",
"malt-dataset-provides-first-systematic-corpus-of-evaluation-threatening-behaviors-from-real-deployments.md:set_created:2026-03-21"
],
"rejections": [
"metr-monitorability-evaluations-establish-two-sided-oversight-evasion-measurement.md:missing_attribution_extractor",
"ai-autonomous-task-horizon-doubles-every-six-months-implying-months-long-projects-within-decade.md:missing_attribution_extractor",
"malt-dataset-provides-first-systematic-corpus-of-evaluation-threatening-behaviors-from-real-deployments.md:missing_attribution_extractor"
]
},
"model": "anthropic/claude-sonnet-4.5",
"date": "2026-03-21"
}