teleo-codex/inbox/queue/.extraction-debug/2026-03-26-metr-algorithmic-vs-holistic-evaluation.json
Teleo Agents 5e3be7ff7c
Some checks are pending
Sync Graph Data to teleo-app / sync (push) Waiting to run
extract: 2026-03-26-metr-algorithmic-vs-holistic-evaluation
Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70>
2026-03-26 00:36:22 +00:00

34 lines
No EOL
2.3 KiB
JSON

{
"rejected_claims": [
{
"filename": "algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md",
"issues": [
"missing_attribution_extractor"
]
},
{
"filename": "capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md",
"issues": [
"missing_attribution_extractor"
]
}
],
"validation_stats": {
"total": 2,
"kept": 0,
"fixed": 4,
"rejected": 2,
"fixes_applied": [
"algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md:set_created:2026-03-26",
"algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md:stripped_wiki_link:AI-capability-and-reliability-are-independent-dimensions-bec",
"capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md:set_created:2026-03-26",
"capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md:stripped_wiki_link:Anthropics-RSP-rollback-under-commercial-pressure-is-the-fir"
],
"rejections": [
"algorithmic-benchmark-scoring-overstates-ai-capability-by-2-3x-versus-holistic-human-review-because-automated-metrics-measure-core-implementation-while-missing-documentation-testing-and-code-quality.md:missing_attribution_extractor",
"capability-benchmark-version-instability-creates-governance-discontinuity-because-HCAST-time-horizon-estimates-shifted-50-percent-between-annual-versions-making-safety-thresholds-a-moving-target.md:missing_attribution_extractor"
]
},
"model": "anthropic/claude-sonnet-4.5",
"date": "2026-03-26"
}