From 96fd8d29366e29c4eb23358e950ead35b86d12da Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Wed, 25 Mar 2026 00:21:08 +0000 Subject: [PATCH] extract: 2026-03-25-metr-developer-productivity-rct-full-paper Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ...ernance-built-on-unreliable-foundations.md | 6 +++++ ...developer-productivity-rct-full-paper.json | 26 +++++++++++++++++++ ...r-developer-productivity-rct-full-paper.md | 15 ++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-25-metr-developer-productivity-rct-full-paper.json diff --git a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md index 7d9864db..7f641c40 100644 --- a/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md +++ b/domains/ai-alignment/pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md @@ -119,6 +119,12 @@ Anthropic's explicit admission that 'the science of model evaluation isn't well- METR's scaffold sensitivity finding (GPT-4o and o3 performing better under Vivaria than Inspect) adds a new dimension to evaluation unreliability: the same model produces different capability estimates depending on evaluation infrastructure, introducing cross-model comparison uncertainty that governance frameworks do not account for. +### Additional Evidence (extend) +*Source: [[2026-03-25-metr-developer-productivity-rct-full-paper]] | Added: 2026-03-25* + +METR's methodology (RCT + 143 hours of screen recordings at ~10-second resolution) represents the most rigorous empirical design deployed for AI productivity research. The combination of randomized assignment, real tasks developers would normally work on, and granular behavioral decomposition sets a new standard for evaluation quality. This contrasts sharply with pre-deployment evaluations that lack real-world task context. + + diff --git a/inbox/queue/.extraction-debug/2026-03-25-metr-developer-productivity-rct-full-paper.json b/inbox/queue/.extraction-debug/2026-03-25-metr-developer-productivity-rct-full-paper.json new file mode 100644 index 00000000..df5a725f --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-25-metr-developer-productivity-rct-full-paper.json @@ -0,0 +1,26 @@ +{ + "rejected_claims": [ + { + "filename": "ai-tools-create-productivity-illusion-where-experienced-developers-work-slower-while-perceiving-speedup.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 1, + "kept": 0, + "fixed": 3, + "rejected": 1, + "fixes_applied": [ + "ai-tools-create-productivity-illusion-where-experienced-developers-work-slower-while-perceiving-speedup.md:set_created:2026-03-25", + "ai-tools-create-productivity-illusion-where-experienced-developers-work-slower-while-perceiving-speedup.md:stripped_wiki_link:deep-technical-expertise-is-a-greater-force-multiplier-when-", + "ai-tools-create-productivity-illusion-where-experienced-developers-work-slower-while-perceiving-speedup.md:stripped_wiki_link:the-gap-between-theoretical-AI-capability-and-observed-deplo" + ], + "rejections": [ + "ai-tools-create-productivity-illusion-where-experienced-developers-work-slower-while-perceiving-speedup.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-25" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-25-metr-developer-productivity-rct-full-paper.md b/inbox/queue/2026-03-25-metr-developer-productivity-rct-full-paper.md index d3b382e6..1f7dbb65 100644 --- a/inbox/queue/2026-03-25-metr-developer-productivity-rct-full-paper.md +++ b/inbox/queue/2026-03-25-metr-developer-productivity-rct-full-paper.md @@ -7,9 +7,13 @@ date: 2025-07-10 domain: ai-alignment secondary_domains: [] format: research-paper -status: unprocessed +status: enrichment priority: medium tags: [developer-productivity, RCT, benchmark-reality-gap, METR, AI-tools, slowdown, human-AI-collaboration] +processed_by: theseus +processed_date: 2026-03-25 +enrichments_applied: ["pre-deployment-AI-evaluations-do-not-predict-real-world-risk-creating-institutional-governance-built-on-unreliable-foundations.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -56,3 +60,12 @@ METR's randomized controlled trial measuring how early-2025 AI tools affect prod PRIMARY CONNECTION: [[the gap between theoretical AI capability and observed deployment is massive across all occupations]] — provides the strongest empirical evidence that expert productivity with AI tools may decline, not just lag WHY ARCHIVED: Foundation for the benchmark-reality gap analysis; also contains the strongest RCT evidence on human-AI productivity in expert domains EXTRACTION HINT: CRITICAL DISTINCTION: This RCT measures human developers using AI tools → they were slower. The "0% production-ready" finding is from METR's separate holistic evaluation of autonomous AI agents. Do NOT conflate. The RCT is primarily about human+AI productivity, the holistic evaluation is about AI-only task completion. Both matter but for different KB claims. + + +## Key Facts +- METR's developer productivity RCT included 16 experienced developers from repos averaging 22k+ stars and 1M+ lines of code +- The study analyzed 246 completed issues with 143 hours of screen recordings at ~10-second resolution (29% of total hours) +- Primary AI tools tested were Cursor Pro with Claude 3.5/3.7 Sonnet +- Developers forecast AI would reduce time by 24% before tasks, estimated 20% reduction after study, but actual result was 19% slower +- The study used clustered standard errors and was at the edge of statistical power with 246 issues +- Full paper published as arXiv 2507.09089 with GitHub data at METR/Measuring-Early-2025-AI-on-Exp-OSS-Devs