From 1f04b73459c876c8550610112e0f6f63e0a1847b Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Sun, 22 Mar 2026 04:16:11 +0000 Subject: [PATCH] extract: 2026-03-22-automation-bias-rct-ai-trained-physicians Pentagon-Agent: Epimetheus <3D35839A-7722-4740-B93D-51157F7D5E70> --- ... errors when overriding correct outputs.md | 6 +++++ ...mation-bias-rct-ai-trained-physicians.json | 26 +++++++++++++++++++ ...tomation-bias-rct-ai-trained-physicians.md | 18 ++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 inbox/queue/.extraction-debug/2026-03-22-automation-bias-rct-ai-trained-physicians.json diff --git a/domains/health/human-in-the-loop clinical AI degrades to worse-than-AI-alone because physicians both de-skill from reliance and introduce errors when overriding correct outputs.md b/domains/health/human-in-the-loop clinical AI degrades to worse-than-AI-alone because physicians both de-skill from reliance and introduce errors when overriding correct outputs.md index 48a0da2a4..f5c719557 100644 --- a/domains/health/human-in-the-loop clinical AI degrades to worse-than-AI-alone because physicians both de-skill from reliance and introduce errors when overriding correct outputs.md +++ b/domains/health/human-in-the-loop clinical AI degrades to worse-than-AI-alone because physicians both de-skill from reliance and introduce errors when overriding correct outputs.md @@ -33,6 +33,12 @@ OpenEvidence's 1M daily consultations (30M+/month) with 44% of physicians expres --- +### Additional Evidence (extend) +*Source: [[2026-03-22-automation-bias-rct-ai-trained-physicians]] | Added: 2026-03-22* + +RCT evidence (NCT06963957, medRxiv August 2025) shows that even 20 hours of AI-literacy training covering LLM capabilities, prompt engineering, and critical evaluation is insufficient to prevent automation bias. Physicians with this training still showed significantly degraded diagnostic accuracy when presented with deliberately erroneous ChatGPT-4o recommendations in 3 of 6 clinical vignettes. The study describes this as 'voluntary deference to flawed AI output' creating 'critical patient safety risk.' The emergence of follow-on trial NCT07328815 testing 'behavioral nudges' to mitigate automation bias suggests the field recognizes training alone is insufficient. + + Relevant Notes: - [[centaur team performance depends on role complementarity not mere human-AI combination]] -- the chess centaur model does NOT generalize to clinical medicine where physician overrides degrade AI performance - [[medical LLM benchmark performance does not translate to clinical impact because physicians with and without AI access achieve similar diagnostic accuracy in randomized trials]] -- the multi-hospital RCT found similar diagnostic accuracy with/without AI; the Stanford/Harvard study found AI alone dramatically superior diff --git a/inbox/queue/.extraction-debug/2026-03-22-automation-bias-rct-ai-trained-physicians.json b/inbox/queue/.extraction-debug/2026-03-22-automation-bias-rct-ai-trained-physicians.json new file mode 100644 index 000000000..2b8f4774c --- /dev/null +++ b/inbox/queue/.extraction-debug/2026-03-22-automation-bias-rct-ai-trained-physicians.json @@ -0,0 +1,26 @@ +{ + "rejected_claims": [ + { + "filename": "ai-literacy-training-insufficient-to-prevent-automation-bias-in-physician-llm-diagnostic-settings.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 1, + "kept": 0, + "fixed": 3, + "rejected": 1, + "fixes_applied": [ + "ai-literacy-training-insufficient-to-prevent-automation-bias-in-physician-llm-diagnostic-settings.md:set_created:2026-03-22", + "ai-literacy-training-insufficient-to-prevent-automation-bias-in-physician-llm-diagnostic-settings.md:stripped_wiki_link:human-in-the-loop clinical AI degrades to worse-than-AI-alon", + "ai-literacy-training-insufficient-to-prevent-automation-bias-in-physician-llm-diagnostic-settings.md:stripped_wiki_link:medical LLM benchmark performance does not translate to clin" + ], + "rejections": [ + "ai-literacy-training-insufficient-to-prevent-automation-bias-in-physician-llm-diagnostic-settings.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-22" +} \ No newline at end of file diff --git a/inbox/queue/2026-03-22-automation-bias-rct-ai-trained-physicians.md b/inbox/queue/2026-03-22-automation-bias-rct-ai-trained-physicians.md index 3f96fa840..36a3aee46 100644 --- a/inbox/queue/2026-03-22-automation-bias-rct-ai-trained-physicians.md +++ b/inbox/queue/2026-03-22-automation-bias-rct-ai-trained-physicians.md @@ -7,9 +7,13 @@ date: 2025-08-26 domain: health secondary_domains: [ai-alignment] format: research paper -status: unprocessed +status: enrichment priority: high tags: [automation-bias, clinical-ai-safety, physician-rct, llm-diagnostic, centaur-model, ai-literacy, chatgpt, randomized-trial] +processed_by: vida +processed_date: 2026-03-22 +enrichments_applied: ["human-in-the-loop clinical AI degrades to worse-than-AI-alone because physicians both de-skill from reliance and introduce errors when overriding correct outputs.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -55,3 +59,15 @@ Meta-analysis on LLM effect on diagnostic accuracy (medRxiv December 2025) synth PRIMARY CONNECTION: "clinical AI augments physicians but creates novel safety risks requiring centaur design" (Belief 5's centaur assumption) WHY ARCHIVED: First RCT showing that even AI-trained physicians fail to catch erroneous AI recommendations — the centaur model's "physician catches errors" safety assumption is empirically weaker than stated EXTRACTION HINT: Extract the automation-bias-despite-AI-training finding as a challenge to the centaur design assumption. Note the follow-on NCT07328815 trial as evidence the field recognizes the problem requires specific intervention. + + +## Key Facts +- NCT06963957 registered as 'Automation Bias in Physician-LLM Diagnostic Reasoning' +- Study timeframe: June 20 to August 15, 2025 +- Participants: Physicians registered with Pakistan Medical and Dental Council (MBBS degrees) +- Training duration: 20 hours covering LLM capabilities, prompt engineering, and critical evaluation +- Study design: Single-blind RCT, 1:1 randomization, 6 clinical vignettes, 75-minute session +- Treatment arm: 3 of 6 vignettes had deliberate errors in ChatGPT-4o recommendations +- Related trial: JAMA Network Open 'LLM Influence on Diagnostic Reasoning' (June 2025, PMID: 2825395) +- Follow-on trial: NCT07328815 'Mitigating Automation Bias in Physician-LLM Diagnostic Reasoning Using Behavioral Nudges' +- Meta-analysis synthesizing these trials published medRxiv December 2025 -- 2.45.2