diff --git a/domains/ai-alignment/single-reward-rlhf-cannot-align-diverse-preferences-because-alignment-gap-grows-proportional-to-minority-distinctiveness.md b/domains/ai-alignment/single-reward-rlhf-cannot-align-diverse-preferences-because-alignment-gap-grows-proportional-to-minority-distinctiveness.md index a19a82ade..7b7364713 100644 --- a/domains/ai-alignment/single-reward-rlhf-cannot-align-diverse-preferences-because-alignment-gap-grows-proportional-to-minority-distinctiveness.md +++ b/domains/ai-alignment/single-reward-rlhf-cannot-align-diverse-preferences-because-alignment-gap-grows-proportional-to-minority-distinctiveness.md @@ -33,6 +33,12 @@ Chakraborty, Qiu, Yuan, Koppel, Manocha, Huang, Bedi, Wang. "MaxMin-RLHF: Alignm Study demonstrates that models trained on different demographic populations show measurable behavioral divergence (3-5 percentage points), providing empirical evidence that single-reward functions trained on one population systematically misalign with others. + +### Additional Evidence (extend) +*Source: [[2025-11-00-sahoo-rlhf-alignment-trilemma]] | Added: 2026-03-16* + +The formal trilemma proof quantifies the practical gap: current RLHF systems collect 10^3-10^4 samples from homogeneous annotator pools while 10^7-10^8 samples are needed for true global representation — a 3-4 order of magnitude shortfall. The complexity bound shows this gap cannot be closed while maintaining polynomial tractability, making the alignment gap a necessary consequence of the trilemma rather than a sampling problem. + --- Relevant Notes: diff --git a/inbox/archive/.extraction-debug/2025-11-00-sahoo-rlhf-alignment-trilemma.json b/inbox/archive/.extraction-debug/2025-11-00-sahoo-rlhf-alignment-trilemma.json new file mode 100644 index 000000000..80ec32372 --- /dev/null +++ b/inbox/archive/.extraction-debug/2025-11-00-sahoo-rlhf-alignment-trilemma.json @@ -0,0 +1,36 @@ +{ + "rejected_claims": [ + { + "filename": "rlhf-alignment-trilemma-is-computationally-proven-impossibility.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "rlhf-pathologies-are-computational-necessities-not-implementation-bugs.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 2, + "kept": 0, + "fixed": 6, + "rejected": 2, + "fixes_applied": [ + "rlhf-alignment-trilemma-is-computationally-proven-impossibility.md:set_created:2026-03-16", + "rlhf-alignment-trilemma-is-computationally-proven-impossibility.md:stripped_wiki_link:universal-alignment-is-mathematically-impossible-because-Arr", + "rlhf-alignment-trilemma-is-computationally-proven-impossibility.md:stripped_wiki_link:single-reward-rlhf-cannot-align-diverse-preferences-because-", + "rlhf-pathologies-are-computational-necessities-not-implementation-bugs.md:set_created:2026-03-16", + "rlhf-pathologies-are-computational-necessities-not-implementation-bugs.md:stripped_wiki_link:emergent-misalignment-arises-naturally-from-reward-hacking-a", + "rlhf-pathologies-are-computational-necessities-not-implementation-bugs.md:stripped_wiki_link:rlhf-is-implicit-social-choice-without-normative-scrutiny.md" + ], + "rejections": [ + "rlhf-alignment-trilemma-is-computationally-proven-impossibility.md:missing_attribution_extractor", + "rlhf-pathologies-are-computational-necessities-not-implementation-bugs.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-16" +} \ No newline at end of file diff --git a/inbox/archive/2025-11-00-sahoo-rlhf-alignment-trilemma.md b/inbox/archive/2025-11-00-sahoo-rlhf-alignment-trilemma.md index 17c59596c..9110b0a55 100644 --- a/inbox/archive/2025-11-00-sahoo-rlhf-alignment-trilemma.md +++ b/inbox/archive/2025-11-00-sahoo-rlhf-alignment-trilemma.md @@ -7,9 +7,13 @@ date: 2025-11-01 domain: ai-alignment secondary_domains: [collective-intelligence] format: paper -status: unprocessed +status: enrichment priority: high tags: [alignment-trilemma, impossibility-result, rlhf, representativeness, robustness, tractability, preference-collapse, sycophancy] +processed_by: theseus +processed_date: 2026-03-16 +enrichments_applied: ["single-reward-rlhf-cannot-align-diverse-preferences-because-alignment-gap-grows-proportional-to-minority-distinctiveness.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -56,3 +60,11 @@ Position paper from Berkeley AI Safety Initiative, AWS/Stanford, Meta/Stanford, PRIMARY CONNECTION: [[RLHF and DPO both fail at preference diversity because they assume a single reward function can capture context-dependent human values]] WHY ARCHIVED: Formalizes our informal impossibility claim with complexity-theoretic proof — independent confirmation of Arrow's-theorem-based argument from a different mathematical tradition EXTRACTION HINT: The trilemma is the key claim. Also extract the practical gap (10^3 vs 10^8) and the "pathologies as computational necessities" framing + + +## Key Facts +- Paper presented at NeurIPS 2025 Workshop on Socially Responsible and Trustworthy Foundation Models +- Authors affiliated with Berkeley AI Safety Initiative, AWS, Stanford, Meta, and Northeastern +- Current RLHF systems collect 10^3-10^4 samples from annotator pools +- True global representation would require 10^7-10^8 samples +- Models assign >99% probability to majority opinions in current implementations