diff --git a/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md b/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md index d8d679b8..663c9193 100644 --- a/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md +++ b/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md @@ -27,6 +27,12 @@ This claim directly addresses the mechanism gap identified in [[RLHF and DPO bot The paper's proposed solution—RLCHF with explicit social welfare functions—connects to [[collective intelligence requires diversity as a structural precondition not a moral preference]] by formalizing how diverse evaluator input should be preserved rather than collapsed. + +### Additional Evidence (extend) +*Source: [[2025-06-00-li-scaling-human-judgment-community-notes-llms]] | Added: 2026-03-15* + +RLCF makes the social choice function explicit by separating generation (AI), evaluation (humans), and aggregation (bridging algorithm). Unlike RLHF where the reward model implicitly aggregates preferences during training, RLCF's bridging algorithm is a visible, auditable mechanism for combining diverse ratings. The matrix factorization approach (y_ij = w_i * x_j + b_i + c_j) makes the aggregation rule transparent: notes surface based on intercept scores that capture cross-partisan agreement. This architectural transparency enables normative scrutiny that RLHF's black-box reward models prevent. + --- Relevant Notes: diff --git a/inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json b/inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json new file mode 100644 index 00000000..f0be43dc --- /dev/null +++ b/inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json @@ -0,0 +1,49 @@ +{ + "rejected_claims": [ + { + "filename": "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-selection.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-outputs.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "human-rating-authority-as-alignment-mechanism-preserves-judgment-sovereignty-while-scaling-content-generation.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 3, + "kept": 0, + "fixed": 12, + "rejected": 3, + "fixes_applied": [ + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-selection.md:set_created:2026-03-15", + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-selection.md:stripped_wiki_link:democratic-alignment-assemblies-produce-constitutions-as-eff", + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-selection.md:stripped_wiki_link:community-centred-norm-elicitation-surfaces-alignment-target", + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-selection.md:stripped_wiki_link:rlhf-is-implicit-social-choice-without-normative-scrutiny.md", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-outputs.md:set_created:2026-03-15", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-outputs.md:stripped_wiki_link:universal-alignment-is-mathematically-impossible-because-Arr", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-outputs.md:stripped_wiki_link:pluralistic-alignment-must-accommodate-irreducibly-diverse-v", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-outputs.md:stripped_wiki_link:some-disagreements-are-permanently-irreducible-because-they-", + "human-rating-authority-as-alignment-mechanism-preserves-judgment-sovereignty-while-scaling-content-generation.md:set_created:2026-03-15", + "human-rating-authority-as-alignment-mechanism-preserves-judgment-sovereignty-while-scaling-content-generation.md:stripped_wiki_link:human-in-the-loop-at-the-architectural-level-means-humans-se", + "human-rating-authority-as-alignment-mechanism-preserves-judgment-sovereignty-while-scaling-content-generation.md:stripped_wiki_link:coding-agents-cannot-take-accountability-for-mistakes-which-", + "human-rating-authority-as-alignment-mechanism-preserves-judgment-sovereignty-while-scaling-content-generation.md:stripped_wiki_link:economic-forces-push-humans-out-of-every-cognitive-loop-wher" + ], + "rejections": [ + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-selection.md:missing_attribution_extractor", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-outputs.md:missing_attribution_extractor", + "human-rating-authority-as-alignment-mechanism-preserves-judgment-sovereignty-while-scaling-content-generation.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-15" +} \ No newline at end of file diff --git a/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md b/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md index 095a911b..81f6ee2c 100644 --- a/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md +++ b/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md @@ -7,9 +7,13 @@ date: 2025-06-30 domain: ai-alignment secondary_domains: [collective-intelligence] format: paper -status: unprocessed +status: enrichment priority: high tags: [RLCF, community-notes, bridging-algorithm, pluralistic-alignment, human-AI-collaboration, LLM-alignment] +processed_by: theseus +processed_date: 2026-03-15 +enrichments_applied: ["rlhf-is-implicit-social-choice-without-normative-scrutiny.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -51,3 +55,11 @@ Proposes a hybrid model for Community Notes where both humans and LLMs write not PRIMARY CONNECTION: democratic alignment assemblies produce constitutions as effective as expert-designed ones while better representing diverse populations WHY ARCHIVED: First concrete specification of RLCF — transitions from design principle to implementable mechanism EXTRACTION HINT: Focus on the architecture (who generates, who rates, what selects) and the homogenization risk — the "optimally inoffensive" failure mode is a key tension with our bridging-based alignment thesis + + +## Key Facts +- Community Notes RLCF system uses matrix factorization: y_ij = w_i * x_j + b_i + c_j, where c_j is the bridging score +- RLCF training uses predicted intercept scores as the reward signal +- Stylistic novelty bonuses are added to bridging scores to prevent homogenization +- Paper published in Journal of Online Trust and Safety, June 2025 +- Authors identify four key risks: helpfulness hacking, declining human engagement, homogenization, and rater capacity overwhelm