From 751432360867755dc81514c9b246a7c6c7aa7c7a Mon Sep 17 00:00:00 2001 From: Teleo Agents Date: Sun, 15 Mar 2026 19:29:03 +0000 Subject: [PATCH] extract: 2025-06-00-li-scaling-human-judgment-community-notes-llms Pentagon-Agent: Ganymede --- ...ocial-choice-without-normative-scrutiny.md | 6 ++ ...g-human-judgment-community-notes-llms.json | 58 +++++++++++++++++++ ...ing-human-judgment-community-notes-llms.md | 14 ++++- 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json diff --git a/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md b/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md index d8d679b8..dc59e956 100644 --- a/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md +++ b/domains/ai-alignment/rlhf-is-implicit-social-choice-without-normative-scrutiny.md @@ -27,6 +27,12 @@ This claim directly addresses the mechanism gap identified in [[RLHF and DPO bot The paper's proposed solution—RLCHF with explicit social welfare functions—connects to [[collective intelligence requires diversity as a structural precondition not a moral preference]] by formalizing how diverse evaluator input should be preserved rather than collapsed. + +### Additional Evidence (extend) +*Source: [[2025-06-00-li-scaling-human-judgment-community-notes-llms]] | Added: 2026-03-15* + +RLCF makes the social choice mechanism explicit through the bridging algorithm (matrix factorization with intercept scores). Unlike standard RLHF which aggregates preferences opaquely through reward model training, RLCF's use of intercepts as the training signal is a deliberate choice to optimize for cross-partisan agreement—a specific social welfare function. + --- Relevant Notes: diff --git a/inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json b/inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json new file mode 100644 index 00000000..38bd99dc --- /dev/null +++ b/inbox/archive/.extraction-debug/2025-06-00-li-scaling-human-judgment-community-notes-llms.json @@ -0,0 +1,58 @@ +{ + "rejected_claims": [ + { + "filename": "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-algorithm-selection.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-content.md", + "issues": [ + "no_frontmatter" + ] + }, + { + "filename": "human-rating-authority-in-ai-systems-preserves-alignment-by-keeping-value-judgment-in-human-hands.md", + "issues": [ + "missing_attribution_extractor" + ] + }, + { + "filename": "stylistic-novelty-rewards-in-rlcf-balance-optimization-pressure-with-diversity-preservation.md", + "issues": [ + "missing_attribution_extractor" + ] + } + ], + "validation_stats": { + "total": 4, + "kept": 0, + "fixed": 14, + "rejected": 4, + "fixes_applied": [ + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-algorithm-selection.md:set_created:2026-03-15", + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-algorithm-selection.md:stripped_wiki_link:democratic-alignment-assemblies-produce-constitutions-as-eff", + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-algorithm-selection.md:stripped_wiki_link:community-centred-norm-elicitation-surfaces-alignment-target", + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-algorithm-selection.md:stripped_wiki_link:rlhf-is-implicit-social-choice-without-normative-scrutiny.md", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-content.md:set_created:2026-03-15", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-content.md:stripped_wiki_link:universal-alignment-is-mathematically-impossible-because-Arr", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-content.md:stripped_wiki_link:pluralistic-alignment-must-accommodate-irreducibly-diverse-v", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-content.md:stripped_wiki_link:some-disagreements-are-permanently-irreducible-because-they-", + "human-rating-authority-in-ai-systems-preserves-alignment-by-keeping-value-judgment-in-human-hands.md:set_created:2026-03-15", + "human-rating-authority-in-ai-systems-preserves-alignment-by-keeping-value-judgment-in-human-hands.md:stripped_wiki_link:coding-agents-cannot-take-accountability-for-mistakes-which-", + "human-rating-authority-in-ai-systems-preserves-alignment-by-keeping-value-judgment-in-human-hands.md:stripped_wiki_link:human-in-the-loop-at-the-architectural-level-means-humans-se", + "stylistic-novelty-rewards-in-rlcf-balance-optimization-pressure-with-diversity-preservation.md:set_created:2026-03-15", + "stylistic-novelty-rewards-in-rlcf-balance-optimization-pressure-with-diversity-preservation.md:stripped_wiki_link:pluralistic-ai-alignment-through-multiple-systems-preserves-", + "stylistic-novelty-rewards-in-rlcf-balance-optimization-pressure-with-diversity-preservation.md:stripped_wiki_link:high-AI-exposure-increases-collective-idea-diversity-without" + ], + "rejections": [ + "rlcf-architecture-separates-ai-generation-from-human-evaluation-with-bridging-algorithm-selection.md:missing_attribution_extractor", + "bridging-based-consensus-mechanisms-risk-homogenization-toward-optimally-inoffensive-content.md:no_frontmatter", + "human-rating-authority-in-ai-systems-preserves-alignment-by-keeping-value-judgment-in-human-hands.md:missing_attribution_extractor", + "stylistic-novelty-rewards-in-rlcf-balance-optimization-pressure-with-diversity-preservation.md:missing_attribution_extractor" + ] + }, + "model": "anthropic/claude-sonnet-4.5", + "date": "2026-03-15" +} \ No newline at end of file diff --git a/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md b/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md index 095a911b..2d14049b 100644 --- a/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md +++ b/inbox/archive/2025-06-00-li-scaling-human-judgment-community-notes-llms.md @@ -7,9 +7,13 @@ date: 2025-06-30 domain: ai-alignment secondary_domains: [collective-intelligence] format: paper -status: unprocessed +status: enrichment priority: high tags: [RLCF, community-notes, bridging-algorithm, pluralistic-alignment, human-AI-collaboration, LLM-alignment] +processed_by: theseus +processed_date: 2026-03-15 +enrichments_applied: ["rlhf-is-implicit-social-choice-without-normative-scrutiny.md"] +extraction_model: "anthropic/claude-sonnet-4.5" --- ## Content @@ -51,3 +55,11 @@ Proposes a hybrid model for Community Notes where both humans and LLMs write not PRIMARY CONNECTION: democratic alignment assemblies produce constitutions as effective as expert-designed ones while better representing diverse populations WHY ARCHIVED: First concrete specification of RLCF — transitions from design principle to implementable mechanism EXTRACTION HINT: Focus on the architecture (who generates, who rates, what selects) and the homogenization risk — the "optimally inoffensive" failure mode is a key tension with our bridging-based alignment thesis + + +## Key Facts +- Community Notes uses a hybrid model where both humans and LLMs write notes, but humans alone rate them +- The bridging algorithm uses matrix factorization: y_ij = w_i * x_j + b_i + c_j where c_j is the bridging score +- Notes must receive support from raters with diverse viewpoints to surface +- The paper was published in the Journal of Online Trust and Safety in June 2025 +- Key risks identified: helpfulness hacking, declining human engagement, homogenization, rater capacity overwhelm