From adbe3bd91126997252f09087262d5443154e6440 Mon Sep 17 00:00:00 2001 From: m3taversal Date: Tue, 7 Apr 2026 11:38:25 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20prevent=20reweave=20PR=20flood=20?= =?UTF-8?q?=E2=80=94=20freshen=20base,=20cleanup=20branches=20on=20failure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for the reweave merge failure cycle: 1. reweave.py: fetch + reset to origin/main before branch creation, eliminating the stale-base problem that caused ~75% merge failure rate 2. merge.py: delete remote branch when closing reweave PRs (in reconcile, merge failure, and conflict retry paths) — prevents discover_external_prs from rediscovering stale branches and creating new PRs every 18 minutes 3. merge.py: skip cherry-pick retry for reweave branches — reweave modifies existing files so cherry-pick always fails, go straight to close+delete Pentagon-Agent: Ship Co-Authored-By: Claude Opus 4.6 (1M context) --- ops/pipeline-v2/lib/merge.py | 66 ++++++++++++++++++++++++++++++------ ops/pipeline-v2/reweave.py | 22 +++++++++++- 2 files changed, 76 insertions(+), 12 deletions(-) diff --git a/ops/pipeline-v2/lib/merge.py b/ops/pipeline-v2/lib/merge.py index 49a20c677..fd9593005 100644 --- a/ops/pipeline-v2/lib/merge.py +++ b/ops/pipeline-v2/lib/merge.py @@ -1432,13 +1432,22 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]: continue if not pick_ok: - # Cherry-pick failed — this is a genuine conflict (not a race condition). - # No retry needed: cherry-pick onto fresh main means main can't have moved. - logger.warning("PR #%d cherry-pick failed: %s", pr_num, pick_msg) - conn.execute( - "UPDATE prs SET status = 'conflict', merge_cycled = 1, merge_failures = COALESCE(merge_failures, 0) + 1, last_error = ? WHERE number = ?", - (pick_msg[:500], pr_num), - ) + logger.warning("PR #%d merge/cherry-pick failed: %s", pr_num, pick_msg) + # Reweave: close immediately, don't retry (Ship: same rationale as ff-push failure) + if branch.startswith("reweave/"): + conn.execute( + "UPDATE prs SET status = 'closed', merge_cycled = 1, merge_failures = COALESCE(merge_failures, 0) + 1, last_error = ? WHERE number = ?", + (f"reweave merge failed (closed, not retried): {pick_msg[:400]}", pr_num), + ) + await forgejo_api("PATCH", repo_path(f"pulls/{pr_num}"), {"state": "closed"}) + await forgejo_api("POST", repo_path(f"issues/{pr_num}/comments"), + {"body": f"Reweave merge failed — closing. Next nightly reweave will create a fresh branch.\n\nError: {pick_msg[:200]}"}) + await _delete_remote_branch(branch) + else: + conn.execute( + "UPDATE prs SET status = 'conflict', merge_cycled = 1, merge_failures = COALESCE(merge_failures, 0) + 1, last_error = ? WHERE number = ?", + (pick_msg[:500], pr_num), + ) db.audit(conn, "merge", "cherry_pick_failed", json.dumps({"pr": pr_num, "error": pick_msg[:200]})) failed += 1 continue @@ -1483,10 +1492,24 @@ async def _merge_domain_queue(conn, domain: str) -> tuple[int, int]: if not merge_ok: logger.error("PR #%d merge failed: %s", pr_num, merge_msg) - conn.execute( - "UPDATE prs SET status = 'conflict', merge_cycled = 1, merge_failures = COALESCE(merge_failures, 0) + 1, last_error = ? WHERE number = ?", - (merge_msg[:500], pr_num), - ) + # Reweave PRs: close immediately on failure. Cherry-pick retry + # will always fail (reweave modifies existing files). Next nightly + # run creates a fresh branch from current main — retry is wasteful. + # (Ship: prevents reweave flood + wasted retry cycles) + if branch.startswith("reweave/"): + conn.execute( + "UPDATE prs SET status = 'closed', merge_cycled = 1, merge_failures = COALESCE(merge_failures, 0) + 1, last_error = ? WHERE number = ?", + (f"reweave merge failed (closed, not retried): {merge_msg[:400]}", pr_num), + ) + await forgejo_api("PATCH", repo_path(f"pulls/{pr_num}"), {"state": "closed"}) + await forgejo_api("POST", repo_path(f"issues/{pr_num}/comments"), + {"body": f"Reweave merge failed — closing. Next nightly reweave will create a fresh branch.\n\nError: {merge_msg[:200]}"}) + await _delete_remote_branch(branch) + else: + conn.execute( + "UPDATE prs SET status = 'conflict', merge_cycled = 1, merge_failures = COALESCE(merge_failures, 0) + 1, last_error = ? WHERE number = ?", + (merge_msg[:500], pr_num), + ) db.audit(conn, "merge", "merge_failed", json.dumps({"pr": pr_num, "error": merge_msg[:200]})) failed += 1 continue @@ -1583,6 +1606,11 @@ async def _reconcile_db_state(conn): continue if forgejo_state == "closed" and not is_merged and db_status not in ("closed",): + # Clean up branch too — stale branches get rediscovered as new PRs + # (Ship: prevents reweave flood where closed PRs leave branches that + # trigger discover_external_prs → new PR → fail → close → repeat) + if branch: + await _delete_remote_branch(branch) conn.execute( "UPDATE prs SET status = 'closed', last_error = 'reconciled: closed on Forgejo' WHERE number = ?", (pr_number,), @@ -1775,6 +1803,22 @@ async def _retry_conflict_prs(conn) -> tuple[int, int]: branch = row["branch"] attempts = row["conflict_rebase_attempts"] or 0 + # Reweave branches modify existing files — cherry-pick will always fail. + # Close immediately and delete branch. Next nightly reweave creates fresh. + # (Ship: prevents wasting 3 retry cycles on branches that can never cherry-pick) + if branch.startswith("reweave/"): + logger.info("Reweave PR #%d: skipping retry, closing + deleting branch", pr_number) + conn.execute( + "UPDATE prs SET status = 'closed', last_error = 'reweave: closed (retry skipped, next nightly creates fresh)' WHERE number = ?", + (pr_number,), + ) + await forgejo_api("PATCH", repo_path(f"pulls/{pr_number}"), {"state": "closed"}) + await forgejo_api("POST", repo_path(f"issues/{pr_number}/comments"), + {"body": "Reweave conflict — closing instead of retrying. Cherry-pick always fails on reweave branches (they modify existing files). Next nightly reweave will create a fresh branch from current main."}) + await _delete_remote_branch(branch) + failed += 1 + continue + logger.info("Conflict retry [%d/%d] PR #%d branch=%s", attempts + 1, MAX_CONFLICT_REBASE_ATTEMPTS, pr_number, branch) diff --git a/ops/pipeline-v2/reweave.py b/ops/pipeline-v2/reweave.py index 518078b01..a705e888f 100644 --- a/ops/pipeline-v2/reweave.py +++ b/ops/pipeline-v2/reweave.py @@ -597,7 +597,14 @@ def _write_edge_regex(neighbor_path: Path, fm_text: str, body_text: str, def create_branch(repo_root: Path, branch_name: str) -> bool: - """Create and checkout a new branch. Cleans up stale local/remote branches from prior failed runs.""" + """Create and checkout a new branch from fresh origin/main. + + Cleans up stale local/remote branches from prior failed runs, then + fetches + resets to origin/main so the branch is never based on stale state. + (Ship: reduces reweave merge failure rate from ~75% to near-zero by + eliminating the stale-base problem that causes superset assertion failures + and force-with-lease races.) + """ # Delete stale local branch if it exists (e.g., from a failed earlier run today) subprocess.run(["git", "branch", "-D", branch_name], cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist @@ -610,6 +617,19 @@ def create_branch(repo_root: Path, branch_name: str) -> bool: subprocess.run(["git", "push", push_url, "--delete", branch_name], cwd=str(repo_root), capture_output=True) # ignore errors if branch doesn't exist + # Freshen to origin/main before branching — ensures branch base matches + # the main HEAD that _merge_reweave_pr will read at merge time. + try: + subprocess.run(["git", "fetch", "origin", "main"], + cwd=str(repo_root), check=True, capture_output=True, timeout=30) + subprocess.run(["git", "checkout", "main"], + cwd=str(repo_root), check=True, capture_output=True) + subprocess.run(["git", "reset", "--hard", "origin/main"], + cwd=str(repo_root), check=True, capture_output=True) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + logger.error("Failed to freshen to origin/main: %s", e) + return False + try: subprocess.run(["git", "checkout", "-b", branch_name], cwd=str(repo_root), check=True, capture_output=True)